From e70f5617f1125e1f39a75d7a8c92ddda86a8056d Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 23 Jun 2015 11:02:12 -0700
Subject: [PATCH 0001/1208] tgsi_to_nir: Fix translation of TXF on MSAA
 targets.

Noticed while trying to add GL_ARB_texture_multisample support to vc4.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/auxiliary/nir/tgsi_to_nir.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 061f39ac6f3..065bbf050c2 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -1078,7 +1078,12 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
       samp = 2;
       break;
    case TGSI_OPCODE_TXF:
-      op = nir_texop_txf;
+      if (tgsi_inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
+          tgsi_inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA) {
+         op = nir_texop_txf_ms;
+      } else {
+         op = nir_texop_txf;
+      }
       num_srcs = 2;
       break;
    case TGSI_OPCODE_TXD:
@@ -1178,7 +1183,10 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
 
    if (tgsi_inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
       instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], W));
-      instr->src[src_number].src_type = nir_tex_src_lod;
+      if (op == nir_texop_txf_ms)
+         instr->src[src_number].src_type = nir_tex_src_ms_index;
+      else
+         instr->src[src_number].src_type = nir_tex_src_lod;
       src_number++;
    }
 

From fc0da629b502bb072b945932bae0477eb9b62bd5 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Sat, 20 Jun 2015 15:30:19 -0700
Subject: [PATCH 0002/1208] vc4: Fix printfs for blit fallbacks.

---
 src/gallium/drivers/vc4/vc4_blit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c
index d29e2c9c318..e52a1941730 100644
--- a/src/gallium/drivers/vc4/vc4_blit.c
+++ b/src/gallium/drivers/vc4/vc4_blit.c
@@ -94,7 +94,7 @@ vc4_render_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
         struct vc4_context *vc4 = vc4_context(ctx);
 
         if (!util_blitter_is_blit_supported(vc4->blitter, info)) {
-                fprintf(stderr, "blit unsupported %s -> %s",
+                fprintf(stderr, "blit unsupported %s -> %s\n",
                     util_format_short_name(info->src.resource->format),
                     util_format_short_name(info->dst.resource->format));
                 return false;
@@ -135,7 +135,7 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
             info.dst.resource->nr_samples <= 1 &&
             !util_format_is_depth_or_stencil(info.src.resource->format) &&
             !util_format_is_pure_integer(info.src.resource->format)) {
-                fprintf(stderr, "color resolve unimplemented");
+                fprintf(stderr, "color resolve unimplemented\n");
                 return;
         }
 
@@ -147,7 +147,7 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
         }
 
         if (info.mask & PIPE_MASK_S) {
-                fprintf(stderr, "cannot blit stencil, skipping");
+                fprintf(stderr, "cannot blit stencil, skipping\n");
                 info.mask &= ~PIPE_MASK_S;
         }
 

From 19056d04296444afefe71ad8094d327ed38967bf Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 22 Jun 2015 17:31:24 -0700
Subject: [PATCH 0003/1208] vc4: Reuse (and extend) the packet.h sizes for
 dumping.

---
 src/gallium/drivers/vc4/kernel/vc4_packet.h |  7 ++
 src/gallium/drivers/vc4/vc4_cl_dump.c       | 82 ++++++++++-----------
 2 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_packet.h b/src/gallium/drivers/vc4/kernel/vc4_packet.h
index 88cfc0fa9f0..8e6f2a1ac2c 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_packet.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_packet.h
@@ -88,16 +88,22 @@ enum vc4_packet {
 #define VC4_PACKET_START_TILE_BINNING_SIZE				1
 #define VC4_PACKET_INCREMENT_SEMAPHORE_SIZE				1
 #define VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE				1
+#define VC4_PACKET_BRANCH_SIZE						5
 #define VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE				5
 #define VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE				1
 #define VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF_SIZE			1
+#define VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE			5
+#define VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE			5
 #define VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE			7
 #define VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE			7
 #define VC4_PACKET_GL_INDEXED_PRIMITIVE_SIZE				14
 #define VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE				10
+#define VC4_PACKET_COMPRESSED_PRIMITIVE_SIZE				1
+#define VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE_SIZE			1
 #define VC4_PACKET_PRIMITIVE_LIST_FORMAT_SIZE				2
 #define VC4_PACKET_GL_SHADER_STATE_SIZE					5
 #define VC4_PACKET_NV_SHADER_STATE_SIZE					5
+#define VC4_PACKET_VG_SHADER_STATE_SIZE					5
 #define VC4_PACKET_CONFIGURATION_BITS_SIZE				4
 #define VC4_PACKET_FLAT_SHADE_FLAGS_SIZE				5
 #define VC4_PACKET_POINT_SIZE_SIZE					5
@@ -106,6 +112,7 @@ enum vc4_packet {
 #define VC4_PACKET_DEPTH_OFFSET_SIZE					5
 #define VC4_PACKET_CLIP_WINDOW_SIZE					9
 #define VC4_PACKET_VIEWPORT_OFFSET_SIZE					5
+#define VC4_PACKET_Z_CLIPPING_SIZE					9
 #define VC4_PACKET_CLIPPER_XY_SCALING_SIZE				9
 #define VC4_PACKET_CLIPPER_Z_SCALING_SIZE				9
 #define VC4_PACKET_TILE_BINNING_MODE_CONFIG_SIZE			16
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index 69055081daa..4cc197acd77 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -291,63 +291,63 @@ dump_VC4_PACKET_GEM_HANDLES(void *cl, uint32_t offset, uint32_t hw_offset)
                 offset, hw_offset, handles[0], handles[1]);
 }
 
-#define PACKET_DUMP(name, size) [name] = { #name, size, dump_##name }
-#define PACKET(name, size) [name] = { #name, size, NULL }
+#define PACKET_DUMP(name) [name] = { #name, name ## _SIZE, dump_##name }
+#define PACKET(name) [name] = { #name, name ## _SIZE, NULL }
 
 static const struct packet_info {
         const char *name;
         uint8_t size;
         void (*dump_func)(void *cl, uint32_t offset, uint32_t hw_offset);
 } packet_info[] = {
-        PACKET(VC4_PACKET_HALT, 1),
-        PACKET(VC4_PACKET_NOP, 1),
+        PACKET(VC4_PACKET_HALT),
+        PACKET(VC4_PACKET_NOP),
 
-        PACKET(VC4_PACKET_FLUSH, 1),
-        PACKET(VC4_PACKET_FLUSH_ALL, 1),
-        PACKET(VC4_PACKET_START_TILE_BINNING, 1),
-        PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, 1),
-        PACKET(VC4_PACKET_WAIT_ON_SEMAPHORE, 1),
+        PACKET(VC4_PACKET_FLUSH),
+        PACKET(VC4_PACKET_FLUSH_ALL),
+        PACKET(VC4_PACKET_START_TILE_BINNING),
+        PACKET(VC4_PACKET_INCREMENT_SEMAPHORE),
+        PACKET(VC4_PACKET_WAIT_ON_SEMAPHORE),
 
-        PACKET(VC4_PACKET_BRANCH, 5),
-        PACKET_DUMP(VC4_PACKET_BRANCH_TO_SUB_LIST, 5),
+        PACKET(VC4_PACKET_BRANCH),
+        PACKET_DUMP(VC4_PACKET_BRANCH_TO_SUB_LIST),
 
-        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER, 1),
-        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF, 1),
-        PACKET(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER, 5),
-        PACKET(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER, 5),
-        PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL, 7),
-        PACKET(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL, 7),
+        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER),
+        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF),
+        PACKET(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER),
+        PACKET(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER),
+        PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL),
+        PACKET(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL),
 
-        PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE, 14),
-        PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE, 10),
+        PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE),
+        PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE),
 
-        PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE, 48),
-        PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE, 49),
+        PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE),
+        PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE),
 
-        PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, 2),
+        PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT),
 
-        PACKET(VC4_PACKET_GL_SHADER_STATE, 5),
-        PACKET(VC4_PACKET_NV_SHADER_STATE, 5),
-        PACKET(VC4_PACKET_VG_SHADER_STATE, 5),
+        PACKET(VC4_PACKET_GL_SHADER_STATE),
+        PACKET(VC4_PACKET_NV_SHADER_STATE),
+        PACKET(VC4_PACKET_VG_SHADER_STATE),
 
-        PACKET(VC4_PACKET_CONFIGURATION_BITS, 4),
-        PACKET_DUMP(VC4_PACKET_FLAT_SHADE_FLAGS, 5),
-        PACKET_DUMP(VC4_PACKET_POINT_SIZE, 5),
-        PACKET_DUMP(VC4_PACKET_LINE_WIDTH, 5),
-        PACKET(VC4_PACKET_RHT_X_BOUNDARY, 3),
-        PACKET(VC4_PACKET_DEPTH_OFFSET, 5),
-        PACKET(VC4_PACKET_CLIP_WINDOW, 9),
-        PACKET_DUMP(VC4_PACKET_VIEWPORT_OFFSET, 5),
-        PACKET(VC4_PACKET_Z_CLIPPING, 9),
-        PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING, 9),
-        PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING, 9),
+        PACKET(VC4_PACKET_CONFIGURATION_BITS),
+        PACKET_DUMP(VC4_PACKET_FLAT_SHADE_FLAGS),
+        PACKET_DUMP(VC4_PACKET_POINT_SIZE),
+        PACKET_DUMP(VC4_PACKET_LINE_WIDTH),
+        PACKET(VC4_PACKET_RHT_X_BOUNDARY),
+        PACKET(VC4_PACKET_DEPTH_OFFSET),
+        PACKET(VC4_PACKET_CLIP_WINDOW),
+        PACKET_DUMP(VC4_PACKET_VIEWPORT_OFFSET),
+        PACKET(VC4_PACKET_Z_CLIPPING),
+        PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING),
+        PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING),
 
-        PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG, 16),
-        PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG, 11),
-        PACKET(VC4_PACKET_CLEAR_COLORS, 14),
-        PACKET_DUMP(VC4_PACKET_TILE_COORDINATES, 3),
+        PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG),
+        PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG),
+        PACKET(VC4_PACKET_CLEAR_COLORS),
+        PACKET_DUMP(VC4_PACKET_TILE_COORDINATES),
 
-        PACKET_DUMP(VC4_PACKET_GEM_HANDLES, 9),
+        PACKET_DUMP(VC4_PACKET_GEM_HANDLES),
 };
 
 void

From 8fbcabc41a4b2c7d7571585bde2e009e57982da4 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 22 Jun 2015 13:14:57 -0700
Subject: [PATCH 0004/1208] vc4: Add an "args" temporary for RCL setup.

---
 .../drivers/vc4/kernel/vc4_render_cl.c        | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
index e2d907ad91f..deb2ccfa0d4 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
@@ -100,7 +100,8 @@ static void emit_tile(struct vc4_exec_info *exec,
 		      struct vc4_rcl_setup *setup,
 		      uint8_t x, uint8_t y, bool first, bool last)
 {
-	bool has_bin = exec->args->bin_cl_size != 0;
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
 
 	/* Note that the load doesn't actually occur until the
 	 * tile coords packet is processed, and only one load
@@ -108,10 +109,9 @@ static void emit_tile(struct vc4_exec_info *exec,
 	 */
 	if (setup->color_read) {
 		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, exec->args->color_read.bits);
+		rcl_u16(setup, args->color_read.bits);
 		rcl_u32(setup,
-			setup->color_read->paddr +
-			exec->args->color_read.offset);
+			setup->color_read->paddr + args->color_read.offset);
 	}
 
 	if (setup->zs_read) {
@@ -122,9 +122,8 @@ static void emit_tile(struct vc4_exec_info *exec,
 		}
 
 		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, exec->args->zs_read.bits);
-		rcl_u32(setup,
-			setup->zs_read->paddr + exec->args->zs_read.offset);
+		rcl_u16(setup, args->zs_read.bits);
+		rcl_u32(setup, setup->zs_read->paddr + args->zs_read.offset);
 	}
 
 	/* Clipping depends on tile coordinates having been
@@ -147,11 +146,11 @@ static void emit_tile(struct vc4_exec_info *exec,
 
 	if (setup->zs_write) {
 		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, exec->args->zs_write.bits |
+		rcl_u16(setup, args->zs_write.bits |
 			(setup->color_ms_write ?
 			 VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR : 0));
 		rcl_u32(setup,
-			(setup->zs_write->paddr + exec->args->zs_write.offset) |
+			(setup->zs_write->paddr + args->zs_write.offset) |
 			((last && !setup->color_ms_write) ?
 			 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
 	}
@@ -172,11 +171,12 @@ static void emit_tile(struct vc4_exec_info *exec,
 static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 			     struct vc4_rcl_setup *setup)
 {
-	bool has_bin = exec->args->bin_cl_size != 0;
-	uint8_t min_x_tile = exec->args->min_x_tile;
-	uint8_t min_y_tile = exec->args->min_y_tile;
-	uint8_t max_x_tile = exec->args->max_x_tile;
-	uint8_t max_y_tile = exec->args->max_y_tile;
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
+	uint8_t min_x_tile = args->min_x_tile;
+	uint8_t min_y_tile = args->min_y_tile;
+	uint8_t max_x_tile = args->max_x_tile;
+	uint8_t max_y_tile = args->max_y_tile;
 	uint8_t xtiles = max_x_tile - min_x_tile + 1;
 	uint8_t ytiles = max_y_tile - min_y_tile + 1;
 	uint8_t x, y;
@@ -185,7 +185,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	size = VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE;
 	loop_body_size = VC4_PACKET_TILE_COORDINATES_SIZE;
 
-	if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+	if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
 		size += VC4_PACKET_CLEAR_COLORS_SIZE +
 			VC4_PACKET_TILE_COORDINATES_SIZE +
 			VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
@@ -226,23 +226,23 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	rcl_u32(setup,
 		(setup->color_ms_write ?
 		 (setup->color_ms_write->paddr +
-		  exec->args->color_ms_write.offset) :
+		  args->color_ms_write.offset) :
 		 0));
-	rcl_u16(setup, exec->args->width);
-	rcl_u16(setup, exec->args->height);
-	rcl_u16(setup, exec->args->color_ms_write.bits);
+	rcl_u16(setup, args->width);
+	rcl_u16(setup, args->height);
+	rcl_u16(setup, args->color_ms_write.bits);
 
 	/* The tile buffer gets cleared when the previous tile is stored.  If
 	 * the clear values changed between frames, then the tile buffer has
 	 * stale clear values in it, so we have to do a store in None mode (no
 	 * writes) so that we trigger the tile buffer clear.
 	 */
-	if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+	if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
 		rcl_u8(setup, VC4_PACKET_CLEAR_COLORS);
-		rcl_u32(setup, exec->args->clear_color[0]);
-		rcl_u32(setup, exec->args->clear_color[1]);
-		rcl_u32(setup, exec->args->clear_z);
-		rcl_u8(setup, exec->args->clear_s);
+		rcl_u32(setup, args->clear_color[0]);
+		rcl_u32(setup, args->clear_color[1]);
+		rcl_u32(setup, args->clear_z);
+		rcl_u8(setup, args->clear_s);
 
 		vc4_tile_coordinates(setup, 0, 0);
 

From 76851f49a5beac01b4eee7892ca95f44b5e18e29 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 22 Jun 2015 11:45:27 -0700
Subject: [PATCH 0005/1208] vc4: Clarify size calculation for Z/S writes.

It's the same value for loads and stores, because they're basically the
same packet.
---
 src/gallium/drivers/vc4/kernel/vc4_render_cl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
index deb2ccfa0d4..f55ffe5a8db 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
@@ -208,7 +208,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	}
 
 	if (setup->zs_write)
-		loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+		loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
 	if (setup->color_ms_write) {
 		if (setup->zs_write)
 			loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;

From af83eb25812fbda89de62b58f9e59a5408ad4654 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 23 Jun 2015 17:53:07 -0700
Subject: [PATCH 0006/1208] vc4: Pull the blending operation out to a separate
 function.

It's fairly separate from the rest of the TLB operations at frag end time,
and we'll need to run it multiple times to support MSAA blending.
---
 src/gallium/drivers/vc4/vc4_program.c | 88 +++++++++++++++------------
 1 file changed, 50 insertions(+), 38 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index ba47c51d9bd..c620a4a351f 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1371,12 +1371,13 @@ vc4_logicop(struct vc4_compile *c, struct qreg src, struct qreg dst)
         }
 }
 
-static void
-emit_frag_end(struct vc4_compile *c)
+/**
+ * Applies the GL blending pipeline and returns the packed (8888) output
+ * color.
+ */
+static struct qreg
+blend_pipeline(struct vc4_compile *c)
 {
-        clip_distance_discard(c);
-        alpha_test_discard(c);
-
         enum pipe_format color_format = c->fs_key->color_format;
         const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
         struct qreg tlb_read_color[4] = { c->undef, c->undef, c->undef, c->undef };
@@ -1408,14 +1409,16 @@ emit_frag_end(struct vc4_compile *c)
                 packed_dst_color = qir_MOV(c, r4);
         }
 
+        struct qreg undef_array[4] = { c->undef, c->undef, c->undef, c->undef };
+        const struct qreg *output_colors = (c->output_color_index != -1 ?
+                                            c->outputs + c->output_color_index :
+                                            undef_array);
+        struct qreg blend_src_color[4];
+        for (int i = 0; i < 4; i++)
+                blend_src_color[i] = output_colors[i];
+
         struct qreg blend_color[4];
-        struct qreg undef_array[4] = {
-                c->undef, c->undef, c->undef, c->undef
-        };
-        vc4_blend(c, blend_color, linear_dst_color,
-                  (c->output_color_index != -1 ?
-                   c->outputs + c->output_color_index :
-                   undef_array));
+        vc4_blend(c, blend_color, linear_dst_color, blend_src_color);
 
         if (util_format_is_srgb(color_format)) {
                 for (int i = 0; i < 3; i++)
@@ -1439,30 +1442,6 @@ emit_frag_end(struct vc4_compile *c)
                                                            format_swiz[i]);
         }
 
-        if (c->discard.file != QFILE_NULL)
-                qir_TLB_DISCARD_SETUP(c, c->discard);
-
-        if (c->fs_key->stencil_enabled) {
-                qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 0));
-                if (c->fs_key->stencil_twoside) {
-                        qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 1));
-                }
-                if (c->fs_key->stencil_full_writemasks) {
-                        qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 2));
-                }
-        }
-
-        if (c->fs_key->depth_enabled) {
-                struct qreg z;
-                if (c->output_position_index != -1) {
-                        z = qir_FTOI(c, qir_FMUL(c, c->outputs[c->output_position_index + 2],
-                                                 qir_uniform_f(c, 0xffffff)));
-                } else {
-                        z = qir_FRAG_Z(c);
-                }
-                qir_TLB_Z_WRITE(c, z);
-        }
-
         struct qreg packed_color = c->undef;
         for (int i = 0; i < 4; i++) {
                 if (swizzled_outputs[i].file == QFILE_NULL)
@@ -1502,8 +1481,41 @@ emit_frag_end(struct vc4_compile *c)
                                               qir_uniform_ui(c, ~colormask)));
         }
 
-        qir_emit(c, qir_inst(QOP_TLB_COLOR_WRITE, c->undef,
-                             packed_color, c->undef));
+        return packed_color;
+}
+
+static void
+emit_frag_end(struct vc4_compile *c)
+{
+        clip_distance_discard(c);
+        alpha_test_discard(c);
+        struct qreg color = blend_pipeline(c);
+
+        if (c->discard.file != QFILE_NULL)
+                qir_TLB_DISCARD_SETUP(c, c->discard);
+
+        if (c->fs_key->stencil_enabled) {
+                qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 0));
+                if (c->fs_key->stencil_twoside) {
+                        qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 1));
+                }
+                if (c->fs_key->stencil_full_writemasks) {
+                        qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 2));
+                }
+        }
+
+        if (c->fs_key->depth_enabled) {
+                struct qreg z;
+                if (c->output_position_index != -1) {
+                        z = qir_FTOI(c, qir_FMUL(c, c->outputs[c->output_position_index + 2],
+                                                 qir_uniform_f(c, 0xffffff)));
+                } else {
+                        z = qir_FRAG_Z(c);
+                }
+                qir_TLB_Z_WRITE(c, z);
+        }
+
+        qir_emit(c, qir_inst(QOP_TLB_COLOR_WRITE, c->undef, color, c->undef));
 }
 
 static void

From 0f69d59b1c8f5314c1abe18659b96adcfc51a0e5 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 23 Jun 2015 18:04:00 -0700
Subject: [PATCH 0007/1208] vc4: Make a helper for TLB color writes, too.

We've done so for all the other QIR instruction generation in this file.
---
 src/gallium/drivers/vc4/vc4_program.c | 2 +-
 src/gallium/drivers/vc4/vc4_qir.h     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index c620a4a351f..2061631dc9e 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1515,7 +1515,7 @@ emit_frag_end(struct vc4_compile *c)
                 qir_TLB_Z_WRITE(c, z);
         }
 
-        qir_emit(c, qir_inst(QOP_TLB_COLOR_WRITE, c->undef, color, c->undef));
+        qir_TLB_COLOR_WRITE(c, color);
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 732cfd0b306..2f1e261f880 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -523,6 +523,7 @@ QIR_ALU0(FRAG_W)
 QIR_ALU0(FRAG_REV_FLAG)
 QIR_ALU0(TEX_RESULT)
 QIR_ALU0(TLB_COLOR_READ)
+QIR_NODST_1(TLB_COLOR_WRITE)
 QIR_NODST_1(TLB_Z_WRITE)
 QIR_NODST_1(TLB_DISCARD_SETUP)
 QIR_NODST_1(TLB_STENCIL_SETUP)

From 997f6778414a352457162b73ff5295e51e09ad63 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 23 Jun 2015 18:08:49 -0700
Subject: [PATCH 0008/1208] vc4: Don't try to CSE color reads.

It returns a new value for each sample in the TLB.  We've already avoided
trying to get the same index's color multiple times at the vc4_program.c
level, so we're not losing anything by doing this.
---
 src/gallium/drivers/vc4/vc4_opt_cse.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c
index 92c8260eb59..51a56504e5e 100644
--- a/src/gallium/drivers/vc4/vc4_opt_cse.c
+++ b/src/gallium/drivers/vc4/vc4_opt_cse.c
@@ -130,7 +130,8 @@ qir_opt_cse(struct vc4_compile *c)
 
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (qir_has_side_effects(c, inst) ||
-                    qir_has_side_effect_reads(c, inst)) {
+                    qir_has_side_effect_reads(c, inst) ||
+                    inst->op == QOP_TLB_COLOR_READ) {
                         continue;
                 }
 

From 5458ac01ae046010f3f7e4ddbf8ef18cca04d96c Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 22 Jun 2015 17:34:24 -0700
Subject: [PATCH 0009/1208] vc4: Add dumping for
 VC4_PACKET_LOAD/STORE_FULL_RES_TILE_BUFFER.

---
 src/gallium/drivers/vc4/kernel/vc4_packet.h | 10 +++++++
 src/gallium/drivers/vc4/vc4_cl_dump.c       | 30 +++++++++++++++++++--
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_packet.h b/src/gallium/drivers/vc4/kernel/vc4_packet.h
index 8e6f2a1ac2c..771e2b78761 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_packet.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_packet.h
@@ -141,6 +141,16 @@ enum vc4_packet {
 #define VC4_TILING_FORMAT_LT        2
 /** @} */
 
+/** @{
+ *
+ * low bits of VC4_PACKET_STORE_FULL_RES_TILE_BUFFER and
+ * VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER.
+ */
+#define VC4_LOADSTORE_FULL_RES_EOF                     (1 << 3)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL       (1 << 2)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_ZS              (1 << 1)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR           (1 << 0)
+
 /** @{
  *
  * byte 2 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index 4cc197acd77..289d4d6c521 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -46,6 +46,32 @@ dump_VC4_PACKET_BRANCH_TO_SUB_LIST(void *cl, uint32_t offset, uint32_t hw_offset
                 offset, hw_offset, *addr);
 }
 
+static void
+dump_loadstore_full(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        uint32_t bits = *(uint32_t *)(cl + offset);
+
+        fprintf(stderr, "0x%08x 0x%08x:      addr 0x%08x%s%s%s%s\n",
+                offset, hw_offset,
+                bits & ~0xf,
+                (bits & VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL) ? "" : " clear",
+                (bits & VC4_LOADSTORE_FULL_RES_DISABLE_ZS) ? "" : " zs",
+                (bits & VC4_LOADSTORE_FULL_RES_DISABLE_COLOR) ? "" : " color",
+                (bits & VC4_LOADSTORE_FULL_RES_EOF) ? " eof" : "");
+}
+
+static void
+dump_VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_full(cl, offset, hw_offset);
+}
+
+static void
+dump_VC4_PACKET_STORE_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_full(cl, offset, hw_offset);
+}
+
 static void
 dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
 {
@@ -313,8 +339,8 @@ static const struct packet_info {
 
         PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER),
         PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF),
-        PACKET(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER),
-        PACKET(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER),
+        PACKET_DUMP(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER),
+        PACKET_DUMP(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER),
         PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL),
         PACKET(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL),
 

From 3fd4c80b32e3080d761e176d129a1e46c618584a Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 22 Jun 2015 17:38:14 -0700
Subject: [PATCH 0010/1208] vc4: Also dump VC4_PACKET_LOAD_TILE_BUFFER_GENERAL.

---
 src/gallium/drivers/vc4/vc4_cl_dump.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index 289d4d6c521..64de79cc830 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -73,7 +73,7 @@ dump_VC4_PACKET_STORE_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t h
 }
 
 static void
-dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
+dump_loadstore_general(void *cl, uint32_t offset, uint32_t hw_offset)
 {
         uint8_t *bytes = cl + offset;
         uint32_t *addr = cl + offset + 2;
@@ -150,6 +150,18 @@ dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw
                 (*addr & (1 << 3)) ? " EOF" : "");
 }
 
+static void
+dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_general(cl, offset, hw_offset);
+}
+
+static void
+dump_VC4_PACKET_LOAD_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_general(cl, offset, hw_offset);
+}
+
 static void
 dump_VC4_PACKET_FLAT_SHADE_FLAGS(void *cl, uint32_t offset, uint32_t hw_offset)
 {
@@ -342,7 +354,7 @@ static const struct packet_info {
         PACKET_DUMP(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER),
         PACKET_DUMP(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER),
         PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL),
-        PACKET(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL),
+        PACKET_DUMP(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL),
 
         PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE),
         PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE),

From 7796e8889a9a2cc1b454dc32d8da3d756404339a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
Date: Thu, 21 May 2015 10:49:05 +0900
Subject: [PATCH 0011/1208] winsys/radeon: Unmap GPU VM address range when
 destroying BO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

But only when doing so is safe according to the
RADEON_INFO_VA_UNMAP_WORKING kernel query.

This avoids kernel GPU VM address range conflicts when the BO has other
references than the GEM handle being closed, e.g. when the BO is shared.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=90537
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=90873

Cc: "10.5 10.6" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 28 ++++++++++++++++---
 .../winsys/radeon/drm/radeon_drm_winsys.c     |  4 +++
 .../winsys/radeon/drm/radeon_drm_winsys.h     |  1 +
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index fe98870967a..78c95b15eb2 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -305,14 +305,34 @@ static void radeon_bo_destroy(struct pb_buffer *_buf)
     if (bo->ptr)
         os_munmap(bo->ptr, bo->base.size);
 
+    if (mgr->va) {
+        if (bo->rws->va_unmap_working) {
+            struct drm_radeon_gem_va va;
+
+            va.handle = bo->handle;
+            va.vm_id = 0;
+            va.operation = RADEON_VA_UNMAP;
+            va.flags = RADEON_VM_PAGE_READABLE |
+                       RADEON_VM_PAGE_WRITEABLE |
+                       RADEON_VM_PAGE_SNOOPED;
+            va.offset = bo->va;
+
+            if (drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_VA, &va,
+				    sizeof(va)) != 0 &&
+		va.operation == RADEON_VA_RESULT_ERROR) {
+                fprintf(stderr, "radeon: Failed to deallocate virtual address for buffer:\n");
+                fprintf(stderr, "radeon:    size      : %d bytes\n", bo->base.size);
+                fprintf(stderr, "radeon:    va        : 0x%016llx\n", (unsigned long long)bo->va);
+            }
+	}
+
+	radeon_bomgr_free_va(mgr, bo->va, bo->base.size);
+    }
+
     /* Close object. */
     args.handle = bo->handle;
     drmIoctl(bo->rws->fd, DRM_IOCTL_GEM_CLOSE, &args);
 
-    if (mgr->va) {
-        radeon_bomgr_free_va(mgr, bo->va, bo->base.size);
-    }
-
     pipe_mutex_destroy(bo->map_mutex);
 
     if (bo->initial_domain & RADEON_DOMAIN_VRAM)
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index ba8d1437b6f..d457f8a5ad1 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -57,6 +57,8 @@
 #define RADEON_INFO_READ_REG		0x24
 #endif
 
+#define RADEON_INFO_VA_UNMAP_WORKING	0x25
+
 static struct util_hash_table *fd_tab = NULL;
 pipe_static_mutex(fd_tab_mutex);
 
@@ -399,6 +401,8 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
             if (!radeon_get_drm_value(ws->fd, RADEON_INFO_IB_VM_MAX_SIZE, NULL,
                                       &ib_vm_max_size))
                 ws->info.r600_virtual_address = FALSE;
+            radeon_get_drm_value(ws->fd, RADEON_INFO_VA_UNMAP_WORKING, NULL,
+                                 &ws->va_unmap_working);
         }
 	if (ws->gen == DRV_R600 && !debug_get_bool_option("RADEON_VA", FALSE))
 		ws->info.r600_virtual_address = FALSE;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
index 166b6b93d28..99c8b8a8a1d 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
@@ -74,6 +74,7 @@ struct radeon_drm_winsys {
     enum radeon_generation gen;
     struct radeon_info info;
     uint32_t va_start;
+    uint32_t va_unmap_working;
     uint32_t accel_working2;
 
     struct pb_manager *kman;

From c8b8e8b29b755cd3d80fc5e470f441cb3716152a Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 22 Jun 2015 14:20:20 -0700
Subject: [PATCH 0012/1208] i965: Don't count NIR instructions for shader-db.

Matt, Jason, and I haven't found this useful in a long time.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_nir.c | 31 -----------------------------
 1 file changed, 31 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index c13708a2f8a..dffb8ab1ca7 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -57,28 +57,6 @@ nir_optimize(nir_shader *nir)
    } while (progress);
 }
 
-static bool
-count_nir_instrs_in_block(nir_block *block, void *state)
-{
-   int *count = (int *) state;
-   nir_foreach_instr(block, instr) {
-      *count = *count + 1;
-   }
-   return true;
-}
-
-static int
-count_nir_instrs(nir_shader *nir)
-{
-   int count = 0;
-   nir_foreach_overload(nir, overload) {
-      if (!overload->impl)
-         continue;
-      nir_foreach_block(overload->impl, count_nir_instrs_in_block, &count);
-   }
-   return count;
-}
-
 nir_shader *
 brw_create_nir(struct brw_context *brw,
                const struct gl_shader_program *shader_prog,
@@ -178,15 +156,6 @@ brw_create_nir(struct brw_context *brw,
       nir_print_shader(nir, stderr);
    }
 
-   static GLuint msg_id = 0;
-   _mesa_gl_debug(&brw->ctx, &msg_id,
-                  MESA_DEBUG_SOURCE_SHADER_COMPILER,
-                  MESA_DEBUG_TYPE_OTHER,
-                  MESA_DEBUG_SEVERITY_NOTIFICATION,
-                  "%s NIR shader: %d inst\n",
-                  _mesa_shader_stage_to_abbrev(stage),
-                  count_nir_instrs(nir));
-
    nir_convert_from_ssa(nir);
    nir_validate_shader(nir);
 

From 23132cd13baa7b3e9688a118466261a282594b8e Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 23 Jun 2015 23:15:22 -0700
Subject: [PATCH 0013/1208] i965: Fix whitespace error in gen8_depth_state.c

Trivial.
---
 src/mesa/drivers/dri/i965/gen8_depth_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index 12ac97a5d14..7c4ec06e84d 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -100,7 +100,7 @@ emit_depth_packets(struct brw_context *brw,
    }
 
    if (stencil_mt == NULL) {
-     BEGIN_BATCH(5);
+      BEGIN_BATCH(5);
       OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (5 - 2));
       OUT_BATCH(0);
       OUT_BATCH(0);

From 32a220f1f60980de50ecefb3b9ab1f754ade8c83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Tue, 9 Jun 2015 11:06:56 +0300
Subject: [PATCH 0014/1208] glsl: remove cross validation of interpolation
 qualifier with GLSL 4.40
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Timothy Arceri <t_arceri@yahoo.com.au>
---
 src/glsl/link_varyings.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 278a778797b..020842a54a3 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -128,7 +128,17 @@ cross_validate_types_and_qualifiers(struct gl_shader_program *prog,
       return;
    }
 
-   if (input->data.interpolation != output->data.interpolation) {
+   /* GLSL >= 4.40 removes text requiring interpolation qualifiers
+    * to match cross stage, they must only match within the same stage.
+    *
+    * From page 84 (page 90 of the PDF) of the GLSL 4.40 spec:
+    *
+    *     "It is a link-time error if, within the same stage, the interpolation
+    *     qualifiers of variables of the same name do not match.
+    *
+    */
+   if (input->data.interpolation != output->data.interpolation &&
+       prog->Version < 440) {
       linker_error(prog,
                    "%s shader output `%s' specifies %s "
                    "interpolation qualifier, "

From 29aaab2b5f55cc6d9a84f58ce2bb8607e76a9dde Mon Sep 17 00:00:00 2001
From: Grigori Goronzy <greg@chown.ath.cx>
Date: Wed, 24 Jun 2015 03:38:02 +0200
Subject: [PATCH 0015/1208] winsys/radeon: align BO size to page size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is the basic granularity for BO allocations. The alignment also
helps with BO reuse by the cached bufmgr.

This results in a huge 45% speedup in Metro 2033 Redux on my test
system. The game relies on buffer orphaning with very small buffers
(hundreds of bytes in size) and that did not work efficiently
before. This change may also affect other applications and games.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 78c95b15eb2..1f0caf60197 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -840,6 +840,12 @@ radeon_winsys_bo_create(struct radeon_winsys *rws,
     memset(&desc, 0, sizeof(desc));
     desc.base.alignment = alignment;
 
+    /* Align size to page size. This is the minimum alignment for normal
+     * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
+     * like constant/uniform buffers, can benefit from better and more reuse.
+     */
+    size = align(size, 4096);
+
     /* Only set one usage bit each for domains and flags, or the cache manager
      * might consider different sets of domains / flags compatible
      */

From 390f94e3581384838595185a06d5943089d3f9ab Mon Sep 17 00:00:00 2001
From: Grigori Goronzy <greg@chown.ath.cx>
Date: Wed, 24 Jun 2015 03:40:38 +0200
Subject: [PATCH 0016/1208] winsys/radeon: reduce BO cache timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1000 ms is an extreme value for typical interactive loads. A large
cache has some disadvantages. Search for reusable BOs can take a long
time and memory might get exhausted.

Let's be rather conservative and use half of the old value,
500ms. This is beneficial to some loads on my test system and there
are no regressions.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index d457f8a5ad1..d8bb353df9d 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -710,7 +710,7 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
     if (!ws->kman)
         goto fail;
 
-    ws->cman = pb_cache_manager_create(ws->kman, 1000000, 2.0f, 0,
+    ws->cman = pb_cache_manager_create(ws->kman, 500000, 2.0f, 0,
                                        MIN2(ws->info.vram_size, ws->info.gart_size));
     if (!ws->cman)
         goto fail;

From 30d67d38246410274713380664be87cd1df9486a Mon Sep 17 00:00:00 2001
From: Julien Isorce <julien.isorce@gmail.com>
Date: Tue, 23 Jun 2015 22:47:05 +0100
Subject: [PATCH 0017/1208] loader: move loader_open_device out of HAVE_LIBUDEV
 block

Fixes the following build issue, when building without libudev.

CCLD   libGL.la
./.libs/libglx.a(dri2_glx.o): In function `dri2CreateScreen':
src/glx/dri2_glx.c:1186: undefined reference to `loader_open_device'
collect2: ld returned 1 exit status

CCLD     libEGL.la
Undefined symbols for architecture x86_64:
"_loader_open_device", referenced from:
  _dri2_initialize_x11_dri2 in libegl_dri2.a(platform_x11.o)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91077
Signed-off-by: Julien Isorce <j.isorce@samsung.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/loader/loader.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/loader/loader.c b/src/loader/loader.c
index fc468153425..8452cd3560e 100644
--- a/src/loader/loader.c
+++ b/src/loader/loader.c
@@ -64,6 +64,8 @@
  *    Rob Clark <robclark@freedesktop.org>
  */
 
+#include <errno.h>
+#include <fcntl.h>
 #include <sys/stat.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -71,10 +73,8 @@
 #ifdef HAVE_LIBUDEV
 #include <assert.h>
 #include <dlfcn.h>
-#include <fcntl.h>
 #include <unistd.h>
 #include <stdlib.h>
-#include <errno.h>
 #ifdef USE_DRICONF
 #include "xmlconfig.h"
 #include "xmlpool.h"
@@ -104,6 +104,22 @@ static void default_logger(int level, const char *fmt, ...)
 
 static void (*log_)(int level, const char *fmt, ...) = default_logger;
 
+int
+loader_open_device(const char *device_name)
+{
+   int fd;
+#ifdef O_CLOEXEC
+   fd = open(device_name, O_RDWR | O_CLOEXEC);
+   if (fd == -1 && errno == EINVAL)
+#endif
+   {
+      fd = open(device_name, O_RDWR);
+      if (fd != -1)
+         fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+   }
+   return fd;
+}
+
 #ifdef HAVE_LIBUDEV
 #include <libudev.h>
 
@@ -314,22 +330,6 @@ get_id_path_tag_from_fd(struct udev *udev, int fd)
    return id_path_tag;
 }
 
-int
-loader_open_device(const char *device_name)
-{
-   int fd;
-#ifdef O_CLOEXEC
-   fd = open(device_name, O_RDWR | O_CLOEXEC);
-   if (fd == -1 && errno == EINVAL)
-#endif
-   {
-      fd = open(device_name, O_RDWR);
-      if (fd != -1)
-         fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
-   }
-   return fd;
-}
-
 #ifdef USE_DRICONF
 const char __driConfigOptionsLoader[] =
 DRI_CONF_BEGIN

From a552c897caea31bbff3f16d2af8f5028a58bd344 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 24 Jun 2015 12:59:55 +0100
Subject: [PATCH 0018/1208] st/wgl: add stw_nopfuncs.h to the sources lists

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/gallium/state_trackers/wgl/Makefile.sources | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/state_trackers/wgl/Makefile.sources b/src/gallium/state_trackers/wgl/Makefile.sources
index 8c463d5f18e..1e00caf97b7 100644
--- a/src/gallium/state_trackers/wgl/Makefile.sources
+++ b/src/gallium/state_trackers/wgl/Makefile.sources
@@ -9,6 +9,7 @@ C_SOURCES := \
 	stw_framebuffer.c \
 	stw_getprocaddress.c \
 	stw_nopfuncs.c \
+	stw_nopfuncs.h \
 	stw_pixelformat.c \
 	stw_st.c \
 	stw_tls.c \

From c1de7df6d4086070e63369ab0af3950f53a03592 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 22 Jun 2015 14:04:09 -0600
Subject: [PATCH 0019/1208] st/mesa: remove unneeded pipe_surface_release() in
 st_render_texture()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This caused us to always free the pipe_surface for the renderbuffer.
The subsequent call to st_update_renderbuffer_surface() would typically
just recreate it.  Remove the call to pipe_surface_release() and let
st_update_renderbuffer_surface() take care of freeing the old surface
if it needs to be replaced (because of change to mipmap level, etc).

This can save quite a few calls to pipe_context::create_surface() and
surface_destroy().

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/mesa/state_tracker/st_cb_fbo.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 0399eef7204..57075904450 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -511,8 +511,6 @@ st_render_texture(struct gl_context *ctx,
    strb->rtt_layered = att->Layered;
    pipe_resource_reference(&strb->texture, pt);
 
-   pipe_surface_release(pipe, &strb->surface);
-
    st_update_renderbuffer_surface(st, strb);
 
    strb->Base.Format = st_pipe_format_to_mesa_format(pt->format);

From e31bce4041122cd00712b60b4dc1eae6486f6579 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 24 Jun 2015 10:41:52 -0600
Subject: [PATCH 0020/1208] svga: silence warnings about unexpected shader type

Trivial.
---
 src/gallium/drivers/svga/svga_screen.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 56e486786df..770f4933b4c 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -443,7 +443,9 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en
       return 0;
    case PIPE_SHADER_GEOMETRY:
    case PIPE_SHADER_COMPUTE:
-      /* no support for geometry or compute shaders at this time */
+   case PIPE_SHADER_TESS_CTRL:
+   case PIPE_SHADER_TESS_EVAL:
+      /* no support for geometry, tess or compute shaders at this time */
       return 0;
    default:
       debug_printf("Unexpected shader type (%u) query\n", shader);

From 147cdb53ecd225ea21d8d552607d384217346ecb Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 23 Jun 2015 23:17:53 -0700
Subject: [PATCH 0021/1208] nir: Use a switch statement for detecting move-like
 operations.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Suggested by Jason Ekstrand.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/glsl/nir/nir_opt_peephole_select.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/glsl/nir/nir_opt_peephole_select.c b/src/glsl/nir/nir_opt_peephole_select.c
index ef7c9775aa3..6620e5dc81f 100644
--- a/src/glsl/nir/nir_opt_peephole_select.c
+++ b/src/glsl/nir/nir_opt_peephole_select.c
@@ -82,14 +82,22 @@ block_check_for_allowed_instrs(nir_block *block)
          break;
 
       case nir_instr_type_alu: {
-         /* It must be a move operation */
          nir_alu_instr *mov = nir_instr_as_alu(instr);
-         if (mov->op != nir_op_fmov && mov->op != nir_op_imov &&
-             mov->op != nir_op_fneg && mov->op != nir_op_ineg &&
-             mov->op != nir_op_fabs && mov->op != nir_op_iabs &&
-             mov->op != nir_op_vec2 && mov->op != nir_op_vec3 &&
-             mov->op != nir_op_vec4)
+         switch (mov->op) {
+         case nir_op_fmov:
+         case nir_op_imov:
+         case nir_op_fneg:
+         case nir_op_ineg:
+         case nir_op_fabs:
+         case nir_op_iabs:
+         case nir_op_vec2:
+         case nir_op_vec3:
+         case nir_op_vec4:
+            /* It must be a move-like operation. */
+            break;
+         default:
             return false;
+         }
 
          /* Can't handle saturate */
          if (mov->dest.saturate)

From 9d4b9f1e0c661e5ed8ce2e71c76ce8cc1adf90dd Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 30 Apr 2015 16:53:12 +0100
Subject: [PATCH 0022/1208] i965: Transplant PIPE_CONTROL routines to
 brw_pipe_control

Start trimming the fat from intel_batchbuffer.c. First by moving the set
of routines for emitting PIPE_CONTROLS (along with the lore concerning
hardware workarounds) to a separate brw_pipe_control.c

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/Makefile.sources    |   1 +
 src/mesa/drivers/dri/i965/brw_context.h       |  11 +
 src/mesa/drivers/dri/i965/brw_pipe_control.c  | 331 ++++++++++++++++++
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 304 ----------------
 src/mesa/drivers/dri/i965/intel_batchbuffer.h |  10 -
 5 files changed, 343 insertions(+), 314 deletions(-)
 create mode 100644 src/mesa/drivers/dri/i965/brw_pipe_control.c

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 981fe79b132..5a33aacbc23 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -86,6 +86,7 @@ i965_FILES = \
 	brw_object_purgeable.c \
 	brw_packed_float.c \
 	brw_performance_monitor.c \
+	brw_pipe_control.c \
 	brw_primitive_restart.c \
 	brw_program.c \
 	brw_program.h \
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index a7d83f8d7b4..761110beef3 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1998,6 +1998,17 @@ bool
 gen9_use_linear_1d_layout(const struct brw_context *brw,
                           const struct intel_mipmap_tree *mt);
 
+/* brw_pipe_control.c */
+void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags);
+void brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
+                                 drm_intel_bo *bo, uint32_t offset,
+                                 uint32_t imm_lower, uint32_t imm_upper);
+void intel_batchbuffer_emit_mi_flush(struct brw_context *brw);
+void intel_emit_post_sync_nonzero_flush(struct brw_context *brw);
+void intel_emit_depth_stall_flushes(struct brw_context *brw);
+void gen7_emit_vs_workaround_flush(struct brw_context *brw);
+void gen7_emit_cs_stall_flush(struct brw_context *brw);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
new file mode 100644
index 00000000000..bd45a114f2f
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_fbo.h"
+#include "intel_reg.h"
+
+/**
+ * According to the latest documentation, any PIPE_CONTROL with the
+ * "Command Streamer Stall" bit set must also have another bit set,
+ * with five different options:
+ *
+ *  - Render Target Cache Flush
+ *  - Depth Cache Flush
+ *  - Stall at Pixel Scoreboard
+ *  - Post-Sync Operation
+ *  - Depth Stall
+ *
+ * I chose "Stall at Pixel Scoreboard" since we've used it effectively
+ * in the past, but the choice is fairly arbitrary.
+ */
+static void
+gen8_add_cs_stall_workaround_bits(uint32_t *flags)
+{
+   uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                      PIPE_CONTROL_WRITE_IMMEDIATE |
+                      PIPE_CONTROL_WRITE_DEPTH_COUNT |
+                      PIPE_CONTROL_WRITE_TIMESTAMP |
+                      PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                      PIPE_CONTROL_DEPTH_STALL;
+
+   /* If we're doing a CS stall, and don't already have one of the
+    * workaround bits set, add "Stall at Pixel Scoreboard."
+    */
+   if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
+      *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+}
+
+/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
+ *
+ * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
+ *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
+ *
+ * Note that the kernel does CS stalls between batches, so we only need
+ * to count them within a batch.
+ */
+static uint32_t
+gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
+{
+   if (brw->gen == 7 && !brw->is_haswell) {
+      if (flags & PIPE_CONTROL_CS_STALL) {
+         /* If we're doing a CS stall, reset the counter and carry on. */
+         brw->batch.pipe_controls_since_last_cs_stall = 0;
+         return 0;
+      }
+
+      /* If this is the fourth pipe control without a CS stall, do one now. */
+      if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
+         brw->batch.pipe_controls_since_last_cs_stall = 0;
+         return PIPE_CONTROL_CS_STALL;
+      }
+   }
+   return 0;
+}
+
+/**
+ * Emit a PIPE_CONTROL with various flushing flags.
+ *
+ * The caller is responsible for deciding what flags are appropriate for the
+ * given generation.
+ */
+void
+brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
+{
+   if (brw->gen >= 8) {
+      gen8_add_cs_stall_workaround_bits(&flags);
+
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
+      OUT_BATCH(flags);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else if (brw->gen >= 6) {
+      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
+      BEGIN_BATCH(5);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+      OUT_BATCH(flags);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+}
+
+/**
+ * Emit a PIPE_CONTROL that writes to a buffer object.
+ *
+ * \p flags should contain one of the following items:
+ *  - PIPE_CONTROL_WRITE_IMMEDIATE
+ *  - PIPE_CONTROL_WRITE_TIMESTAMP
+ *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
+ */
+void
+brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
+                            drm_intel_bo *bo, uint32_t offset,
+                            uint32_t imm_lower, uint32_t imm_upper)
+{
+   if (brw->gen >= 8) {
+      gen8_add_cs_stall_workaround_bits(&flags);
+
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
+      OUT_BATCH(flags);
+      OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  offset);
+      OUT_BATCH(imm_lower);
+      OUT_BATCH(imm_upper);
+      ADVANCE_BATCH();
+   } else if (brw->gen >= 6) {
+      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
+      /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
+       * on later platforms.  We always use PPGTT on Gen7+.
+       */
+      unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
+
+      BEGIN_BATCH(5);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+      OUT_BATCH(flags);
+      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                gen6_gtt | offset);
+      OUT_BATCH(imm_lower);
+      OUT_BATCH(imm_upper);
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
+      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
+      OUT_BATCH(imm_lower);
+      OUT_BATCH(imm_upper);
+      ADVANCE_BATCH();
+   }
+}
+
+/**
+ * Restriction [DevSNB, DevIVB]:
+ *
+ * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
+ * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
+ * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
+ * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
+ * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
+ * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
+ * unless SW can otherwise guarantee that the pipeline from WM onwards is
+ * already flushed (e.g., via a preceding MI_FLUSH).
+ */
+void
+intel_emit_depth_stall_flushes(struct brw_context *brw)
+{
+   assert(brw->gen >= 6 && brw->gen <= 9);
+
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
+}
+
+/**
+ * From the Ivybridge PRM, Volume 2 Part 1, Section 3.2 (VS Stage Input):
+ * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
+ *  stall needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
+ *  3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
+ *  3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL needs
+ *  to be sent before any combination of VS associated 3DSTATE."
+ */
+void
+gen7_emit_vs_workaround_flush(struct brw_context *brw)
+{
+   assert(brw->gen == 7);
+   brw_emit_pipe_control_write(brw,
+                               PIPE_CONTROL_WRITE_IMMEDIATE
+                               | PIPE_CONTROL_DEPTH_STALL,
+                               brw->batch.workaround_bo, 0,
+                               0, 0);
+}
+
+
+/**
+ * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
+ */
+void
+gen7_emit_cs_stall_flush(struct brw_context *brw)
+{
+   brw_emit_pipe_control_write(brw,
+                               PIPE_CONTROL_CS_STALL
+                               | PIPE_CONTROL_WRITE_IMMEDIATE,
+                               brw->batch.workaround_bo, 0,
+                               0, 0);
+}
+
+
+/**
+ * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
+ * implementing two workarounds on gen6.  From section 1.4.7.1
+ * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
+ *
+ * [DevSNB-C+{W/A}] Before any depth stall flush (including those
+ * produced by non-pipelined state commands), software needs to first
+ * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
+ * 0.
+ *
+ * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
+ * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
+ *
+ * And the workaround for these two requires this workaround first:
+ *
+ * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
+ * BEFORE the pipe-control with a post-sync op and no write-cache
+ * flushes.
+ *
+ * And this last workaround is tricky because of the requirements on
+ * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
+ * volume 2 part 1:
+ *
+ *     "1 of the following must also be set:
+ *      - Render Target Cache Flush Enable ([12] of DW1)
+ *      - Depth Cache Flush Enable ([0] of DW1)
+ *      - Stall at Pixel Scoreboard ([1] of DW1)
+ *      - Depth Stall ([13] of DW1)
+ *      - Post-Sync Operation ([13] of DW1)
+ *      - Notify Enable ([8] of DW1)"
+ *
+ * The cache flushes require the workaround flush that triggered this
+ * one, so we can't use it.  Depth stall would trigger the same.
+ * Post-sync nonzero is what triggered this second workaround, so we
+ * can't use that one either.  Notify enable is IRQs, which aren't
+ * really our business.  That leaves only stall at scoreboard.
+ */
+void
+intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
+{
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_CS_STALL |
+                               PIPE_CONTROL_STALL_AT_SCOREBOARD);
+
+   brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
+                               brw->batch.workaround_bo, 0, 0, 0);
+}
+
+/* Emit a pipelined flush to either flush render and texture cache for
+ * reading from a FBO-drawn texture, or flush so that frontbuffer
+ * render appears on the screen in DRI1.
+ *
+ * This is also used for the always_flush_cache driconf debug option.
+ */
+void
+intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
+{
+   if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
+      BEGIN_BATCH_BLT(4);
+      OUT_BATCH(MI_FLUSH_DW);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else {
+      int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
+      if (brw->gen >= 6) {
+         if (brw->gen == 9) {
+            /* Hardware workaround: SKL
+             *
+             * Emit Pipe Control with all bits set to zero before emitting
+             * a Pipe Control with VF Cache Invalidate set.
+             */
+            brw_emit_pipe_control_flush(brw, 0);
+         }
+
+         flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                  PIPE_CONTROL_VF_CACHE_INVALIDATE |
+                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                  PIPE_CONTROL_CS_STALL;
+
+         if (brw->gen == 6) {
+            /* Hardware workaround: SNB B-Spec says:
+             *
+             * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
+             * Flush Enable =1, a PIPE_CONTROL with any non-zero
+             * post-sync-op is required.
+             */
+            intel_emit_post_sync_nonzero_flush(brw);
+         }
+      }
+      brw_emit_pipe_control_flush(brw, flags);
+   }
+
+   brw_render_cache_set_clear(brw);
+}
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index ed659ed625e..54081a1412f 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -439,310 +439,6 @@ intel_batchbuffer_data(struct brw_context *brw,
    brw->batch.used += bytes >> 2;
 }
 
-/**
- * According to the latest documentation, any PIPE_CONTROL with the
- * "Command Streamer Stall" bit set must also have another bit set,
- * with five different options:
- *
- *  - Render Target Cache Flush
- *  - Depth Cache Flush
- *  - Stall at Pixel Scoreboard
- *  - Post-Sync Operation
- *  - Depth Stall
- *
- * I chose "Stall at Pixel Scoreboard" since we've used it effectively
- * in the past, but the choice is fairly arbitrary.
- */
-static void
-gen8_add_cs_stall_workaround_bits(uint32_t *flags)
-{
-   uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                      PIPE_CONTROL_WRITE_IMMEDIATE |
-                      PIPE_CONTROL_WRITE_DEPTH_COUNT |
-                      PIPE_CONTROL_WRITE_TIMESTAMP |
-                      PIPE_CONTROL_STALL_AT_SCOREBOARD |
-                      PIPE_CONTROL_DEPTH_STALL;
-
-   /* If we're doing a CS stall, and don't already have one of the
-    * workaround bits set, add "Stall at Pixel Scoreboard."
-    */
-   if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
-      *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
-}
-
-/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
- *
- * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
- *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
- *
- * Note that the kernel does CS stalls between batches, so we only need
- * to count them within a batch.
- */
-static uint32_t
-gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
-{
-   if (brw->gen == 7 && !brw->is_haswell) {
-      if (flags & PIPE_CONTROL_CS_STALL) {
-         /* If we're doing a CS stall, reset the counter and carry on. */
-         brw->batch.pipe_controls_since_last_cs_stall = 0;
-         return 0;
-      }
-
-      /* If this is the fourth pipe control without a CS stall, do one now. */
-      if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
-         brw->batch.pipe_controls_since_last_cs_stall = 0;
-         return PIPE_CONTROL_CS_STALL;
-      }
-   }
-   return 0;
-}
-
-/**
- * Emit a PIPE_CONTROL with various flushing flags.
- *
- * The caller is responsible for deciding what flags are appropriate for the
- * given generation.
- */
-void
-brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
-{
-   if (brw->gen >= 8) {
-      gen8_add_cs_stall_workaround_bits(&flags);
-
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
-      OUT_BATCH(flags);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else if (brw->gen >= 6) {
-      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
-
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
-      OUT_BATCH(flags);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-}
-
-/**
- * Emit a PIPE_CONTROL that writes to a buffer object.
- *
- * \p flags should contain one of the following items:
- *  - PIPE_CONTROL_WRITE_IMMEDIATE
- *  - PIPE_CONTROL_WRITE_TIMESTAMP
- *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
- */
-void
-brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
-                            drm_intel_bo *bo, uint32_t offset,
-                            uint32_t imm_lower, uint32_t imm_upper)
-{
-   if (brw->gen >= 8) {
-      gen8_add_cs_stall_workaround_bits(&flags);
-
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
-      OUT_BATCH(flags);
-      OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                  offset);
-      OUT_BATCH(imm_lower);
-      OUT_BATCH(imm_upper);
-      ADVANCE_BATCH();
-   } else if (brw->gen >= 6) {
-      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
-
-      /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
-       * on later platforms.  We always use PPGTT on Gen7+.
-       */
-      unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
-
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
-      OUT_BATCH(flags);
-      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                gen6_gtt | offset);
-      OUT_BATCH(imm_lower);
-      OUT_BATCH(imm_upper);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
-      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
-      OUT_BATCH(imm_lower);
-      OUT_BATCH(imm_upper);
-      ADVANCE_BATCH();
-   }
-}
-
-/**
- * Restriction [DevSNB, DevIVB]:
- *
- * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
- * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
- * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
- * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
- * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
- * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
- * unless SW can otherwise guarantee that the pipeline from WM onwards is
- * already flushed (e.g., via a preceding MI_FLUSH).
- */
-void
-intel_emit_depth_stall_flushes(struct brw_context *brw)
-{
-   assert(brw->gen >= 6 && brw->gen <= 9);
-
-   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
-   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
-   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
-}
-
-/**
- * From the Ivybridge PRM, Volume 2 Part 1, Section 3.2 (VS Stage Input):
- * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
- *  stall needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
- *  3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
- *  3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL needs
- *  to be sent before any combination of VS associated 3DSTATE."
- */
-void
-gen7_emit_vs_workaround_flush(struct brw_context *brw)
-{
-   assert(brw->gen == 7);
-   brw_emit_pipe_control_write(brw,
-                               PIPE_CONTROL_WRITE_IMMEDIATE
-                               | PIPE_CONTROL_DEPTH_STALL,
-                               brw->batch.workaround_bo, 0,
-                               0, 0);
-}
-
-
-/**
- * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
- */
-void
-gen7_emit_cs_stall_flush(struct brw_context *brw)
-{
-   brw_emit_pipe_control_write(brw,
-                               PIPE_CONTROL_CS_STALL
-                               | PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->batch.workaround_bo, 0,
-                               0, 0);
-}
-
-
-/**
- * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
- * implementing two workarounds on gen6.  From section 1.4.7.1
- * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
- *
- * [DevSNB-C+{W/A}] Before any depth stall flush (including those
- * produced by non-pipelined state commands), software needs to first
- * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
- * 0.
- *
- * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
- * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
- *
- * And the workaround for these two requires this workaround first:
- *
- * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
- * BEFORE the pipe-control with a post-sync op and no write-cache
- * flushes.
- *
- * And this last workaround is tricky because of the requirements on
- * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
- * volume 2 part 1:
- *
- *     "1 of the following must also be set:
- *      - Render Target Cache Flush Enable ([12] of DW1)
- *      - Depth Cache Flush Enable ([0] of DW1)
- *      - Stall at Pixel Scoreboard ([1] of DW1)
- *      - Depth Stall ([13] of DW1)
- *      - Post-Sync Operation ([13] of DW1)
- *      - Notify Enable ([8] of DW1)"
- *
- * The cache flushes require the workaround flush that triggered this
- * one, so we can't use it.  Depth stall would trigger the same.
- * Post-sync nonzero is what triggered this second workaround, so we
- * can't use that one either.  Notify enable is IRQs, which aren't
- * really our business.  That leaves only stall at scoreboard.
- */
-void
-intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
-{
-   brw_emit_pipe_control_flush(brw,
-                               PIPE_CONTROL_CS_STALL |
-                               PIPE_CONTROL_STALL_AT_SCOREBOARD);
-
-   brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->batch.workaround_bo, 0, 0, 0);
-}
-
-/* Emit a pipelined flush to either flush render and texture cache for
- * reading from a FBO-drawn texture, or flush so that frontbuffer
- * render appears on the screen in DRI1.
- *
- * This is also used for the always_flush_cache driconf debug option.
- */
-void
-intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
-{
-   if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
-      BEGIN_BATCH_BLT(4);
-      OUT_BATCH(MI_FLUSH_DW);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
-      if (brw->gen >= 6) {
-         if (brw->gen == 9) {
-            /* Hardware workaround: SKL
-             *
-             * Emit Pipe Control with all bits set to zero before emitting
-             * a Pipe Control with VF Cache Invalidate set.
-             */
-            brw_emit_pipe_control_flush(brw, 0);
-         }
-
-         flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
-                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                  PIPE_CONTROL_VF_CACHE_INVALIDATE |
-                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
-                  PIPE_CONTROL_CS_STALL;
-
-         if (brw->gen == 6) {
-            /* Hardware workaround: SNB B-Spec says:
-             *
-             * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
-             * Flush Enable =1, a PIPE_CONTROL with any non-zero
-             * post-sync-op is required.
-             */
-            intel_emit_post_sync_nonzero_flush(brw);
-         }
-      }
-      brw_emit_pipe_control_flush(brw, flags);
-   }
-
-   brw_render_cache_set_clear(brw);
-}
-
 static void
 load_sized_register_mem(struct brw_context *brw,
                         uint32_t reg,
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index 7bdd8364346..ef8a6ffcca8 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -63,16 +63,6 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw,
                                     uint32_t read_domains,
                                     uint32_t write_domain,
                                     uint32_t offset);
-void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags);
-void brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
-                                 drm_intel_bo *bo, uint32_t offset,
-                                 uint32_t imm_lower, uint32_t imm_upper);
-void intel_batchbuffer_emit_mi_flush(struct brw_context *brw);
-void intel_emit_post_sync_nonzero_flush(struct brw_context *brw);
-void intel_emit_depth_stall_flushes(struct brw_context *brw);
-void gen7_emit_vs_workaround_flush(struct brw_context *brw);
-void gen7_emit_cs_stall_flush(struct brw_context *brw);
-
 static inline uint32_t float_as_int(float f)
 {
    union {

From 4b35ab9bdb4e663f41ff5c9ae5bbcc650b6093f9 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 30 Apr 2015 17:04:51 +0100
Subject: [PATCH 0023/1208] i965: Rename intel_emit* to reflect their new
 location in brw_pipe_control

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_blorp.cpp             | 4 ++--
 src/mesa/drivers/dri/i965/brw_clear.c               | 4 ++--
 src/mesa/drivers/dri/i965/brw_context.h             | 6 +++---
 src/mesa/drivers/dri/i965/brw_draw.c                | 4 ++--
 src/mesa/drivers/dri/i965/brw_meta_fast_clear.c     | 4 ++--
 src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c   | 8 ++++----
 src/mesa/drivers/dri/i965/brw_meta_updownsample.c   | 4 ++--
 src/mesa/drivers/dri/i965/brw_misc_state.c          | 2 +-
 src/mesa/drivers/dri/i965/brw_performance_monitor.c | 8 ++++----
 src/mesa/drivers/dri/i965/brw_pipe_control.c        | 8 ++++----
 src/mesa/drivers/dri/i965/brw_state_upload.c        | 4 ++--
 src/mesa/drivers/dri/i965/gen6_blorp.cpp            | 6 +++---
 src/mesa/drivers/dri/i965/gen6_depth_state.c        | 2 +-
 src/mesa/drivers/dri/i965/gen6_queryobj.c           | 6 +++---
 src/mesa/drivers/dri/i965/gen6_sol.c                | 2 +-
 src/mesa/drivers/dri/i965/gen6_urb.c                | 2 +-
 src/mesa/drivers/dri/i965/gen7_blorp.cpp            | 4 ++--
 src/mesa/drivers/dri/i965/gen7_misc_state.c         | 2 +-
 src/mesa/drivers/dri/i965/gen7_sol_state.c          | 4 ++--
 src/mesa/drivers/dri/i965/gen8_depth_state.c        | 2 +-
 src/mesa/drivers/dri/i965/intel_blit.c              | 6 +++---
 src/mesa/drivers/dri/i965/intel_buffer_objects.c    | 4 ++--
 src/mesa/drivers/dri/i965/intel_extensions.c        | 6 +++---
 src/mesa/drivers/dri/i965/intel_fbo.c               | 2 +-
 src/mesa/drivers/dri/i965/intel_pixel_read.c        | 2 +-
 src/mesa/drivers/dri/i965/intel_syncobj.c           | 2 +-
 src/mesa/drivers/dri/i965/intel_tex_image.c         | 2 +-
 27 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp
index b404869f0c7..2ccfae1d77f 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp
@@ -220,7 +220,7 @@ brw_blorp_exec(struct brw_context *brw, const brw_blorp_params *params)
     * data with different formats, which blorp does for stencil and depth
     * data.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
 retry:
    intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING);
@@ -283,7 +283,7 @@ retry:
    /* Flush the sampler cache so any texturing from the destination is
     * coherent.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
 
 brw_hiz_op_params::brw_hiz_op_params(struct intel_mipmap_tree *mt,
diff --git a/src/mesa/drivers/dri/i965/brw_clear.c b/src/mesa/drivers/dri/i965/brw_clear.c
index 1d4ba3cac7e..f981388ef1a 100644
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -184,7 +184,7 @@ brw_fast_clear_depth(struct gl_context *ctx)
     *      must be issued before the rectangle primitive used for the depth
     *      buffer clear operation.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    if (fb->MaxNumLayers > 0) {
       for (unsigned layer = 0; layer < depth_irb->layer_count; layer++) {
@@ -204,7 +204,7 @@ brw_fast_clear_depth(struct gl_context *ctx)
        *      by a PIPE_CONTROL command with DEPTH_STALL bit set and Then
        *      followed by Depth FLUSH'
       */
-      intel_batchbuffer_emit_mi_flush(brw);
+      brw_emit_mi_flush(brw);
    }
 
    /* Now, the HiZ buffer contains data that needs to be resolved to the depth
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 761110beef3..85d8f14a006 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -2003,9 +2003,9 @@ void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags);
 void brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
                                  drm_intel_bo *bo, uint32_t offset,
                                  uint32_t imm_lower, uint32_t imm_upper);
-void intel_batchbuffer_emit_mi_flush(struct brw_context *brw);
-void intel_emit_post_sync_nonzero_flush(struct brw_context *brw);
-void intel_emit_depth_stall_flushes(struct brw_context *brw);
+void brw_emit_mi_flush(struct brw_context *brw);
+void brw_emit_post_sync_nonzero_flush(struct brw_context *brw);
+void brw_emit_depth_stall_flushes(struct brw_context *brw);
 void gen7_emit_vs_workaround_flush(struct brw_context *brw);
 void gen7_emit_cs_stall_flush(struct brw_context *brw);
 
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index b91597a9f5d..69ad4d444da 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -217,7 +217,7 @@ static void brw_emit_prim(struct brw_context *brw,
     * the besides the draw code.
     */
    if (brw->always_flush_cache) {
-      intel_batchbuffer_emit_mi_flush(brw);
+      brw_emit_mi_flush(brw);
    }
 
    /* If indirect, emit a bunch of loads from the indirect BO. */
@@ -284,7 +284,7 @@ static void brw_emit_prim(struct brw_context *brw,
    ADVANCE_BATCH();
 
    if (brw->always_flush_cache) {
-      intel_batchbuffer_emit_mi_flush(brw);
+      brw_emit_mi_flush(brw);
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index 49f2e3e498c..5b8191c093b 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -623,7 +623,7 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
     *     write-flush must be issued before sending any DRAW commands on that
     *     render target.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* If we had to fall back to plain clear for any buffers, clear those now
     * by calling into meta.
@@ -677,7 +677,7 @@ brw_meta_resolve_color(struct brw_context *brw,
    GLuint fbo, rbo;
    struct rect rect;
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    _mesa_meta_begin(ctx, MESA_META_ALL);
 
diff --git a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
index d079197a2a9..d4abfe63de7 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
@@ -500,11 +500,11 @@ brw_meta_fbo_stencil_blit(struct brw_context *brw,
                              .mirror_x = mirror_x, .mirror_y = mirror_y };
    adjust_mip_level(dst_mt, dst_irb->mt_level, dst_irb->mt_layer, &dims);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
    _mesa_meta_begin(ctx, MESA_META_ALL);
    brw_meta_stencil_blit(brw,
                          dst_mt, dst_irb->mt_level, dst_irb->mt_layer, &dims);
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
 
 void
@@ -524,7 +524,7 @@ brw_meta_stencil_updownsample(struct brw_context *brw,
    if (dst->stencil_mt)
       dst = dst->stencil_mt;
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
    _mesa_meta_begin(ctx, MESA_META_ALL);
 
    _mesa_GenFramebuffers(1, &fbo);
@@ -535,7 +535,7 @@ brw_meta_stencil_updownsample(struct brw_context *brw,
                                  GL_RENDERBUFFER, rbo);
 
    brw_meta_stencil_blit(brw, dst, 0, 0, &dims);
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    _mesa_DeleteRenderbuffers(1, &rbo);
    _mesa_DeleteFramebuffers(1, &fbo);
diff --git a/src/mesa/drivers/dri/i965/brw_meta_updownsample.c b/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
index 21507b1ad2a..f39d50a69e6 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
@@ -116,7 +116,7 @@ brw_meta_updownsample(struct brw_context *brw,
       blit_bit = GL_COLOR_BUFFER_BIT;
    }
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    _mesa_meta_begin(ctx, MESA_META_ALL);
    _mesa_GenFramebuffers(2, fbos);
@@ -147,5 +147,5 @@ brw_meta_updownsample(struct brw_context *brw,
 
    _mesa_meta_end(ctx);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 5a4515b582d..1bbb16cf697 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -580,7 +580,7 @@ brw_emit_depth_stencil_hiz(struct brw_context *brw,
     * non-pipelined state that will need the PIPE_CONTROL workaround.
     */
    if (brw->gen == 6) {
-      intel_emit_depth_stall_flushes(brw);
+      brw_emit_depth_stall_flushes(brw);
    }
 
    unsigned int len;
diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
index 2c8cd491a8e..0a123754257 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
@@ -581,7 +581,7 @@ snapshot_statistics_registers(struct brw_context *brw,
    const int group = PIPELINE_STATS_COUNTERS;
    const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters;
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    for (int i = 0; i < num_counters; i++) {
       if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) {
@@ -687,7 +687,7 @@ stop_oa_counters(struct brw_context *brw)
  * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
  * including the required PIPE_CONTROL flushes.
  *
- * Sandybridge is the worst case scenario: intel_batchbuffer_emit_mi_flush
+ * Sandybridge is the worst case scenario: brw_emit_mi_flush
  * expands to three PIPE_CONTROLs which are 4 DWords each.  We have to flush
  * before and after MI_REPORT_PERF_COUNT, so multiply by two.  Finally, add
  * the 3 DWords for MI_REPORT_PERF_COUNT itself.
@@ -713,7 +713,7 @@ emit_mi_report_perf_count(struct brw_context *brw,
    int batch_used = brw->batch.used;
 
    /* Reports apparently don't always get written unless we flush first. */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    if (brw->gen == 5) {
       /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
@@ -751,7 +751,7 @@ emit_mi_report_perf_count(struct brw_context *brw,
    }
 
    /* Reports apparently don't always get written unless we flush after. */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    (void) batch_used;
    assert(brw->batch.used - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index bd45a114f2f..b4c86b9dff9 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -189,7 +189,7 @@ brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
  * already flushed (e.g., via a preceding MI_FLUSH).
  */
 void
-intel_emit_depth_stall_flushes(struct brw_context *brw)
+brw_emit_depth_stall_flushes(struct brw_context *brw)
 {
    assert(brw->gen >= 6 && brw->gen <= 9);
 
@@ -270,7 +270,7 @@ gen7_emit_cs_stall_flush(struct brw_context *brw)
  * really our business.  That leaves only stall at scoreboard.
  */
 void
-intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
+brw_emit_post_sync_nonzero_flush(struct brw_context *brw)
 {
    brw_emit_pipe_control_flush(brw,
                                PIPE_CONTROL_CS_STALL |
@@ -287,7 +287,7 @@ intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
  * This is also used for the always_flush_cache driconf debug option.
  */
 void
-intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
+brw_emit_mi_flush(struct brw_context *brw)
 {
    if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
       BEGIN_BATCH_BLT(4);
@@ -321,7 +321,7 @@ intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
              * Flush Enable =1, a PIPE_CONTROL with any non-zero
              * post-sync-op is required.
              */
-            intel_emit_post_sync_nonzero_flush(brw);
+            brw_emit_post_sync_nonzero_flush(brw);
          }
       }
       brw_emit_pipe_control_flush(brw, flags);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 08d1ac28885..7662c3b580c 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -349,7 +349,7 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
       return;
 
    if (brw->gen == 6)
-      intel_emit_post_sync_nonzero_flush(brw);
+      brw_emit_post_sync_nonzero_flush(brw);
 
    brw_upload_invariant_state(brw);
 
@@ -710,7 +710,7 @@ brw_upload_pipeline_state(struct brw_context *brw,
 
    /* Emit Sandybridge workaround flushes on every primitive, for safety. */
    if (brw->gen == 6)
-      intel_emit_post_sync_nonzero_flush(brw);
+      brw_emit_post_sync_nonzero_flush(brw);
 
    brw_upload_programs(brw, pipeline);
    merge_ctx_state(brw, &state);
diff --git a/src/mesa/drivers/dri/i965/gen6_blorp.cpp b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
index b6a3d78d849..54c4a6dfdd8 100644
--- a/src/mesa/drivers/dri/i965/gen6_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
@@ -821,7 +821,7 @@ gen6_blorp_emit_depth_stencil_config(struct brw_context *brw,
 
    /* 3DSTATE_DEPTH_BUFFER */
    {
-      intel_emit_depth_stall_flushes(brw);
+      brw_emit_depth_stall_flushes(brw);
 
       BEGIN_BATCH(7);
       /* 3DSTATE_DEPTH_BUFFER dw0 */
@@ -896,7 +896,7 @@ static void
 gen6_blorp_emit_depth_disable(struct brw_context *brw,
                               const brw_blorp_params *params)
 {
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    BEGIN_BATCH(7);
    OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
@@ -1021,7 +1021,7 @@ gen6_blorp_exec(struct brw_context *brw,
    uint32_t prog_offset = params->get_wm_prog(brw, &prog_data);
 
    /* Emit workaround flushes when we switch from drawing to blorping. */
-   intel_emit_post_sync_nonzero_flush(brw);
+   brw_emit_post_sync_nonzero_flush(brw);
 
    gen6_emit_3dstate_multisample(brw, params->dst.num_samples);
    gen6_emit_3dstate_sample_mask(brw,
diff --git a/src/mesa/drivers/dri/i965/gen6_depth_state.c b/src/mesa/drivers/dri/i965/gen6_depth_state.c
index 1df0bd47571..8f0d7dc5431 100644
--- a/src/mesa/drivers/dri/i965/gen6_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_depth_state.c
@@ -65,7 +65,7 @@ gen6_emit_depth_stencil_hiz(struct brw_context *brw,
     */
    bool enable_hiz_ss = hiz || separate_stencil;
 
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
    if (!irb)
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
index ba5c944fb3d..9f4a5db3592 100644
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -86,7 +86,7 @@ static void
 write_primitives_generated(struct brw_context *brw,
                            drm_intel_bo *query_bo, int stream, int idx)
 {
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    if (brw->gen >= 7 && stream > 0) {
       brw_store_register_mem64(brw, query_bo,
@@ -100,7 +100,7 @@ static void
 write_xfb_primitives_written(struct brw_context *brw,
                              drm_intel_bo *bo, int stream, int idx)
 {
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    if (brw->gen >= 7) {
       brw_store_register_mem64(brw, bo, GEN7_SO_NUM_PRIMS_WRITTEN(stream), idx);
@@ -157,7 +157,7 @@ emit_pipeline_stat(struct brw_context *brw, drm_intel_bo *bo,
    /* Emit a flush to make sure various parts of the pipeline are complete and
     * we get an accurate value
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    brw_store_register_mem64(brw, bo, reg, idx);
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
index be80d7bdfc5..3899ce9451f 100644
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -292,5 +292,5 @@ brw_end_transform_feedback(struct gl_context *ctx,
     * simplicity, just do a full flush.
     */
    struct brw_context *brw = brw_context(ctx);
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c
index 107a4f24fa6..c7311fd0b03 100644
--- a/src/mesa/drivers/dri/i965/gen6_urb.c
+++ b/src/mesa/drivers/dri/i965/gen6_urb.c
@@ -120,7 +120,7 @@ gen6_upload_urb( struct brw_context *brw )
     * a workaround.
     */
    if (brw->urb.gs_present && !gs_present)
-      intel_batchbuffer_emit_mi_flush(brw);
+      brw_emit_mi_flush(brw);
    brw->urb.gs_present = gs_present;
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index 2bdc82bc895..abace6df37e 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -645,7 +645,7 @@ gen7_blorp_emit_depth_stencil_config(struct brw_context *brw,
 
    /* 3DSTATE_DEPTH_BUFFER */
    {
-      intel_emit_depth_stall_flushes(brw);
+      brw_emit_depth_stall_flushes(brw);
 
       BEGIN_BATCH(7);
       OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
@@ -696,7 +696,7 @@ gen7_blorp_emit_depth_stencil_config(struct brw_context *brw,
 static void
 gen7_blorp_emit_depth_disable(struct brw_context *brw)
 {
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    BEGIN_BATCH(7);
    OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
diff --git a/src/mesa/drivers/dri/i965/gen7_misc_state.c b/src/mesa/drivers/dri/i965/gen7_misc_state.c
index f4f665219d6..a14d4a0c50d 100644
--- a/src/mesa/drivers/dri/i965/gen7_misc_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_misc_state.c
@@ -57,7 +57,7 @@ gen7_emit_depth_stencil_hiz(struct brw_context *brw,
       return;
    }
 
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
    if (!irb)
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
index aec4f44bb73..41573a80a52 100644
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -365,7 +365,7 @@ gen7_save_primitives_written_counters(struct brw_context *brw,
    }
 
    /* Flush any drawing so that the counters have the right values. */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Emit MI_STORE_REGISTER_MEM commands to write the values. */
    for (int i = 0; i < streams; i++) {
@@ -502,7 +502,7 @@ gen7_pause_transform_feedback(struct gl_context *ctx,
       (struct brw_transform_feedback_object *) obj;
 
    /* Flush any drawing so that the counters have the right values. */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Save the SOL buffer offset register values. */
    if (brw->gen < 8) {
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index 7c4ec06e84d..81447f8d0b5 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -57,7 +57,7 @@ emit_depth_packets(struct brw_context *brw,
       return;
    }
 
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    /* _NEW_BUFFERS, _NEW_DEPTH, _NEW_STENCIL */
    BEGIN_BATCH(8);
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index d3ab769356c..9fac63d56a1 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -460,7 +460,7 @@ intelEmitCopyBlit(struct brw_context *brw,
 
    ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    return true;
 }
@@ -544,7 +544,7 @@ intelEmitImmediateColorExpandBlit(struct brw_context *brw,
 
    intel_batchbuffer_data(brw, src_bits, dwords * 4, BLT_RING);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    return true;
 }
@@ -667,5 +667,5 @@ intel_miptree_set_alpha_to_one(struct brw_context *brw,
    OUT_BATCH(0xffffffff); /* white, but only alpha gets written */
    ADVANCE_BATCH_TILED(dst_y_tiled, false);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
index 627c487f0e7..ff05b5cd0e7 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@@ -560,7 +560,7 @@ brw_unmap_buffer(struct gl_context *ctx,
        * flush.  Once again, we wish for a domain tracker in libdrm to cover
        * usage inside of a batchbuffer.
        */
-      intel_batchbuffer_emit_mi_flush(brw);
+      brw_emit_mi_flush(brw);
 
       drm_intel_bo_unreference(intel_obj->range_map_bo[index]);
       intel_obj->range_map_bo[index] = NULL;
@@ -632,7 +632,7 @@ brw_copy_buffer_subdata(struct gl_context *ctx,
     * flush.  Once again, we wish for a domain tracker in libdrm to cover
     * usage inside of a batchbuffer.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 365b4b8f718..3423190c485 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -76,7 +76,7 @@ can_do_pipelined_register_writes(struct brw_context *brw)
    OUT_BATCH(expected_value);
    ADVANCE_BATCH();
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Save the register's value back to the buffer. */
    BEGIN_BATCH(3);
@@ -132,7 +132,7 @@ can_write_oacontrol(struct brw_context *brw)
    OUT_BATCH(expected_value);
    ADVANCE_BATCH();
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Save the register's value back to the buffer. */
    BEGIN_BATCH(3);
@@ -143,7 +143,7 @@ can_write_oacontrol(struct brw_context *brw)
              offset * sizeof(uint32_t));
    ADVANCE_BATCH();
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Set OACONTROL back to zero (everything off). */
    BEGIN_BATCH(3);
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 1b3a72f3ec2..9e6a7116630 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -1076,7 +1076,7 @@ brw_render_cache_set_check_flush(struct brw_context *brw, drm_intel_bo *bo)
    if (!_mesa_set_search(brw->render_cache, bo))
       return;
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index 30380570d62..3fe506e3cf1 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -247,7 +247,7 @@ intelReadPixels(struct gl_context * ctx,
           * rendered to via a PBO at any point, so it seems better to just
           * flush here unconditionally.
           */
-         intel_batchbuffer_emit_mi_flush(brw);
+         brw_emit_mi_flush(brw);
          return;
       }
 
diff --git a/src/mesa/drivers/dri/i965/intel_syncobj.c b/src/mesa/drivers/dri/i965/intel_syncobj.c
index 3cfa7e593ab..c44c4beceef 100644
--- a/src/mesa/drivers/dri/i965/intel_syncobj.c
+++ b/src/mesa/drivers/dri/i965/intel_syncobj.c
@@ -69,7 +69,7 @@ brw_fence_insert(struct brw_context *brw, struct brw_fence *fence)
    assert(!fence->batch_bo);
    assert(!fence->signalled);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
    fence->batch_bo = brw->batch.bo;
    drm_intel_bo_reference(fence->batch_bo);
    intel_batchbuffer_flush(brw);
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index ebe84b664d4..e077d5e4743 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -490,7 +490,7 @@ intel_get_tex_image(struct gl_context *ctx,
           * See the related comment in intelReadPixels() for a more detailed
           * explanation.
           */
-         intel_batchbuffer_emit_mi_flush(brw);
+         brw_emit_mi_flush(brw);
          return;
       }
 

From c2ff3485b3d48749ea9dcad07bc1a691627dc3e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Thu, 11 Jun 2015 10:41:52 +0300
Subject: [PATCH 0024/1208] glsl: clone inputs and outputs during linking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This increases memory pressure during linking but makes it easier
for backend to free IR after it is not needed anymore.

v2: use resource list as ralloc context in case of relink (Kenneth)

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Cc: mesa-stable@lists.freedesktop.org
---
 src/glsl/linker.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 4a726d4e2e7..5da9cadcb08 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -2637,7 +2637,9 @@ add_interface_variables(struct gl_shader_program *shProg,
          continue;
       };
 
-      if (!add_program_resource(shProg, programInterface, var,
+      /* Clone ir_variable data so that backend is able to free memory. */
+      if (!add_program_resource(shProg, programInterface,
+                                var->clone(shProg->ProgramResourceList, NULL),
                                 build_stageref(shProg, var->name) | mask))
          return false;
    }

From 104c8fc2c2aa5621261f80aa6b4f76c3163078f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Thu, 11 Jun 2015 10:41:53 +0300
Subject: [PATCH 0025/1208] i965: Delete linked GLSL IR when using NIR.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is based on Kenneth's patch to delete 'most of the IR'. Due to
linker changes to clone variables, we can now free all of IR.

Saves 58MB of memory when replaying a Dota 2 trace on Broadwell.

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Cc: mesa-stable@lists.freedesktop.org
---
 src/mesa/drivers/dri/i965/brw_shader.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 32c40131434..5653d6ba1e4 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -387,8 +387,11 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
 
       brw_add_texrect_params(prog);
 
-      if (options->NirOptions)
+      if (options->NirOptions) {
          prog->nir = brw_create_nir(brw, shProg, prog, (gl_shader_stage) stage);
+         ralloc_free(shader->ir);
+         shader->ir = NULL;
+      }
 
       _mesa_reference_program(ctx, &prog, NULL);
    }

From b2c6ba0c4b21391dc35018e1c8c4f7f7d8952bea Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 8 Jun 2015 16:03:19 -0700
Subject: [PATCH 0026/1208] i965/fs_live_variables: Do liveness analysis
 bottom-to-top

From Muchnick's Advanced Compiler Design and Implementation:

"To determine which variables are live at each point in a flowgraph, we
perform a backward data-flow analysis"

Previously, we were walking the blocks forwards and updating the livein and
then the liveout.  However, the livein calculation depends on the liveout
and the liveout depends on the successor blocks.  The net result is that it
takes one full iteration to go from liveout to livein and then another
full iteration to propagate to the predecessors.  This works out to an
O(n^2) computation where n is the number of blocks.  If we run things in
the other order, it's O(nl) where l is the maximum loop depth which is
practically bounded by 3.

On my HSW desktop, one particular shadertoy test gets a 20% improvement in
compile times:

N           Min           Max        Median           Avg        Stddev
x  10        15.965        16.884        16.026       16.1822    0.34736846
+  10        12.813        13.052        12.876       12.8891    0.06913666
Difference at 95.0% confidence
        -3.2931 +/- 0.235316
        -20.3501% +/- 1.45417%
        (Student's t, pooled s = 0.250444)

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 .../dri/i965/brw_fs_live_variables.cpp        | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
index 502161d5128..19aec92fad1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
@@ -204,27 +204,9 @@ fs_live_variables::compute_live_variables()
    while (cont) {
       cont = false;
 
-      foreach_block (block, cfg) {
+      foreach_block_reverse (block, cfg) {
          struct block_data *bd = &block_data[block->num];
 
-	 /* Update livein */
-	 for (int i = 0; i < bitset_words; i++) {
-            BITSET_WORD new_livein = (bd->use[i] |
-                                      (bd->liveout[i] &
-                                       ~bd->def[i]));
-	    if (new_livein & ~bd->livein[i]) {
-               bd->livein[i] |= new_livein;
-               cont = true;
-	    }
-	 }
-         BITSET_WORD new_livein = (bd->flag_use[0] |
-                                   (bd->flag_liveout[0] &
-                                    ~bd->flag_def[0]));
-         if (new_livein & ~bd->flag_livein[0]) {
-            bd->flag_livein[0] |= new_livein;
-            cont = true;
-         }
-
 	 /* Update liveout */
 	 foreach_list_typed(bblock_link, child_link, link, &block->children) {
             struct block_data *child_bd = &block_data[child_link->block->num];
@@ -244,6 +226,24 @@ fs_live_variables::compute_live_variables()
                cont = true;
             }
 	 }
+
+         /* Update livein */
+         for (int i = 0; i < bitset_words; i++) {
+            BITSET_WORD new_livein = (bd->use[i] |
+                                      (bd->liveout[i] &
+                                       ~bd->def[i]));
+            if (new_livein & ~bd->livein[i]) {
+               bd->livein[i] |= new_livein;
+               cont = true;
+            }
+         }
+         BITSET_WORD new_livein = (bd->flag_use[0] |
+                                   (bd->flag_liveout[0] &
+                                    ~bd->flag_def[0]));
+         if (new_livein & ~bd->flag_livein[0]) {
+            bd->flag_livein[0] |= new_livein;
+            cont = true;
+         }
       }
    }
 }

From 9f261dc18dba0aa4dc43fc560d343ba9ffd486e9 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Mon, 22 Jun 2015 11:09:27 -0700
Subject: [PATCH 0027/1208] radeon: Advertise correct
 GL_QUERY_COUNTER_BITS/GL_SAMPLES_PASSED value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit b765119c changed the default value of all the counter bits to
64.  However, older hardware only has 32 counter bits.

This has only been build-tested.  We don't have any tests that verify
the advertised value against implementation behavior, so I don't know
what additional testing could be done.

NOTE: It appears that many Gallium drivers (at least r300 and i915g)
have the same problem, but I don't see a way for the state-tracker to
determine the counter size.  Marek says, "For Gallium, a new PIPE_CAP or
new get_xxx_param function will be needed."

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
---
 .../dri/radeon/radeon_common_context.c        | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index 9699dcbfcdc..3d0cedaf33a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -194,6 +194,29 @@ GLboolean radeonInitContext(radeonContextPtr radeon,
 
 	radeon_init_dma(radeon);
 
+        /* _mesa_initialize_context calls _mesa_init_queryobj which
+         * initializes all of the counter sizes to 64.  The counters on r100
+         * and r200 are only 32-bits for occlusion queries.  Those are the
+         * only counters, so set the other sizes to zero.
+         */
+        radeon->glCtx.Const.QueryCounterBits.SamplesPassed = 32;
+
+        radeon->glCtx.Const.QueryCounterBits.TimeElapsed = 0;
+        radeon->glCtx.Const.QueryCounterBits.Timestamp = 0;
+        radeon->glCtx.Const.QueryCounterBits.PrimitivesGenerated = 0;
+        radeon->glCtx.Const.QueryCounterBits.PrimitivesWritten = 0;
+        radeon->glCtx.Const.QueryCounterBits.VerticesSubmitted = 0;
+        radeon->glCtx.Const.QueryCounterBits.PrimitivesSubmitted = 0;
+        radeon->glCtx.Const.QueryCounterBits.VsInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.TessPatches = 0;
+        radeon->glCtx.Const.QueryCounterBits.TessInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.GsInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.GsPrimitives = 0;
+        radeon->glCtx.Const.QueryCounterBits.FsInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.ComputeInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.ClInPrimitives = 0;
+        radeon->glCtx.Const.QueryCounterBits.ClOutPrimitives = 0;
+
 	return GL_TRUE;
 }
 

From d1663ccb4c664b0f544ed5d6f0761f3ae2435199 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Wed, 17 Jun 2015 15:50:11 -0700
Subject: [PATCH 0028/1208] i965/bxt: Add basic Broxton infrastructure

The thread counts and URB information are all speculative numbers that were
based on some CHV numbers at the time.

v2:
Originally this patch had PCI IDs. I've moved that to a new patch at the end of
the series.
Remove is_cherryview hack.
Add PCI ids. These match the ones defined in the kernel. The only one tested by
us is 0x0a84.
Capitalize the hex string (Mark)

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Tested-by: "Lecluse, Philippe" <Philippe.Lecluse@intel.com>
Reviewed-by: Mark Janes <mark.a.janes@intel.com>
---
 include/pci_ids/i965_pci_ids.h              |  3 +++
 src/mesa/drivers/dri/i965/brw_context.c     |  1 +
 src/mesa/drivers/dri/i965/brw_context.h     |  1 +
 src/mesa/drivers/dri/i965/brw_device_info.c | 16 ++++++++++++++++
 src/mesa/drivers/dri/i965/brw_device_info.h |  1 +
 5 files changed, 22 insertions(+)

diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h
index 8d757aaa767..8a425999429 100644
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -128,3 +128,6 @@ CHIPSET(0x22B0, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B1, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B2, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B3, chv,     "Intel(R) HD Graphics (Cherryview)")
+CHIPSET(0x0A84, bxt,     "Intel(R) HD Graphics (Broxton)")
+CHIPSET(0x1A84, bxt,     "Intel(R) HD Graphics (Broxton)")
+CHIPSET(0x5A84, bxt,     "Intel(R) HD Graphics (Broxton)")
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index cf408830620..4b51fe5da56 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -715,6 +715,7 @@ brwCreateContext(gl_api api,
    brw->is_baytrail = devinfo->is_baytrail;
    brw->is_haswell = devinfo->is_haswell;
    brw->is_cherryview = devinfo->is_cherryview;
+   brw->is_broxton = devinfo->is_broxton;
    brw->has_llc = devinfo->has_llc;
    brw->has_hiz = devinfo->has_hiz_and_separate_stencil;
    brw->has_separate_stencil = devinfo->has_hiz_and_separate_stencil;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 85d8f14a006..3553f6ec48c 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1117,6 +1117,7 @@ struct brw_context
    bool is_baytrail;
    bool is_haswell;
    bool is_cherryview;
+   bool is_broxton;
 
    bool has_hiz;
    bool has_separate_stencil;
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index 97243a47293..342e56622b7 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -334,6 +334,22 @@ static const struct brw_device_info brw_device_info_skl_gt3 = {
    .supports_simd16_3src = true,
 };
 
+static const struct brw_device_info brw_device_info_bxt = {
+   GEN9_FEATURES,
+   .is_broxton = 1,
+   .gt = 1,
+   .has_llc = false,
+   .max_vs_threads = 112,
+   .max_gs_threads = 112,
+   .max_wm_threads = 32,
+   .urb = {
+      .size = 64,
+      .min_vs_entries = 34,
+      .max_vs_entries = 640,
+      .max_gs_entries = 256,
+   }
+};
+
 const struct brw_device_info *
 brw_get_device_info(int devid, int revision)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.h b/src/mesa/drivers/dri/i965/brw_device_info.h
index 65c024ceeed..7b7a1fc046a 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.h
+++ b/src/mesa/drivers/dri/i965/brw_device_info.h
@@ -35,6 +35,7 @@ struct brw_device_info
    bool is_baytrail;
    bool is_haswell;
    bool is_cherryview;
+   bool is_broxton;
 
    bool has_hiz_and_separate_stencil;
    bool must_use_separate_stencil;

From 77a78c65f80323059d892c501ca551ccf324b17d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 25 Jun 2015 00:56:32 +0200
Subject: [PATCH 0029/1208] softpipe,llvmpipe: fix PIPE_SHADER_CAP_MAX_INPUTS
 value

PIPE_MAX_SHADER_INPUTS was recently bumped to 80 because of tessellation.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91099
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91101

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_limits.h | 2 +-
 src/gallium/auxiliary/tgsi/tgsi_exec.h        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index db503514881..2851fd10b04 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -100,7 +100,7 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
       return LP_MAX_TGSI_NESTING;
    case PIPE_SHADER_CAP_MAX_INPUTS:
-      return PIPE_MAX_SHADER_INPUTS;
+      return 32;
    case PIPE_SHADER_CAP_MAX_OUTPUTS:
       return 32;
    case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 208640cfd46..e8ee2565831 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -213,7 +213,7 @@ struct tgsi_sampler
  * input register files, this is the stride between two 1D
  * arrays.
  */
-#define TGSI_EXEC_MAX_INPUT_ATTRIBS PIPE_MAX_SHADER_INPUTS
+#define TGSI_EXEC_MAX_INPUT_ATTRIBS 32
 
 /* The maximum number of bytes per constant buffer.
  */

From 6026f7e8fb993a34f3e2ad1638d7a842a5cefd80 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 11 Jun 2015 01:59:44 -0700
Subject: [PATCH 0030/1208] nir: Recognize max(min(a, 1.0), 0.0) as fsat(a).

We already recognize min(max(a, 0.0), 1.0) as a saturate, but neglected
this variant (which is also handled by the GLSL IR pass).

shader-db results on Broadwell:
total instructions in shared programs: 7363046 -> 7362788 (-0.00%)
instructions in affected programs:     11928 -> 11670 (-2.16%)
helped:                                64
HURT:                                  0

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/glsl/nir/nir_opt_algebraic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index eace791f5b0..3068445cbfe 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -101,6 +101,7 @@ optimizations = [
    (('umin', a, a), a),
    (('umax', a, a), a),
    (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
+   (('fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
    (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
    (('fsat', ('fsat', a)), ('fsat', a)),
    (('fmin', ('fmax', ('fmin', ('fmax', a, 0.0), 1.0), 0.0), 1.0), ('fmin', ('fmax', a, 0.0), 1.0)),

From c97105ee12e54ab893351ebbda8c2348c899adde Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 24 Jun 2015 00:04:11 -0700
Subject: [PATCH 0031/1208] i965: Drop brw->depthstencil.stencil_offset from
 gen8_depth_state.c.

This is always 0 - only brw_workaround_depthstencil_alignment ever sets
it, and that doesn't run on Gen6+.  My initial Broadwell depth state
commit had this mistake.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
---
 src/mesa/drivers/dri/i965/gen8_depth_state.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index 81447f8d0b5..bc05a310544 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -41,7 +41,6 @@ emit_depth_packets(struct brw_context *brw,
                    bool depth_writable,
                    struct intel_mipmap_tree *stencil_mt,
                    bool stencil_writable,
-                   uint32_t stencil_offset,
                    bool hiz,
                    uint32_t width,
                    uint32_t height,
@@ -127,8 +126,7 @@ emit_depth_packets(struct brw_context *brw,
       OUT_BATCH(HSW_STENCIL_ENABLED | mocs_wb << 22 |
                 (2 * stencil_mt->pitch - 1));
       OUT_RELOC64(stencil_mt->bo,
-                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                  stencil_offset);
+                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
       OUT_BATCH(stencil_mt ? stencil_mt->qpitch >> 2 : 0);
       ADVANCE_BATCH();
    }
@@ -220,7 +218,6 @@ gen8_emit_depth_stencil_hiz(struct brw_context *brw,
    emit_depth_packets(brw, depth_mt, brw_depthbuffer_format(brw), surftype,
                       ctx->Depth.Mask != 0,
                       stencil_mt, ctx->Stencil._WriteEnabled,
-                      brw->depthstencil.stencil_offset,
                       hiz, width, height, depth, lod, min_array_element);
 }
 
@@ -439,7 +436,7 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
                       brw_depth_format(brw, mt->format),
                       BRW_SURFACE_2D,
                       true, /* depth writes */
-                      NULL, false, 0, /* no stencil for now */
+                      NULL, false, /* no stencil for now */
                       true, /* hiz */
                       surface_width,
                       surface_height,

From 101a73846b48ebac8e2386a25b24659f013c66a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 25 Jun 2015 14:58:37 +0200
Subject: [PATCH 0032/1208] radeonsi: don't fail in
 si_shader_io_get_unique_index

Trivial. Picked from my tessellation branch.
---
 src/gallium/drivers/radeonsi/si_shader.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 47e5f96cbed..a293ef36fbb 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -133,8 +133,12 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 		return 4 + index;
 
 	default:
-		assert(0);
-		return 63;
+		/* Don't fail here. The result of this function is only used
+		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
+		 * occur, but this function is called for all vertex shaders
+		 * before it's known whether LS will be compiled or not.
+		 */
+		return 0;
 	}
 }
 

From c1151b18f2dce7c6f238f057e9c4fa8d912ce6b5 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Wed, 24 Jun 2015 20:07:54 -0700
Subject: [PATCH 0033/1208] i965/skl: Use more compact hiz dimensions

gen8 had some special restrictions which don't seem to carry over to gen9.
Quoting the spec for SKL:
"The Z_Height and Z_Width values must equal those present in
3DSTATE_DEPTH_BUFFER incremented by one."

This fixes nothing in piglit (and regresses nothing).

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 6aa969a4930..31386b99656 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -1456,21 +1456,23 @@ intel_gen7_hiz_buf_create(struct brw_context *brw,
    /* Gen7 PRM Volume 2, Part 1, 11.5.3 "Hierarchical Depth Buffer" documents
     * adjustments required for Z_Height and Z_Width based on multisampling.
     */
-   switch (mt->num_samples) {
-   case 0:
-   case 1:
-      break;
-   case 2:
-   case 4:
-      z_width *= 2;
-      z_height *= 2;
-      break;
-   case 8:
-      z_width *= 4;
-      z_height *= 2;
-      break;
-   default:
-      unreachable("unsupported sample count");
+   if (brw->gen < 9) {
+      switch (mt->num_samples) {
+      case 0:
+      case 1:
+         break;
+      case 2:
+      case 4:
+         z_width *= 2;
+         z_height *= 2;
+         break;
+      case 8:
+         z_width *= 4;
+         z_height *= 2;
+         break;
+      default:
+         unreachable("unsupported sample count");
+      }
    }
 
    const unsigned vertical_align = 8; /* 'j' in the docs */

From 316206ee9ea06419c9a2ea6fe48d66a0b805319d Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 25 Jun 2015 08:08:27 -0700
Subject: [PATCH 0034/1208] i965/vec4_live_variables: Do liveness analysis
 bottom-to-top

From Muchnick's Advanced Compiler Design and Implementation:

"To determine which variables are live at each point in a flowgraph, we
perform a backward data-flow analysis"

Previously, we were walking the blocks forwards and updating the livein and
then the liveout.  However, the livein calculation depends on the liveout
and the liveout depends on the successor blocks.  The net result is that it
takes one full iteration to go from liveout to livein and then another
full iteration to propagate to the predecessors.  This works out to an
O(n^2) computation where n is the number of blocks.  If we run things in
the other order, it's O(nl) where l is the maximum loop depth which is
practically bounded by 3.

In b2c6ba0c4b21391dc35018e1c8c4f7f7d8952bea, we made this same change in
the FS backend to great effect.  Might as well keep it consistent and make
the same change for vec4.  Also, this took the time to run the test:

ES31-CTS.arrays_of_arrays.InteractionFunctionCalls1

from 6:49.62 to 3:31.40 on Timothy Arceri's machine.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 .../dri/i965/brw_vec4_live_variables.cpp      | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
index 95b9d9017e2..29b4a53418a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
@@ -133,27 +133,9 @@ vec4_live_variables::compute_live_variables()
    while (cont) {
       cont = false;
 
-      foreach_block (block, cfg) {
+      foreach_block_reverse (block, cfg) {
          struct block_data *bd = &block_data[block->num];
 
-	 /* Update livein */
-	 for (int i = 0; i < bitset_words; i++) {
-            BITSET_WORD new_livein = (bd->use[i] |
-                                      (bd->liveout[i] &
-                                       ~bd->def[i]));
-            if (new_livein & ~bd->livein[i]) {
-               bd->livein[i] |= new_livein;
-               cont = true;
-	    }
-	 }
-         BITSET_WORD new_livein = (bd->flag_use[0] |
-                                   (bd->flag_liveout[0] &
-                                    ~bd->flag_def[0]));
-         if (new_livein & ~bd->flag_livein[0]) {
-            bd->flag_livein[0] |= new_livein;
-            cont = true;
-         }
-
 	 /* Update liveout */
 	 foreach_list_typed(bblock_link, child_link, link, &block->children) {
             struct block_data *child_bd = &block_data[child_link->block->num];
@@ -173,6 +155,24 @@ vec4_live_variables::compute_live_variables()
                cont = true;
             }
 	 }
+
+         /* Update livein */
+         for (int i = 0; i < bitset_words; i++) {
+            BITSET_WORD new_livein = (bd->use[i] |
+                                      (bd->liveout[i] &
+                                       ~bd->def[i]));
+            if (new_livein & ~bd->livein[i]) {
+               bd->livein[i] |= new_livein;
+               cont = true;
+            }
+         }
+         BITSET_WORD new_livein = (bd->flag_use[0] |
+                                   (bd->flag_liveout[0] &
+                                    ~bd->flag_def[0]));
+         if (new_livein & ~bd->flag_livein[0]) {
+            bd->flag_livein[0] |= new_livein;
+            cont = true;
+         }
       }
    }
 }

From fbba25bba017b3dde5f6613698004b0086bdea00 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 23 Jun 2015 08:42:14 +0200
Subject: [PATCH 0035/1208] mesa: remove unnecessary checks in
 _mesa_readpixels_needs_slow_path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

readpixels_can_use_memcpy will later call _mesa_format_matches_format_and_type
which does much tighter checks than these to decide if we can use
memcpy for readpixels.

Also, the checks do not seem to be extensive enough anyway, since we are
checking for signed/unsigned conversion only when the framebuffer has integers,
but the same checks could be done for other types anyway, since as long as
there is a signed/unsigned conversion we can't memcpy.

No regressions observed on i965/llvmpipe.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/main/readpix.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index a3357cd6419..e256695480a 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -128,7 +128,6 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
 {
    struct gl_renderbuffer *rb =
          _mesa_get_read_renderbuffer_for_format(ctx, format);
-   GLenum srcType;
 
    assert(rb);
 
@@ -153,21 +152,6 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
          return GL_TRUE;
       }
 
-      /* Conversion between signed and unsigned integers needs masking
-       * (it isn't just memcpy). */
-      srcType = _mesa_get_format_datatype(rb->Format);
-
-      if ((srcType == GL_INT &&
-           (type == GL_UNSIGNED_INT ||
-            type == GL_UNSIGNED_SHORT ||
-            type == GL_UNSIGNED_BYTE)) ||
-          (srcType == GL_UNSIGNED_INT &&
-           (type == GL_INT ||
-            type == GL_SHORT ||
-            type == GL_BYTE))) {
-         return GL_TRUE;
-      }
-
       /* And finally, see if there are any transfer ops. */
       return get_readpixels_transfer_ops(ctx, rb->Format, format, type,
                                          uses_blit) != 0;

From 36d107e92cc4c1d2b60e0017dbe998af3a2e8b75 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Tue, 23 Jun 2015 23:59:31 -0600
Subject: [PATCH 0036/1208] ilo: introduce ilo_vma

This cleans up the code a bit and makes ilo_state_vector_resource_renamed()
simpler and more robust.  It also allows a single bo to back mulitple VMAs.
---
 src/gallium/drivers/ilo/Makefile.sources      |   5 +-
 src/gallium/drivers/ilo/core/ilo_buffer.h     |   5 -
 .../drivers/ilo/core/ilo_builder_3d_bottom.h  |  43 ++++---
 .../drivers/ilo/core/ilo_builder_3d_top.h     |  65 ++++++-----
 src/gallium/drivers/ilo/core/ilo_image.h      |   7 --
 src/gallium/drivers/ilo/core/ilo_state_sol.c  |  36 +++---
 src/gallium/drivers/ilo/core/ilo_state_sol.h  |  22 ++--
 .../drivers/ilo/core/ilo_state_surface.c      |  25 +++-
 .../drivers/ilo/core/ilo_state_surface.h      |  32 +++---
 src/gallium/drivers/ilo/core/ilo_state_vf.c   |  34 +++---
 src/gallium/drivers/ilo/core/ilo_state_vf.h   |  16 +--
 src/gallium/drivers/ilo/core/ilo_state_zs.c   |  52 ++++++---
 src/gallium/drivers/ilo/core/ilo_state_zs.h   |  25 ++--
 src/gallium/drivers/ilo/core/ilo_vma.h        |  73 ++++++++++++
 src/gallium/drivers/ilo/ilo_blitter_blt.c     |  41 +++----
 src/gallium/drivers/ilo/ilo_draw.c            |  11 +-
 src/gallium/drivers/ilo/ilo_render_surface.c  |  27 ++---
 src/gallium/drivers/ilo/ilo_resource.c        |  37 +++---
 src/gallium/drivers/ilo/ilo_resource.h        |  21 ++--
 src/gallium/drivers/ilo/ilo_state.c           | 108 +++++++-----------
 src/gallium/drivers/ilo/ilo_transfer.c        |  51 ++++++---
 21 files changed, 420 insertions(+), 316 deletions(-)
 create mode 100644 src/gallium/drivers/ilo/core/ilo_vma.h

diff --git a/src/gallium/drivers/ilo/Makefile.sources b/src/gallium/drivers/ilo/Makefile.sources
index e1bbb9a0781..35d76bd4948 100644
--- a/src/gallium/drivers/ilo/Makefile.sources
+++ b/src/gallium/drivers/ilo/Makefile.sources
@@ -43,6 +43,7 @@ C_SOURCES := \
 	core/ilo_state_viewport.h \
 	core/ilo_state_zs.c \
 	core/ilo_state_zs.h \
+	core/ilo_vma.h \
 	core/intel_winsys.h \
 	ilo_blit.c \
 	ilo_blit.h \
@@ -65,8 +66,6 @@ C_SOURCES := \
 	ilo_public.h \
 	ilo_query.c \
 	ilo_query.h \
-	ilo_resource.c \
-	ilo_resource.h \
 	ilo_render.c \
 	ilo_render.h \
 	ilo_render_gen.h \
@@ -76,6 +75,8 @@ C_SOURCES := \
 	ilo_render_gen8.c \
 	ilo_render_media.c \
 	ilo_render_surface.c \
+	ilo_resource.c \
+	ilo_resource.h \
 	ilo_screen.c \
 	ilo_screen.h \
 	ilo_shader.c \
diff --git a/src/gallium/drivers/ilo/core/ilo_buffer.h b/src/gallium/drivers/ilo/core/ilo_buffer.h
index ca3c61ff890..f2fb63064c0 100644
--- a/src/gallium/drivers/ilo/core/ilo_buffer.h
+++ b/src/gallium/drivers/ilo/core/ilo_buffer.h
@@ -28,17 +28,12 @@
 #ifndef ILO_BUFFER_H
 #define ILO_BUFFER_H
 
-#include "intel_winsys.h"
-
 #include "ilo_core.h"
 #include "ilo_debug.h"
 #include "ilo_dev.h"
 
 struct ilo_buffer {
    unsigned bo_size;
-
-   /* managed by users */
-   struct intel_bo *bo;
 };
 
 static inline void
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
index 6d9e3699125..5efe9da2d22 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
@@ -39,6 +39,7 @@
 #include "ilo_state_shader.h"
 #include "ilo_state_viewport.h"
 #include "ilo_state_zs.h"
+#include "ilo_vma.h"
 #include "ilo_builder.h"
 #include "ilo_builder_3d_top.h"
 
@@ -674,9 +675,10 @@ gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder,
 
       dw[5] |= builder->mocs << GEN8_DEPTH_DW5_MOCS__SHIFT;
 
-      if (zs->depth_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2, zs->depth_bo,
-               zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->z_vma) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->z_vma->bo,
+               zs->z_vma->bo_offset + zs->depth[1],
+               (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       dw[1] = zs->depth[0];
@@ -691,9 +693,10 @@ gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder,
       else
          dw[6] |= builder->mocs << GEN6_DEPTH_DW6_MOCS__SHIFT;
 
-      if (zs->depth_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2, zs->depth_bo,
-               zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->z_vma) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->z_vma->bo,
+               zs->z_vma->bo_offset + zs->depth[1],
+               (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
@@ -724,9 +727,10 @@ gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder,
 
       dw[1] |= builder->mocs << GEN8_STENCIL_DW1_MOCS__SHIFT;
 
-      if (zs->stencil_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2, zs->stencil_bo,
-               zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->s_vma) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->s_vma->bo,
+               zs->s_vma->bo_offset + zs->stencil[1],
+               (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       dw[1] = zs->stencil[0];
@@ -734,9 +738,10 @@ gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder,
 
       dw[1] |= builder->mocs << GEN6_STENCIL_DW1_MOCS__SHIFT;
 
-      if (zs->stencil_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2, zs->stencil_bo,
-               zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->s_vma) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->s_vma->bo,
+               zs->s_vma->bo_offset + zs->stencil[1],
+               (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
@@ -767,9 +772,10 @@ gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder,
 
       dw[1] |= builder->mocs << GEN8_HIZ_DW1_MOCS__SHIFT;
 
-      if (zs->hiz_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2, zs->hiz_bo,
-               zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->hiz_vma) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->hiz_vma->bo,
+               zs->hiz_vma->bo_offset + zs->hiz[1],
+               (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       dw[1] = zs->hiz[0];
@@ -777,9 +783,10 @@ gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder,
 
       dw[1] |= builder->mocs << GEN6_HIZ_DW1_MOCS__SHIFT;
 
-      if (zs->hiz_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2, zs->hiz_bo,
-               zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->hiz_vma) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->hiz_vma->bo,
+               zs->hiz_vma->bo_offset + zs->hiz[1],
+               (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
index 8d30095e6f6..6e94fb25f1f 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
@@ -39,6 +39,7 @@
 #include "ilo_state_surface.h"
 #include "ilo_state_urb.h"
 #include "ilo_state_vf.h"
+#include "ilo_vma.h"
 #include "ilo_builder.h"
 
 static inline void
@@ -318,8 +319,10 @@ gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder,
       dw[3] = 0;
 
       if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-         if (b->need_bo)
-            ilo_builder_batch_reloc64(builder, pos + 1, b->bo, b->vb[1], 0);
+         if (b->vma) {
+            ilo_builder_batch_reloc64(builder, pos + 1, b->vma->bo,
+                  b->vma->bo_offset + b->vb[1], 0);
+         }
 
          dw[3] |= b->vb[2];
       } else {
@@ -331,9 +334,11 @@ gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder,
             dw[3] |= vf->user_instancing[elem][1];
          }
 
-         if (b->need_bo) {
-            ilo_builder_batch_reloc(builder, pos + 1, b->bo, b->vb[1], 0);
-            ilo_builder_batch_reloc(builder, pos + 2, b->bo, b->vb[2], 0);
+         if (b->vma) {
+            ilo_builder_batch_reloc(builder, pos + 1, b->vma->bo,
+                  b->vma->bo_offset + b->vb[1], 0);
+            ilo_builder_batch_reloc(builder, pos + 2, b->vma->bo,
+                  b->vma->bo_offset + b->vb[2], 0);
          }
       }
 
@@ -429,9 +434,11 @@ gen6_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder,
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = dw0;
-   if (ib->need_bo) {
-      ilo_builder_batch_reloc(builder, pos + 1, ib->bo, ib->ib[1], 0);
-      ilo_builder_batch_reloc(builder, pos + 2, ib->bo, ib->ib[2], 0);
+   if (ib->vma) {
+      ilo_builder_batch_reloc(builder, pos + 1, ib->vma->bo,
+            ib->vma->bo_offset + ib->ib[1], 0);
+      ilo_builder_batch_reloc(builder, pos + 2, ib->vma->bo,
+            ib->vma->bo_offset + ib->ib[2], 0);
    } else {
       dw[1] = 0;
       dw[2] = 0;
@@ -456,8 +463,9 @@ gen8_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder,
    dw[1] = ib->ib[0] |
            builder->mocs << GEN8_IB_DW1_MOCS__SHIFT;
 
-   if (ib->need_bo) {
-      ilo_builder_batch_reloc64(builder, pos + 2, ib->bo, ib->ib[1], 0);
+   if (ib->vma) {
+      ilo_builder_batch_reloc64(builder, pos + 2, ib->vma->bo,
+            ib->vma->bo_offset + ib->ib[1], 0);
    } else {
       dw[2] = 0;
       dw[3] = 0;
@@ -801,11 +809,11 @@ gen7_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
            builder->mocs << GEN7_SO_BUF_DW1_MOCS__SHIFT |
            sol->strides[buffer] << GEN7_SO_BUF_DW1_PITCH__SHIFT;
 
-   if (sb->need_bo) {
-      ilo_builder_batch_reloc(builder, pos + 2, sb->bo,
-            sb->so_buf[0], INTEL_RELOC_WRITE);
-      ilo_builder_batch_reloc(builder, pos + 3, sb->bo,
-            sb->so_buf[1], INTEL_RELOC_WRITE);
+   if (sb->vma) {
+      ilo_builder_batch_reloc(builder, pos + 2, sb->vma->bo,
+            sb->vma->bo_offset + sb->so_buf[0], INTEL_RELOC_WRITE);
+      ilo_builder_batch_reloc(builder, pos + 3, sb->vma->bo,
+            sb->vma->bo_offset + sb->so_buf[1], INTEL_RELOC_WRITE);
    } else {
       dw[2] = 0;
       dw[3] = 0;
@@ -832,9 +840,9 @@ gen8_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
            buffer << GEN7_SO_BUF_DW1_INDEX__SHIFT |
            builder->mocs << GEN8_SO_BUF_DW1_MOCS__SHIFT;
 
-   if (sb->need_bo) {
-      ilo_builder_batch_reloc64(builder, pos + 2, sb->bo,
-            sb->so_buf[1], INTEL_RELOC_WRITE);
+   if (sb->vma) {
+      ilo_builder_batch_reloc64(builder, pos + 2, sb->vma->bo,
+            sb->vma->bo_offset + sb->so_buf[1], INTEL_RELOC_WRITE);
    } else {
       dw[2] = 0;
       dw[3] = 0;
@@ -842,9 +850,10 @@ gen8_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
 
    dw[4] = sb->so_buf[2];
 
-   if (sb->need_write_offset_bo) {
-      ilo_builder_batch_reloc64(builder, pos + 5, sb->write_offset_bo,
-            sizeof(uint32_t) * buffer, INTEL_RELOC_WRITE);
+   if (sb->write_offset_vma) {
+      ilo_builder_batch_reloc64(builder, pos + 5, sb->write_offset_vma->bo,
+            sb->write_offset_vma->bo_offset + sizeof(uint32_t) * buffer,
+            INTEL_RELOC_WRITE);
    } else {
       dw[5] = 0;
       dw[6] = 0;
@@ -1254,14 +1263,15 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
             ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw);
       memcpy(dw, surf->surface, state_len << 2);
 
-      if (surf->bo) {
+      if (surf->vma) {
          const uint32_t mocs = (surf->scanout) ?
             (GEN8_MOCS_MT_PTE | GEN8_MOCS_CT_L3) : builder->mocs;
 
          dw[1] |= mocs << GEN8_SURFACE_DW1_MOCS__SHIFT;
 
-         ilo_builder_surface_reloc64(builder, state_offset, 8, surf->bo,
-               surf->surface[8], (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
+         ilo_builder_surface_reloc64(builder, state_offset, 8, surf->vma->bo,
+               surf->vma->bo_offset + surf->surface[8],
+               (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       state_align = 32;
@@ -1271,15 +1281,16 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
             ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw);
       memcpy(dw, surf->surface, state_len << 2);
 
-      if (surf->bo) {
+      if (surf->vma) {
          /*
           * For scanouts, we should not enable caching in LLC.  Since we only
           * enable that on Gen8+, we are fine here.
           */
          dw[5] |= builder->mocs << GEN6_SURFACE_DW5_MOCS__SHIFT;
 
-         ilo_builder_surface_reloc(builder, state_offset, 1, surf->bo,
-               surf->surface[1], (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
+         ilo_builder_surface_reloc(builder, state_offset, 1, surf->vma->bo,
+               surf->vma->bo_offset + surf->surface[1],
+               (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index af15e856028..77747ed7492 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -29,7 +29,6 @@
 #define ILO_IMAGE_H
 
 #include "genhw/genhw.h"
-#include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
@@ -141,13 +140,7 @@ struct ilo_image {
       unsigned walk_layer_height;
       unsigned bo_stride;
       unsigned bo_height;
-
-      /* managed by users */
-      struct intel_bo *bo;
    } aux;
-
-   /* managed by users */
-   struct intel_bo *bo;
 };
 
 struct pipe_resource;
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.c b/src/gallium/drivers/ilo/core/ilo_state_sol.c
index 38c0b719ab3..dd1ef5e7887 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_sol.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_sol.c
@@ -26,7 +26,7 @@
  */
 
 #include "ilo_debug.h"
-#include "ilo_buffer.h"
+#include "ilo_vma.h"
 #include "ilo_state_sol.h"
 
 static bool
@@ -270,9 +270,6 @@ sol_buffer_validate_gen7(const struct ilo_dev *dev,
 {
    ILO_DEV_ASSERT(dev, 7, 8);
 
-   if (info->buf)
-      assert(info->offset < info->buf->bo_size && info->size);
-
    /*
     * From the Ivy Bridge PRM, volume 2 part 1, page 208:
     *
@@ -281,9 +278,17 @@ sol_buffer_validate_gen7(const struct ilo_dev *dev,
     */
    assert(info->offset % 4 == 0);
 
+   if (info->vma) {
+      assert(info->vma->vm_alignment % 4 == 0);
+      assert(info->size && info->offset + info->size <= info->vma->vm_size);
+   }
+
    /* Gen8+ only */
-   if (info->write_offset_load || info->write_offset_save)
-      assert(ilo_dev_gen(dev) >= ILO_GEN(8));
+   if (info->write_offset_load || info->write_offset_save) {
+      assert(ilo_dev_gen(dev) >= ILO_GEN(8) && info->write_offset_vma);
+      assert(info->write_offset_offset + sizeof(uint32_t) <=
+            info->write_offset_vma->vm_size);
+   }
 
    /*
     * From the Broadwell PRM, volume 2b, page 206:
@@ -304,25 +309,15 @@ static uint32_t
 sol_buffer_get_gen6_size(const struct ilo_dev *dev,
                          const struct ilo_state_sol_buffer_info *info)
 {
-   uint32_t size;
-
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (!info->buf)
-      return 0;
-
-   size = (info->offset + info->size <= info->buf->bo_size) ? info->size :
-      info->buf->bo_size - info->offset;
-
    /*
     * From the Ivy Bridge PRM, volume 2 part 1, page 208:
     *
     *     "(Surface End Address) This field specifies the ending DWord
     *      address..."
     */
-   size &= ~3;
-
-   return size;
+   return (info->vma) ? info->size & ~3 : 0;
 }
 
 static bool
@@ -359,7 +354,7 @@ sol_buffer_set_gen8_3dstate_so_buffer(struct ilo_state_sol_buffer *sb,
 
    dw1 = 0;
 
-   if (info->buf)
+   if (info->vma)
       dw1 |= GEN8_SO_BUF_DW1_ENABLE;
    if (info->write_offset_load)
       dw1 |= GEN8_SO_BUF_DW1_OFFSET_WRITE_ENABLE;
@@ -443,9 +438,8 @@ ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
    else
       ret &= sol_buffer_set_gen7_3dstate_so_buffer(sb, dev, info);
 
-   sb->need_bo = (info->size > 0);
-   sb->need_write_offset_bo = (info->write_offset_save ||
-         (info->write_offset_load && !info->write_offset_imm_enable));
+   sb->vma = info->vma;
+   sb->write_offset_vma = info->write_offset_vma;
 
    assert(ret);
 
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.h b/src/gallium/drivers/ilo/core/ilo_state_sol.h
index 2513fcb4979..f0968b39e27 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_sol.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_sol.h
@@ -107,17 +107,17 @@ struct ilo_state_sol {
    uint8_t decl_count;
 };
 
-struct ilo_buffer;
+struct ilo_vma;
 
 struct ilo_state_sol_buffer_info {
-   const struct ilo_buffer *buf;
+   const struct ilo_vma *vma;
    uint32_t offset;
    uint32_t size;
 
-   /*
-    * Gen8+ only.  When enabled, require a write offset bo of at least
-    * (sizeof(uint32_t) * ILO_STATE_SOL_MAX_BUFFER_COUNT) bytes
-    */
+   /* Gen8+ only; at least sizeof(uint32_t) bytes */
+   const struct ilo_vma *write_offset_vma;
+   uint32_t write_offset_offset;
+
    bool write_offset_load;
    bool write_offset_save;
 
@@ -126,14 +126,10 @@ struct ilo_state_sol_buffer_info {
 };
 
 struct ilo_state_sol_buffer {
-   uint32_t so_buf[4];
+   uint32_t so_buf[5];
 
-   bool need_bo;
-   bool need_write_offset_bo;
-
-   /* managed by users */
-   struct intel_bo *bo;
-   struct intel_bo *write_offset_bo;
+   const struct ilo_vma *vma;
+   const struct ilo_vma *write_offset_vma;
 };
 
 static inline size_t
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c
index 5be9f8f6270..402bbf4b52a 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c
@@ -26,8 +26,8 @@
  */
 
 #include "ilo_debug.h"
-#include "ilo_buffer.h"
 #include "ilo_image.h"
+#include "ilo_vma.h"
 #include "ilo_state_surface.h"
 
 static bool
@@ -104,7 +104,7 @@ surface_validate_gen6_buffer(const struct ilo_dev *dev,
    if (ilo_dev_gen(dev) >= ILO_GEN(7))
       assert(info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB);
 
-   if (info->offset + info->size > info->buf->bo_size) {
+   if (info->offset + info->size > info->vma->vm_size) {
       ilo_warn("invalid buffer range\n");
       return false;
    }
@@ -155,7 +155,8 @@ surface_validate_gen6_buffer(const struct ilo_dev *dev,
    if (info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB) {
       assert(info->struct_size % info->format_size == 0);
 
-      if (info->offset % info->struct_size) {
+      if (info->offset % info->struct_size ||
+          info->vma->vm_alignment % info->struct_size) {
          ilo_warn("bad buffer offset\n");
          return false;
       }
@@ -177,7 +178,7 @@ surface_validate_gen6_buffer(const struct ilo_dev *dev,
        * Nothing is said about Untyped* messages, but I guess they require the
        * base address to be DWord aligned.
        */
-      if (info->offset % 4) {
+      if (info->offset % 4 || info->vma->vm_alignment % 4) {
          ilo_warn("bad RAW buffer offset\n");
          return false;
       }
@@ -408,6 +409,17 @@ surface_validate_gen6_image(const struct ilo_dev *dev,
       break;
    }
 
+   assert(info->img && info->vma);
+
+   if (info->img->tiling != GEN6_TILING_NONE)
+      assert(info->vma->vm_alignment % 4096 == 0);
+
+   if (info->aux_vma) {
+      assert(ilo_image_can_enable_aux(info->img, info->level_base));
+      /* always tiled */
+      assert(info->aux_vma->vm_alignment % 4096 == 0);
+   }
+
    /*
     * From the Sandy Bridge PRM, volume 4 part 1, page 78:
     *
@@ -1107,6 +1119,7 @@ ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
    else
       ret &= surface_set_gen6_null_SURFACE_STATE(surf, dev);
 
+   surf->vma = NULL;
    surf->type = GEN6_SURFTYPE_NULL;
    surf->readonly = true;
 
@@ -1129,6 +1142,7 @@ ilo_state_surface_init_for_buffer(struct ilo_state_surface *surf,
    else
       ret &= surface_set_gen6_buffer_SURFACE_STATE(surf, dev, info);
 
+   surf->vma = info->vma;
    surf->readonly = info->readonly;
 
    assert(ret);
@@ -1150,6 +1164,9 @@ ilo_state_surface_init_for_image(struct ilo_state_surface *surf,
    else
       ret &= surface_set_gen6_image_SURFACE_STATE(surf, dev, info);
 
+   surf->vma = info->vma;
+   surf->aux_vma = info->aux_vma;
+
    surf->is_integer = info->is_integer;
    surf->readonly = info->readonly;
    surf->scanout = info->img->scanout;
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.h b/src/gallium/drivers/ilo/core/ilo_state_surface.h
index 9c025428d50..b9921134a1e 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.h
@@ -29,14 +29,10 @@
 #define ILO_STATE_SURFACE_H
 
 #include "genhw/genhw.h"
-#include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
 
-struct ilo_buffer;
-struct ilo_image;
-
 enum ilo_state_surface_access {
    ILO_STATE_SURFACE_ACCESS_SAMPLER,      /* sampling engine surfaces */
    ILO_STATE_SURFACE_ACCESS_DP_RENDER,    /* render target surfaces */
@@ -46,8 +42,13 @@ enum ilo_state_surface_access {
    ILO_STATE_SURFACE_ACCESS_DP_SVB,
 };
 
+struct ilo_vma;
+struct ilo_image;
+
 struct ilo_state_surface_buffer_info {
-   const struct ilo_buffer *buf;
+   const struct ilo_vma *vma;
+   uint32_t offset;
+   uint32_t size;
 
    enum ilo_state_surface_access access;
 
@@ -56,13 +57,17 @@ struct ilo_state_surface_buffer_info {
 
    bool readonly;
    uint16_t struct_size;
-
-   uint32_t offset;
-   uint32_t size;
 };
 
 struct ilo_state_surface_image_info {
    const struct ilo_image *img;
+   uint8_t level_base;
+   uint8_t level_count;
+   uint16_t slice_base;
+   uint16_t slice_count;
+
+   const struct ilo_vma *vma;
+   const struct ilo_vma *aux_vma;
 
    enum ilo_state_surface_access access;
 
@@ -72,16 +77,14 @@ struct ilo_state_surface_image_info {
    bool readonly;
    bool is_cube_map;
    bool is_array;
-
-   uint8_t level_base;
-   uint8_t level_count;
-   uint16_t slice_base;
-   uint16_t slice_count;
 };
 
 struct ilo_state_surface {
    uint32_t surface[13];
 
+   const struct ilo_vma *vma;
+   const struct ilo_vma *aux_vma;
+
    enum gen_surface_type type;
    uint8_t min_lod;
    uint8_t mip_count;
@@ -89,9 +92,6 @@ struct ilo_state_surface {
 
    bool readonly;
    bool scanout;
-
-   /* managed by users */
-   struct intel_bo *bo;
 };
 
 bool
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.c b/src/gallium/drivers/ilo/core/ilo_state_vf.c
index ddc75428ed7..2dd72276e63 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_vf.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.c
@@ -26,7 +26,7 @@
  */
 
 #include "ilo_debug.h"
-#include "ilo_buffer.h"
+#include "ilo_vma.h"
 #include "ilo_state_vf.h"
 
 static bool
@@ -479,8 +479,8 @@ vertex_buffer_validate_gen6(const struct ilo_dev *dev,
 {
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (info->buf)
-      assert(info->offset < info->buf->bo_size && info->size);
+   if (info->vma)
+      assert(info->size && info->offset + info->size <= info->vma->vm_size);
 
    /*
     * From the Sandy Bridge PRM, volume 2 part 1, page 86:
@@ -500,6 +500,9 @@ vertex_buffer_validate_gen6(const struct ilo_dev *dev,
     *      aligned address, and BufferPitch must be a multiple of 64-bits."
     */
    if (info->cv_has_double) {
+      if (info->vma)
+         assert(info->vma->vm_alignment % 8 == 0);
+
       assert(info->stride % 8 == 0);
       assert((info->offset + info->cv_double_vertex_offset_mod_8) % 8 == 0);
    }
@@ -512,12 +515,7 @@ vertex_buffer_get_gen6_size(const struct ilo_dev *dev,
                             const struct ilo_state_vertex_buffer_info *info)
 {
    ILO_DEV_ASSERT(dev, 6, 8);
-
-   if (!info->buf)
-      return 0;
-
-   return (info->offset + info->size <= info->buf->bo_size) ? info->size :
-      info->buf->bo_size - info->offset;
+   return (info->vma) ? info->size : 0;
 }
 
 static bool
@@ -537,7 +535,7 @@ vertex_buffer_set_gen8_vertex_buffer_state(struct ilo_state_vertex_buffer *vb,
 
    if (ilo_dev_gen(dev) >= ILO_GEN(7))
       dw0 |= GEN7_VB_DW0_ADDR_MODIFIED;
-   if (!info->buf)
+   if (!info->vma)
       dw0 |= GEN6_VB_DW0_IS_NULL;
 
    STATIC_ASSERT(ARRAY_SIZE(vb->vb) >= 3);
@@ -551,7 +549,7 @@ vertex_buffer_set_gen8_vertex_buffer_state(struct ilo_state_vertex_buffer *vb,
       vb->vb[2] = (size) ? info->offset + size - 1 : 0;
    }
 
-   vb->need_bo = (info->buf != NULL);
+   vb->vma = info->vma;
 
    return true;
 }
@@ -586,8 +584,10 @@ index_buffer_validate_gen6(const struct ilo_dev *dev,
     */
    assert(info->offset % format_size == 0);
 
-   if (info->buf)
-      assert(info->offset < info->buf->bo_size && info->size);
+   if (info->vma) {
+      assert(info->vma->vm_alignment % format_size == 0);
+      assert(info->size && info->offset + info->size <= info->vma->vm_size);
+   }
 
    return true;
 }
@@ -600,12 +600,10 @@ index_buffer_get_gen6_size(const struct ilo_dev *dev,
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (!info->buf)
+   if (!info->vma)
       return 0;
 
-   size = (info->offset + info->size <= info->buf->bo_size) ? info->size :
-      info->buf->bo_size - info->offset;
-
+   size = info->size;
    if (ilo_dev_gen(dev) < ILO_GEN(8)) {
       const uint32_t format_size = get_index_format_size(info->format);
       size -= (size % format_size);
@@ -638,7 +636,7 @@ index_buffer_set_gen8_3DSTATE_INDEX_BUFFER(struct ilo_state_index_buffer *ib,
       ib->ib[2] = (size) ? info->offset + size - 1 : 0;
    }
 
-   ib->need_bo = (info->buf != NULL);
+   ib->vma = info->vma;
 
    return true;
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.h b/src/gallium/drivers/ilo/core/ilo_state_vf.h
index f15c63a248a..30734476435 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_vf.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.h
@@ -126,10 +126,10 @@ struct ilo_state_vf_delta {
    uint32_t dirty;
 };
 
-struct ilo_buffer;
+struct ilo_vma;
 
 struct ilo_state_vertex_buffer_info {
-   const struct ilo_buffer *buf;
+   const struct ilo_vma *vma;
    uint32_t offset;
    uint32_t size;
 
@@ -143,14 +143,11 @@ struct ilo_state_vertex_buffer_info {
 struct ilo_state_vertex_buffer {
    uint32_t vb[3];
 
-   bool need_bo;
-
-   /* managed by users */
-   struct intel_bo *bo;
+   const struct ilo_vma *vma;
 };
 
 struct ilo_state_index_buffer_info {
-   const struct ilo_buffer *buf;
+   const struct ilo_vma *vma;
    uint32_t offset;
    uint32_t size;
 
@@ -160,10 +157,7 @@ struct ilo_state_index_buffer_info {
 struct ilo_state_index_buffer {
    uint32_t ib[3];
 
-   bool need_bo;
-
-   /* managed by users */
-   struct intel_bo *bo;
+   const struct ilo_vma *vma;
 };
 
 static inline size_t
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.c b/src/gallium/drivers/ilo/core/ilo_state_zs.c
index 901fedb5599..7b82f1acf6f 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_zs.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.c
@@ -25,10 +25,9 @@
  *    Chia-I Wu <olv@lunarg.com>
  */
 
-#include "intel_winsys.h"
-
 #include "ilo_debug.h"
 #include "ilo_image.h"
+#include "ilo_vma.h"
 #include "ilo_state_zs.h"
 
 static bool
@@ -128,6 +127,24 @@ zs_validate_gen6(const struct ilo_dev *dev,
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
+   assert(!info->z_img == !info->z_vma);
+   assert(!info->s_img == !info->s_vma);
+
+   /* all tiled */
+   if (info->z_img) {
+      assert(info->z_img->tiling == GEN6_TILING_Y);
+      assert(info->z_vma->vm_alignment % 4096 == 0);
+   }
+   if (info->s_img) {
+      assert(info->s_img->tiling == GEN8_TILING_W);
+      assert(info->s_vma->vm_alignment % 4096 == 0);
+   }
+   if (info->hiz_vma) {
+      assert(info->z_img &&
+             ilo_image_can_enable_aux(info->z_img, info->level));
+      assert(info->z_vma->vm_alignment % 4096 == 0);
+   }
+
    /*
     * From the Ivy Bridge PRM, volume 2 part 1, page 315:
     *
@@ -146,11 +163,6 @@ zs_validate_gen6(const struct ilo_dev *dev,
    assert(info->level < img->level_count);
    assert(img->bo_stride);
 
-   if (info->hiz_enable) {
-      assert(info->z_img &&
-             ilo_image_can_enable_aux(info->z_img, info->level));
-   }
-
    if (info->is_cube_map) {
       assert(get_gen6_surface_type(dev, img) == GEN6_SURFTYPE_2D);
 
@@ -162,11 +174,6 @@ zs_validate_gen6(const struct ilo_dev *dev,
       assert(img->width0 == img->height0);
    }
 
-   if (info->z_img)
-      assert(info->z_img->tiling == GEN6_TILING_Y);
-   if (info->s_img)
-      assert(info->s_img->tiling == GEN8_TILING_W);
-
    return true;
 }
 
@@ -274,7 +281,7 @@ zs_get_gen6_depth_extent(const struct ilo_dev *dev,
    w = img->width0;
    h = img->height0;
 
-   if (info->hiz_enable) {
+   if (info->hiz_vma) {
       uint16_t align_w, align_h;
 
       get_gen6_hiz_alignments(dev, info->z_img, &align_w, &align_h);
@@ -439,7 +446,7 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
     *      to the same value (enabled or disabled) as Hierarchical Depth
     *      Buffer Enable."
     */
-   if (!info->hiz_enable && format == GEN6_ZFORMAT_D24_UNORM_X8_UINT)
+   if (!info->hiz_vma && format == GEN6_ZFORMAT_D24_UNORM_X8_UINT)
       format = GEN6_ZFORMAT_D24_UNORM_S8_UINT;
 
    /* info->z_readonly and info->s_readonly are ignored on Gen6 */
@@ -450,7 +457,7 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
    if (info->z_img)
       dw1 |= (info->z_img->bo_stride - 1) << GEN6_DEPTH_DW1_PITCH__SHIFT;
 
-   if (info->hiz_enable || !info->z_img) {
+   if (info->hiz_vma || !info->z_img) {
       dw1 |= GEN6_DEPTH_DW1_HIZ_ENABLE |
              GEN6_DEPTH_DW1_SEPARATE_STENCIL;
    }
@@ -508,7 +515,7 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
    if (info->z_img) {
       if (!info->z_readonly)
          dw1 |= GEN7_DEPTH_DW1_DEPTH_WRITE_ENABLE;
-      if (info->hiz_enable)
+      if (info->hiz_vma)
          dw1 |= GEN7_DEPTH_DW1_HIZ_ENABLE;
 
       dw1 |= (info->z_img->bo_stride - 1) << GEN7_DEPTH_DW1_PITCH__SHIFT;
@@ -683,11 +690,15 @@ ilo_state_zs_init(struct ilo_state_zs *zs, const struct ilo_dev *dev,
    else
       ret &= zs_set_gen6_null_3DSTATE_STENCIL_BUFFER(zs, dev);
 
-   if (info->z_img && info->hiz_enable)
+   if (info->z_img && info->hiz_vma)
       ret &= zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER(zs, dev, info);
    else
       ret &= zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
 
+   zs->z_vma = info->z_vma;
+   zs->s_vma = info->s_vma;
+   zs->hiz_vma = info->hiz_vma;
+
    zs->z_readonly = info->z_readonly;
    zs->s_readonly = info->s_readonly;
 
@@ -720,8 +731,11 @@ ilo_state_zs_disable_hiz(struct ilo_state_zs *zs,
     */
    assert(ilo_dev_gen(dev) >= ILO_GEN(7));
 
-   zs->depth[0] &= ~GEN7_DEPTH_DW1_HIZ_ENABLE;
-   zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
+   if (zs->hiz_vma) {
+      zs->depth[0] &= ~GEN7_DEPTH_DW1_HIZ_ENABLE;
+      zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
+      zs->hiz_vma = NULL;
+   }
 
    return true;
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.h b/src/gallium/drivers/ilo/core/ilo_state_zs.h
index 98212daf74f..6f32b7e2efe 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_zs.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.h
@@ -29,28 +29,30 @@
 #define ILO_STATE_ZS_H
 
 #include "genhw/genhw.h"
-#include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
 
+struct ilo_vma;
 struct ilo_image;
 
 struct ilo_state_zs_info {
-   /* both are optional */
+   /* both optional */
    const struct ilo_image *z_img;
    const struct ilo_image *s_img;
+   uint8_t level;
+   uint16_t slice_base;
+   uint16_t slice_count;
+
+   const struct ilo_vma *z_vma;
+   const struct ilo_vma *s_vma;
+   const struct ilo_vma *hiz_vma;
 
    /* ignored prior to Gen7 */
    bool z_readonly;
    bool s_readonly;
 
-   bool hiz_enable;
    bool is_cube_map;
-
-   uint8_t level;
-   uint16_t slice_base;
-   uint16_t slice_count;
 };
 
 struct ilo_state_zs {
@@ -58,16 +60,15 @@ struct ilo_state_zs {
    uint32_t stencil[3];
    uint32_t hiz[3];
 
+   const struct ilo_vma *z_vma;
+   const struct ilo_vma *s_vma;
+   const struct ilo_vma *hiz_vma;
+
    /* TODO move this to ilo_image */
    enum gen_depth_format depth_format;
 
    bool z_readonly;
    bool s_readonly;
-
-   /* managed by users */
-   struct intel_bo *depth_bo;
-   struct intel_bo *stencil_bo;
-   struct intel_bo *hiz_bo;
 };
 
 bool
diff --git a/src/gallium/drivers/ilo/core/ilo_vma.h b/src/gallium/drivers/ilo/core/ilo_vma.h
new file mode 100644
index 00000000000..ad2a1d4b33e
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_vma.h
@@ -0,0 +1,73 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_VMA_H
+#define ILO_VMA_H
+
+#include "ilo_core.h"
+#include "ilo_debug.h"
+#include "ilo_dev.h"
+
+struct intel_bo;
+
+/**
+ * A virtual memory area.
+ */
+struct ilo_vma {
+   /* address space */
+   uint32_t vm_size;
+   uint32_t vm_alignment;
+
+   /* backing storage */
+   struct intel_bo *bo;
+   uint32_t bo_offset;
+};
+
+static inline bool
+ilo_vma_init(struct ilo_vma *vma, const struct ilo_dev *dev,
+             uint32_t size, uint32_t alignment)
+{
+   assert(ilo_is_zeroed(vma, sizeof(*vma)));
+   assert(size && alignment);
+
+   vma->vm_alignment = alignment;
+   vma->vm_size = size;
+
+   return true;
+}
+
+static inline void
+ilo_vma_set_bo(struct ilo_vma *vma, const struct ilo_dev *dev,
+               struct intel_bo *bo, uint32_t offset)
+{
+   assert(offset % vma->vm_alignment == 0);
+
+   vma->bo = bo;
+   vma->bo_offset = offset;
+}
+
+#endif /* ILO_VMA_H */
diff --git a/src/gallium/drivers/ilo/ilo_blitter_blt.c b/src/gallium/drivers/ilo/ilo_blitter_blt.c
index d55dc35e360..52b4b25d827 100644
--- a/src/gallium/drivers/ilo/ilo_blitter_blt.c
+++ b/src/gallium/drivers/ilo/ilo_blitter_blt.c
@@ -127,7 +127,7 @@ ilo_blitter_blt_end(struct ilo_blitter *blitter, uint32_t swctrl)
 
 static bool
 buf_clear_region(struct ilo_blitter *blitter,
-                 struct ilo_buffer *buf, unsigned offset,
+                 struct ilo_buffer_resource *buf, unsigned offset,
                  uint32_t val, unsigned size,
                  enum gen6_blt_mask value_mask,
                  enum gen6_blt_mask write_mask)
@@ -140,8 +140,8 @@ buf_clear_region(struct ilo_blitter *blitter,
    if (offset % cpp || size % cpp)
       return false;
 
-   dst.bo = buf->bo;
-   dst.offset = offset;
+   dst.bo = buf->vma.bo;
+   dst.offset = buf->vma.bo_offset + offset;
 
    ilo_blitter_blt_begin(blitter, GEN6_COLOR_BLT__SIZE *
          (1 + size / 32764 / gen6_blt_max_scanlines),
@@ -179,25 +179,26 @@ buf_clear_region(struct ilo_blitter *blitter,
 
 static bool
 buf_copy_region(struct ilo_blitter *blitter,
-                struct ilo_buffer *dst_buf, unsigned dst_offset,
-                struct ilo_buffer *src_buf, unsigned src_offset,
+                struct ilo_buffer_resource *dst_buf, unsigned dst_offset,
+                struct ilo_buffer_resource *src_buf, unsigned src_offset,
                 unsigned size)
 {
    const uint8_t rop = 0xcc; /* SRCCOPY */
    struct ilo_builder *builder = &blitter->ilo->cp->builder;
    struct gen6_blt_bo dst, src;
 
-   dst.bo = dst_buf->bo;
-   dst.offset = dst_offset;
+   dst.bo = dst_buf->vma.bo;
+   dst.offset = dst_buf->vma.bo_offset + dst_offset;
    dst.pitch = 0;
 
-   src.bo = src_buf->bo;
-   src.offset = src_offset;
+   src.bo = src_buf->vma.bo;
+   src.offset = src_buf->vma.bo_offset + src_offset;
    src.pitch = 0;
 
    ilo_blitter_blt_begin(blitter, GEN6_SRC_COPY_BLT__SIZE *
          (1 + size / 32764 / gen6_blt_max_scanlines),
-         dst_buf->bo, GEN6_TILING_NONE, src_buf->bo, GEN6_TILING_NONE);
+         dst_buf->vma.bo, GEN6_TILING_NONE,
+         src_buf->vma.bo, GEN6_TILING_NONE);
 
    while (size) {
       unsigned width, height;
@@ -258,14 +259,14 @@ tex_clear_region(struct ilo_blitter *blitter,
    if (dst_box->width * cpp > gen6_blt_max_bytes_per_scanline)
       return false;
 
-   dst.bo = dst_tex->image.bo;
-   dst.offset = 0;
+   dst.bo = dst_tex->vma.bo;
+   dst.offset = dst_tex->vma.bo_offset;
    dst.pitch = dst_tex->image.bo_stride;
    dst.tiling = dst_tex->image.tiling;
 
    swctrl = ilo_blitter_blt_begin(blitter,
          GEN6_XY_COLOR_BLT__SIZE * dst_box->depth,
-         dst_tex->image.bo, dst_tex->image.tiling, NULL, GEN6_TILING_NONE);
+         dst_tex->vma.bo, dst_tex->image.tiling, NULL, GEN6_TILING_NONE);
 
    for (slice = 0; slice < dst_box->depth; slice++) {
       unsigned x, y;
@@ -347,13 +348,13 @@ tex_copy_region(struct ilo_blitter *blitter,
       break;
    }
 
-   dst.bo = dst_tex->image.bo;
-   dst.offset = 0;
+   dst.bo = dst_tex->vma.bo;
+   dst.offset = dst_tex->vma.bo_offset;
    dst.pitch = dst_tex->image.bo_stride;
    dst.tiling = dst_tex->image.tiling;
 
-   src.bo = src_tex->image.bo;
-   src.offset = 0;
+   src.bo = src_tex->vma.bo;
+   src.offset = src_tex->vma.bo_offset;
    src.pitch = src_tex->image.bo_stride;
    src.tiling = src_tex->image.tiling;
 
@@ -423,8 +424,8 @@ ilo_blitter_blt_copy_resource(struct ilo_blitter *blitter,
              src_box->height == 1 &&
              src_box->depth == 1);
 
-      success = buf_copy_region(blitter,
-            ilo_buffer(dst), dst_offset, ilo_buffer(src), src_offset, size);
+      success = buf_copy_region(blitter, ilo_buffer_resource(dst), dst_offset,
+            ilo_buffer_resource(src), src_offset, size);
    }
    else if (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER) {
       success = tex_copy_region(blitter,
@@ -488,7 +489,7 @@ ilo_blitter_blt_clear_rt(struct ilo_blitter *blitter,
       if (offset + size > end)
          size = end - offset;
 
-      success = buf_clear_region(blitter, ilo_buffer(rt->texture),
+      success = buf_clear_region(blitter, ilo_buffer_resource(rt->texture),
             offset, packed.ui[0], size, mask, mask);
    }
    else {
diff --git a/src/gallium/drivers/ilo/ilo_draw.c b/src/gallium/drivers/ilo/ilo_draw.c
index e8e1a4cd14c..433348d9326 100644
--- a/src/gallium/drivers/ilo/ilo_draw.c
+++ b/src/gallium/drivers/ilo/ilo_draw.c
@@ -444,6 +444,7 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
                          const struct pipe_draw_info *info)
 {
    const struct ilo_ib_state *ib = &ilo->state_vector.ib;
+   const struct ilo_vma *vma;
    union {
       const void *ptr;
       const uint8_t *u8;
@@ -453,10 +454,12 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
 
    /* we will draw with IB mapped */
    if (ib->state.buffer) {
-      u.ptr = intel_bo_map(ilo_buffer(ib->state.buffer)->bo, false);
+      vma = ilo_resource_get_vma(ib->state.buffer);
+      u.ptr = intel_bo_map(vma->bo, false);
       if (u.ptr)
-         u.u8 += ib->state.offset;
+         u.u8 += vma->bo_offset + ib->state.offset;
    } else {
+      vma = NULL;
       u.ptr = ib->state.user_buffer;
    }
 
@@ -500,8 +503,8 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
 
 #undef DRAW_VBO_WITH_SW_RESTART
 
-   if (ib->state.buffer)
-      intel_bo_unmap(ilo_buffer(ib->state.buffer)->bo);
+   if (vma)
+      intel_bo_unmap(vma->bo);
 }
 
 static bool
diff --git a/src/gallium/drivers/ilo/ilo_render_surface.c b/src/gallium/drivers/ilo/ilo_render_surface.c
index ad053564294..3bf8646b344 100644
--- a/src/gallium/drivers/ilo/ilo_render_surface.c
+++ b/src/gallium/drivers/ilo/ilo_render_surface.c
@@ -42,14 +42,17 @@ gen6_so_SURFACE_STATE(struct ilo_builder *builder,
                       const struct pipe_stream_output_info *so_info,
                       int so_index)
 {
-   struct ilo_buffer *buf = ilo_buffer(so->buffer);
    struct ilo_state_surface_buffer_info info;
    struct ilo_state_surface surf;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
    memset(&info, 0, sizeof(info));
-   info.buf = buf;
+
+   info.vma = ilo_resource_get_vma(so->buffer);
+   info.offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4;
+   info.size = so->buffer_size - so_info->output[so_index].dst_offset * 4;
+
    info.access = ILO_STATE_SURFACE_ACCESS_DP_SVB;
 
    switch (so_info->output[so_index].num_components) {
@@ -78,12 +81,9 @@ gen6_so_SURFACE_STATE(struct ilo_builder *builder,
 
    info.struct_size =
       so_info->stride[so_info->output[so_index].output_buffer] * 4;
-   info.offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4;
-   info.size = so->buffer_size - so_info->output[so_index].dst_offset * 4;
 
    memset(&surf, 0, sizeof(surf));
    ilo_state_surface_init_for_buffer(&surf, builder->dev, &info);
-   surf.bo = info.buf->bo;
 
    return gen6_SURFACE_STATE(builder, &surf);
 }
@@ -482,18 +482,19 @@ gen6_emit_launch_grid_surface_const(struct ilo_render *r,
       return;
 
    memset(&info, 0, sizeof(info));
-   info.buf = ilo_buffer(session->input->buffer);
+
+   info.vma = ilo_resource_get_vma(session->input->buffer);
+   info.offset = session->input->buffer_offset;
+   info.size = session->input->buffer_size;
+
    info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED;
    info.format = GEN6_FORMAT_RAW;
    info.format_size = 1;
    info.struct_size = 1;
    info.readonly = true;
-   info.offset = session->input->buffer_offset;
-   info.size = session->input->buffer_size;
 
    memset(&surf, 0, sizeof(surf));
    ilo_state_surface_init_for_buffer(&surf, r->dev, &info);
-   surf.bo = info.buf->bo;
 
    assert(count == 1 && session->input->buffer);
    surface_state[base] = gen6_SURFACE_STATE(r->builder, &surf);
@@ -538,23 +539,23 @@ gen6_emit_launch_grid_surface_global(struct ilo_render *r,
    surface_state += base;
    for (i = 0; i < count; i++) {
       if (i < vec->global_binding.count && bindings[i].resource) {
-         const struct ilo_buffer *buf = ilo_buffer(bindings[i].resource);
          struct ilo_state_surface_buffer_info info;
          struct ilo_state_surface surf;
 
          assert(bindings[i].resource->target == PIPE_BUFFER);
 
          memset(&info, 0, sizeof(info));
-         info.buf = buf;
+
+         info.vma = ilo_resource_get_vma(bindings[i].resource);
+         info.size = info.vma->vm_size;
+
          info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED;
          info.format = GEN6_FORMAT_RAW;
          info.format_size = 1;
          info.struct_size = 1;
-         info.size = buf->bo_size;
 
          memset(&surf, 0, sizeof(surf));
          ilo_state_surface_init_for_buffer(&surf, r->dev, &info);
-         surf.bo = info.buf->bo;
 
          surface_state[i] = gen6_SURFACE_STATE(r->builder, &surf);
       } else {
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index be9fd10a84c..065e665d895 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -178,8 +178,8 @@ tex_create_bo(struct ilo_texture *tex)
    if (!bo)
       return false;
 
-   intel_bo_unref(tex->image.bo);
-   tex->image.bo = bo;
+   intel_bo_unref(tex->vma.bo);
+   ilo_vma_set_bo(&tex->vma, &is->dev, bo, 0);
 
    return true;
 }
@@ -215,15 +215,16 @@ static bool
 tex_create_hiz(struct ilo_texture *tex)
 {
    const struct pipe_resource *templ = &tex->base;
+   const uint32_t size = tex->image.aux.bo_stride * tex->image.aux.bo_height;
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    struct intel_bo *bo;
 
-   bo = intel_winsys_alloc_bo(is->dev.winsys, "hiz texture",
-         tex->image.aux.bo_stride * tex->image.aux.bo_height, false);
+   bo = intel_winsys_alloc_bo(is->dev.winsys, "hiz texture", size, false);
    if (!bo)
       return false;
 
-   tex->image.aux.bo = bo;
+   ilo_vma_init(&tex->aux_vma, &is->dev, size, 4096);
+   ilo_vma_set_bo(&tex->aux_vma, &is->dev, bo, 0);
 
    if (tex->imported) {
       unsigned lv;
@@ -246,17 +247,18 @@ tex_create_hiz(struct ilo_texture *tex)
 static bool
 tex_create_mcs(struct ilo_texture *tex)
 {
+   const uint32_t size = tex->image.aux.bo_stride * tex->image.aux.bo_height;
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    struct intel_bo *bo;
 
    assert(tex->image.aux.enables == (1 << (tex->base.last_level + 1)) - 1);
 
-   bo = intel_winsys_alloc_bo(is->dev.winsys, "mcs texture",
-         tex->image.aux.bo_stride * tex->image.aux.bo_height, false);
+   bo = intel_winsys_alloc_bo(is->dev.winsys, "mcs texture", size, false);
    if (!bo)
       return false;
 
-   tex->image.aux.bo = bo;
+   ilo_vma_init(&tex->aux_vma, &is->dev, size, 4096);
+   ilo_vma_set_bo(&tex->aux_vma, &is->dev, bo, 0);
 
    return true;
 }
@@ -267,8 +269,8 @@ tex_destroy(struct ilo_texture *tex)
    if (tex->separate_s8)
       tex_destroy(tex->separate_s8);
 
-   intel_bo_unref(tex->image.bo);
-   intel_bo_unref(tex->image.aux.bo);
+   intel_bo_unref(tex->vma.bo);
+   intel_bo_unref(tex->aux_vma.bo);
 
    tex_free_slices(tex);
    FREE(tex);
@@ -327,7 +329,9 @@ tex_import_handle(struct ilo_texture *tex,
       return false;
    }
 
-   tex->image.bo = bo;
+   ilo_vma_init(&tex->vma, &is->dev,
+         tex->image.bo_stride * tex->image.bo_height, 4096);
+   ilo_vma_set_bo(&tex->vma, &is->dev, bo, 0);
 
    tex->imported = true;
 
@@ -347,6 +351,8 @@ tex_init_image(struct ilo_texture *tex,
          return false;
    } else {
       ilo_image_init(img, &is->dev, templ);
+      ilo_vma_init(&tex->vma, &is->dev,
+            img->bo_stride * img->bo_height, 4096);
    }
 
    if (img->bo_height > ilo_max_resource_size / img->bo_stride)
@@ -406,7 +412,7 @@ tex_get_handle(struct ilo_texture *tex, struct winsys_handle *handle)
    else
       tiling = surface_to_winsys_tiling(tex->image.tiling);
 
-   err = intel_winsys_export_handle(is->dev.winsys, tex->image.bo, tiling,
+   err = intel_winsys_export_handle(is->dev.winsys, tex->vma.bo, tiling,
          tex->image.bo_stride, tex->image.bo_height, handle);
 
    return !err;
@@ -425,8 +431,8 @@ buf_create_bo(struct ilo_buffer_resource *buf)
    if (!bo)
       return false;
 
-   intel_bo_unref(buf->buffer.bo);
-   buf->buffer.bo = bo;
+   intel_bo_unref(buf->vma.bo);
+   ilo_vma_set_bo(&buf->vma, &is->dev, bo, 0);
 
    return true;
 }
@@ -434,7 +440,7 @@ buf_create_bo(struct ilo_buffer_resource *buf)
 static void
 buf_destroy(struct ilo_buffer_resource *buf)
 {
-   intel_bo_unref(buf->buffer.bo);
+   intel_bo_unref(buf->vma.bo);
    FREE(buf);
 }
 
@@ -472,6 +478,7 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ)
       size = align(size, 4096);
 
    ilo_buffer_init(&buf->buffer, &is->dev, size, templ->bind, templ->flags);
+   ilo_vma_init(&buf->vma, &is->dev, buf->buffer.bo_size, 4096);
 
    if (buf->buffer.bo_size < templ->width0 ||
        buf->buffer.bo_size > ilo_max_resource_size ||
diff --git a/src/gallium/drivers/ilo/ilo_resource.h b/src/gallium/drivers/ilo/ilo_resource.h
index d602e0cbf70..0357499f44a 100644
--- a/src/gallium/drivers/ilo/ilo_resource.h
+++ b/src/gallium/drivers/ilo/ilo_resource.h
@@ -31,6 +31,7 @@
 #include "core/intel_winsys.h"
 #include "core/ilo_buffer.h"
 #include "core/ilo_image.h"
+#include "core/ilo_vma.h"
 
 #include "ilo_common.h"
 #include "ilo_screen.h"
@@ -93,6 +94,8 @@ struct ilo_texture {
    bool imported;
 
    struct ilo_image image;
+   struct ilo_vma vma;
+   struct ilo_vma aux_vma;
 
    /* XXX thread-safety */
    struct ilo_texture_slice *slices[PIPE_MAX_TEXTURE_LEVELS];
@@ -104,13 +107,14 @@ struct ilo_buffer_resource {
    struct pipe_resource base;
 
    struct ilo_buffer buffer;
+   struct ilo_vma vma;
 };
 
-static inline struct ilo_buffer *
-ilo_buffer(struct pipe_resource *res)
+static inline struct ilo_buffer_resource *
+ilo_buffer_resource(struct pipe_resource *res)
 {
-   return (res && res->target == PIPE_BUFFER) ?
-      &((struct ilo_buffer_resource *) res)->buffer : NULL;
+   return (struct ilo_buffer_resource *)
+      ((res && res->target == PIPE_BUFFER) ? res : NULL);
 }
 
 static inline struct ilo_texture *
@@ -127,13 +131,14 @@ bool
 ilo_resource_rename_bo(struct pipe_resource *res);
 
 /**
- * Return the bo of the resource.
+ * Return the VMA of the resource.
  */
-static inline struct intel_bo *
-ilo_resource_get_bo(struct pipe_resource *res)
+static inline const struct ilo_vma *
+ilo_resource_get_vma(struct pipe_resource *res)
 {
    return (res->target == PIPE_BUFFER) ?
-      ilo_buffer(res)->bo : ilo_texture(res)->image.bo;
+      &((struct ilo_buffer_resource *) res)->vma :
+      &((struct ilo_texture *) res)->vma;
 }
 
 static inline struct ilo_texture_slice *
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index 63534f33fa7..24ab59aa32b 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -379,13 +379,12 @@ finalize_cbuf_state(struct ilo_context *ilo,
       u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size,
             cbuf->cso[i].user_buffer, &offset, &cbuf->cso[i].resource);
 
-      cbuf->cso[i].info.buf = ilo_buffer(cbuf->cso[i].resource);
+      cbuf->cso[i].info.vma = ilo_resource_get_vma(cbuf->cso[i].resource);
       cbuf->cso[i].info.offset = offset;
 
       memset(&cbuf->cso[i].surface, 0, sizeof(cbuf->cso[i].surface));
       ilo_state_surface_init_for_buffer(&cbuf->cso[i].surface,
             ilo->dev, &cbuf->cso[i].info);
-      cbuf->cso[i].surface.bo = cbuf->cso[i].info.buf->bo;
 
       ilo->state_vector.dirty |= ILO_DIRTY_CBUF;
    }
@@ -466,11 +465,9 @@ finalize_index_buffer(struct ilo_context *ilo)
 
    memset(&info, 0, sizeof(info));
    if (vec->ib.hw_resource) {
-      info.buf = ilo_buffer(vec->ib.hw_resource);
-      info.size = info.buf->bo_size;
+      info.vma = ilo_resource_get_vma(vec->ib.hw_resource);
+      info.size = info.vma->vm_size;
       info.format = ilo_translate_index_size(vec->ib.hw_index_size);
-
-      vec->ib.ib.bo = info.buf->bo;
    }
 
    ilo_state_index_buffer_set_info(&vec->ib.ib, dev, &info);
@@ -532,13 +529,11 @@ finalize_vertex_buffers(struct ilo_context *ilo)
       const struct pipe_vertex_buffer *cso = &vec->vb.states[pipe_idx];
 
       if (cso->buffer) {
-         info.buf = ilo_buffer(cso->buffer);
+         info.vma = ilo_resource_get_vma(cso->buffer);
          info.offset = cso->buffer_offset;
-         info.size = info.buf->bo_size;
+         info.size = info.vma->vm_size - cso->buffer_offset;
 
          info.stride = cso->stride;
-
-         vec->vb.vb[i].bo = info.buf->bo;
       } else {
          memset(&info, 0, sizeof(info));
       }
@@ -1566,24 +1561,23 @@ ilo_set_constant_buffer(struct pipe_context *pipe,
          cso->info.size = buf[i].buffer_size;
 
          if (buf[i].buffer) {
-            cso->info.buf = ilo_buffer(buf[i].buffer);
+            cso->info.vma = ilo_resource_get_vma(buf[i].buffer);
             cso->info.offset = buf[i].buffer_offset;
 
             memset(&cso->surface, 0, sizeof(cso->surface));
             ilo_state_surface_init_for_buffer(&cso->surface, dev, &cso->info);
-            cso->surface.bo = cso->info.buf->bo;
 
             cso->user_buffer = NULL;
 
             cbuf->enabled_mask |= 1 << (index + i);
          } else if (buf[i].user_buffer) {
-            cso->info.buf = NULL;
+            cso->info.vma = NULL;
             /* buffer_offset does not apply for user buffer */
             cso->user_buffer = buf[i].user_buffer;
 
             cbuf->enabled_mask |= 1 << (index + i);
          } else {
-            cso->info.buf = NULL;
+            cso->info.vma = NULL;
             cso->info.size = 0;
             cso->user_buffer = NULL;
 
@@ -1596,7 +1590,7 @@ ilo_set_constant_buffer(struct pipe_context *pipe,
 
          pipe_resource_reference(&cso->resource, NULL);
 
-         cso->info.buf = NULL;
+         cso->info.vma = NULL;
          cso->info.size = 0;
          cso->user_buffer = NULL;
 
@@ -1706,7 +1700,7 @@ ilo_set_framebuffer_state(struct pipe_context *pipe,
       const struct ilo_surface_cso *cso =
          (const struct ilo_surface_cso *) state->zsbuf;
 
-      fb->has_hiz = cso->u.zs.hiz_bo;
+      fb->has_hiz = cso->u.zs.hiz_vma;
       fb->depth_offset_format =
          ilo_state_zs_get_depth_format(&cso->u.zs, dev);
    } else {
@@ -1945,12 +1939,11 @@ ilo_create_stream_output_target(struct pipe_context *pipe,
    target->base.buffer_size = buffer_size;
 
    memset(&info, 0, sizeof(info));
-   info.buf = ilo_buffer(res);
+   info.vma = ilo_resource_get_vma(res);
    info.offset = buffer_offset;
    info.size = buffer_size;
 
    ilo_state_sol_buffer_init(&target->sb, dev, &info);
-   target->sb.bo = info.buf->bo;
 
    return &target->base;
 }
@@ -2018,18 +2011,17 @@ ilo_create_sampler_view(struct pipe_context *pipe,
       struct ilo_state_surface_buffer_info info;
 
       memset(&info, 0, sizeof(info));
-      info.buf = ilo_buffer(res);
+      info.vma = ilo_resource_get_vma(res);
+      info.offset = templ->u.buf.first_element * info.struct_size;
+      info.size = (templ->u.buf.last_element -
+            templ->u.buf.first_element + 1) * info.struct_size;
       info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER;
       info.format = ilo_format_translate_color(dev, templ->format);
       info.format_size = util_format_get_blocksize(templ->format);
       info.struct_size = info.format_size;
       info.readonly = true;
-      info.offset = templ->u.buf.first_element * info.struct_size;
-      info.size = (templ->u.buf.last_element -
-            templ->u.buf.first_element + 1) * info.struct_size;
 
       ilo_state_surface_init_for_buffer(&view->surface, dev, &info);
-      view->surface.bo = info.buf->bo;
    } else {
       struct ilo_texture *tex = ilo_texture(res);
       struct ilo_state_surface_image_info info;
@@ -2042,8 +2034,16 @@ ilo_create_sampler_view(struct pipe_context *pipe,
       }
 
       memset(&info, 0, sizeof(info));
-      info.img = &tex->image;
 
+      info.img = &tex->image;
+      info.level_base = templ->u.tex.first_level;
+      info.level_count = templ->u.tex.last_level -
+         templ->u.tex.first_level + 1;
+      info.slice_base = templ->u.tex.first_layer;
+      info.slice_count = templ->u.tex.last_layer -
+         templ->u.tex.first_layer + 1;
+
+      info.vma = &tex->vma;
       info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER;
 
       if (templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
@@ -2059,15 +2059,7 @@ ilo_create_sampler_view(struct pipe_context *pipe,
       info.is_array = util_resource_is_array_texture(&tex->base);
       info.readonly = true;
 
-      info.level_base = templ->u.tex.first_level;
-      info.level_count = templ->u.tex.last_level -
-         templ->u.tex.first_level + 1;
-      info.slice_base = templ->u.tex.first_layer;
-      info.slice_count = templ->u.tex.last_layer -
-         templ->u.tex.first_layer + 1;
-
       ilo_state_surface_init_for_image(&view->surface, dev, &info);
-      view->surface.bo = info.img->bo;
    }
 
    return &view->base;
@@ -2111,18 +2103,23 @@ ilo_create_surface(struct pipe_context *pipe,
       assert(tex->base.target != PIPE_BUFFER);
 
       memset(&info, 0, sizeof(info));
+
       info.img = &tex->image;
-      info.access = ILO_STATE_SURFACE_ACCESS_DP_RENDER;
-      info.format = ilo_format_translate_render(dev, templ->format);
-      info.is_array = util_resource_is_array_texture(&tex->base);
       info.level_base = templ->u.tex.level;
       info.level_count = 1;
       info.slice_base = templ->u.tex.first_layer;
       info.slice_count = templ->u.tex.last_layer -
          templ->u.tex.first_layer + 1;
 
+      info.vma = &tex->vma;
+      if (ilo_image_can_enable_aux(&tex->image, templ->u.tex.level))
+         info.aux_vma = &tex->aux_vma;
+
+      info.access = ILO_STATE_SURFACE_ACCESS_DP_RENDER;
+      info.format = ilo_format_translate_render(dev, templ->format);
+      info.is_array = util_resource_is_array_texture(&tex->base);
+
       ilo_state_surface_init_for_image(&surf->u.rt, dev, &info);
-      surf->u.rt.bo = info.img->bo;
    } else {
       struct ilo_state_zs_info info;
 
@@ -2131,13 +2128,19 @@ ilo_create_surface(struct pipe_context *pipe,
       memset(&info, 0, sizeof(info));
 
       if (templ->format == PIPE_FORMAT_S8_UINT) {
+         info.s_vma = &tex->vma;
          info.s_img = &tex->image;
       } else {
+         info.z_vma = &tex->vma;
          info.z_img = &tex->image;
-         info.s_img = (tex->separate_s8) ? &tex->separate_s8->image : NULL;
 
-         info.hiz_enable =
-            ilo_image_can_enable_aux(&tex->image, templ->u.tex.level);
+         if (tex->separate_s8) {
+            info.s_vma = &tex->separate_s8->vma;
+            info.s_img = &tex->separate_s8->image;
+         }
+
+         if (ilo_image_can_enable_aux(&tex->image, templ->u.tex.level))
+            info.hiz_vma = &tex->aux_vma;
       }
 
       info.level = templ->u.tex.level;
@@ -2146,15 +2149,6 @@ ilo_create_surface(struct pipe_context *pipe,
          templ->u.tex.first_layer + 1;
 
       ilo_state_zs_init(&surf->u.zs, dev, &info);
-
-      if (info.z_img) {
-         surf->u.zs.depth_bo = info.z_img->bo;
-         if (info.hiz_enable)
-            surf->u.zs.hiz_bo = info.z_img->aux.bo;
-      }
-
-      if (info.s_img)
-         surf->u.zs.stencil_bo = info.s_img->bo;
    }
 
    return &surf->base;
@@ -2451,7 +2445,6 @@ void
 ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
                                   struct pipe_resource *res)
 {
-   struct intel_bo *bo = ilo_resource_get_bo(res);
    uint32_t states = 0;
    unsigned sh, i;
 
@@ -2482,10 +2475,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
 
       for (i = 0; i < vec->so.count; i++) {
          if (vec->so.states[i]->buffer == res) {
-            struct ilo_stream_output_target *target =
-               (struct ilo_stream_output_target *) vec->so.states[i];
-
-            target->sb.bo = ilo_buffer(res)->bo;
             states |= ILO_DIRTY_SO;
             break;
          }
@@ -2503,7 +2492,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
                [PIPE_SHADER_GEOMETRY]  = ILO_DIRTY_VIEW_GS,
                [PIPE_SHADER_COMPUTE]   = ILO_DIRTY_VIEW_CS,
             };
-            cso->surface.bo = bo;
 
             states |= view_dirty_bits[sh];
             break;
@@ -2515,7 +2503,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
             struct ilo_cbuf_cso *cbuf = &vec->cbuf[sh].cso[i];
 
             if (cbuf->resource == res) {
-               cbuf->surface.bo = bo;
                states |= ILO_DIRTY_CBUF;
                break;
             }
@@ -2528,7 +2515,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
          (struct ilo_surface_cso *) vec->resource.states[i];
 
       if (cso->base.texture == res) {
-         cso->u.rt.bo = bo;
          states |= ILO_DIRTY_RESOURCE;
          break;
       }
@@ -2540,27 +2526,19 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
          struct ilo_surface_cso *cso =
             (struct ilo_surface_cso *) vec->fb.state.cbufs[i];
          if (cso && cso->base.texture == res) {
-            cso->u.rt.bo = bo;
             states |= ILO_DIRTY_FB;
             break;
          }
       }
 
-      if (vec->fb.state.zsbuf && vec->fb.state.zsbuf->texture == res) {
-         struct ilo_surface_cso *cso =
-            (struct ilo_surface_cso *) vec->fb.state.zsbuf;
-
-         cso->u.zs.depth_bo = bo;
-
+      if (vec->fb.state.zsbuf && vec->fb.state.zsbuf->texture == res)
          states |= ILO_DIRTY_FB;
-      }
    }
 
    for (i = 0; i < vec->cs_resource.count; i++) {
       struct ilo_surface_cso *cso =
          (struct ilo_surface_cso *) vec->cs_resource.states[i];
       if (cso->base.texture == res) {
-         cso->u.rt.bo = bo;
          states |= ILO_DIRTY_CS_RESOURCE;
          break;
       }
diff --git a/src/gallium/drivers/ilo/ilo_transfer.c b/src/gallium/drivers/ilo/ilo_transfer.c
index ec41473f94a..be5aeee8e23 100644
--- a/src/gallium/drivers/ilo/ilo_transfer.c
+++ b/src/gallium/drivers/ilo/ilo_transfer.c
@@ -268,23 +268,27 @@ xfer_alloc_staging_sys(struct ilo_transfer *xfer)
 static void *
 xfer_map(struct ilo_transfer *xfer)
 {
+   const struct ilo_vma *vma;
    void *ptr;
 
    switch (xfer->method) {
    case ILO_TRANSFER_MAP_CPU:
-      ptr = intel_bo_map(ilo_resource_get_bo(xfer->base.resource),
-            xfer->base.usage & PIPE_TRANSFER_WRITE);
+      vma = ilo_resource_get_vma(xfer->base.resource);
+      ptr = intel_bo_map(vma->bo, xfer->base.usage & PIPE_TRANSFER_WRITE);
       break;
    case ILO_TRANSFER_MAP_GTT:
-      ptr = intel_bo_map_gtt(ilo_resource_get_bo(xfer->base.resource));
+      vma = ilo_resource_get_vma(xfer->base.resource);
+      ptr = intel_bo_map_gtt(vma->bo);
       break;
    case ILO_TRANSFER_MAP_GTT_ASYNC:
-      ptr = intel_bo_map_gtt_async(ilo_resource_get_bo(xfer->base.resource));
+      vma = ilo_resource_get_vma(xfer->base.resource);
+      ptr = intel_bo_map_gtt_async(vma->bo);
       break;
    case ILO_TRANSFER_MAP_STAGING:
       {
          const struct ilo_screen *is = ilo_screen(xfer->staging.res->screen);
-         struct intel_bo *bo = ilo_resource_get_bo(xfer->staging.res);
+
+         vma = ilo_resource_get_vma(xfer->staging.res);
 
          /*
           * We want a writable, optionally persistent and coherent, mapping
@@ -292,25 +296,29 @@ xfer_map(struct ilo_transfer *xfer)
           * this turns out to be fairly simple.
           */
          if (is->dev.has_llc)
-            ptr = intel_bo_map(bo, true);
+            ptr = intel_bo_map(vma->bo, true);
          else
-            ptr = intel_bo_map_gtt(bo);
+            ptr = intel_bo_map_gtt(vma->bo);
 
          if (ptr && xfer->staging.res->target == PIPE_BUFFER)
             ptr += (xfer->base.box.x % ILO_TRANSFER_MAP_BUFFER_ALIGNMENT);
-
       }
       break;
    case ILO_TRANSFER_MAP_SW_CONVERT:
    case ILO_TRANSFER_MAP_SW_ZS:
+      vma = NULL;
       ptr = xfer->staging.sys;
       break;
    default:
       assert(!"unknown mapping method");
+      vma = NULL;
       ptr = NULL;
       break;
    }
 
+   if (ptr && vma)
+      ptr = (void *) ((char *) ptr + vma->bo_offset);
+
    return ptr;
 }
 
@@ -324,10 +332,10 @@ xfer_unmap(struct ilo_transfer *xfer)
    case ILO_TRANSFER_MAP_CPU:
    case ILO_TRANSFER_MAP_GTT:
    case ILO_TRANSFER_MAP_GTT_ASYNC:
-      intel_bo_unmap(ilo_resource_get_bo(xfer->base.resource));
+      intel_bo_unmap(ilo_resource_get_vma(xfer->base.resource)->bo);
       break;
    case ILO_TRANSFER_MAP_STAGING:
-      intel_bo_unmap(ilo_resource_get_bo(xfer->staging.res));
+      intel_bo_unmap(ilo_resource_get_vma(xfer->staging.res)->bo);
       break;
    default:
       break;
@@ -541,9 +549,12 @@ tex_staging_sys_map_bo(struct ilo_texture *tex,
 
    if (prefer_cpu && (tex->image.tiling == GEN6_TILING_NONE ||
                       !linear_view))
-      ptr = intel_bo_map(tex->image.bo, !for_read_back);
+      ptr = intel_bo_map(tex->vma.bo, !for_read_back);
    else
-      ptr = intel_bo_map_gtt(tex->image.bo);
+      ptr = intel_bo_map_gtt(tex->vma.bo);
+
+   if (ptr)
+      ptr = (void *) ((char *) ptr + tex->vma.bo_offset);
 
    return ptr;
 }
@@ -551,7 +562,7 @@ tex_staging_sys_map_bo(struct ilo_texture *tex,
 static void
 tex_staging_sys_unmap_bo(struct ilo_texture *tex)
 {
-   intel_bo_unmap(tex->image.bo);
+   intel_bo_unmap(tex->vma.bo);
 }
 
 static bool
@@ -1055,7 +1066,7 @@ choose_transfer_method(struct ilo_context *ilo, struct ilo_transfer *xfer)
       return false;
 
    /* see if we can avoid blocking */
-   if (is_bo_busy(ilo, ilo_resource_get_bo(res), &need_submit)) {
+   if (is_bo_busy(ilo, ilo_resource_get_vma(res)->bo, &need_submit)) {
       bool resource_renamed;
 
       if (!xfer_unblock(xfer, &resource_renamed)) {
@@ -1078,11 +1089,11 @@ static void
 buf_pwrite(struct ilo_context *ilo, struct pipe_resource *res,
            unsigned usage, int offset, int size, const void *data)
 {
-   struct ilo_buffer *buf = ilo_buffer(res);
+   struct ilo_buffer_resource *buf = ilo_buffer_resource(res);
    bool need_submit;
 
    /* see if we can avoid blocking */
-   if (is_bo_busy(ilo, buf->bo, &need_submit)) {
+   if (is_bo_busy(ilo, buf->vma.bo, &need_submit)) {
       bool unblocked = false;
 
       if ((usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) &&
@@ -1103,9 +1114,12 @@ buf_pwrite(struct ilo_context *ilo, struct pipe_resource *res,
          templ.bind = PIPE_BIND_TRANSFER_WRITE;
          staging = ilo->base.screen->resource_create(ilo->base.screen, &templ);
          if (staging) {
+            const struct ilo_vma *staging_vma = ilo_resource_get_vma(staging);
             struct pipe_box staging_box;
 
-            intel_bo_pwrite(ilo_buffer(staging)->bo, 0, size, data);
+            /* offset by staging_vma->bo_offset for pwrite */
+            intel_bo_pwrite(staging_vma->bo, staging_vma->bo_offset,
+                  size, data);
 
             u_box_1d(0, size, &staging_box);
             ilo_blitter_blt_copy_resource(ilo->blitter,
@@ -1123,7 +1137,8 @@ buf_pwrite(struct ilo_context *ilo, struct pipe_resource *res,
          ilo_cp_submit(ilo->cp, "syncing for pwrites");
    }
 
-   intel_bo_pwrite(buf->bo, offset, size, data);
+   /* offset by buf->vma.bo_offset for pwrite */
+   intel_bo_pwrite(buf->vma.bo, buf->vma.bo_offset + offset, size, data);
 }
 
 static void

From 9871646c132ba137709b0bfebfe285985dc351e6 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Fri, 26 Jun 2015 13:08:32 +0800
Subject: [PATCH 0037/1208] ilo: remove ilo_buffer

Since the addition of ilo_vma, it was used only to pad a bo for sampling
engine surfaces.  Replace it entirely with these functions

  ilo_state_surface_buffer_size()
  ilo_state_vertex_buffer_size()
  ilo_state_index_buffer_size()
  ilo_state_sol_buffer_size()
---
 src/gallium/drivers/ilo/Makefile.sources      |  1 -
 src/gallium/drivers/ilo/core/ilo_buffer.h     | 59 -------------------
 src/gallium/drivers/ilo/core/ilo_state_sol.c  |  9 +++
 src/gallium/drivers/ilo/core/ilo_state_sol.h  |  4 ++
 .../drivers/ilo/core/ilo_state_surface.c      | 48 +++++++++++++++
 .../drivers/ilo/core/ilo_state_surface.h      |  5 ++
 src/gallium/drivers/ilo/core/ilo_state_vf.c   | 18 ++++++
 src/gallium/drivers/ilo/core/ilo_state_vf.h   |  8 +++
 src/gallium/drivers/ilo/ilo_resource.c        | 22 +++++--
 src/gallium/drivers/ilo/ilo_resource.h        |  3 +-
 10 files changed, 109 insertions(+), 68 deletions(-)
 delete mode 100644 src/gallium/drivers/ilo/core/ilo_buffer.h

diff --git a/src/gallium/drivers/ilo/Makefile.sources b/src/gallium/drivers/ilo/Makefile.sources
index 35d76bd4948..7a7db938f92 100644
--- a/src/gallium/drivers/ilo/Makefile.sources
+++ b/src/gallium/drivers/ilo/Makefile.sources
@@ -1,5 +1,4 @@
 C_SOURCES := \
-	core/ilo_buffer.h \
 	core/ilo_builder.c \
 	core/ilo_builder.h \
 	core/ilo_builder_3d.h \
diff --git a/src/gallium/drivers/ilo/core/ilo_buffer.h b/src/gallium/drivers/ilo/core/ilo_buffer.h
deleted file mode 100644
index f2fb63064c0..00000000000
--- a/src/gallium/drivers/ilo/core/ilo_buffer.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2012-2013 LunarG, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <olv@lunarg.com>
- */
-
-#ifndef ILO_BUFFER_H
-#define ILO_BUFFER_H
-
-#include "ilo_core.h"
-#include "ilo_debug.h"
-#include "ilo_dev.h"
-
-struct ilo_buffer {
-   unsigned bo_size;
-};
-
-static inline void
-ilo_buffer_init(struct ilo_buffer *buf, const struct ilo_dev *dev,
-                unsigned size, uint32_t bind, uint32_t flags)
-{
-   assert(ilo_is_zeroed(buf, sizeof(*buf)));
-
-   buf->bo_size = size;
-
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 118:
-    *
-    *     "For buffers, which have no inherent "height," padding requirements
-    *      are different. A buffer must be padded to the next multiple of 256
-    *      array elements, with an additional 16 bytes added beyond that to
-    *      account for the L1 cache line."
-    */
-   if (bind & PIPE_BIND_SAMPLER_VIEW)
-      buf->bo_size = align(buf->bo_size, 256) + 16;
-}
-
-#endif /* ILO_BUFFER_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.c b/src/gallium/drivers/ilo/core/ilo_state_sol.c
index dd1ef5e7887..6ef2c91a592 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_sol.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_sol.c
@@ -424,6 +424,15 @@ ilo_state_sol_init_disabled(struct ilo_state_sol *sol,
    return ilo_state_sol_init(sol, dev, &info);
 }
 
+uint32_t
+ilo_state_sol_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                          uint32_t *alignment)
+{
+   /* DWord aligned without padding */
+   *alignment = 4;
+   return size;
+}
+
 bool
 ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
                           const struct ilo_dev *dev,
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.h b/src/gallium/drivers/ilo/core/ilo_state_sol.h
index f0968b39e27..92c5f94725b 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_sol.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_sol.h
@@ -150,6 +150,10 @@ ilo_state_sol_init_disabled(struct ilo_state_sol *sol,
                             const struct ilo_dev *dev,
                             bool render_disable);
 
+uint32_t
+ilo_state_sol_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                          uint32_t *alignment);
+
 bool
 ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
                           const struct ilo_dev *dev,
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c
index 402bbf4b52a..d3581749e56 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c
@@ -1106,6 +1106,54 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
    return true;
 }
 
+uint32_t
+ilo_state_surface_buffer_size(const struct ilo_dev *dev,
+                              enum ilo_state_surface_access access,
+                              uint32_t size, uint32_t *alignment)
+{
+   switch (access) {
+   case ILO_STATE_SURFACE_ACCESS_SAMPLER:
+      /*
+       * From the Sandy Bridge PRM, volume 1 part 1, page 118:
+       *
+       *     "For buffers, which have no inherent "height," padding
+       *      requirements are different. A buffer must be padded to the next
+       *      multiple of 256 array elements, with an additional 16 bytes
+       *      added beyond that to account for the L1 cache line."
+       *
+       * Assuming tightly packed GEN6_FORMAT_R32G32B32A32_FLOAT, the size
+       * needs to be padded to 4096 (= 16 * 256).
+       */
+      *alignment = 1;
+      size = align(size, 4096) + 16;
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_RENDER:
+   case ILO_STATE_SURFACE_ACCESS_DP_TYPED:
+      /* element-size aligned for worst cases */
+      *alignment = 16;
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED:
+      /* DWord aligned? */
+      *alignment = 4;
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_DATA:
+      /* OWord aligned */
+      *alignment = 16;
+      size = align(size, 16);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_SVB:
+      /* always DWord aligned */
+      *alignment = 4;
+      break;
+   default:
+      assert(!"unknown access");
+      *alignment = 1;
+      break;
+   }
+
+   return size;
+}
+
 bool
 ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
                                 const struct ilo_dev *dev)
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.h b/src/gallium/drivers/ilo/core/ilo_state_surface.h
index b9921134a1e..0cda08eb031 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.h
@@ -99,6 +99,11 @@ ilo_state_surface_valid_format(const struct ilo_dev *dev,
                                enum ilo_state_surface_access access,
                                enum gen_surface_format format);
 
+uint32_t
+ilo_state_surface_buffer_size(const struct ilo_dev *dev,
+                              enum ilo_state_surface_access access,
+                              uint32_t size, uint32_t *alignment);
+
 bool
 ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
                                 const struct ilo_dev *dev);
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.c b/src/gallium/drivers/ilo/core/ilo_state_vf.c
index 2dd72276e63..9faf835fef2 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_vf.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.c
@@ -947,6 +947,15 @@ ilo_state_vf_get_delta(const struct ilo_state_vf *vf,
    }
 }
 
+uint32_t
+ilo_state_vertex_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                             uint32_t *alignment)
+{
+   /* align for doubles without padding */
+   *alignment = 8;
+   return size;
+}
+
 /**
  * No need to initialize first.
  */
@@ -964,6 +973,15 @@ ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb,
    return ret;
 }
 
+uint32_t
+ilo_state_index_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                            uint32_t *alignment)
+{
+   /* align for the worst case without padding */
+   *alignment = get_index_format_size(GEN6_INDEX_DWORD);
+   return size;
+}
+
 /**
  * No need to initialize first.
  */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.h b/src/gallium/drivers/ilo/core/ilo_state_vf.h
index 30734476435..16b128bf63c 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_vf.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.h
@@ -209,11 +209,19 @@ ilo_state_vf_get_delta(const struct ilo_state_vf *vf,
                        const struct ilo_state_vf *old,
                        struct ilo_state_vf_delta *delta);
 
+uint32_t
+ilo_state_vertex_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                             uint32_t *alignment);
+
 bool
 ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb,
                                  const struct ilo_dev *dev,
                                  const struct ilo_state_vertex_buffer_info *info);
 
+uint32_t
+ilo_state_index_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                            uint32_t *alignment);
+
 bool
 ilo_state_index_buffer_set_info(struct ilo_state_index_buffer *ib,
                                 const struct ilo_dev *dev,
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index 065e665d895..a0074e57e99 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -25,6 +25,10 @@
  *    Chia-I Wu <olv@lunarg.com>
  */
 
+#include "core/ilo_state_vf.h"
+#include "core/ilo_state_sol.h"
+#include "core/ilo_state_surface.h"
+
 #include "ilo_screen.h"
 #include "ilo_resource.h"
 
@@ -426,8 +430,7 @@ buf_create_bo(struct ilo_buffer_resource *buf)
    const bool cpu_init = resource_get_cpu_init(&buf->base);
    struct intel_bo *bo;
 
-   bo = intel_winsys_alloc_bo(is->dev.winsys, name,
-         buf->buffer.bo_size, cpu_init);
+   bo = intel_winsys_alloc_bo(is->dev.winsys, name, buf->bo_size, cpu_init);
    if (!bo)
       return false;
 
@@ -449,6 +452,7 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ)
 {
    const struct ilo_screen *is = ilo_screen(screen);
    struct ilo_buffer_resource *buf;
+   uint32_t alignment;
    unsigned size;
 
    buf = CALLOC_STRUCT(ilo_buffer_resource);
@@ -477,11 +481,17 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ)
        ilo_dev_gen(&is->dev) < ILO_GEN(7.5))
       size = align(size, 4096);
 
-   ilo_buffer_init(&buf->buffer, &is->dev, size, templ->bind, templ->flags);
-   ilo_vma_init(&buf->vma, &is->dev, buf->buffer.bo_size, 4096);
+   if (templ->bind & PIPE_BIND_VERTEX_BUFFER)
+      size = ilo_state_vertex_buffer_size(&is->dev, size, &alignment);
+   if (templ->bind & PIPE_BIND_INDEX_BUFFER)
+      size = ilo_state_index_buffer_size(&is->dev, size, &alignment);
+   if (templ->bind & PIPE_BIND_STREAM_OUTPUT)
+      size = ilo_state_sol_buffer_size(&is->dev, size, &alignment);
 
-   if (buf->buffer.bo_size < templ->width0 ||
-       buf->buffer.bo_size > ilo_max_resource_size ||
+   buf->bo_size = size;
+   ilo_vma_init(&buf->vma, &is->dev, buf->bo_size, 4096);
+
+   if (buf->bo_size < templ->width0 || buf->bo_size > ilo_max_resource_size ||
        !buf_create_bo(buf)) {
       FREE(buf);
       return NULL;
diff --git a/src/gallium/drivers/ilo/ilo_resource.h b/src/gallium/drivers/ilo/ilo_resource.h
index 0357499f44a..c28c05abcfe 100644
--- a/src/gallium/drivers/ilo/ilo_resource.h
+++ b/src/gallium/drivers/ilo/ilo_resource.h
@@ -29,7 +29,6 @@
 #define ILO_RESOURCE_H
 
 #include "core/intel_winsys.h"
-#include "core/ilo_buffer.h"
 #include "core/ilo_image.h"
 #include "core/ilo_vma.h"
 
@@ -106,7 +105,7 @@ struct ilo_texture {
 struct ilo_buffer_resource {
    struct pipe_resource base;
 
-   struct ilo_buffer buffer;
+   uint32_t bo_size;
    struct ilo_vma vma;
 };
 

From 07acf9cb167d4e1f7aebd6837d22e3523ad63109 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Wed, 24 Jun 2015 12:57:57 +0800
Subject: [PATCH 0038/1208] ilo: improve SURFTYPE_BUFFER validations

Reorganize the validations to make them more systematic.
---
 .../drivers/ilo/core/ilo_state_surface.c      | 223 +++++++++++-------
 .../drivers/ilo/core/ilo_state_surface.h      |   1 +
 2 files changed, 141 insertions(+), 83 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c
index d3581749e56..2caba6df46e 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c
@@ -94,15 +94,127 @@ surface_set_gen7_null_SURFACE_STATE(struct ilo_state_surface *surf,
    return true;
 }
 
+static uint32_t
+surface_get_gen6_buffer_offset_alignment(const struct ilo_dev *dev,
+                                         const struct ilo_state_surface_buffer_info *info)
+{
+   uint32_t alignment;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 68:
+    *
+    *     "The Base Address for linear render target surfaces and surfaces
+    *      accessed with the typed surface read/write data port messages must
+    *      be element-size aligned, for non-YUV surface formats, or a multiple
+    *      of 2 element-sizes for YUV surface formats.  Other linear surfaces
+    *      have no alignment requirements (byte alignment is sufficient)."
+    *
+    *     "Certain message types used to access surfaces have more stringent
+    *      alignment requirements. Please refer to the specific message
+    *      documentation for additional restrictions."
+    */
+   switch (info->access) {
+   case ILO_STATE_SURFACE_ACCESS_SAMPLER:
+      /* no alignment requirements */
+      alignment = 1;
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_RENDER:
+   case ILO_STATE_SURFACE_ACCESS_DP_TYPED:
+      /* element-size aligned */
+      alignment = info->format_size;
+
+      assert(info->struct_size % alignment == 0);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED:
+      /*
+       * Nothing is said about Untyped* messages, but I think they require the
+       * base address to be DWord aligned.
+       */
+      alignment = 4;
+
+      /*
+       * From the Ivy Bridge PRM, volume 4 part 1, page 70:
+       *
+       *     "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the
+       *      pitch must be a multiple of 4 bytes."
+       */
+      if (info->struct_size > 1)
+         assert(info->struct_size % alignment == 0);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_DATA:
+      /*
+       * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, and 237:
+       *
+       *     "the surface base address must be OWord aligned"
+       *
+       * for OWord Block Read/Write, Unaligned OWord Block Read, and OWord
+       * Dual Block Read/Write.
+       *
+       * From the Ivy Bridge PRM, volume 4 part 1, page 246 and 249:
+       *
+       *     "The surface base address must be DWord aligned"
+       *
+       * for DWord Scattered Read/Write and Byte Scattered Read/Write.
+       */
+      alignment = (info->format_size > 4) ? 16 : 4;
+
+      /*
+       * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, 237, and
+       * 246:
+       *
+       *     "the surface pitch is ignored, the surface is treated as a
+       *      1-dimensional surface. An element size (pitch) of 16 bytes is
+       *      used to determine the size of the buffer for out-of-bounds
+       *      checking if using the surface state model."
+       *
+       * for OWord Block Read/Write, Unaligned OWord Block Read, OWord
+       * Dual Block Read/Write, and DWord Scattered Read/Write.
+       *
+       * From the Ivy Bridge PRM, volume 4 part 1, page 248:
+       *
+       *     "The surface pitch is ignored, the surface is treated as a
+       *      1-dimensional surface. An element size (pitch) of 4 bytes is
+       *      used to determine the size of the buffer for out-of-bounds
+       *      checking if using the surface state model."
+       *
+       * for Byte Scattered Read/Write.
+       *
+       * It is programmable on Gen7.5+.
+       */
+      if (ilo_dev_gen(dev) < ILO_GEN(7.5)) {
+         const int fixed = (info->format_size > 1) ? 16 : 4;
+         assert(info->struct_size == fixed);
+      }
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_SVB:
+      /*
+       * From the Sandy Bridge PRM, volume 4 part 1, page 259:
+       *
+       *     "Both the surface base address and surface pitch must be DWord
+       *      aligned."
+       */
+      alignment = 4;
+
+      assert(info->struct_size % alignment == 0);
+      break;
+   default:
+      assert(!"unknown access");
+      alignment = 1;
+      break;
+   }
+
+   return alignment;
+}
+
 static bool
 surface_validate_gen6_buffer(const struct ilo_dev *dev,
                              const struct ilo_state_surface_buffer_info *info)
 {
-   ILO_DEV_ASSERT(dev, 6, 8);
+   uint32_t alignment;
 
-   /* SVB writes are Gen6-only */
-   if (ilo_dev_gen(dev) >= ILO_GEN(7))
-      assert(info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB);
+   ILO_DEV_ASSERT(dev, 6, 8);
 
    if (info->offset + info->size > info->vma->vm_size) {
       ilo_warn("invalid buffer range\n");
@@ -120,88 +232,34 @@ surface_validate_gen6_buffer(const struct ilo_dev *dev,
       return false;
    }
 
+   alignment = surface_get_gen6_buffer_offset_alignment(dev, info);
+   if (info->offset % alignment || info->vma->vm_alignment % alignment) {
+      ilo_warn("bad buffer offset\n");
+      return false;
+   }
+
+   /* no STRBUF on Gen6 */
+   if (info->format == GEN6_FORMAT_RAW && info->struct_size > 1)
+      assert(ilo_dev_gen(dev) >= ILO_GEN(7));
+
+   /* SVB writes are Gen6 only */
+   if (info->access == ILO_STATE_SURFACE_ACCESS_DP_SVB)
+      assert(ilo_dev_gen(dev) == ILO_GEN(6));
+
    /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 68:
+    * From the Ivy Bridge PRM, volume 4 part 1, page 83:
     *
-    *     "The Base Address for linear render target surfaces and surfaces
-    *      accessed with the typed surface read/write data port messages must
-    *      be element-size aligned, for non-YUV surface formats, or a multiple
-    *      of 2 element-sizes for YUV surface formats.  Other linear surfaces
-    *      have no alignment requirements (byte alignment is sufficient)."
+    *     "NOTE: "RAW" is supported only with buffers and structured buffers
+    *      accessed via the untyped surface read/write and untyped atomic
+    *      operation messages, which do not have a column in the table."
     *
-    *     "Certain message types used to access surfaces have more stringent
-    *      alignment requirements. Please refer to the specific message
-    *      documentation for additional restrictions."
+    * From the Ivy Bridge PRM, volume 4 part 1, page 252:
     *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, and 237:
-    *
-    *     "the surface base address must be OWord aligned"
-    *
-    * for OWord Block Read/Write, Unaligned OWord Block Read, and OWord Dual
-    * Block Read/Write.
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 246 and 249:
-    *
-    *     "The surface base address must be DWord aligned"
-    *
-    * for DWord Scattered Read/Write and Byte Scattered Read/Write.
-    *
-    * We have to rely on users to correctly set info->struct_size here.  DWord
-    * Scattered Read/Write has conflicting pitch and alignment, but we do not
-    * use them yet so we are fine.
-    *
-    * It is unclear if sampling engine surfaces require aligned offsets.
+    *     "For untyped messages, the Surface Format must be RAW and the
+    *      Surface Type must be SURFTYPE_BUFFER or SURFTYPE_STRBUF."
     */
-   if (info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB) {
-      assert(info->struct_size % info->format_size == 0);
-
-      if (info->offset % info->struct_size ||
-          info->vma->vm_alignment % info->struct_size) {
-         ilo_warn("bad buffer offset\n");
-         return false;
-      }
-   }
-
-   if (info->format == GEN6_FORMAT_RAW) {
-      /*
-       * From the Sandy Bridge PRM, volume 4 part 1, page 97:
-       *
-       *     ""RAW" is supported only with buffers and structured buffers
-       *      accessed via the untyped surface read/write and untyped atomic
-       *      operation messages, which do not have a column in the table."
-       *
-       * We do not have a specific access mode for untyped messages.
-       */
-      assert(info->access == ILO_STATE_SURFACE_ACCESS_DP_UNTYPED);
-
-      /*
-       * Nothing is said about Untyped* messages, but I guess they require the
-       * base address to be DWord aligned.
-       */
-      if (info->offset % 4 || info->vma->vm_alignment % 4) {
-         ilo_warn("bad RAW buffer offset\n");
-         return false;
-      }
-
-      if (info->struct_size > 1) {
-         /* no STRBUF on Gen6 */
-         if (ilo_dev_gen(dev) == ILO_GEN(6)) {
-            ilo_warn("no STRBUF support\n");
-            return false;
-         }
-
-         /*
-          * From the Ivy Bridge PRM, volume 4 part 1, page 70:
-          *
-          *     "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the
-          *      pitch must be a multiple of 4 bytes."
-          */
-         if (info->struct_size % 4) {
-            ilo_warn("bad STRBUF pitch\n");
-            return false;
-         }
-      }
-   }
+   assert((info->access == ILO_STATE_SURFACE_ACCESS_DP_UNTYPED) ==
+          (info->format == GEN6_FORMAT_RAW));
 
    return true;
 }
@@ -216,8 +274,7 @@ surface_get_gen6_buffer_struct_count(const struct ilo_dev *dev,
    ILO_DEV_ASSERT(dev, 6, 8);
 
    c = info->size / info->struct_size;
-   if (info->access == ILO_STATE_SURFACE_ACCESS_DP_SVB &&
-       info->format_size < info->size - info->struct_size * c)
+   if (info->format_size < info->size - info->struct_size * c)
       c++;
 
    /*
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.h b/src/gallium/drivers/ilo/core/ilo_state_surface.h
index 0cda08eb031..835df69882e 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.h
@@ -52,6 +52,7 @@ struct ilo_state_surface_buffer_info {
 
    enum ilo_state_surface_access access;
 
+   /* format_size may be less than, equal to, or greater than struct_size */
    enum gen_surface_format format;
    uint8_t format_size;
 

From f825fe8e13adfec4cd488bac3663b7e9c90a8c06 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Thu, 25 Jun 2015 07:18:31 +0800
Subject: [PATCH 0039/1208] ilo: remove ilo_image_disable_aux()

Fail resource creation when aux bo allocation fails.
---
 src/gallium/drivers/ilo/core/ilo_image.c | 19 -------------------
 src/gallium/drivers/ilo/core/ilo_image.h |  3 ---
 src/gallium/drivers/ilo/ilo_resource.c   |  8 ++------
 3 files changed, 2 insertions(+), 28 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 0d837d8a9d5..ed9b2883ac0 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -1449,22 +1449,3 @@ ilo_image_init_for_imported(struct ilo_image *img,
 
    return true;
 }
-
-bool
-ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev)
-{
-   /* HiZ is required for separate stencil on Gen6 */
-   if (ilo_dev_gen(dev) == ILO_GEN(6) &&
-       img->aux.type == ILO_IMAGE_AUX_HIZ &&
-       img->separate_stencil)
-      return false;
-
-   /* MCS is required for multisample images */
-   if (img->aux.type == ILO_IMAGE_AUX_MCS &&
-       img->sample_count > 1)
-      return false;
-
-   img->aux.enables = 0x0;
-
-   return true;
-}
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index 77747ed7492..e5dcc4319b6 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -157,9 +157,6 @@ ilo_image_init_for_imported(struct ilo_image *img,
                             enum gen_surface_tiling tiling,
                             unsigned bo_stride);
 
-bool
-ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev);
-
 static inline bool
 ilo_image_can_enable_aux(const struct ilo_image *img, unsigned level)
 {
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index a0074e57e99..3b8e607862c 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -283,8 +283,6 @@ tex_destroy(struct ilo_texture *tex)
 static bool
 tex_alloc_bos(struct ilo_texture *tex)
 {
-   struct ilo_screen *is = ilo_screen(tex->base.screen);
-
    if (!tex->imported && !tex_create_bo(tex))
       return false;
 
@@ -294,13 +292,11 @@ tex_alloc_bos(struct ilo_texture *tex)
 
    switch (tex->image.aux.type) {
    case ILO_IMAGE_AUX_HIZ:
-      if (!tex_create_hiz(tex) &&
-          !ilo_image_disable_aux(&tex->image, &is->dev))
+      if (!tex_create_hiz(tex))
          return false;
       break;
    case ILO_IMAGE_AUX_MCS:
-      if (!tex_create_mcs(tex) &&
-          !ilo_image_disable_aux(&tex->image, &is->dev))
+      if (!tex_create_mcs(tex))
          return false;
       break;
    default:

From 934e4a469fd37dac03b8280cce41df4d9f4ed123 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Wed, 24 Jun 2015 22:46:36 +0800
Subject: [PATCH 0040/1208] ilo: initialize ilo_image from ilo_image_info

Convert pipe_resource to ilo_image_info for image initialization.
---
 src/gallium/drivers/ilo/core/ilo_image.c | 283 +++++++++++------------
 src/gallium/drivers/ilo/core/ilo_image.h |  43 +++-
 src/gallium/drivers/ilo/ilo_resource.c   |  99 ++++++--
 3 files changed, 244 insertions(+), 181 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index ed9b2883ac0..39c6daaafd3 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -42,7 +42,7 @@ enum {
 
 struct ilo_image_params {
    const struct ilo_dev *dev;
-   const struct pipe_resource *templ;
+   const struct ilo_image_info *info;
    unsigned valid_tilings;
 
    bool compressed;
@@ -56,7 +56,7 @@ img_get_slice_size(const struct ilo_image *img,
                    const struct ilo_image_params *params,
                    unsigned level, unsigned *width, unsigned *height)
 {
-   const struct pipe_resource *templ = params->templ;
+   const struct ilo_image_info *info = params->info;
    unsigned w, h;
 
    w = u_minify(img->width0, level);
@@ -112,8 +112,7 @@ img_get_slice_size(const struct ilo_image *img,
     *   y = align(y, 2) * 2;
     */
    if (img->interleaved_samples) {
-      switch (templ->nr_samples) {
-      case 0:
+      switch (info->sample_count) {
       case 1:
          break;
       case 2:
@@ -157,12 +156,12 @@ static unsigned
 img_get_num_layers(const struct ilo_image *img,
                    const struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
-   unsigned num_layers = templ->array_size;
+   const struct ilo_image_info *info = params->info;
+   unsigned num_layers = info->array_size;
 
    /* samples of the same index are stored in a layer */
-   if (templ->nr_samples > 1 && !img->interleaved_samples)
-      num_layers *= templ->nr_samples;
+   if (info->sample_count > 1 && !img->interleaved_samples)
+      num_layers *= info->sample_count;
 
    return num_layers;
 }
@@ -171,7 +170,7 @@ static void
 img_init_layer_height(struct ilo_image *img,
                       struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
+   const struct ilo_image_info *info = params->info;
    unsigned num_layers;
 
    if (img->walk != ILO_IMAGE_WALK_LAYER)
@@ -218,7 +217,7 @@ img_init_layer_height(struct ilo_image *img,
    img->walk_layer_height = params->h0 + params->h1 +
       ((ilo_dev_gen(params->dev) >= ILO_GEN(7)) ? 12 : 11) * img->align_j;
 
-   if (ilo_dev_gen(params->dev) == ILO_GEN(6) && templ->nr_samples > 1 &&
+   if (ilo_dev_gen(params->dev) == ILO_GEN(6) && info->sample_count > 1 &&
        img->height0 % 4 == 1)
       img->walk_layer_height += 4;
 
@@ -229,13 +228,13 @@ static void
 img_init_lods(struct ilo_image *img,
               struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
+   const struct ilo_image_info *info = params->info;
    unsigned cur_x, cur_y;
    unsigned lv;
 
    cur_x = 0;
    cur_y = 0;
-   for (lv = 0; lv <= templ->last_level; lv++) {
+   for (lv = 0; lv < info->level_count; lv++) {
       unsigned lod_w, lod_h;
 
       img_get_slice_size(img, params, lv, &lod_w, &lod_h);
@@ -261,7 +260,7 @@ img_init_lods(struct ilo_image *img,
             cur_y += lod_h;
 
          /* every LOD begins at tile boundaries */
-         if (templ->last_level > 0) {
+         if (info->level_count > 1) {
             assert(img->format == PIPE_FORMAT_S8_UINT);
             cur_x = align(cur_x, 64);
             cur_y = align(cur_y, 64);
@@ -269,7 +268,7 @@ img_init_lods(struct ilo_image *img,
          break;
       case ILO_IMAGE_WALK_3D:
          {
-            const unsigned num_slices = u_minify(templ->depth0, lv);
+            const unsigned num_slices = u_minify(info->depth, lv);
             const unsigned num_slices_per_row = 1 << lv;
             const unsigned num_rows =
                (num_slices + num_slices_per_row - 1) / num_slices_per_row;
@@ -291,7 +290,7 @@ img_init_lods(struct ilo_image *img,
    if (img->walk == ILO_IMAGE_WALK_LAYER) {
       params->h0 = img->lods[0].slice_height;
 
-      if (templ->last_level > 0)
+      if (info->level_count > 1)
          params->h1 = img->lods[1].slice_height;
       else
          img_get_slice_size(img, params, 1, &cur_x, &params->h1);
@@ -302,7 +301,7 @@ static void
 img_init_alignments(struct ilo_image *img,
                     const struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
+   const struct ilo_image_info *info = params->info;
 
    /*
     * From the Sandy Bridge PRM, volume 1 part 1, page 113:
@@ -396,7 +395,7 @@ img_init_alignments(struct ilo_image *img,
       /* this happens to be the case */
       img->align_i = img->block_width;
       img->align_j = img->block_height;
-   } else if (templ->bind & PIPE_BIND_DEPTH_STENCIL) {
+   } else if (info->bind_zs) {
       if (ilo_dev_gen(params->dev) >= ILO_GEN(7)) {
          switch (img->format) {
          case PIPE_FORMAT_Z16_UNORM:
@@ -426,11 +425,11 @@ img_init_alignments(struct ilo_image *img,
       }
    } else {
       const bool valign_4 =
-         (templ->nr_samples > 1) ||
+         (info->sample_count > 1) ||
          (ilo_dev_gen(params->dev) >= ILO_GEN(8)) ||
          (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
           img->tiling == GEN6_TILING_Y &&
-          (templ->bind & PIPE_BIND_RENDER_TARGET));
+          info->bind_surface_dp_render);
 
       if (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
           ilo_dev_gen(params->dev) <= ILO_GEN(7.5) && valign_4)
@@ -460,14 +459,14 @@ static void
 img_init_tiling(struct ilo_image *img,
                 const struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
+   const struct ilo_image_info *info = params->info;
    unsigned preferred_tilings = params->valid_tilings;
 
    /* no fencing nor BLT support */
    if (preferred_tilings & ~IMAGE_TILING_W)
       preferred_tilings &= ~IMAGE_TILING_W;
 
-   if (templ->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW)) {
+   if (info->bind_surface_dp_render || info->bind_surface_sampler) {
       /*
        * heuristically set a minimum width/height for enabling tiling
        */
@@ -499,7 +498,7 @@ static void
 img_init_walk_gen7(struct ilo_image *img,
                    const struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
+   const struct ilo_image_info *info = params->info;
 
    /*
     * It is not explicitly states, but render targets are expected to be
@@ -508,14 +507,14 @@ img_init_walk_gen7(struct ilo_image *img,
     *
     * See "Multisampled Surface Storage Format" field of SURFACE_STATE.
     */
-   if (templ->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (info->bind_zs) {
       /*
        * From the Ivy Bridge PRM, volume 1 part 1, page 111:
        *
        *     "note that the depth buffer and stencil buffer have an implied
        *      value of ARYSPC_FULL"
        */
-      img->walk = (templ->target == PIPE_TEXTURE_3D) ?
+      img->walk = (info->target == PIPE_TEXTURE_3D) ?
          ILO_IMAGE_WALK_3D : ILO_IMAGE_WALK_LAYER;
 
       img->interleaved_samples = true;
@@ -530,12 +529,12 @@ img_init_walk_gen7(struct ilo_image *img,
        * As multisampled resources are not mipmapped, we never use
        * ARYSPC_FULL for them.
        */
-      if (templ->nr_samples > 1)
-         assert(templ->last_level == 0);
+      if (info->sample_count > 1)
+         assert(info->level_count == 1);
 
       img->walk =
-         (templ->target == PIPE_TEXTURE_3D) ? ILO_IMAGE_WALK_3D :
-         (templ->last_level > 0) ? ILO_IMAGE_WALK_LAYER :
+         (info->target == PIPE_TEXTURE_3D) ? ILO_IMAGE_WALK_3D :
+         (info->level_count > 1) ? ILO_IMAGE_WALK_LAYER :
          ILO_IMAGE_WALK_LOD;
 
       img->interleaved_samples = false;
@@ -558,7 +557,7 @@ img_init_walk_gen6(struct ilo_image *img,
     * GEN6 does not support compact spacing otherwise.
     */
    img->walk =
-      (params->templ->target == PIPE_TEXTURE_3D) ? ILO_IMAGE_WALK_3D :
+      (params->info->target == PIPE_TEXTURE_3D) ? ILO_IMAGE_WALK_3D :
       (img->format == PIPE_FORMAT_S8_UINT) ? ILO_IMAGE_WALK_LOD :
       ILO_IMAGE_WALK_LAYER;
 
@@ -580,7 +579,7 @@ static unsigned
 img_get_valid_tilings(const struct ilo_image *img,
                       const struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
+   const struct ilo_image_info *info = params->info;
    const enum pipe_format format = img->format;
    unsigned valid_tilings = params->valid_tilings;
 
@@ -590,7 +589,7 @@ img_get_valid_tilings(const struct ilo_image *img,
     *     "Display/Overlay   Y-Major not supported.
     *                        X-Major required for Async Flips"
     */
-   if (unlikely(templ->bind & PIPE_BIND_SCANOUT))
+   if (unlikely(info->bind_scanout))
       valid_tilings &= IMAGE_TILING_X;
 
    /*
@@ -599,7 +598,7 @@ img_get_valid_tilings(const struct ilo_image *img,
     *     "The cursor surface address must be 4K byte aligned. The cursor must
     *      be in linear memory, it cannot be tiled."
     */
-   if (unlikely(templ->bind & (PIPE_BIND_CURSOR | PIPE_BIND_LINEAR)))
+   if (unlikely(info->bind_cursor))
       valid_tilings &= IMAGE_TILING_NONE;
 
    /*
@@ -614,7 +613,7 @@ img_get_valid_tilings(const struct ilo_image *img,
     *
     *     "W-Major Tile Format is used for separate stencil."
     */
-   if (templ->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (info->bind_zs) {
       switch (format) {
       case PIPE_FORMAT_S8_UINT:
          valid_tilings &= IMAGE_TILING_W;
@@ -625,7 +624,7 @@ img_get_valid_tilings(const struct ilo_image *img,
       }
    }
 
-   if (templ->bind & PIPE_BIND_RENDER_TARGET) {
+   if (info->bind_surface_dp_render) {
       /*
        * From the Sandy Bridge PRM, volume 1 part 2, page 32:
        *
@@ -656,7 +655,7 @@ img_get_valid_tilings(const struct ilo_image *img,
       valid_tilings &= ~IMAGE_TILING_W;
    }
 
-   if (templ->bind & PIPE_BIND_SAMPLER_VIEW) {
+   if (info->bind_surface_sampler) {
       if (ilo_dev_gen(params->dev) < ILO_GEN(8))
          valid_tilings &= ~IMAGE_TILING_W;
    }
@@ -671,17 +670,17 @@ static void
 img_init_size_and_format(struct ilo_image *img,
                          struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
-   enum pipe_format format = templ->format;
+   const struct ilo_image_info *info = params->info;
+   enum pipe_format format = info->format;
    bool require_separate_stencil = false;
 
-   img->target = templ->target;
-   img->width0 = templ->width0;
-   img->height0 = templ->height0;
-   img->depth0 = templ->depth0;
-   img->array_size = templ->array_size;
-   img->level_count = templ->last_level + 1;
-   img->sample_count = (templ->nr_samples) ? templ->nr_samples : 1;
+   img->target = info->target;
+   img->width0 = info->width;
+   img->height0 = info->height;
+   img->depth0 = info->depth;
+   img->array_size = info->array_size;
+   img->level_count = info->level_count;
+   img->sample_count = info->sample_count;
 
    /*
     * From the Sandy Bridge PRM, volume 2 part 1, page 317:
@@ -691,7 +690,7 @@ img_init_size_and_format(struct ilo_image *img,
     *
     * GEN7+ requires separate stencil buffers.
     */
-   if (templ->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (info->bind_zs) {
       if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
          require_separate_stencil = true;
       else
@@ -731,15 +730,14 @@ static bool
 img_want_mcs(const struct ilo_image *img,
              const struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
+   const struct ilo_image_info *info = params->info;
    bool want_mcs = false;
 
    /* MCS is for RT on GEN7+ */
    if (ilo_dev_gen(params->dev) < ILO_GEN(7))
       return false;
 
-   if (templ->target != PIPE_TEXTURE_2D ||
-       !(templ->bind & PIPE_BIND_RENDER_TARGET))
+   if (info->target != PIPE_TEXTURE_2D || !info->bind_surface_dp_render)
       return false;
 
    /*
@@ -752,9 +750,9 @@ img_want_mcs(const struct ilo_image *img,
     *     "This field must be set to 0 for all SINT MSRTs when all RT channels
     *      are not written"
     */
-   if (templ->nr_samples > 1 && !util_format_is_pure_sint(templ->format)) {
+   if (info->sample_count > 1 && !util_format_is_pure_sint(info->format)) {
       want_mcs = true;
-   } else if (templ->nr_samples <= 1) {
+   } else if (info->sample_count == 1 && !info->aux_disable) {
       /*
        * From the Ivy Bridge PRM, volume 2 part 1, page 326:
        *
@@ -770,7 +768,7 @@ img_want_mcs(const struct ilo_image *img,
        *      ..."
        */
       if (img->tiling != GEN6_TILING_NONE &&
-          templ->last_level == 0 && templ->array_size == 1) {
+          info->level_count == 1 && info->array_size == 1) {
          switch (img->block_size) {
          case 4:
          case 8:
@@ -790,27 +788,26 @@ static bool
 img_want_hiz(const struct ilo_image *img,
              const struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
+   const struct ilo_image_info *info = params->info;
    const struct util_format_description *desc =
-      util_format_description(templ->format);
+      util_format_description(info->format);
 
    if (ilo_debug & ILO_DEBUG_NOHIZ)
       return false;
 
-   /* we want 8x4 aligned levels */
-   if (templ->target == PIPE_TEXTURE_1D)
+   if (info->aux_disable)
       return false;
 
-   if (!(templ->bind & PIPE_BIND_DEPTH_STENCIL))
+   /* we want 8x4 aligned levels */
+   if (info->target == PIPE_TEXTURE_1D)
+      return false;
+
+   if (!info->bind_zs)
       return false;
 
    if (!util_format_has_depth(desc))
       return false;
 
-   /* no point in having HiZ */
-   if (templ->usage == PIPE_USAGE_STAGING)
-      return false;
-
    /*
     * As can be seen in img_calculate_hiz_size(), HiZ may not be enabled
     * for every level.  This is generally fine except on GEN6, where HiZ and
@@ -819,8 +816,8 @@ img_want_hiz(const struct ilo_image *img,
     * can result in incompatible formats.
     */
    if (ilo_dev_gen(params->dev) == ILO_GEN(6) &&
-       templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
-       templ->last_level)
+       info->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
+       info->level_count > 1)
       return false;
 
    return true;
@@ -839,7 +836,7 @@ img_init_aux(struct ilo_image *img,
 static void
 img_align(struct ilo_image *img, struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
+   const struct ilo_image_info *info = params->info;
    int align_w = 1, align_h = 1, pad_h = 0;
 
    /*
@@ -864,11 +861,11 @@ img_align(struct ilo_image *img, struct ilo_image_params *params)
     *      padding purposes. The value of 4 for j still applies for mip level
     *      alignment and QPitch calculation."
     */
-   if (templ->bind & PIPE_BIND_SAMPLER_VIEW) {
+   if (info->bind_surface_sampler) {
       align_w = MAX2(align_w, img->align_i);
       align_h = MAX2(align_h, img->align_j);
 
-      if (templ->target == PIPE_TEXTURE_CUBE)
+      if (info->target == PIPE_TEXTURE_CUBE)
          pad_h += 2;
 
       if (params->compressed)
@@ -881,7 +878,7 @@ img_align(struct ilo_image *img, struct ilo_image_params *params)
     *     "If the surface contains an odd number of rows of data, a final row
     *      below the surface must be allocated."
     */
-   if (templ->bind & PIPE_BIND_RENDER_TARGET)
+   if (info->bind_surface_dp_render)
       align_h = MAX2(align_h, 2);
 
    /*
@@ -889,9 +886,9 @@ img_align(struct ilo_image *img, struct ilo_image_params *params)
     * for unaligned non-mipmapped and non-array images.
     */
    if (img->aux.type == ILO_IMAGE_AUX_HIZ &&
-       templ->last_level == 0 &&
-       templ->array_size == 1 &&
-       templ->depth0 == 1) {
+       info->level_count == 1 &&
+       info->array_size == 1 &&
+       info->depth == 1) {
       align_w = MAX2(align_w, 8);
       align_h = MAX2(align_h, 4);
    }
@@ -925,7 +922,7 @@ img_calculate_bo_size(struct ilo_image *img,
        *      required above."
        */
       if (ilo_dev_gen(params->dev) >= ILO_GEN(7.5) &&
-          (params->templ->bind & PIPE_BIND_SAMPLER_VIEW) &&
+          params->info->bind_surface_sampler &&
           img->tiling == GEN6_TILING_NONE)
          h += (64 + img->bo_stride - 1) / img->bo_stride;
 
@@ -943,7 +940,7 @@ img_calculate_bo_size(struct ilo_image *img,
        *
        * Different requirements may exist when the bo is used in different
        * places, but our alignments here should be good enough that we do not
-       * need to check params->templ->bind.
+       * need to check params->info->bind_x.
        */
       switch (img->tiling) {
       case GEN6_TILING_X:
@@ -994,7 +991,7 @@ img_calculate_bo_size(struct ilo_image *img,
                img->tiling = GEN6_TILING_NONE;
                /* MCS support for non-MSRTs is limited to tiled RTs */
                if (img->aux.type == ILO_IMAGE_AUX_MCS &&
-                   params->templ->nr_samples <= 1)
+                   params->info->sample_count == 1)
                   img->aux.type = ILO_IMAGE_AUX_NONE;
 
                continue;
@@ -1014,7 +1011,7 @@ static void
 img_calculate_hiz_size(struct ilo_image *img,
                        const struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
+   const struct ilo_image_info *info = params->info;
    const unsigned hz_align_j = 8;
    enum ilo_image_walk_type hz_walk;
    unsigned hz_width, hz_height, lv;
@@ -1059,7 +1056,7 @@ img_calculate_hiz_size(struct ilo_image *img,
 
          hz_width = align(img->lods[0].slice_width, 16);
 
-         hz_height = hz_qpitch * templ->array_size / 2;
+         hz_height = hz_qpitch * info->array_size / 2;
          if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
             hz_height = align(hz_height, 8);
 
@@ -1077,7 +1074,7 @@ img_calculate_hiz_size(struct ilo_image *img,
          hz_height = 0;
          cur_tx = 0;
          cur_ty = 0;
-         for (lv = 0; lv <= templ->last_level; lv++) {
+         for (lv = 0; lv < info->level_count; lv++) {
             unsigned tw, th;
 
             lod_tx[lv] = cur_tx;
@@ -1085,7 +1082,7 @@ img_calculate_hiz_size(struct ilo_image *img,
 
             tw = align(img->lods[lv].slice_width, 16);
             th = align(img->lods[lv].slice_height, hz_align_j) *
-               templ->array_size / 2;
+               info->array_size / 2;
             /* convert to Y-tiles */
             tw = align(tw, 128) / 128;
             th = align(th, 32) / 32;
@@ -1102,7 +1099,7 @@ img_calculate_hiz_size(struct ilo_image *img,
          }
 
          /* convert tile offsets to memory offsets */
-         for (lv = 0; lv <= templ->last_level; lv++) {
+         for (lv = 0; lv < info->level_count; lv++) {
             img->aux.walk_lod_offsets[lv] =
                (lod_ty[lv] * hz_width + lod_tx[lv]) * 4096;
          }
@@ -1114,10 +1111,10 @@ img_calculate_hiz_size(struct ilo_image *img,
       hz_width = align(img->lods[0].slice_width, 16);
 
       hz_height = 0;
-      for (lv = 0; lv <= templ->last_level; lv++) {
+      for (lv = 0; lv < info->level_count; lv++) {
          const unsigned h = align(img->lods[lv].slice_height, hz_align_j);
          /* according to the formula, slices are packed together vertically */
-         hz_height += h * u_minify(templ->depth0, lv);
+         hz_height += h * u_minify(info->depth, lv);
       }
       hz_height /= 2;
       break;
@@ -1136,8 +1133,7 @@ img_calculate_hiz_size(struct ilo_image *img,
     */
    hz_clear_w = 8;
    hz_clear_h = 4;
-   switch (templ->nr_samples) {
-   case 0:
+   switch (info->sample_count) {
    case 1:
    default:
       break;
@@ -1158,7 +1154,7 @@ img_calculate_hiz_size(struct ilo_image *img,
       break;
    }
 
-   for (lv = 0; lv <= templ->last_level; lv++) {
+   for (lv = 0; lv < info->level_count; lv++) {
       if (u_minify(img->width0, lv) % hz_clear_w ||
           u_minify(img->height0, lv) % hz_clear_h)
          break;
@@ -1166,7 +1162,7 @@ img_calculate_hiz_size(struct ilo_image *img,
    }
 
    /* we padded to allow this in img_align() */
-   if (templ->last_level == 0 && templ->array_size == 1 && templ->depth0 == 1)
+   if (info->level_count == 1 && info->array_size == 1 && info->depth == 1)
       img->aux.enables |= 0x1;
 
    /* align to Y-tile */
@@ -1178,13 +1174,13 @@ static void
 img_calculate_mcs_size(struct ilo_image *img,
                        const struct ilo_image_params *params)
 {
-   const struct pipe_resource *templ = params->templ;
+   const struct ilo_image_info *info = params->info;
    int mcs_width, mcs_height, mcs_cpp;
    int downscale_x, downscale_y;
 
    assert(img->aux.type == ILO_IMAGE_AUX_MCS);
 
-   if (templ->nr_samples > 1) {
+   if (info->sample_count > 1) {
       /*
        * From the Ivy Bridge PRM, volume 2 part 1, page 326, the clear
        * rectangle is scaled down by 8x2 for 4X MSAA and 2x2 for 8X MSAA.  The
@@ -1198,7 +1194,7 @@ img_calculate_mcs_size(struct ilo_image *img,
        * RT.  Similarly, we could reason that an OWord in 4X MCS maps to a 8x2
        * pixel block in the RT.
        */
-      switch (templ->nr_samples) {
+      switch (info->sample_count) {
       case 2:
       case 4:
          downscale_x = 8;
@@ -1295,13 +1291,13 @@ img_calculate_mcs_size(struct ilo_image *img,
       mcs_cpp = 16; /* an OWord */
    }
 
-   img->aux.enables = (1 << (templ->last_level + 1)) - 1;
+   img->aux.enables = (1 << info->level_count) - 1;
    /* align to Y-tile */
    img->aux.bo_stride = align(mcs_width * mcs_cpp, 128);
    img->aux.bo_height = align(mcs_height, 32);
 }
 
-static void
+static bool
 img_init(struct ilo_image *img,
          struct ilo_image_params *params)
 {
@@ -1318,7 +1314,7 @@ img_init(struct ilo_image *img,
    img_align(img, params);
    img_calculate_bo_size(img, params);
 
-   img->scanout = (params->templ->bind & PIPE_BIND_SCANOUT);
+   img->scanout = params->info->bind_scanout;
 
    switch (img->aux.type) {
    case ILO_IMAGE_AUX_HIZ:
@@ -1330,6 +1326,8 @@ img_init(struct ilo_image *img,
    default:
       break;
    }
+
+   return true;
 }
 
 /**
@@ -1339,29 +1337,29 @@ img_init(struct ilo_image *img,
 static void
 img_init_for_transfer(struct ilo_image *img,
                       const struct ilo_dev *dev,
-                      const struct pipe_resource *templ)
+                      const struct ilo_image_info *info)
 {
-   const unsigned num_layers = (templ->target == PIPE_TEXTURE_3D) ?
-      templ->depth0 : templ->array_size;
+   const unsigned num_layers = (info->target == PIPE_TEXTURE_3D) ?
+      info->depth : info->array_size;
    unsigned layer_width, layer_height;
 
-   assert(templ->last_level == 0);
-   assert(templ->nr_samples <= 1);
+   assert(info->level_count == 1);
+   assert(info->sample_count == 1);
 
    img->aux.type = ILO_IMAGE_AUX_NONE;
 
-   img->target = templ->target;
-   img->width0 = templ->width0;
-   img->height0 = templ->height0;
-   img->depth0 = templ->depth0;
-   img->array_size = templ->array_size;
+   img->target = info->target;
+   img->width0 = info->width;
+   img->height0 = info->height;
+   img->depth0 = info->depth;
+   img->array_size = info->array_size;
    img->level_count = 1;
    img->sample_count = 1;
 
-   img->format = templ->format;
-   img->block_width = util_format_get_blockwidth(templ->format);
-   img->block_height = util_format_get_blockheight(templ->format);
-   img->block_size = util_format_get_blocksize(templ->format);
+   img->format = info->format;
+   img->block_width = util_format_get_blockwidth(info->format);
+   img->block_height = util_format_get_blockheight(info->format);
+   img->block_size = util_format_get_blocksize(info->format);
 
    img->walk = ILO_IMAGE_WALK_LOD;
 
@@ -1374,8 +1372,8 @@ img_init_for_transfer(struct ilo_image *img,
           util_is_power_of_two(img->block_height));
 
    /* use packed layout */
-   layer_width = align(templ->width0, img->align_i);
-   layer_height = align(templ->height0, img->align_j);
+   layer_width = align(info->width, img->align_i);
+   layer_height = align(info->height, img->align_j);
 
    img->lods[0].slice_width = layer_width;
    img->lods[0].slice_height = layer_height;
@@ -1386,66 +1384,57 @@ img_init_for_transfer(struct ilo_image *img,
    img->bo_height = (layer_height / img->block_height) * num_layers;
 }
 
+static bool
+img_is_bind_gpu(const struct ilo_image_info *info)
+{
+   return (info->bind_surface_sampler ||
+           info->bind_surface_dp_render ||
+           info->bind_surface_dp_typed ||
+           info->bind_zs ||
+           info->bind_scanout ||
+           info->bind_cursor);
+}
+
 /**
  * Initialize the image.  Callers should zero-initialize \p img first.
  */
-void ilo_image_init(struct ilo_image *img,
-                    const struct ilo_dev *dev,
-                    const struct pipe_resource *templ)
+bool
+ilo_image_init(struct ilo_image *img,
+               const struct ilo_dev *dev,
+               const struct ilo_image_info *info)
 {
    struct ilo_image_params params;
-   bool transfer_only;
 
    assert(ilo_is_zeroed(img, sizeof(*img)));
 
    /* use transfer layout when the texture is never bound to GPU */
-   transfer_only = !(templ->bind & ~(PIPE_BIND_TRANSFER_WRITE |
-                                     PIPE_BIND_TRANSFER_READ));
-   if (transfer_only && templ->last_level == 0 && templ->nr_samples <= 1) {
-      img_init_for_transfer(img, dev, templ);
-      return;
+   if (!img_is_bind_gpu(info) &&
+       info->level_count == 1 &&
+       info->sample_count == 1) {
+      img_init_for_transfer(img, dev, info);
+      return true;
    }
 
    memset(&params, 0, sizeof(params));
    params.dev = dev;
-   params.templ = templ;
-   params.valid_tilings = IMAGE_TILING_ALL;
+   params.info = info;
+   params.valid_tilings = (info->valid_tilings) ?
+      info->valid_tilings : IMAGE_TILING_ALL;
 
-   img_init(img, &params);
-}
-
-bool
-ilo_image_init_for_imported(struct ilo_image *img,
-                            const struct ilo_dev *dev,
-                            const struct pipe_resource *templ,
-                            enum gen_surface_tiling tiling,
-                            unsigned bo_stride)
-{
-   struct ilo_image_params params;
-
-   assert(ilo_is_zeroed(img, sizeof(*img)));
-
-   if ((tiling == GEN6_TILING_X && bo_stride % 512) ||
-       (tiling == GEN6_TILING_Y && bo_stride % 128) ||
-       (tiling == GEN8_TILING_W && bo_stride % 64))
+   if (!img_init(img, &params))
       return false;
 
-   memset(&params, 0, sizeof(params));
-   params.dev = dev;
-   params.templ = templ;
-   params.valid_tilings = 1 << tiling;
+   if (info->force_bo_stride) {
+      if ((img->tiling == GEN6_TILING_X && info->force_bo_stride % 512) ||
+          (img->tiling == GEN6_TILING_Y && info->force_bo_stride % 128) ||
+          (img->tiling == GEN8_TILING_W && info->force_bo_stride % 64))
+         return false;
 
-   img_init(img, &params);
+      if (img->bo_stride > info->force_bo_stride)
+         return false;
 
-   assert(img->tiling == tiling);
-   if (img->bo_stride > bo_stride)
-      return false;
-
-   img->bo_stride = bo_stride;
-
-   /* assume imported RTs are also scanouts */
-   if (!img->scanout)
-      img->scanout = (templ->bind & PIPE_BIND_RENDER_TARGET);
+      img->bo_stride = info->force_bo_stride;
+   }
 
    return true;
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index e5dcc4319b6..e488bef0d3f 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -67,6 +67,36 @@ enum ilo_image_walk_type {
    ILO_IMAGE_WALK_3D,
 };
 
+struct ilo_image_info {
+   enum pipe_texture_target target;
+
+   enum pipe_format format;
+
+   /* image size */
+   uint16_t width;
+   uint16_t height;
+   uint16_t depth;
+   uint16_t array_size;
+   uint8_t level_count;
+   uint8_t sample_count;
+
+   /* disable optional aux */
+   bool aux_disable;
+
+   /* tilings to consider, if any bit is set */
+   uint8_t valid_tilings;
+
+   /* force a stride */
+   uint32_t force_bo_stride;
+
+   bool bind_surface_sampler;
+   bool bind_surface_dp_render;
+   bool bind_surface_dp_typed;
+   bool bind_zs;
+   bool bind_scanout;
+   bool bind_cursor;
+};
+
 /*
  * When the walk type is ILO_IMAGE_WALK_LAYER, there is only a slice in each
  * LOD and this is used to describe LODs in the first array layer.  Otherwise,
@@ -143,19 +173,10 @@ struct ilo_image {
    } aux;
 };
 
-struct pipe_resource;
-
-void
+bool
 ilo_image_init(struct ilo_image *img,
                const struct ilo_dev *dev,
-               const struct pipe_resource *templ);
-
-bool
-ilo_image_init_for_imported(struct ilo_image *img,
-                            const struct ilo_dev *dev,
-                            const struct pipe_resource *templ,
-                            enum gen_surface_tiling tiling,
-                            unsigned bo_stride);
+               const struct ilo_image_info *info);
 
 static inline bool
 ilo_image_can_enable_aux(const struct ilo_image *img, unsigned level)
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index 3b8e607862c..0b0f69c30be 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -87,6 +87,38 @@ resource_get_cpu_init(const struct pipe_resource *templ)
                           PIPE_BIND_STREAM_OUTPUT)) ? false : true;
 }
 
+static void
+resource_get_image_info(const struct pipe_resource *templ,
+                        const struct ilo_dev *dev,
+                        enum pipe_format image_format,
+                        struct ilo_image_info *info)
+{
+   memset(info, 0, sizeof(*info));
+
+   info->target = templ->target;
+   info->format = image_format;
+
+   info->width = templ->width0;
+   info->height = templ->height0;
+   info->depth = templ->depth0;
+   info->array_size = templ->array_size;
+   info->level_count = templ->last_level + 1;
+   info->sample_count = (templ->nr_samples) ? templ->nr_samples : 1;
+
+   info->aux_disable = (templ->usage == PIPE_USAGE_STAGING);
+
+   if (templ->bind & PIPE_BIND_LINEAR)
+      info->valid_tilings = 1 << GEN6_TILING_NONE;
+
+   info->bind_surface_sampler = (templ->bind & PIPE_BIND_SAMPLER_VIEW);
+   info->bind_surface_dp_render = (templ->bind & PIPE_BIND_RENDER_TARGET);
+   info->bind_surface_dp_typed = (templ->bind &
+         (PIPE_BIND_SHADER_RESOURCE | PIPE_BIND_COMPUTE_RESOURCE));
+   info->bind_zs = (templ->bind & PIPE_BIND_DEPTH_STENCIL);
+   info->bind_scanout = (templ->bind & PIPE_BIND_SCANOUT);
+   info->bind_cursor = (templ->bind & PIPE_BIND_CURSOR);
+}
+
 static enum gen_surface_tiling
 winsys_to_surface_tiling(enum intel_tiling_mode tiling)
 {
@@ -306,9 +338,10 @@ tex_alloc_bos(struct ilo_texture *tex)
    return true;
 }
 
-static bool
+static struct intel_bo *
 tex_import_handle(struct ilo_texture *tex,
-                  const struct winsys_handle *handle)
+                  const struct winsys_handle *handle,
+                  struct ilo_image_info *info)
 {
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    const struct pipe_resource *templ = &tex->base;
@@ -319,23 +352,24 @@ tex_import_handle(struct ilo_texture *tex,
 
    bo = intel_winsys_import_handle(is->dev.winsys, name, handle,
          tex->image.bo_height, &tiling, &pitch);
-   if (!bo)
-      return false;
+   /* modify image info */
+   if (bo) {
+      const uint8_t valid_tilings = 1 << winsys_to_surface_tiling(tiling);
 
-   if (!ilo_image_init_for_imported(&tex->image, &is->dev, templ,
-            winsys_to_surface_tiling(tiling), pitch)) {
-      ilo_err("failed to import handle for texture\n");
-      intel_bo_unref(bo);
-      return false;
+      if (info->valid_tilings && !(info->valid_tilings & valid_tilings)) {
+         intel_bo_unref(bo);
+         return NULL;
+      }
+
+      info->valid_tilings = valid_tilings;
+      info->force_bo_stride = pitch;
+
+      /* assume imported RTs are also scanouts */
+      if (!info->bind_scanout)
+         info->bind_scanout = (templ->usage & PIPE_BIND_RENDER_TARGET);
    }
 
-   ilo_vma_init(&tex->vma, &is->dev,
-         tex->image.bo_stride * tex->image.bo_height, 4096);
-   ilo_vma_set_bo(&tex->vma, &is->dev, bo, 0);
-
-   tex->imported = true;
-
-   return true;
+   return bo;
 }
 
 static bool
@@ -345,18 +379,33 @@ tex_init_image(struct ilo_texture *tex,
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    const struct pipe_resource *templ = &tex->base;
    struct ilo_image *img = &tex->image;
+   struct intel_bo *imported_bo = NULL;;
+   struct ilo_image_info info;
+
+   resource_get_image_info(templ, &is->dev, templ->format, &info);
 
    if (handle) {
-      if (!tex_import_handle(tex, handle))
+      imported_bo = tex_import_handle(tex, handle, &info);
+      if (!imported_bo)
          return false;
-   } else {
-      ilo_image_init(img, &is->dev, templ);
-      ilo_vma_init(&tex->vma, &is->dev,
-            img->bo_stride * img->bo_height, 4096);
    }
 
-   if (img->bo_height > ilo_max_resource_size / img->bo_stride)
+   if (!ilo_image_init(img, &is->dev, &info)) {
+      intel_bo_unref(imported_bo);
       return false;
+   }
+
+   if (img->bo_height > ilo_max_resource_size / img->bo_stride ||
+       !ilo_vma_init(&tex->vma, &is->dev, img->bo_stride * img->bo_height,
+          4096)) {
+      intel_bo_unref(imported_bo);
+      return false;
+   }
+
+   if (imported_bo) {
+      ilo_vma_set_bo(&tex->vma, &is->dev, imported_bo, 0);
+      tex->imported = true;
+   }
 
    if (templ->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
       /* require on-the-fly tiling/untiling or format conversion */
@@ -500,13 +549,17 @@ static boolean
 ilo_can_create_resource(struct pipe_screen *screen,
                         const struct pipe_resource *templ)
 {
+   struct ilo_screen *is = ilo_screen(screen);
+   struct ilo_image_info info;
    struct ilo_image img;
 
    if (templ->target == PIPE_BUFFER)
       return (templ->width0 <= ilo_max_resource_size);
 
+   resource_get_image_info(templ, &is->dev, templ->format, &info);
+
    memset(&img, 0, sizeof(img));
-   ilo_image_init(&img, &ilo_screen(screen)->dev, templ);
+   ilo_image_init(&img, &ilo_screen(screen)->dev, &info);
 
    return (img.bo_height <= ilo_max_resource_size / img.bo_stride);
 }

From dc2e92b2d3d216fc9657f2ef594d7c5d0b03370e Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Wed, 24 Jun 2015 22:47:30 +0800
Subject: [PATCH 0041/1208] ilo: replace pipe_texture_target by
 gen_surface_type

Replace pipe_texture_target by gen_surface_type in ilo_image.  Change how
GEN6_SURFTYPE_CUBE is specified in ilo_state_surface and ilo_state_zs.
---
 src/gallium/drivers/ilo/core/ilo_image.c      | 18 ++--
 src/gallium/drivers/ilo/core/ilo_image.h      |  4 +-
 .../drivers/ilo/core/ilo_state_surface.c      | 79 ++++++------------
 .../drivers/ilo/core/ilo_state_surface.h      |  3 +-
 src/gallium/drivers/ilo/core/ilo_state_zs.c   | 83 ++++++-------------
 src/gallium/drivers/ilo/core/ilo_state_zs.h   |  4 +-
 src/gallium/drivers/ilo/ilo_resource.c        | 24 +++++-
 src/gallium/drivers/ilo/ilo_state.c           | 10 ++-
 8 files changed, 99 insertions(+), 126 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 39c6daaafd3..9ec6792146f 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -514,7 +514,7 @@ img_init_walk_gen7(struct ilo_image *img,
        *     "note that the depth buffer and stencil buffer have an implied
        *      value of ARYSPC_FULL"
        */
-      img->walk = (info->target == PIPE_TEXTURE_3D) ?
+      img->walk = (info->type == GEN6_SURFTYPE_3D) ?
          ILO_IMAGE_WALK_3D : ILO_IMAGE_WALK_LAYER;
 
       img->interleaved_samples = true;
@@ -533,7 +533,7 @@ img_init_walk_gen7(struct ilo_image *img,
          assert(info->level_count == 1);
 
       img->walk =
-         (info->target == PIPE_TEXTURE_3D) ? ILO_IMAGE_WALK_3D :
+         (info->type == GEN6_SURFTYPE_3D) ? ILO_IMAGE_WALK_3D :
          (info->level_count > 1) ? ILO_IMAGE_WALK_LAYER :
          ILO_IMAGE_WALK_LOD;
 
@@ -557,7 +557,7 @@ img_init_walk_gen6(struct ilo_image *img,
     * GEN6 does not support compact spacing otherwise.
     */
    img->walk =
-      (params->info->target == PIPE_TEXTURE_3D) ? ILO_IMAGE_WALK_3D :
+      (params->info->type == GEN6_SURFTYPE_3D) ? ILO_IMAGE_WALK_3D :
       (img->format == PIPE_FORMAT_S8_UINT) ? ILO_IMAGE_WALK_LOD :
       ILO_IMAGE_WALK_LAYER;
 
@@ -674,7 +674,7 @@ img_init_size_and_format(struct ilo_image *img,
    enum pipe_format format = info->format;
    bool require_separate_stencil = false;
 
-   img->target = info->target;
+   img->type = info->type;
    img->width0 = info->width;
    img->height0 = info->height;
    img->depth0 = info->depth;
@@ -737,7 +737,7 @@ img_want_mcs(const struct ilo_image *img,
    if (ilo_dev_gen(params->dev) < ILO_GEN(7))
       return false;
 
-   if (info->target != PIPE_TEXTURE_2D || !info->bind_surface_dp_render)
+   if (info->type != GEN6_SURFTYPE_2D || !info->bind_surface_dp_render)
       return false;
 
    /*
@@ -799,7 +799,7 @@ img_want_hiz(const struct ilo_image *img,
       return false;
 
    /* we want 8x4 aligned levels */
-   if (info->target == PIPE_TEXTURE_1D)
+   if (info->type == GEN6_SURFTYPE_1D)
       return false;
 
    if (!info->bind_zs)
@@ -865,7 +865,7 @@ img_align(struct ilo_image *img, struct ilo_image_params *params)
       align_w = MAX2(align_w, img->align_i);
       align_h = MAX2(align_h, img->align_j);
 
-      if (info->target == PIPE_TEXTURE_CUBE)
+      if (info->type == GEN6_SURFTYPE_CUBE)
          pad_h += 2;
 
       if (params->compressed)
@@ -1339,7 +1339,7 @@ img_init_for_transfer(struct ilo_image *img,
                       const struct ilo_dev *dev,
                       const struct ilo_image_info *info)
 {
-   const unsigned num_layers = (info->target == PIPE_TEXTURE_3D) ?
+   const unsigned num_layers = (info->type == GEN6_SURFTYPE_3D) ?
       info->depth : info->array_size;
    unsigned layer_width, layer_height;
 
@@ -1348,7 +1348,7 @@ img_init_for_transfer(struct ilo_image *img,
 
    img->aux.type = ILO_IMAGE_AUX_NONE;
 
-   img->target = info->target;
+   img->type = info->type;
    img->width0 = info->width;
    img->height0 = info->height;
    img->depth0 = info->depth;
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index e488bef0d3f..1c4f86c78da 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -68,7 +68,7 @@ enum ilo_image_walk_type {
 };
 
 struct ilo_image_info {
-   enum pipe_texture_target target;
+   enum gen_surface_type type;
 
    enum pipe_format format;
 
@@ -117,7 +117,7 @@ struct ilo_image_lod {
  * Texture layout.
  */
 struct ilo_image {
-   enum pipe_texture_target target;
+   enum gen_surface_type type;
 
    /* size, format, etc for programming hardware states */
    unsigned width0;
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c
index 2caba6df46e..40fe15f316f 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c
@@ -425,29 +425,6 @@ surface_set_gen7_buffer_SURFACE_STATE(struct ilo_state_surface *surf,
    return true;
 }
 
-static enum gen_surface_type
-get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   switch (img->target) {
-   case PIPE_TEXTURE_1D:
-   case PIPE_TEXTURE_1D_ARRAY:
-      return GEN6_SURFTYPE_1D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_CUBE:
-   case PIPE_TEXTURE_RECT:
-   case PIPE_TEXTURE_2D_ARRAY:
-   case PIPE_TEXTURE_CUBE_ARRAY:
-      return GEN6_SURFTYPE_2D;
-   case PIPE_TEXTURE_3D:
-      return GEN6_SURFTYPE_3D;
-   default:
-      assert(!"unknown texture target");
-      return GEN6_SURFTYPE_NULL;
-   }
-}
-
 static bool
 surface_validate_gen6_image(const struct ilo_dev *dev,
                             const struct ilo_state_surface_image_info *info)
@@ -487,17 +464,19 @@ surface_validate_gen6_image(const struct ilo_dev *dev,
    assert(info->img->bo_stride && info->img->bo_stride <= 512 * 1024 &&
           info->img->width0 <= info->img->bo_stride);
 
-   if (info->is_cube_map) {
-      assert(get_gen6_surface_type(dev, info->img) == GEN6_SURFTYPE_2D);
-
-      /*
-       * From the Sandy Bridge PRM, volume 4 part 1, page 78:
-       *
-       *     "For cube maps, Width must be set equal to the Height."
-       */
-      assert(info->img->width0 == info->img->height0);
+   if (info->type != info->img->type) {
+      assert(info->type == GEN6_SURFTYPE_2D &&
+             info->img->type == GEN6_SURFTYPE_CUBE);
    }
 
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 78:
+    *
+    *     "For cube maps, Width must be set equal to the Height."
+    */
+   if (info->type == GEN6_SURFTYPE_CUBE)
+      assert(info->img->width0 == info->img->height0);
+
    /*
     * From the Sandy Bridge PRM, volume 4 part 1, page 72:
     *
@@ -532,20 +511,21 @@ surface_validate_gen6_image(const struct ilo_dev *dev,
 }
 
 static void
-get_gen6_max_extent(const struct ilo_dev *dev,
-                    const struct ilo_image *img,
-                    uint16_t *max_w, uint16_t *max_h)
+surface_get_gen6_image_max_extent(const struct ilo_dev *dev,
+                                  const struct ilo_state_surface_image_info *info,
+                                  uint16_t *max_w, uint16_t *max_h)
 {
    const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   switch (get_gen6_surface_type(dev, img)) {
+   switch (info->type) {
    case GEN6_SURFTYPE_1D:
       *max_w = max_size;
       *max_h = 1;
       break;
    case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_CUBE:
       *max_w = max_size;
       *max_h = max_size;
       break;
@@ -573,7 +553,7 @@ surface_get_gen6_image_extent(const struct ilo_dev *dev,
    w = info->img->width0;
    h = info->img->height0;
 
-   get_gen6_max_extent(dev, info->img, &max_w, &max_h);
+   surface_get_gen6_image_max_extent(dev, info, &max_w, &max_h);
    assert(w && h && w <= max_w && h <= max_h);
 
    *width = w - 1;
@@ -624,16 +604,17 @@ surface_get_gen6_image_slices(const struct ilo_dev *dev,
     * layers to (86 * 6), about 512.
     */
 
-   switch (get_gen6_surface_type(dev, info->img)) {
+   switch (info->type) {
    case GEN6_SURFTYPE_1D:
    case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_CUBE:
       max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 2048 : 512;
 
       assert(info->img->array_size <= max_slice);
       max_slice = info->img->array_size;
 
       d = info->slice_count;
-      if (info->is_cube_map) {
+      if (info->type == GEN6_SURFTYPE_CUBE) {
          if (info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) {
             if (!d || d % 6) {
                ilo_warn("invalid cube slice count\n");
@@ -946,7 +927,6 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
    uint8_t min_lod, mip_count;
    enum gen_sample_count sample_count;
    uint32_t alignments;
-   enum gen_surface_type type;
    uint32_t dw0, dw2, dw3, dw4, dw5;
 
    ILO_DEV_ASSERT(dev, 6, 6);
@@ -966,10 +946,7 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
    if (info->img->sample_count > 1)
       assert(info->img->interleaved_samples);
 
-   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
-      get_gen6_surface_type(dev, info->img);
-
-   dw0 = type << GEN6_SURFACE_DW0_TYPE__SHIFT |
+   dw0 = info->type << GEN6_SURFACE_DW0_TYPE__SHIFT |
          info->format << GEN6_SURFACE_DW0_FORMAT__SHIFT |
          GEN6_SURFACE_DW0_MIPLAYOUT_BELOW;
 
@@ -996,7 +973,7 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
     *     "When TEXCOORDMODE_CLAMP is used when accessing a cube map, this
     *      field must be programmed to 111111b (all faces enabled)."
     */
-   if (info->is_cube_map &&
+   if (info->type == GEN6_SURFTYPE_CUBE &&
        info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) {
       dw0 |= GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_AVERAGE |
              GEN6_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
@@ -1025,7 +1002,7 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
    surf->surface[4] = dw4;
    surf->surface[5] = dw5;
 
-   surf->type = type;
+   surf->type = info->type;
    surf->min_lod = min_lod;
    surf->mip_count = mip_count;
 
@@ -1041,7 +1018,6 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
    uint8_t min_lod, mip_count;
    uint32_t alignments;
    enum gen_sample_count sample_count;
-   enum gen_surface_type type;
    uint32_t dw0, dw1, dw2, dw3, dw4, dw5, dw7;
 
    ILO_DEV_ASSERT(dev, 7, 8);
@@ -1055,10 +1031,7 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
        !surface_get_gen6_image_alignments(dev, info, &alignments))
       return false;
 
-   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
-      get_gen6_surface_type(dev, info->img);
-
-   dw0 = type << GEN7_SURFACE_DW0_TYPE__SHIFT |
+   dw0 = info->type << GEN7_SURFACE_DW0_TYPE__SHIFT |
          info->format << GEN7_SURFACE_DW0_FORMAT__SHIFT |
          alignments;
 
@@ -1092,7 +1065,7 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
     *      field must be programmed to 111111b (all faces enabled). This field
     *      is ignored unless the Surface Type is SURFTYPE_CUBE."
     */
-   if (info->is_cube_map &&
+   if (info->type == GEN6_SURFTYPE_CUBE &&
        info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER)
       dw0 |= GEN7_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
 
@@ -1156,7 +1129,7 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
       surf->surface[12] = 0;
    }
 
-   surf->type = type;
+   surf->type = info->type;
    surf->min_lod = min_lod;
    surf->mip_count = mip_count;
 
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.h b/src/gallium/drivers/ilo/core/ilo_state_surface.h
index 835df69882e..e78c7c97db1 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.h
@@ -72,11 +72,12 @@ struct ilo_state_surface_image_info {
 
    enum ilo_state_surface_access access;
 
+   enum gen_surface_type type;
+
    enum gen_surface_format format;
    bool is_integer;
 
    bool readonly;
-   bool is_cube_map;
    bool is_array;
 };
 
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.c b/src/gallium/drivers/ilo/core/ilo_state_zs.c
index 7b82f1acf6f..306e6d9aedc 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_zs.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.c
@@ -60,29 +60,6 @@ zs_set_gen6_null_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
    return true;
 }
 
-static enum gen_surface_type
-get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   switch (img->target) {
-   case PIPE_TEXTURE_1D:
-   case PIPE_TEXTURE_1D_ARRAY:
-      return GEN6_SURFTYPE_1D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_CUBE:
-   case PIPE_TEXTURE_RECT:
-   case PIPE_TEXTURE_2D_ARRAY:
-   case PIPE_TEXTURE_CUBE_ARRAY:
-      return GEN6_SURFTYPE_2D;
-   case PIPE_TEXTURE_3D:
-      return GEN6_SURFTYPE_3D;
-   default:
-      assert(!"unknown texture target");
-      return GEN6_SURFTYPE_NULL;
-   }
-}
-
 static enum gen_depth_format
 get_gen6_depth_format(const struct ilo_dev *dev, const struct ilo_image *img)
 {
@@ -148,50 +125,52 @@ zs_validate_gen6(const struct ilo_dev *dev,
    /*
     * From the Ivy Bridge PRM, volume 2 part 1, page 315:
     *
-    *      The stencil buffer has a format of S8_UINT, and shares Surface
+    *     "The stencil buffer has a format of S8_UINT, and shares Surface
     *      Type, Height, Width, and Depth, Minimum Array Element, Render
     *      Target View Extent, Depth Coordinate Offset X/Y, LOD, and Depth
-    *      Buffer Object Control State fields of the depth buffer.
+    *      Buffer Object Control State fields of the depth buffer."
     */
-   if (info->z_img == info->s_img) {
-      assert(info->z_img->target == info->s_img->target &&
-             info->z_img->width0 == info->s_img->width0 &&
+   if (info->z_img && info->s_img && info->z_img != info->s_img) {
+      assert(info->z_img->type == info->s_img->type &&
              info->z_img->height0 == info->s_img->height0 &&
              info->z_img->depth0 == info->s_img->depth0);
    }
 
+   if (info->type != img->type) {
+      assert(info->type == GEN6_SURFTYPE_2D &&
+             img->type == GEN6_SURFTYPE_CUBE);
+   }
+
    assert(info->level < img->level_count);
    assert(img->bo_stride);
 
-   if (info->is_cube_map) {
-      assert(get_gen6_surface_type(dev, img) == GEN6_SURFTYPE_2D);
-
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 323:
-       *
-       *     "For cube maps, Width must be set equal to Height."
-       */
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 323:
+    *
+    *     "For cube maps, Width must be set equal to Height."
+    */
+   if (info->type == GEN6_SURFTYPE_CUBE)
       assert(img->width0 == img->height0);
-   }
 
    return true;
 }
 
 static void
-get_gen6_max_extent(const struct ilo_dev *dev,
-                    const struct ilo_image *img,
-                    uint16_t *max_w, uint16_t *max_h)
+zs_get_gen6_max_extent(const struct ilo_dev *dev,
+                       const struct ilo_state_zs_info *info,
+                       uint16_t *max_w, uint16_t *max_h)
 {
    const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   switch (get_gen6_surface_type(dev, img)) {
+   switch (info->type) {
    case GEN6_SURFTYPE_1D:
       *max_w = max_size;
       *max_h = 1;
       break;
    case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_CUBE:
       *max_w = max_size;
       *max_h = max_size;
       break;
@@ -297,7 +276,7 @@ zs_get_gen6_depth_extent(const struct ilo_dev *dev,
       h = align(h, align_h);
    }
 
-   get_gen6_max_extent(dev, img, &max_w, &max_h);
+   zs_get_gen6_max_extent(dev, info, &max_w, &max_h);
    assert(w && h && w <= max_w && h <= max_h);
 
    *width = w - 1;
@@ -326,16 +305,17 @@ zs_get_gen6_depth_slices(const struct ilo_dev *dev,
     *      surfaces. If the volume texture is MIP-mapped, this field specifies
     *      the depth of the base MIP level."
     */
-   switch (get_gen6_surface_type(dev, img)) {
+   switch (info->type) {
    case GEN6_SURFTYPE_1D:
    case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_CUBE:
       max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 2048 : 512;
 
       assert(img->array_size <= max_slice);
       max_slice = img->array_size;
 
       d = info->slice_count;
-      if (info->is_cube_map) {
+      if (info->type == GEN6_SURFTYPE_CUBE) {
          /*
           * Minumum Array Element and Depth must be 0; Render Target View
           * Extent is ignored.
@@ -415,7 +395,6 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  const struct ilo_state_zs_info *info)
 {
    uint16_t width, height, depth, array_base, view_extent;
-   enum gen_surface_type type;
    enum gen_depth_format format;
    uint32_t dw1, dw2, dw3, dw4;
 
@@ -427,10 +406,6 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  &view_extent))
       return false;
 
-   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
-          (info->z_img) ? get_gen6_surface_type(dev, info->z_img) :
-                          get_gen6_surface_type(dev, info->s_img);
-
    format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) :
       GEN6_ZFORMAT_D32_FLOAT;
 
@@ -450,7 +425,7 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
       format = GEN6_ZFORMAT_D24_UNORM_S8_UINT;
 
    /* info->z_readonly and info->s_readonly are ignored on Gen6 */
-   dw1 = type << GEN6_DEPTH_DW1_TYPE__SHIFT |
+   dw1 = info->type << GEN6_DEPTH_DW1_TYPE__SHIFT |
          GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT |
          format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
 
@@ -488,7 +463,6 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  const struct ilo_dev *dev,
                                  const struct ilo_state_zs_info *info)
 {
-   enum gen_surface_type type;
    enum gen_depth_format format;
    uint16_t width, height, depth;
    uint16_t array_base, view_extent;
@@ -502,14 +476,10 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  &view_extent))
       return false;
 
-   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
-          (info->z_img) ? get_gen6_surface_type(dev, info->z_img) :
-                          get_gen6_surface_type(dev, info->s_img);
-
    format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) :
       GEN6_ZFORMAT_D32_FLOAT;
 
-   dw1 = type << GEN7_DEPTH_DW1_TYPE__SHIFT |
+   dw1 = info->type << GEN7_DEPTH_DW1_TYPE__SHIFT |
          format << GEN7_DEPTH_DW1_FORMAT__SHIFT;
 
    if (info->z_img) {
@@ -714,6 +684,7 @@ ilo_state_zs_init_for_null(struct ilo_state_zs *zs,
    struct ilo_state_zs_info info;
 
    memset(&info, 0, sizeof(info));
+   info.type = GEN6_SURFTYPE_NULL;
 
    return ilo_state_zs_init(zs, dev, &info);
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.h b/src/gallium/drivers/ilo/core/ilo_state_zs.h
index 6f32b7e2efe..d78a12ad516 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_zs.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.h
@@ -48,11 +48,11 @@ struct ilo_state_zs_info {
    const struct ilo_vma *s_vma;
    const struct ilo_vma *hiz_vma;
 
+   enum gen_surface_type type;
+
    /* ignored prior to Gen7 */
    bool z_readonly;
    bool s_readonly;
-
-   bool is_cube_map;
 };
 
 struct ilo_state_zs {
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index 0b0f69c30be..9ef53213b17 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -87,6 +87,28 @@ resource_get_cpu_init(const struct pipe_resource *templ)
                           PIPE_BIND_STREAM_OUTPUT)) ? false : true;
 }
 
+static enum gen_surface_type
+get_surface_type(enum pipe_texture_target target)
+{
+   switch (target) {
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+      return GEN6_SURFTYPE_1D;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_2D_ARRAY:
+      return GEN6_SURFTYPE_2D;
+   case PIPE_TEXTURE_3D:
+      return GEN6_SURFTYPE_3D;
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return GEN6_SURFTYPE_CUBE;
+   default:
+      assert(!"unknown texture target");
+      return GEN6_SURFTYPE_NULL;
+   }
+}
+
 static void
 resource_get_image_info(const struct pipe_resource *templ,
                         const struct ilo_dev *dev,
@@ -95,7 +117,7 @@ resource_get_image_info(const struct pipe_resource *templ,
 {
    memset(info, 0, sizeof(*info));
 
-   info->target = templ->target;
+   info->type = get_surface_type(templ->target);
    info->format = image_format;
 
    info->width = templ->width0;
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index 24ab59aa32b..445df1966f9 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -2045,6 +2045,7 @@ ilo_create_sampler_view(struct pipe_context *pipe,
 
       info.vma = &tex->vma;
       info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER;
+      info.type = tex->image.type;
 
       if (templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
           tex->image.separate_stencil) {
@@ -2054,8 +2055,6 @@ ilo_create_sampler_view(struct pipe_context *pipe,
          info.format = ilo_format_translate_texture(dev, templ->format);
       }
 
-      info.is_cube_map = (tex->image.target == PIPE_TEXTURE_CUBE ||
-                          tex->image.target == PIPE_TEXTURE_CUBE_ARRAY);
       info.is_array = util_resource_is_array_texture(&tex->base);
       info.readonly = true;
 
@@ -2116,6 +2115,10 @@ ilo_create_surface(struct pipe_context *pipe,
          info.aux_vma = &tex->aux_vma;
 
       info.access = ILO_STATE_SURFACE_ACCESS_DP_RENDER;
+
+      info.type = (tex->image.type == GEN6_SURFTYPE_CUBE) ?
+         GEN6_SURFTYPE_2D : tex->image.type;
+
       info.format = ilo_format_translate_render(dev, templ->format);
       info.is_array = util_resource_is_array_texture(&tex->base);
 
@@ -2148,6 +2151,9 @@ ilo_create_surface(struct pipe_context *pipe,
       info.slice_count = templ->u.tex.last_layer -
          templ->u.tex.first_layer + 1;
 
+      info.type = (tex->image.type == GEN6_SURFTYPE_CUBE) ?
+         GEN6_SURFTYPE_2D : tex->image.type;
+
       ilo_state_zs_init(&surf->u.zs, dev, &info);
    }
 

From 2ee95f6d64aca9e9490c1ac293dd711b5f60a16b Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Thu, 25 Jun 2015 07:43:47 +0800
Subject: [PATCH 0042/1208] ilo: always use the specified image format

Move silent promotion of PIPE_FORMAT_ETC1_RGB8 or combined depth/stencil out
of core.
---
 src/gallium/drivers/ilo/core/ilo_image.c |  80 ++++++------------
 src/gallium/drivers/ilo/core/ilo_image.h |   4 +-
 src/gallium/drivers/ilo/ilo_resource.c   | 100 ++++++++++++++++++++---
 src/gallium/drivers/ilo/ilo_state.c      |   2 +-
 4 files changed, 116 insertions(+), 70 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 9ec6792146f..7da5debcb37 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -671,10 +671,30 @@ img_init_size_and_format(struct ilo_image *img,
                          struct ilo_image_params *params)
 {
    const struct ilo_image_info *info = params->info;
-   enum pipe_format format = info->format;
-   bool require_separate_stencil = false;
 
    img->type = info->type;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 314:
+    *
+    *     "The separate stencil buffer is always enabled, thus the field in
+    *      3DSTATE_DEPTH_BUFFER to explicitly enable the separate stencil
+    *      buffer has been removed Surface formats with interleaved depth and
+    *      stencil are no longer supported"
+    */
+   if (ilo_dev_gen(params->dev) >= ILO_GEN(7) && info->bind_zs) {
+      const struct util_format_description *desc =
+         util_format_description(info->format);
+
+      assert(info->format == PIPE_FORMAT_S8_UINT ||
+             !util_format_has_stencil(desc));
+   }
+
+   img->format = info->format;
+   img->block_width = util_format_get_blockwidth(info->format);
+   img->block_height = util_format_get_blockheight(info->format);
+   img->block_size = util_format_get_blocksize(info->format);
+
    img->width0 = info->width;
    img->height0 = info->height;
    img->depth0 = info->depth;
@@ -682,46 +702,6 @@ img_init_size_and_format(struct ilo_image *img,
    img->level_count = info->level_count;
    img->sample_count = info->sample_count;
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 317:
-    *
-    *     "This field (Separate Stencil Buffer Enable) must be set to the same
-    *      value (enabled or disabled) as Hierarchical Depth Buffer Enable."
-    *
-    * GEN7+ requires separate stencil buffers.
-    */
-   if (info->bind_zs) {
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
-         require_separate_stencil = true;
-      else
-         require_separate_stencil = (img->aux.type == ILO_IMAGE_AUX_HIZ);
-   }
-
-   switch (format) {
-   case PIPE_FORMAT_ETC1_RGB8:
-      format = PIPE_FORMAT_R8G8B8X8_UNORM;
-      break;
-   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-      if (require_separate_stencil) {
-         format = PIPE_FORMAT_Z24X8_UNORM;
-         img->separate_stencil = true;
-      }
-      break;
-   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-      if (require_separate_stencil) {
-         format = PIPE_FORMAT_Z32_FLOAT;
-         img->separate_stencil = true;
-      }
-      break;
-   default:
-      break;
-   }
-
-   img->format = format;
-   img->block_width = util_format_get_blockwidth(format);
-   img->block_height = util_format_get_blockheight(format);
-   img->block_size = util_format_get_blocksize(format);
-
    params->valid_tilings = img_get_valid_tilings(img, params);
    params->compressed = util_format_is_compressed(img->format);
 }
@@ -805,19 +785,7 @@ img_want_hiz(const struct ilo_image *img,
    if (!info->bind_zs)
       return false;
 
-   if (!util_format_has_depth(desc))
-      return false;
-
-   /*
-    * As can be seen in img_calculate_hiz_size(), HiZ may not be enabled
-    * for every level.  This is generally fine except on GEN6, where HiZ and
-    * separate stencil are enabled and disabled at the same time.  When the
-    * format is PIPE_FORMAT_Z32_FLOAT_S8X24_UINT, enabling and disabling HiZ
-    * can result in incompatible formats.
-    */
-   if (ilo_dev_gen(params->dev) == ILO_GEN(6) &&
-       info->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
-       info->level_count > 1)
+   if (!util_format_has_depth(desc) || util_format_has_stencil(desc))
       return false;
 
    return true;
@@ -1303,8 +1271,8 @@ img_init(struct ilo_image *img,
 {
    /* there are hard dependencies between every function here */
 
-   img_init_aux(img, params);
    img_init_size_and_format(img, params);
+   img_init_aux(img, params);
    img_init_walk(img, params);
    img_init_tiling(img, params);
    img_init_alignments(img, params);
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index 1c4f86c78da..8a2e1438158 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -119,6 +119,8 @@ struct ilo_image_lod {
 struct ilo_image {
    enum gen_surface_type type;
 
+   enum pipe_format format;
+
    /* size, format, etc for programming hardware states */
    unsigned width0;
    unsigned height0;
@@ -126,8 +128,6 @@ struct ilo_image {
    unsigned array_size;
    unsigned level_count;
    unsigned sample_count;
-   enum pipe_format format;
-   bool separate_stencil;
 
    /*
     * width, height, and size of pixel blocks for conversion between pixel
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index 9ef53213b17..11833e0c599 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -109,6 +109,45 @@ get_surface_type(enum pipe_texture_target target)
    }
 }
 
+static enum pipe_format
+resource_get_image_format(const struct pipe_resource *templ,
+                          const struct ilo_dev *dev,
+                          bool *separate_stencil_ret)
+{
+   enum pipe_format format = templ->format;
+   bool separate_stencil;
+
+   /* silently promote ETC1 */
+   if (templ->format == PIPE_FORMAT_ETC1_RGB8)
+      format = PIPE_FORMAT_R8G8B8X8_UNORM;
+
+   /* separate stencil buffers */
+   separate_stencil = false;
+   if ((templ->bind & PIPE_BIND_DEPTH_STENCIL) &&
+       util_format_is_depth_and_stencil(templ->format)) {
+      switch (templ->format) {
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         /* Gen6 requires HiZ to be available for all levels */
+         if (ilo_dev_gen(dev) >= ILO_GEN(7) || templ->last_level == 0) {
+            format = PIPE_FORMAT_Z32_FLOAT;
+            separate_stencil = true;
+         }
+         break;
+      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+         format = PIPE_FORMAT_Z24X8_UNORM;
+         separate_stencil = true;
+         break;
+      default:
+         break;
+      }
+   }
+
+   if (separate_stencil_ret)
+      *separate_stencil_ret = separate_stencil;
+
+   return format;
+}
+
 static void
 resource_get_image_info(const struct pipe_resource *templ,
                         const struct ilo_dev *dev,
@@ -340,10 +379,6 @@ tex_alloc_bos(struct ilo_texture *tex)
    if (!tex->imported && !tex_create_bo(tex))
       return false;
 
-   /* allocate separate stencil resource */
-   if (tex->image.separate_stencil && !tex_create_separate_stencil(tex))
-      return false;
-
    switch (tex->image.aux.type) {
    case ILO_IMAGE_AUX_HIZ:
       if (!tex_create_hiz(tex))
@@ -396,15 +431,19 @@ tex_import_handle(struct ilo_texture *tex,
 
 static bool
 tex_init_image(struct ilo_texture *tex,
-               const struct winsys_handle *handle)
+               const struct winsys_handle *handle,
+               bool *separate_stencil)
 {
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    const struct pipe_resource *templ = &tex->base;
    struct ilo_image *img = &tex->image;
    struct intel_bo *imported_bo = NULL;;
+   enum pipe_format image_format;
    struct ilo_image_info info;
 
-   resource_get_image_info(templ, &is->dev, templ->format, &info);
+   image_format = resource_get_image_format(templ,
+         &is->dev, separate_stencil);
+   resource_get_image_info(templ, &is->dev, image_format, &info);
 
    if (handle) {
       imported_bo = tex_import_handle(tex, handle, &info);
@@ -417,6 +456,31 @@ tex_init_image(struct ilo_texture *tex,
       return false;
    }
 
+   /*
+    * HiZ requires 8x4 alignment and some levels might need HiZ disabled.  It
+    * is generally fine except on Gen6, where HiZ and separate stencil must be
+    * enabled together.  For PIPE_FORMAT_Z24X8_UNORM with separate stencil, we
+    * can live with stencil values being interleaved for levels where HiZ is
+    * disabled.  But it is not the case for PIPE_FORMAT_Z32_FLOAT with
+    * separate stencil.  If HiZ was disabled for a level, we had to change the
+    * format to PIPE_FORMAT_Z32_FLOAT_S8X24_UINT for the level and that format
+    * had a different bpp.  In other words, HiZ has to be available for all
+    * levels.
+    */
+   if (ilo_dev_gen(&is->dev) == ILO_GEN(6) &&
+       templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
+       image_format == PIPE_FORMAT_Z32_FLOAT &&
+       img->aux.enables != (1 << templ->last_level)) {
+      image_format = templ->format;
+      info.format = image_format;
+
+      memset(img, 0, sizeof(*img));
+      if (!ilo_image_init(img, &is->dev, &info)) {
+         intel_bo_unref(imported_bo);
+         return false;
+      }
+   }
+
    if (img->bo_height > ilo_max_resource_size / img->bo_stride ||
        !ilo_vma_init(&tex->vma, &is->dev, img->bo_stride * img->bo_height,
           4096)) {
@@ -431,8 +495,8 @@ tex_init_image(struct ilo_texture *tex,
 
    if (templ->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
       /* require on-the-fly tiling/untiling or format conversion */
-      if (img->tiling == GEN8_TILING_W || img->separate_stencil ||
-          img->format != templ->format)
+      if (img->tiling == GEN8_TILING_W || *separate_stencil ||
+          image_format != templ->format)
          return false;
    }
 
@@ -448,6 +512,7 @@ tex_create(struct pipe_screen *screen,
            const struct winsys_handle *handle)
 {
    struct ilo_texture *tex;
+   bool separate_stencil;
 
    tex = CALLOC_STRUCT(ilo_texture);
    if (!tex)
@@ -457,12 +522,13 @@ tex_create(struct pipe_screen *screen,
    tex->base.screen = screen;
    pipe_reference_init(&tex->base.reference, 1);
 
-   if (!tex_init_image(tex, handle)) {
+   if (!tex_init_image(tex, handle, &separate_stencil)) {
       FREE(tex);
       return NULL;
    }
 
-   if (!tex_alloc_bos(tex)) {
+   if (!tex_alloc_bos(tex) ||
+       (separate_stencil && !tex_create_separate_stencil(tex))) {
       tex_destroy(tex);
       return NULL;
    }
@@ -572,17 +638,29 @@ ilo_can_create_resource(struct pipe_screen *screen,
                         const struct pipe_resource *templ)
 {
    struct ilo_screen *is = ilo_screen(screen);
+   enum pipe_format image_format;
    struct ilo_image_info info;
    struct ilo_image img;
 
    if (templ->target == PIPE_BUFFER)
       return (templ->width0 <= ilo_max_resource_size);
 
-   resource_get_image_info(templ, &is->dev, templ->format, &info);
+   image_format = resource_get_image_format(templ, &is->dev, NULL);
+   resource_get_image_info(templ, &is->dev, image_format, &info);
 
    memset(&img, 0, sizeof(img));
    ilo_image_init(&img, &ilo_screen(screen)->dev, &info);
 
+   /* as in tex_init_image() */
+   if (ilo_dev_gen(&is->dev) == ILO_GEN(6) &&
+       templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
+       image_format == PIPE_FORMAT_Z32_FLOAT &&
+       img.aux.enables != (1 << templ->last_level)) {
+      info.format = templ->format;
+      memset(&img, 0, sizeof(img));
+      ilo_image_init(&img, &ilo_screen(screen)->dev, &info);
+   }
+
    return (img.bo_height <= ilo_max_resource_size / img.bo_stride);
 }
 
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index 445df1966f9..20d3c001188 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -2048,7 +2048,7 @@ ilo_create_sampler_view(struct pipe_context *pipe,
       info.type = tex->image.type;
 
       if (templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
-          tex->image.separate_stencil) {
+          tex->separate_s8) {
          info.format = ilo_format_translate_texture(dev,
                PIPE_FORMAT_Z32_FLOAT);
       } else {

From cbdc26aa3f76dc20285caa7e62ca8809cb2fe638 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Thu, 25 Jun 2015 22:27:04 +0800
Subject: [PATCH 0043/1208] ilo: replace pipe_format by gen_surface_format

Replace pipe_format by gen_surface_format in ilo_image.  Change how depth
format is specified in ilo_state_zs.
---
 src/gallium/drivers/ilo/core/ilo_core.h       |  2 -
 src/gallium/drivers/ilo/core/ilo_image.c      | 70 +++++++------
 src/gallium/drivers/ilo/core/ilo_image.h      | 12 ++-
 src/gallium/drivers/ilo/core/ilo_state_zs.c   | 99 ++++++-------------
 src/gallium/drivers/ilo/core/ilo_state_zs.h   | 11 +--
 src/gallium/drivers/ilo/ilo_blitter_blt.c     |  2 +-
 .../drivers/ilo/ilo_blitter_rectlist.c        |  4 +-
 src/gallium/drivers/ilo/ilo_common.h          |  3 +
 src/gallium/drivers/ilo/ilo_format.h          | 35 +++++++
 src/gallium/drivers/ilo/ilo_resource.c        | 49 +++++++--
 src/gallium/drivers/ilo/ilo_resource.h        |  1 +
 src/gallium/drivers/ilo/ilo_state.c           |  8 +-
 src/gallium/drivers/ilo/ilo_transfer.c        | 20 ++--
 13 files changed, 174 insertions(+), 142 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_core.h b/src/gallium/drivers/ilo/core/ilo_core.h
index 0a7f7d9d3fe..d95a80aabd3 100644
--- a/src/gallium/drivers/ilo/core/ilo_core.h
+++ b/src/gallium/drivers/ilo/core/ilo_core.h
@@ -30,11 +30,9 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_format.h"
 
 #include "util/u_debug.h"
 #include "util/list.h"
-#include "util/u_format.h"
 #include "util/u_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 7da5debcb37..7a1100288d3 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -45,8 +45,6 @@ struct ilo_image_params {
    const struct ilo_image_info *info;
    unsigned valid_tilings;
 
-   bool compressed;
-
    unsigned h0, h1;
    unsigned max_x, max_y;
 };
@@ -261,7 +259,7 @@ img_init_lods(struct ilo_image *img,
 
          /* every LOD begins at tile boundaries */
          if (info->level_count > 1) {
-            assert(img->format == PIPE_FORMAT_S8_UINT);
+            assert(img->format == GEN6_FORMAT_R8_UINT);
             cur_x = align(cur_x, 64);
             cur_y = align(cur_y, 64);
          }
@@ -334,7 +332,7 @@ img_init_alignments(struct ilo_image *img,
     *
     *                                  align_i        align_j
     *   compressed formats             block width    block height
-    *   PIPE_FORMAT_S8_UINT            4              2
+    *   GEN6_FORMAT_R8_UINT            4              2
     *   other depth/stencil formats    4              4
     *   4x multisampled                4              4
     *   bpp 96                         4              2
@@ -382,27 +380,27 @@ img_init_alignments(struct ilo_image *img,
     *
     *                                  align_i        align_j
     *  compressed formats              block width    block height
-    *  PIPE_FORMAT_Z16_UNORM           8              4
-    *  PIPE_FORMAT_S8_UINT             8              8
+    *  GEN6_FORMAT_R16_UNORM           8              4
+    *  GEN6_FORMAT_R8_UINT             8              8
     *  other depth/stencil formats     4              4
     *  2x or 4x multisampled           4 or 8         4
     *  tiled Y                         4 or 8         4 (if rt)
-    *  PIPE_FORMAT_R32G32B32_FLOAT     4 or 8         2
+    *  GEN6_FORMAT_R32G32B32_FLOAT     4 or 8         2
     *  others                          4 or 8         2 or 4
     */
 
-   if (params->compressed) {
+   if (info->compressed) {
       /* this happens to be the case */
       img->align_i = img->block_width;
       img->align_j = img->block_height;
    } else if (info->bind_zs) {
       if (ilo_dev_gen(params->dev) >= ILO_GEN(7)) {
          switch (img->format) {
-         case PIPE_FORMAT_Z16_UNORM:
+         case GEN6_FORMAT_R16_UNORM:
             img->align_i = 8;
             img->align_j = 4;
             break;
-         case PIPE_FORMAT_S8_UINT:
+         case GEN6_FORMAT_R8_UINT:
             img->align_i = 8;
             img->align_j = 8;
             break;
@@ -413,7 +411,7 @@ img_init_alignments(struct ilo_image *img,
          }
       } else {
          switch (img->format) {
-         case PIPE_FORMAT_S8_UINT:
+         case GEN6_FORMAT_R8_UINT:
             img->align_i = 4;
             img->align_j = 2;
             break;
@@ -433,7 +431,7 @@ img_init_alignments(struct ilo_image *img,
 
       if (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
           ilo_dev_gen(params->dev) <= ILO_GEN(7.5) && valign_4)
-         assert(img->format != PIPE_FORMAT_R32G32B32_FLOAT);
+         assert(img->format != GEN6_FORMAT_R32G32B32_FLOAT);
 
       img->align_i = 4;
       img->align_j = (valign_4) ? 4 : 2;
@@ -558,7 +556,7 @@ img_init_walk_gen6(struct ilo_image *img,
     */
    img->walk =
       (params->info->type == GEN6_SURFTYPE_3D) ? ILO_IMAGE_WALK_3D :
-      (img->format == PIPE_FORMAT_S8_UINT) ? ILO_IMAGE_WALK_LOD :
+      (img->format == GEN6_FORMAT_R8_UINT) ? ILO_IMAGE_WALK_LOD :
       ILO_IMAGE_WALK_LAYER;
 
    /* GEN6 supports only interleaved samples */
@@ -580,7 +578,6 @@ img_get_valid_tilings(const struct ilo_image *img,
                       const struct ilo_image_params *params)
 {
    const struct ilo_image_info *info = params->info;
-   const enum pipe_format format = img->format;
    unsigned valid_tilings = params->valid_tilings;
 
    /*
@@ -614,8 +611,8 @@ img_get_valid_tilings(const struct ilo_image *img,
     *     "W-Major Tile Format is used for separate stencil."
     */
    if (info->bind_zs) {
-      switch (format) {
-      case PIPE_FORMAT_S8_UINT:
+      switch (info->format) {
+      case GEN6_FORMAT_R8_UINT:
          valid_tilings &= IMAGE_TILING_W;
          break;
       default:
@@ -649,7 +646,7 @@ img_get_valid_tilings(const struct ilo_image *img,
        */
       if (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
           ilo_dev_gen(params->dev) <= ILO_GEN(7.5) &&
-          img->format == PIPE_FORMAT_R32G32B32_FLOAT)
+          img->format == GEN6_FORMAT_R32G32B32_FLOAT)
          valid_tilings &= ~IMAGE_TILING_Y;
 
       valid_tilings &= ~IMAGE_TILING_W;
@@ -682,18 +679,13 @@ img_init_size_and_format(struct ilo_image *img,
     *      buffer has been removed Surface formats with interleaved depth and
     *      stencil are no longer supported"
     */
-   if (ilo_dev_gen(params->dev) >= ILO_GEN(7) && info->bind_zs) {
-      const struct util_format_description *desc =
-         util_format_description(info->format);
-
-      assert(info->format == PIPE_FORMAT_S8_UINT ||
-             !util_format_has_stencil(desc));
-   }
+   if (ilo_dev_gen(params->dev) >= ILO_GEN(7) && info->bind_zs)
+      assert(!info->interleaved_stencil);
 
    img->format = info->format;
-   img->block_width = util_format_get_blockwidth(info->format);
-   img->block_height = util_format_get_blockheight(info->format);
-   img->block_size = util_format_get_blocksize(info->format);
+   img->block_width = info->block_width;
+   img->block_height = info->block_height;
+   img->block_size = info->block_size;
 
    img->width0 = info->width;
    img->height0 = info->height;
@@ -703,7 +695,6 @@ img_init_size_and_format(struct ilo_image *img,
    img->sample_count = info->sample_count;
 
    params->valid_tilings = img_get_valid_tilings(img, params);
-   params->compressed = util_format_is_compressed(img->format);
 }
 
 static bool
@@ -730,7 +721,7 @@ img_want_mcs(const struct ilo_image *img,
     *     "This field must be set to 0 for all SINT MSRTs when all RT channels
     *      are not written"
     */
-   if (info->sample_count > 1 && !util_format_is_pure_sint(info->format)) {
+   if (info->sample_count > 1 && !info->is_integer) {
       want_mcs = true;
    } else if (info->sample_count == 1 && !info->aux_disable) {
       /*
@@ -769,8 +760,6 @@ img_want_hiz(const struct ilo_image *img,
              const struct ilo_image_params *params)
 {
    const struct ilo_image_info *info = params->info;
-   const struct util_format_description *desc =
-      util_format_description(info->format);
 
    if (ilo_debug & ILO_DEBUG_NOHIZ)
       return false;
@@ -785,10 +774,17 @@ img_want_hiz(const struct ilo_image *img,
    if (!info->bind_zs)
       return false;
 
-   if (!util_format_has_depth(desc) || util_format_has_stencil(desc))
+   if (info->interleaved_stencil)
       return false;
 
-   return true;
+   switch (info->format) {
+   case GEN6_FORMAT_R32_FLOAT:
+   case GEN6_FORMAT_R24_UNORM_X8_TYPELESS:
+   case GEN6_FORMAT_R16_UNORM:
+      return true;
+   default:
+      return false;
+   }
 }
 
 static void
@@ -836,7 +832,7 @@ img_align(struct ilo_image *img, struct ilo_image_params *params)
       if (info->type == GEN6_SURFTYPE_CUBE)
          pad_h += 2;
 
-      if (params->compressed)
+      if (info->compressed)
          align_h = MAX2(align_h, img->align_j * 2);
    }
 
@@ -1325,9 +1321,9 @@ img_init_for_transfer(struct ilo_image *img,
    img->sample_count = 1;
 
    img->format = info->format;
-   img->block_width = util_format_get_blockwidth(info->format);
-   img->block_height = util_format_get_blockheight(info->format);
-   img->block_size = util_format_get_blocksize(info->format);
+   img->block_width = info->block_width;
+   img->block_height = info->block_height;
+   img->block_size = info->block_size;
 
    img->walk = ILO_IMAGE_WALK_LOD;
 
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index 8a2e1438158..0fe0d4da75a 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -70,7 +70,14 @@ enum ilo_image_walk_type {
 struct ilo_image_info {
    enum gen_surface_type type;
 
-   enum pipe_format format;
+   enum gen_surface_format format;
+   bool interleaved_stencil;
+   bool is_integer;
+   /* width, height and size of pixel blocks */
+   bool compressed;
+   unsigned block_width;
+   unsigned block_height;
+   unsigned block_size;
 
    /* image size */
    uint16_t width;
@@ -119,7 +126,8 @@ struct ilo_image_lod {
 struct ilo_image {
    enum gen_surface_type type;
 
-   enum pipe_format format;
+   enum gen_surface_format format;
+   bool interleaved_stencil;
 
    /* size, format, etc for programming hardware states */
    unsigned width0;
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.c b/src/gallium/drivers/ilo/core/ilo_state_zs.c
index 306e6d9aedc..827632764b2 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_zs.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.c
@@ -55,47 +55,9 @@ zs_set_gen6_null_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
    zs->depth[3] = 0;
    zs->depth[4] = 0;
 
-   zs->depth_format = format;
-
    return true;
 }
 
-static enum gen_depth_format
-get_gen6_depth_format(const struct ilo_dev *dev, const struct ilo_image *img)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      switch (img->format) {
-      case PIPE_FORMAT_Z32_FLOAT:
-         return GEN6_ZFORMAT_D32_FLOAT;
-      case PIPE_FORMAT_Z24X8_UNORM:
-         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
-      case PIPE_FORMAT_Z16_UNORM:
-         return GEN6_ZFORMAT_D16_UNORM;
-      default:
-         assert(!"unknown depth format");
-         return GEN6_ZFORMAT_D32_FLOAT;
-      }
-   } else {
-      switch (img->format) {
-      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-         return GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT;
-      case PIPE_FORMAT_Z32_FLOAT:
-         return GEN6_ZFORMAT_D32_FLOAT;
-      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-         return GEN6_ZFORMAT_D24_UNORM_S8_UINT;
-      case PIPE_FORMAT_Z24X8_UNORM:
-         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
-      case PIPE_FORMAT_Z16_UNORM:
-         return GEN6_ZFORMAT_D16_UNORM;
-      default:
-         assert(!"unknown depth format");
-         return GEN6_ZFORMAT_D32_FLOAT;
-      }
-   }
-}
-
 static bool
 zs_validate_gen6(const struct ilo_dev *dev,
                  const struct ilo_state_zs_info *info)
@@ -141,6 +103,35 @@ zs_validate_gen6(const struct ilo_dev *dev,
              img->type == GEN6_SURFTYPE_CUBE);
    }
 
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      switch (info->format) {
+      case GEN6_ZFORMAT_D32_FLOAT:
+      case GEN6_ZFORMAT_D24_UNORM_X8_UINT:
+      case GEN6_ZFORMAT_D16_UNORM:
+         break;
+      default:
+         assert(!"unknown depth format");
+         break;
+      }
+   } else {
+      /*
+       * From the Ironlake PRM, volume 2 part 1, page 330:
+       *
+       *     "If this field (Separate Stencil Buffer Enable) is disabled, the
+       *      Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT."
+       *
+       * From the Sandy Bridge PRM, volume 2 part 1, page 321:
+       *
+       *     "[DevSNB]: This field (Separate Stencil Buffer Enable) must be
+       *      set to the same value (enabled or disabled) as Hierarchical
+       *      Depth Buffer Enable."
+       */
+      if (info->hiz_vma)
+         assert(info->format != GEN6_ZFORMAT_D24_UNORM_S8_UINT);
+      else
+         assert(info->format != GEN6_ZFORMAT_D24_UNORM_X8_UINT);
+   }
+
    assert(info->level < img->level_count);
    assert(img->bo_stride);
 
@@ -395,7 +386,6 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  const struct ilo_state_zs_info *info)
 {
    uint16_t width, height, depth, array_base, view_extent;
-   enum gen_depth_format format;
    uint32_t dw1, dw2, dw3, dw4;
 
    ILO_DEV_ASSERT(dev, 6, 6);
@@ -406,28 +396,10 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  &view_extent))
       return false;
 
-   format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) :
-      GEN6_ZFORMAT_D32_FLOAT;
-
-   /*
-    * From the Ironlake PRM, volume 2 part 1, page 330:
-    *
-    *     "If this field (Separate Stencil Buffer Enable) is disabled, the
-    *      Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT."
-    *
-    * From the Sandy Bridge PRM, volume 2 part 1, page 321:
-    *
-    *     "[DevSNB]: This field (Separate Stencil Buffer Enable) must be set
-    *      to the same value (enabled or disabled) as Hierarchical Depth
-    *      Buffer Enable."
-    */
-   if (!info->hiz_vma && format == GEN6_ZFORMAT_D24_UNORM_X8_UINT)
-      format = GEN6_ZFORMAT_D24_UNORM_S8_UINT;
-
    /* info->z_readonly and info->s_readonly are ignored on Gen6 */
    dw1 = info->type << GEN6_DEPTH_DW1_TYPE__SHIFT |
          GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT |
-         format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
+         info->format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
 
    if (info->z_img)
       dw1 |= (info->z_img->bo_stride - 1) << GEN6_DEPTH_DW1_PITCH__SHIFT;
@@ -453,8 +425,6 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
    zs->depth[3] = dw4;
    zs->depth[4] = 0;
 
-   zs->depth_format = format;
-
    return true;
 }
 
@@ -463,7 +433,6 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  const struct ilo_dev *dev,
                                  const struct ilo_state_zs_info *info)
 {
-   enum gen_depth_format format;
    uint16_t width, height, depth;
    uint16_t array_base, view_extent;
    uint32_t dw1, dw2, dw3, dw4, dw6;
@@ -476,11 +445,8 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  &view_extent))
       return false;
 
-   format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) :
-      GEN6_ZFORMAT_D32_FLOAT;
-
    dw1 = info->type << GEN7_DEPTH_DW1_TYPE__SHIFT |
-         format << GEN7_DEPTH_DW1_FORMAT__SHIFT;
+         info->format << GEN7_DEPTH_DW1_FORMAT__SHIFT;
 
    if (info->z_img) {
       if (!info->z_readonly)
@@ -516,8 +482,6 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
    zs->depth[3] = dw4;
    zs->depth[4] = dw6;
 
-   zs->depth_format = format;
-
    return true;
 }
 
@@ -685,6 +649,7 @@ ilo_state_zs_init_for_null(struct ilo_state_zs *zs,
 
    memset(&info, 0, sizeof(info));
    info.type = GEN6_SURFTYPE_NULL;
+   info.format = GEN6_ZFORMAT_D32_FLOAT;
 
    return ilo_state_zs_init(zs, dev, &info);
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.h b/src/gallium/drivers/ilo/core/ilo_state_zs.h
index d78a12ad516..6a25a873897 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_zs.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.h
@@ -49,6 +49,7 @@ struct ilo_state_zs_info {
    const struct ilo_vma *hiz_vma;
 
    enum gen_surface_type type;
+   enum gen_depth_format format;
 
    /* ignored prior to Gen7 */
    bool z_readonly;
@@ -64,9 +65,6 @@ struct ilo_state_zs {
    const struct ilo_vma *s_vma;
    const struct ilo_vma *hiz_vma;
 
-   /* TODO move this to ilo_image */
-   enum gen_depth_format depth_format;
-
    bool z_readonly;
    bool s_readonly;
 };
@@ -84,11 +82,4 @@ bool
 ilo_state_zs_disable_hiz(struct ilo_state_zs *zs,
                          const struct ilo_dev *dev);
 
-static inline enum gen_depth_format
-ilo_state_zs_get_depth_format(const struct ilo_state_zs *zs,
-                              const struct ilo_dev *dev)
-{
-   return zs->depth_format;
-}
-
 #endif /* ILO_STATE_ZS_H */
diff --git a/src/gallium/drivers/ilo/ilo_blitter_blt.c b/src/gallium/drivers/ilo/ilo_blitter_blt.c
index 52b4b25d827..66203e86137 100644
--- a/src/gallium/drivers/ilo/ilo_blitter_blt.c
+++ b/src/gallium/drivers/ilo/ilo_blitter_blt.c
@@ -300,7 +300,7 @@ tex_copy_region(struct ilo_blitter *blitter,
                 const struct pipe_box *src_box)
 {
    const struct util_format_description *desc =
-      util_format_description(dst_tex->image.format);
+      util_format_description(dst_tex->image_format);
    const unsigned max_extent = 32767; /* INT16_MAX */
    const uint8_t rop = 0xcc; /* SRCCOPY */
    struct ilo_builder *builder = &blitter->ilo->cp->builder;
diff --git a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
index 13c8f500680..86e67084d6e 100644
--- a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
+++ b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
@@ -318,7 +318,7 @@ hiz_can_clear_zs(const struct ilo_blitter *blitter,
     * The truth is when HiZ is enabled, separate stencil is also enabled on
     * all GENs.  The depth buffer format cannot be combined depth/stencil.
     */
-   switch (tex->image.format) {
+   switch (tex->image_format) {
    case PIPE_FORMAT_Z16_UNORM:
       if (ilo_dev_gen(blitter->ilo->dev) == ILO_GEN(6) &&
           tex->base.width0 % 16)
@@ -355,7 +355,7 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter,
    if (ilo_dev_gen(blitter->ilo->dev) >= ILO_GEN(8))
       clear_value = fui(depth);
    else
-      clear_value = util_pack_z(tex->image.format, depth);
+      clear_value = util_pack_z(tex->image_format, depth);
 
    ilo_blit_resolve_surface(blitter->ilo, zs,
          ILO_TEXTURE_RENDER_WRITE | ILO_TEXTURE_CLEAR);
diff --git a/src/gallium/drivers/ilo/ilo_common.h b/src/gallium/drivers/ilo/ilo_common.h
index 9ebbf76e81e..4d6604b29b2 100644
--- a/src/gallium/drivers/ilo/ilo_common.h
+++ b/src/gallium/drivers/ilo/ilo_common.h
@@ -28,6 +28,9 @@
 #ifndef ILO_COMMON_H
 #define ILO_COMMON_H
 
+#include "pipe/p_format.h"
+#include "util/u_format.h"
+
 #include "core/ilo_core.h"
 #include "core/ilo_debug.h"
 #include "core/ilo_dev.h"
diff --git a/src/gallium/drivers/ilo/ilo_format.h b/src/gallium/drivers/ilo/ilo_format.h
index 4e955c09c14..0a19c02659e 100644
--- a/src/gallium/drivers/ilo/ilo_format.h
+++ b/src/gallium/drivers/ilo/ilo_format.h
@@ -165,4 +165,39 @@ ilo_format_translate_vertex(const struct ilo_dev *dev,
    return ilo_format_translate(dev, format, PIPE_BIND_VERTEX_BUFFER);
 }
 
+static inline enum gen_depth_format
+ilo_format_translate_depth(const struct ilo_dev *dev,
+                           enum pipe_format format)
+{
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      switch (format) {
+      case PIPE_FORMAT_Z32_FLOAT:
+         return GEN6_ZFORMAT_D32_FLOAT;
+      case PIPE_FORMAT_Z24X8_UNORM:
+         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+      case PIPE_FORMAT_Z16_UNORM:
+         return GEN6_ZFORMAT_D16_UNORM;
+      default:
+         assert(!"unknown depth format");
+         return GEN6_ZFORMAT_D32_FLOAT;
+      }
+   } else {
+      switch (format) {
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         return GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT;
+      case PIPE_FORMAT_Z32_FLOAT:
+         return GEN6_ZFORMAT_D32_FLOAT;
+      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+         return GEN6_ZFORMAT_D24_UNORM_S8_UINT;
+      case PIPE_FORMAT_Z24X8_UNORM:
+         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+      case PIPE_FORMAT_Z16_UNORM:
+         return GEN6_ZFORMAT_D16_UNORM;
+      default:
+         assert(!"unknown depth format");
+         return GEN6_ZFORMAT_D32_FLOAT;
+      }
+   }
+}
+
 #endif /* ILO_FORMAT_H */
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index 11833e0c599..962c710a57a 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -30,6 +30,7 @@
 #include "core/ilo_state_surface.h"
 
 #include "ilo_screen.h"
+#include "ilo_format.h"
 #include "ilo_resource.h"
 
 /*
@@ -148,6 +149,26 @@ resource_get_image_format(const struct pipe_resource *templ,
    return format;
 }
 
+static inline enum gen_surface_format
+pipe_to_surface_format(const struct ilo_dev *dev, enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return GEN6_FORMAT_R32_FLOAT_X8X24_TYPELESS;
+   case PIPE_FORMAT_Z32_FLOAT:
+      return GEN6_FORMAT_R32_FLOAT;
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z24X8_UNORM:
+      return GEN6_FORMAT_R24_UNORM_X8_TYPELESS;
+   case PIPE_FORMAT_Z16_UNORM:
+      return GEN6_FORMAT_R16_UNORM;
+   case PIPE_FORMAT_S8_UINT:
+      return GEN6_FORMAT_R8_UINT;
+   default:
+      return ilo_format_translate_color(dev, format);
+   }
+}
+
 static void
 resource_get_image_info(const struct pipe_resource *templ,
                         const struct ilo_dev *dev,
@@ -157,7 +178,14 @@ resource_get_image_info(const struct pipe_resource *templ,
    memset(info, 0, sizeof(*info));
 
    info->type = get_surface_type(templ->target);
-   info->format = image_format;
+
+   info->format = pipe_to_surface_format(dev, image_format);
+   info->interleaved_stencil = util_format_is_depth_and_stencil(image_format);
+   info->is_integer = util_format_is_pure_integer(image_format);
+   info->compressed = util_format_is_compressed(image_format);
+   info->block_width = util_format_get_blockwidth(image_format);
+   info->block_height = util_format_get_blockheight(image_format);
+   info->block_size = util_format_get_blocksize(image_format);
 
    info->width = templ->width0;
    info->height = templ->height0;
@@ -303,7 +331,7 @@ tex_create_separate_stencil(struct ilo_texture *tex)
 
    tex->separate_s8 = ilo_texture(s8);
 
-   assert(tex->separate_s8->image.format == PIPE_FORMAT_S8_UINT);
+   assert(tex->separate_s8->image_format == PIPE_FORMAT_S8_UINT);
 
    return true;
 }
@@ -438,12 +466,11 @@ tex_init_image(struct ilo_texture *tex,
    const struct pipe_resource *templ = &tex->base;
    struct ilo_image *img = &tex->image;
    struct intel_bo *imported_bo = NULL;;
-   enum pipe_format image_format;
    struct ilo_image_info info;
 
-   image_format = resource_get_image_format(templ,
+   tex->image_format = resource_get_image_format(templ,
          &is->dev, separate_stencil);
-   resource_get_image_info(templ, &is->dev, image_format, &info);
+   resource_get_image_info(templ, &is->dev, tex->image_format, &info);
 
    if (handle) {
       imported_bo = tex_import_handle(tex, handle, &info);
@@ -469,10 +496,11 @@ tex_init_image(struct ilo_texture *tex,
     */
    if (ilo_dev_gen(&is->dev) == ILO_GEN(6) &&
        templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
-       image_format == PIPE_FORMAT_Z32_FLOAT &&
+       tex->image_format == PIPE_FORMAT_Z32_FLOAT &&
        img->aux.enables != (1 << templ->last_level)) {
-      image_format = templ->format;
-      info.format = image_format;
+      tex->image_format = templ->format;
+      info.format = pipe_to_surface_format(&is->dev, tex->image_format);
+      info.interleaved_stencil = true;
 
       memset(img, 0, sizeof(*img));
       if (!ilo_image_init(img, &is->dev, &info)) {
@@ -496,7 +524,7 @@ tex_init_image(struct ilo_texture *tex,
    if (templ->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
       /* require on-the-fly tiling/untiling or format conversion */
       if (img->tiling == GEN8_TILING_W || *separate_stencil ||
-          image_format != templ->format)
+          tex->image_format != templ->format)
          return false;
    }
 
@@ -656,7 +684,8 @@ ilo_can_create_resource(struct pipe_screen *screen,
        templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
        image_format == PIPE_FORMAT_Z32_FLOAT &&
        img.aux.enables != (1 << templ->last_level)) {
-      info.format = templ->format;
+      info.format = pipe_to_surface_format(&is->dev, templ->format);
+      info.interleaved_stencil = true;
       memset(&img, 0, sizeof(img));
       ilo_image_init(&img, &ilo_screen(screen)->dev, &info);
    }
diff --git a/src/gallium/drivers/ilo/ilo_resource.h b/src/gallium/drivers/ilo/ilo_resource.h
index c28c05abcfe..8378af54741 100644
--- a/src/gallium/drivers/ilo/ilo_resource.h
+++ b/src/gallium/drivers/ilo/ilo_resource.h
@@ -92,6 +92,7 @@ struct ilo_texture {
 
    bool imported;
 
+   enum pipe_format image_format;
    struct ilo_image image;
    struct ilo_vma vma;
    struct ilo_vma aux_vma;
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index 20d3c001188..c13fa9c2e18 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -1699,10 +1699,11 @@ ilo_set_framebuffer_state(struct pipe_context *pipe,
    if (state->zsbuf) {
       const struct ilo_surface_cso *cso =
          (const struct ilo_surface_cso *) state->zsbuf;
+      const struct ilo_texture *tex = ilo_texture(cso->base.texture);
 
       fb->has_hiz = cso->u.zs.hiz_vma;
       fb->depth_offset_format =
-         ilo_state_zs_get_depth_format(&cso->u.zs, dev);
+         ilo_format_translate_depth(dev, tex->image_format);
    } else {
       fb->has_hiz = false;
       fb->depth_offset_format = GEN6_ZFORMAT_D32_FLOAT;
@@ -2154,6 +2155,11 @@ ilo_create_surface(struct pipe_context *pipe,
       info.type = (tex->image.type == GEN6_SURFTYPE_CUBE) ?
          GEN6_SURFTYPE_2D : tex->image.type;
 
+      info.format = ilo_format_translate_depth(dev, tex->image_format);
+      if (ilo_dev_gen(dev) == ILO_GEN(6) && !info.hiz_vma &&
+          tex->image_format == PIPE_FORMAT_Z24X8_UNORM)
+         info.format = GEN6_ZFORMAT_D24_UNORM_S8_UINT;
+
       ilo_state_zs_init(&surf->u.zs, dev, &info);
    }
 
diff --git a/src/gallium/drivers/ilo/ilo_transfer.c b/src/gallium/drivers/ilo/ilo_transfer.c
index be5aeee8e23..5abd3bebf68 100644
--- a/src/gallium/drivers/ilo/ilo_transfer.c
+++ b/src/gallium/drivers/ilo/ilo_transfer.c
@@ -100,7 +100,7 @@ resource_get_transfer_method(struct pipe_resource *res,
             m = ILO_TRANSFER_MAP_SW_ZS;
             need_convert = true;
          }
-      } else if (tex->image.format != tex->base.format) {
+      } else if (tex->image_format != tex->base.format) {
          m = ILO_TRANSFER_MAP_SW_CONVERT;
          need_convert = true;
       }
@@ -601,7 +601,7 @@ tex_staging_sys_zs_read(struct ilo_texture *tex,
       s8_tile_offset = tex_tile_choose_offset_func(s8_tex, &s8_tiles_per_row);
 
       if (tex->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
-         assert(tex->image.format == PIPE_FORMAT_Z24X8_UNORM);
+         assert(tex->image_format == PIPE_FORMAT_Z24X8_UNORM);
 
          dst_cpp = 4;
          dst_s8_pos = 3;
@@ -609,7 +609,7 @@ tex_staging_sys_zs_read(struct ilo_texture *tex,
       }
       else {
          assert(tex->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
-         assert(tex->image.format == PIPE_FORMAT_Z32_FLOAT);
+         assert(tex->image_format == PIPE_FORMAT_Z32_FLOAT);
 
          dst_cpp = 8;
          dst_s8_pos = 4;
@@ -655,7 +655,7 @@ tex_staging_sys_zs_read(struct ilo_texture *tex,
       tex_staging_sys_unmap_bo(s8_tex);
    }
    else {
-      assert(tex->image.format == PIPE_FORMAT_S8_UINT);
+      assert(tex->image_format == PIPE_FORMAT_S8_UINT);
 
       for (slice = 0; slice < box->depth; slice++) {
          unsigned mem_x, mem_y;
@@ -728,7 +728,7 @@ tex_staging_sys_zs_write(struct ilo_texture *tex,
       s8_tile_offset = tex_tile_choose_offset_func(s8_tex, &s8_tiles_per_row);
 
       if (tex->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
-         assert(tex->image.format == PIPE_FORMAT_Z24X8_UNORM);
+         assert(tex->image_format == PIPE_FORMAT_Z24X8_UNORM);
 
          src_cpp = 4;
          src_s8_pos = 3;
@@ -736,7 +736,7 @@ tex_staging_sys_zs_write(struct ilo_texture *tex,
       }
       else {
          assert(tex->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
-         assert(tex->image.format == PIPE_FORMAT_Z32_FLOAT);
+         assert(tex->image_format == PIPE_FORMAT_Z32_FLOAT);
 
          src_cpp = 8;
          src_s8_pos = 4;
@@ -782,7 +782,7 @@ tex_staging_sys_zs_write(struct ilo_texture *tex,
       tex_staging_sys_unmap_bo(s8_tex);
    }
    else {
-      assert(tex->image.format == PIPE_FORMAT_S8_UINT);
+      assert(tex->image_format == PIPE_FORMAT_S8_UINT);
 
       for (slice = 0; slice < box->depth; slice++) {
          unsigned mem_x, mem_y;
@@ -840,8 +840,8 @@ tex_staging_sys_convert_write(struct ilo_texture *tex,
    else
       dst_slice_stride = 0;
 
-   if (unlikely(tex->image.format == tex->base.format)) {
-      util_copy_box(dst, tex->image.format, tex->image.bo_stride,
+   if (unlikely(tex->image_format == tex->base.format)) {
+      util_copy_box(dst, tex->image_format, tex->image.bo_stride,
             dst_slice_stride, 0, 0, 0, box->width, box->height, box->depth,
             xfer->staging.sys, xfer->base.stride, xfer->base.layer_stride,
             0, 0, 0);
@@ -853,7 +853,7 @@ tex_staging_sys_convert_write(struct ilo_texture *tex,
 
    switch (tex->base.format) {
    case PIPE_FORMAT_ETC1_RGB8:
-      assert(tex->image.format == PIPE_FORMAT_R8G8B8X8_UNORM);
+      assert(tex->image_format == PIPE_FORMAT_R8G8B8X8_UNORM);
 
       for (slice = 0; slice < box->depth; slice++) {
          const void *src =

From 7de85694fa606b112b8badd4f07969aef782efb8 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Fri, 26 Jun 2015 11:38:46 +0800
Subject: [PATCH 0044/1208] ilo: define ILO_IMAGE_MAX_LEVEL_COUNT

Define ILO_IMAGE_MAX_LEVEL_COUNT for ilo_image and remove unnecessary header
includes.
---
 src/gallium/drivers/ilo/core/ilo_core.h  |  4 ----
 src/gallium/drivers/ilo/core/ilo_image.c |  4 ++--
 src/gallium/drivers/ilo/core/ilo_image.h | 11 +++++++++--
 src/gallium/drivers/ilo/ilo_common.h     |  5 +++++
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_core.h b/src/gallium/drivers/ilo/core/ilo_core.h
index d95a80aabd3..da7db90a54b 100644
--- a/src/gallium/drivers/ilo/core/ilo_core.h
+++ b/src/gallium/drivers/ilo/core/ilo_core.h
@@ -29,13 +29,9 @@
 #define ILO_CORE_H
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_defines.h"
 
 #include "util/u_debug.h"
-#include "util/list.h"
-#include "util/u_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-#include "util/u_pointer.h"
 
 #endif /* ILO_CORE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 7a1100288d3..3209674154b 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -1029,8 +1029,8 @@ img_calculate_hiz_size(struct ilo_image *img,
       break;
    case ILO_IMAGE_WALK_LOD:
       {
-         unsigned lod_tx[PIPE_MAX_TEXTURE_LEVELS];
-         unsigned lod_ty[PIPE_MAX_TEXTURE_LEVELS];
+         unsigned lod_tx[ILO_IMAGE_MAX_LEVEL_COUNT];
+         unsigned lod_ty[ILO_IMAGE_MAX_LEVEL_COUNT];
          unsigned cur_tx, cur_ty;
 
          /* figure out the tile offsets of LODs */
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index 0fe0d4da75a..cfe18b9e8d1 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -33,6 +33,13 @@
 #include "ilo_core.h"
 #include "ilo_dev.h"
 
+/*
+ * From the Ivy Bridge PRM, volume 4 part 1, page 75:
+ *
+ *     "(MIP Count / LOD) representing [1,15] MIP levels"
+ */
+#define ILO_IMAGE_MAX_LEVEL_COUNT 15
+
 enum ilo_image_aux_type {
    ILO_IMAGE_AUX_NONE,
    ILO_IMAGE_AUX_HIZ,
@@ -154,7 +161,7 @@ struct ilo_image {
    unsigned align_i;
    unsigned align_j;
 
-   struct ilo_image_lod lods[PIPE_MAX_TEXTURE_LEVELS];
+   struct ilo_image_lod lods[ILO_IMAGE_MAX_LEVEL_COUNT];
 
    /* physical layer height for ILO_IMAGE_WALK_LAYER */
    unsigned walk_layer_height;
@@ -173,7 +180,7 @@ struct ilo_image {
       unsigned enables;
 
       /* LOD offsets for ILO_IMAGE_WALK_LOD */
-      unsigned walk_lod_offsets[PIPE_MAX_TEXTURE_LEVELS];
+      unsigned walk_lod_offsets[ILO_IMAGE_MAX_LEVEL_COUNT];
 
       unsigned walk_layer_height;
       unsigned bo_stride;
diff --git a/src/gallium/drivers/ilo/ilo_common.h b/src/gallium/drivers/ilo/ilo_common.h
index 4d6604b29b2..3dbe79fb872 100644
--- a/src/gallium/drivers/ilo/ilo_common.h
+++ b/src/gallium/drivers/ilo/ilo_common.h
@@ -29,7 +29,12 @@
 #define ILO_COMMON_H
 
 #include "pipe/p_format.h"
+#include "pipe/p_defines.h"
+
+#include "util/list.h"
 #include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_pointer.h"
 
 #include "core/ilo_core.h"
 #include "core/ilo_debug.h"

From 229450520a23ba211fd9f7b3c9bc80f291229ec1 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 24 Jun 2015 14:06:33 +0100
Subject: [PATCH 0045/1208] mesa: fold duplicated GL/GL_CORE/GLES3 entry in
 get_hash_params.py

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/main/get_hash_params.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 74ff3ba6619..c25e1b6555f 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -351,6 +351,9 @@ descriptor=[
 # GL_ARB_framebuffer_object
   [ "MAX_SAMPLES", "CONTEXT_INT(Const.MaxSamples), extra_ARB_framebuffer_object_EXT_framebuffer_multisample" ],
 
+# GL_ARB_sampler_objects / GL 3.3 / GLES 3.0
+  [ "SAMPLER_BINDING", "LOC_CUSTOM, TYPE_INT, GL_SAMPLER_BINDING, NO_EXTRA" ],
+
 # GL_ARB_sync
   [ "MAX_SERVER_WAIT_TIMEOUT", "CONTEXT_INT64(Const.MaxServerWaitTimeout), extra_ARB_sync" ],
 
@@ -404,11 +407,6 @@ descriptor=[
   [ "TEXTURE_EXTERNAL_OES", "LOC_CUSTOM, TYPE_BOOLEAN, 0, extra_OES_EGL_image_external" ],
 ]},
 
-{ "apis": ["GL", "GL_CORE", "GLES3"], "params": [
-# GL_ARB_sampler_objects / GL 3.3 / GLES 3.0
-  [ "SAMPLER_BINDING", "LOC_CUSTOM, TYPE_INT, GL_SAMPLER_BINDING, NO_EXTRA" ],
-]},
-
 # Enums in OpenGL Core profile and ES 3.1
 { "apis": ["GL_CORE", "GLES3"], "params": [
 # GL_ARB_draw_indirect / GLES 3.1

From 404a90b82786080564fe32716f83ce055b9a934f Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 10 Jun 2015 16:30:56 -0700
Subject: [PATCH 0046/1208] mesa: Enable subdir-objects globally.

Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 configure.ac                                  | 2 +-
 src/Makefile.am                               | 2 --
 src/gallium/auxiliary/Makefile.am             | 2 --
 src/gallium/drivers/freedreno/Makefile.am     | 2 --
 src/gallium/drivers/ilo/Makefile.am           | 2 --
 src/gallium/drivers/nouveau/Makefile.am       | 2 --
 src/gallium/drivers/r300/Makefile.am          | 2 --
 src/gallium/drivers/r600/Makefile.am          | 2 --
 src/gallium/drivers/svga/Makefile.am          | 2 --
 src/gallium/drivers/vc4/Makefile.am           | 2 --
 src/gallium/state_trackers/clover/Makefile.am | 2 --
 src/gallium/state_trackers/xvmc/Makefile.am   | 1 -
 src/gallium/targets/opencl/Makefile.am        | 2 --
 src/gbm/Makefile.am                           | 2 --
 src/glsl/Makefile.am                          | 2 --
 src/gtest/Makefile.am                         | 2 --
 src/mapi/Makefile.am                          | 2 --
 src/mesa/Makefile.am                          | 2 --
 18 files changed, 1 insertion(+), 34 deletions(-)

diff --git a/configure.ac b/configure.ac
index ddc757e1629..af61aa2018c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -44,7 +44,7 @@ AC_INIT([Mesa], [MESA_VERSION],
 AC_CONFIG_AUX_DIR([bin])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CANONICAL_SYSTEM
-AM_INIT_AUTOMAKE([foreign tar-ustar dist-xz])
+AM_INIT_AUTOMAKE([foreign tar-ustar dist-xz subdir-objects])
 
 dnl We only support native Windows builds (MinGW/MSVC) through SCons.
 case "$host_os" in
diff --git a/src/Makefile.am b/src/Makefile.am
index 5d69abd996d..90bf94737cf 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 SUBDIRS = . gtest util mapi/glapi/gen mapi
 
 if NEED_OPENGL_COMMON
diff --git a/src/gallium/auxiliary/Makefile.am b/src/gallium/auxiliary/Makefile.am
index 89c7a13e913..ab91062ef1b 100644
--- a/src/gallium/auxiliary/Makefile.am
+++ b/src/gallium/auxiliary/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 if HAVE_LOADER_GALLIUM
 SUBDIRS := pipe-loader
 endif
diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am
index cbf62c6daae..dff95ba5270 100644
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/ilo/Makefile.am b/src/gallium/drivers/ilo/Makefile.am
index a8785a5e8c4..1f14153748e 100644
--- a/src/gallium/drivers/ilo/Makefile.am
+++ b/src/gallium/drivers/ilo/Makefile.am
@@ -21,8 +21,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/nouveau/Makefile.am b/src/gallium/drivers/nouveau/Makefile.am
index d05f0a17ab4..c52d62e54a2 100644
--- a/src/gallium/drivers/nouveau/Makefile.am
+++ b/src/gallium/drivers/nouveau/Makefile.am
@@ -20,8 +20,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/r300/Makefile.am b/src/gallium/drivers/r300/Makefile.am
index dd1a5ede19b..081f332683e 100644
--- a/src/gallium/drivers/r300/Makefile.am
+++ b/src/gallium/drivers/r300/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/r600/Makefile.am b/src/gallium/drivers/r600/Makefile.am
index dc0d90d759b..8317da727a2 100644
--- a/src/gallium/drivers/r600/Makefile.am
+++ b/src/gallium/drivers/r600/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/svga/Makefile.am b/src/gallium/drivers/svga/Makefile.am
index e0a8cad7208..d46de95e4b4 100644
--- a/src/gallium/drivers/svga/Makefile.am
+++ b/src/gallium/drivers/svga/Makefile.am
@@ -20,8 +20,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am
index 3f62ce21a9f..774463138d0 100644
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/state_trackers/clover/Makefile.am b/src/gallium/state_trackers/clover/Makefile.am
index f46d9ef457d..fd0ccf88cc5 100644
--- a/src/gallium/state_trackers/clover/Makefile.am
+++ b/src/gallium/state_trackers/clover/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 
 AM_CPPFLAGS = \
diff --git a/src/gallium/state_trackers/xvmc/Makefile.am b/src/gallium/state_trackers/xvmc/Makefile.am
index 047d05b3719..3c7c35c8c37 100644
--- a/src/gallium/state_trackers/xvmc/Makefile.am
+++ b/src/gallium/state_trackers/xvmc/Makefile.am
@@ -20,7 +20,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/targets/opencl/Makefile.am b/src/gallium/targets/opencl/Makefile.am
index 5daf327fb47..70e60e20052 100644
--- a/src/gallium/targets/opencl/Makefile.am
+++ b/src/gallium/targets/opencl/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include $(top_srcdir)/src/gallium/Automake.inc
 
 lib_LTLIBRARIES = lib@OPENCL_LIBNAME@.la
diff --git a/src/gbm/Makefile.am b/src/gbm/Makefile.am
index 918fdf7d6ad..da4195df31c 100644
--- a/src/gbm/Makefile.am
+++ b/src/gbm/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = main/gbm.pc
 
diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am
index fa8c9f5d3ca..98dcb37fc74 100644
--- a/src/glsl/Makefile.am
+++ b/src/glsl/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src \
diff --git a/src/gtest/Makefile.am b/src/gtest/Makefile.am
index 47d392bc705..29d6c6d1998 100644
--- a/src/gtest/Makefile.am
+++ b/src/gtest/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 AM_CFLAGS = $(DEFINES) -I$(top_srcdir)/src/gtest/include
 AM_CXXFLAGS = $(DEFINES) -I$(top_srcdir)/src/gtest/include
 
diff --git a/src/mapi/Makefile.am b/src/mapi/Makefile.am
index 50c5b2ebba3..160a255af6a 100644
--- a/src/mapi/Makefile.am
+++ b/src/mapi/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 SUBDIRS =
 TESTS =
 
diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index 71794b5dada..c86ded979b9 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 SUBDIRS = . main/tests
 
 if HAVE_X11_DRIVER

From 3cf90bb183c7f403ded4c069a78eae1fd71f8eab Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Tue, 16 Jun 2015 13:53:40 +0100
Subject: [PATCH 0047/1208] i965/skl: Fix aligning mt->total_width to the block
 size

brw_miptree_layout_2d tries to ensure that mt->total_width is a
multiple of the compressed block size, presumably because it wouldn't
be possible to make an image that has a fraction of a block. However
it was doing this by aligning mt->total_width to align_w. Previously
align_w has been used as a shortcut for getting the block width
because before Gen9 the block width was always equal to the alignment.
Commit 4ab8d59a2 tried to fix these cases to use the block width
instead of the alignment but it missed this case.

I think in practice this probably won't make any difference because
the buffer for the texture will be allocated to be large enough to
contain the entire pitch and libdrm aligns the pitch to the tile width
anyway. However I think the patch is worth having to make the
intention clearer.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 998d8c42770..455984309fa 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -366,9 +366,8 @@ brw_miptree_layout_2d(struct intel_mipmap_tree *mt)
 
    mt->total_width = mt->physical_width0;
 
-   if (mt->compressed) {
-       mt->total_width = ALIGN(mt->physical_width0, mt->align_w);
-   }
+   if (mt->compressed)
+       mt->total_width = ALIGN(mt->total_width, bw);
 
    /* May need to adjust width to accommodate the placement of
     * the 2nd mipmap.  This occurs when the alignment

From 052b3d4e2f159038137504f01e9ff2380a67af8b Mon Sep 17 00:00:00 2001
From: Boyan Ding <boyan.j.ding@gmail.com>
Date: Sat, 13 Jun 2015 15:36:27 +0800
Subject: [PATCH 0048/1208] egl_dri2: Remove trailing whitespaces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Boyan Ding <boyan.j.ding@gmail.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/egl/drivers/dri2/egl_dri2.c         | 10 +++++-----
 src/egl/drivers/dri2/egl_dri2.h         |  6 +++---
 src/egl/drivers/dri2/platform_wayland.c |  8 ++++----
 src/egl/drivers/dri2/platform_x11.c     |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index a1cbd437f53..223cc5c5fbf 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -139,7 +139,7 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
 
    dri2_dpy = disp->DriverData;
    _eglInitConfig(&base, disp, id);
-   
+
    i = 0;
    double_buffer = 0;
    bind_to_texture_rgb = 0;
@@ -155,7 +155,7 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
 	 else
 	    return NULL;
 	 _eglSetConfigKey(&base, EGL_COLOR_BUFFER_TYPE, value);
-	 break;	 
+	 break;
 
       case __DRI_ATTRIB_CONFIG_CAVEAT:
          if (value & __DRI_ATTRIB_NON_CONFORMANT_CONFIG)
@@ -365,7 +365,7 @@ dri2_bind_extensions(struct dri2_egl_display *dri2_dpy,
 	 }
       }
    }
-   
+
    for (j = 0; matches[j].name; j++) {
       field = ((char *) dri2_dpy + matches[j].offset);
       if (*(const __DRIextension **) field == NULL) {
@@ -624,7 +624,7 @@ dri2_create_screen(_EGLDisplay *disp)
    dri2_dpy->own_dri_screen = 1;
 
    extensions = dri2_dpy->core->getExtensions(dri2_dpy->dri_screen);
-   
+
    if (dri2_dpy->dri2) {
       if (!dri2_bind_extensions(dri2_dpy, dri2_core_extensions, extensions))
          goto cleanup_dri_screen;
@@ -1970,7 +1970,7 @@ dri2_create_drm_image_mesa(_EGLDriver *drv, _EGLDisplay *disp,
    if (attrs.DRMBufferUseMESA & EGL_DRM_BUFFER_USE_CURSOR_MESA)
       dri_use |= __DRI_IMAGE_USE_CURSOR;
 
-   dri2_img->dri_image = 
+   dri2_img->dri_image =
       dri2_dpy->image->createImage(dri2_dpy->dri_screen,
 				   attrs.Width, attrs.Height,
                                    format, dri_use, dri2_img);
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index 9985c49f984..f0cc6da1867 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -120,9 +120,9 @@ struct dri2_egl_display_vtbl {
    EGLBoolean (*swap_buffers)(_EGLDriver *drv, _EGLDisplay *dpy,
                               _EGLSurface *surf);
 
-   EGLBoolean (*swap_buffers_with_damage)(_EGLDriver *drv, _EGLDisplay *dpy,     
-                                          _EGLSurface *surface,                  
-                                          const EGLint *rects, EGLint n_rects);  
+   EGLBoolean (*swap_buffers_with_damage)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                          _EGLSurface *surface,
+                                          const EGLint *rects, EGLint n_rects);
 
    EGLBoolean (*swap_buffers_region)(_EGLDriver *drv, _EGLDisplay *dpy,
                                      _EGLSurface *surf, EGLint numRects,
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index 1c985523862..160fa8ce8d7 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -138,7 +138,7 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp,
       _eglError(EGL_BAD_ALLOC, "dri2_create_surface");
       return NULL;
    }
-   
+
    if (!_eglInitSurface(&dri2_surf->base, disp, EGL_WINDOW_BIT, conf, attrib_list))
       goto cleanup_surf;
 
@@ -157,7 +157,7 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp,
    dri2_surf->base.Width =  -1;
    dri2_surf->base.Height = -1;
 
-   dri2_surf->dri_drawable = 
+   dri2_surf->dri_drawable =
       (*dri2_dpy->dri2->createNewDrawable) (dri2_dpy->dri_screen,
 					    dri2_conf->dri_double_config,
 					    dri2_surf);
@@ -361,7 +361,7 @@ get_back_bo(struct dri2_egl_surface *dri2_surf)
    }
 
    if (dri2_surf->back->dri_image == NULL) {
-      dri2_surf->back->dri_image = 
+      dri2_surf->back->dri_image =
          dri2_dpy->image->createImage(dri2_dpy->dri_screen,
                                       dri2_surf->base.Width,
                                       dri2_surf->base.Height,
@@ -1220,7 +1220,7 @@ dri2_initialize_wayland_drm(_EGLDriver *drv, _EGLDisplay *disp)
    wl_event_queue_destroy(dri2_dpy->wl_queue);
  cleanup_dpy:
    free(dri2_dpy);
-   
+
    return EGL_FALSE;
 }
 
diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 56c14288204..0fbf4e40f2f 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -56,7 +56,7 @@ swrastCreateDrawable(struct dri2_egl_display * dri2_dpy,
    uint32_t           mask;
    const uint32_t     function = GXcopy;
    uint32_t           valgc[2];
-   
+
    /* create GC's */
    dri2_surf->gc = xcb_generate_id(dri2_dpy->conn);
    mask = XCB_GC_FUNCTION;

From ad62ec8316a926682958e7ab52639992867c3755 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 26 Jun 2015 15:01:22 -0400
Subject: [PATCH 0049/1208] nv50/ir: propagate modifier to right arg when
 const-folding mad

An immediate has to be the second arg of an ADD operation. However we
were mistakenly propagating the modifier of the non-folded value to the
folded immediate argument.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91117
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "10.5 10.6" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index ae739eeda83..ad9bf6f4aa9 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -608,9 +608,12 @@ ConstantFolding::expr(Instruction *i,
    case OP_FMA: {
       i->op = OP_ADD;
 
+      /* Move the immediate to the second arg, otherwise the ADD operation
+       * won't be emittable
+       */
       i->setSrc(1, i->getSrc(0));
-      i->src(1).mod = i->src(2).mod;
       i->setSrc(0, i->getSrc(2));
+      i->src(0).mod = i->src(2).mod;
       i->setSrc(2, NULL);
 
       ImmediateValue src0;

From 35d83793047b3de31a706fa2a62a233090ea7cfc Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 18 Jun 2015 13:55:52 -0700
Subject: [PATCH 0050/1208] i965/fs: Fix ir_txs in emit_texture_gen4_simd16().

We were not emitting the LOD, which led to message lengths of 1 instead
of 3.  Setting has_lod makes us emit the LOD, but I had to make changes
to avoid emitting the non-existent coordinate as well.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91022
Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 9a4bad6bcf5..4e13199eec0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -247,7 +247,7 @@ fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
                                      uint32_t sampler)
 {
    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
-   bool has_lod = op == ir_txl || op == ir_txb || op == ir_txf;
+   bool has_lod = op == ir_txl || op == ir_txb || op == ir_txf || op == ir_txs;
 
    if (has_lod && shadow_c.file != BAD_FILE)
       no16("TXB and TXL with shadow comparison unsupported in SIMD16.");
@@ -264,14 +264,15 @@ fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
    fs_reg msg_end = offset(message, vector_elements);
 
    /* Messages other than sample and ld require all three components */
-   if (has_lod || shadow_c.file != BAD_FILE) {
+   if (vector_elements > 0 && (has_lod || shadow_c.file != BAD_FILE)) {
       for (int i = vector_elements; i < 3; i++) {
          bld.MOV(offset(message, i), fs_reg(0.0f));
       }
+      msg_end = offset(message, 3);
    }
 
    if (has_lod) {
-      fs_reg msg_lod = retype(offset(message, 3), op == ir_txf ?
+      fs_reg msg_lod = retype(msg_end, op == ir_txf ?
                               BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
       bld.MOV(msg_lod, lod);
       msg_end = offset(msg_lod, 1);

From 7e5064360c03b8dbdd60298b46e1595418c6cea3 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 25 Jun 2015 03:36:23 +0100
Subject: [PATCH 0051/1208] radeonsi: add support for viewport array (v3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This isn't pretty and I'd suggest it the pm4 interface builder
could be tweaked to do this more efficently, but I'd need
guidance on how that would look.

This seems to pass the few piglit tests I threw at it.

v2: handle passing layer/viewport index to fragment shader.
fix crash in blit changes,
add support to io_get_unique_index for layer/viewport index
update docs.
v3: avoid looking up viewport index and layer in es (Marek).

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                                  |  4 +-
 docs/relnotes/10.7.0.html                     |  3 +
 src/gallium/drivers/radeonsi/si_blit.c        |  8 +--
 src/gallium/drivers/radeonsi/si_pipe.c        |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c      | 27 ++++++--
 src/gallium/drivers/radeonsi/si_state.c       | 66 ++++++++++++-------
 src/gallium/drivers/radeonsi/si_state.h       |  4 +-
 .../drivers/radeonsi/si_state_shaders.c       |  2 -
 8 files changed, 74 insertions(+), 42 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 220bcc8742f..df913bdd8c9 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -128,7 +128,7 @@ GL 4.1, GLSL 4.10:
   GL_ARB_separate_shader_objects                       DONE (all drivers)
   GL_ARB_shader_precision                              started (Micah)
   GL_ARB_vertex_attrib_64bit                           DONE (nvc0, softpipe)
-  GL_ARB_viewport_array                                DONE (i965, nv50, nvc0, r600, llvmpipe)
+  GL_ARB_viewport_array                                DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
 
 
 GL 4.2, GLSL 4.20:
@@ -156,7 +156,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_copy_image                                    DONE (i965) (gallium - in progress, VMware)
   GL_KHR_debug                                         DONE (all drivers)
   GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
-  GL_ARB_fragment_layer_viewport                       DONE (nv50, nvc0, r600, llvmpipe)
+  GL_ARB_fragment_layer_viewport                       DONE (nv50, nvc0, r600, radeonsi, llvmpipe)
   GL_ARB_framebuffer_no_attachments                    DONE (i965)
   GL_ARB_internalformat_query2                         not started
   GL_ARB_invalidate_subdata                            DONE (all drivers)
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index e089889667d..fcc50811b69 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -44,8 +44,11 @@ Note: some of the new features are only available with certain drivers.
 </p>
 
 <ul>
+<li>GL_AMD_vertex_shader_viewport_index on radeonsi</li>
 <li>GL_ARB_framebuffer_no_attachments on i965</li>
 <li>GL_ARB_shader_stencil_export on llvmpipe</li>
+<li>GL_ARB_viewport_array on radeonsi</li>
+<li>GL_ARB_fragment_layer_viewport on radeonsi</li>
 </ul>
 
 <h2>Bug fixes</h2>
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 1f2c4082dbc..6c7b383a4a3 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -63,11 +63,11 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 		util_blitter_save_sample_mask(sctx->blitter,
 					      sctx->queued.named.sample_mask->sample_mask);
 	}
-	if (sctx->queued.named.viewport) {
-		util_blitter_save_viewport(sctx->blitter, &sctx->queued.named.viewport->viewport);
+	if (sctx->queued.named.viewport[0]) {
+		util_blitter_save_viewport(sctx->blitter, &sctx->queued.named.viewport[0]->viewport);
 	}
-	if (sctx->queued.named.scissor) {
-		util_blitter_save_scissor(sctx->blitter, &sctx->queued.named.scissor->scissor);
+	if (sctx->queued.named.scissor[0]) {
+		util_blitter_save_scissor(sctx->blitter, &sctx->queued.named.scissor[0]->scissor);
 	}
 	util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer);
 	util_blitter_save_so_targets(sctx->blitter, sctx->b.streamout.num_targets,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 53ae71a8c92..480a3010d31 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -335,7 +335,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return 8;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
-		return 1;
+		return 16;
 
 	/* Timer queries, present when the clock frequency is non zero. */
 	case PIPE_CAP_QUERY_TIMESTAMP:
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index a293ef36fbb..4ca31728dff 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1132,7 +1132,7 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
 				&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
 	LLVMValueRef args[9];
 	LLVMValueRef pos_args[4][9] = { { 0 } };
-	LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL;
+	LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
 	unsigned semantic_name, semantic_index;
 	unsigned target;
 	unsigned param_count = 0;
@@ -1158,7 +1158,12 @@ handle_semantic:
 			continue;
 		case TGSI_SEMANTIC_LAYER:
 			layer_value = outputs[i].values[0];
-			continue;
+			semantic_name = TGSI_SEMANTIC_GENERIC;
+			goto handle_semantic;
+		case TGSI_SEMANTIC_VIEWPORT_INDEX:
+			viewport_index_value = outputs[i].values[0];
+			semantic_name = TGSI_SEMANTIC_GENERIC;
+			goto handle_semantic;
 		case TGSI_SEMANTIC_POSITION:
 			target = V_008DFC_SQ_EXP_POS;
 			break;
@@ -1224,11 +1229,13 @@ handle_semantic:
 	/* Write the misc vector (point size, edgeflag, layer, viewport). */
 	if (shader->selector->info.writes_psize ||
 	    shader->selector->info.writes_edgeflag ||
+	    shader->selector->info.writes_viewport_index ||
 	    shader->selector->info.writes_layer) {
 		pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
 						      shader->selector->info.writes_psize |
 						      (shader->selector->info.writes_edgeflag << 1) |
-						      (shader->selector->info.writes_layer << 2));
+						      (shader->selector->info.writes_layer << 2) |
+						      (shader->selector->info.writes_viewport_index << 3));
 		pos_args[1][1] = uint->zero; /* EXEC mask */
 		pos_args[1][2] = uint->zero; /* last export? */
 		pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
@@ -1259,6 +1266,9 @@ handle_semantic:
 
 		if (shader->selector->info.writes_layer)
 			pos_args[1][7] = layer_value;
+
+		if (shader->selector->info.writes_viewport_index)
+			pos_args[1][8] = viewport_index_value;
 	}
 
 	for (i = 0; i < 4; i++)
@@ -1299,10 +1309,15 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 	for (i = 0; i < info->num_outputs; i++) {
 		LLVMValueRef *out_ptr =
 			si_shader_ctx->radeon_bld.soa.outputs[i];
-		int param_index = get_param_index(info->output_semantic_name[i],
-						  info->output_semantic_index[i],
-						  es->key.vs.gs_used_inputs);
+		int param_index;
 
+		if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
+		    info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
+			continue;
+
+		param_index = get_param_index(info->output_semantic_name[i],
+					      info->output_semantic_index[i],
+					      es->key.vs.gs_used_inputs);
 		if (param_index < 0)
 			continue;
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 6c18836d189..752467bcfd7 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -489,11 +489,13 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
 		S_02881C_USE_VTX_POINT_SIZE(info->writes_psize) |
 		S_02881C_USE_VTX_EDGE_FLAG(info->writes_edgeflag) |
 		S_02881C_USE_VTX_RENDER_TARGET_INDX(info->writes_layer) |
+	        S_02881C_USE_VTX_VIEWPORT_INDX(info->writes_viewport_index) |
 		S_02881C_VS_OUT_CCDIST0_VEC_ENA((clipdist_mask & 0x0F) != 0) |
 		S_02881C_VS_OUT_CCDIST1_VEC_ENA((clipdist_mask & 0xF0) != 0) |
 		S_02881C_VS_OUT_MISC_VEC_ENA(info->writes_psize ||
 					    info->writes_edgeflag ||
-					    info->writes_layer) |
+					    info->writes_layer ||
+					     info->writes_viewport_index) |
 		(sctx->queued.named.rasterizer->clip_plane_enable &
 		 clipdist_mask));
 	r600_write_context_reg(cs, R_028810_PA_CL_CLIP_CNTL,
@@ -509,20 +511,26 @@ static void si_set_scissor_states(struct pipe_context *ctx,
                                   const struct pipe_scissor_state *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_state_scissor *scissor = CALLOC_STRUCT(si_state_scissor);
-	struct si_pm4_state *pm4 = &scissor->pm4;
+	struct si_state_scissor *scissor;
+	struct si_pm4_state *pm4;
+	int i;
 
-	if (scissor == NULL)
-		return;
+	for (i = start_slot; i < start_slot + num_scissors; i++) {
+		int idx = i - start_slot;
+		int offset = i * 4 * 2;
 
-	scissor->scissor = *state;
-	si_pm4_set_reg(pm4, R_028250_PA_SC_VPORT_SCISSOR_0_TL,
-		       S_028250_TL_X(state->minx) | S_028250_TL_Y(state->miny) |
-		       S_028250_WINDOW_OFFSET_DISABLE(1));
-	si_pm4_set_reg(pm4, R_028254_PA_SC_VPORT_SCISSOR_0_BR,
-		       S_028254_BR_X(state->maxx) | S_028254_BR_Y(state->maxy));
-
-	si_pm4_set_state(sctx, scissor, scissor);
+		scissor = CALLOC_STRUCT(si_state_scissor);
+		if (scissor == NULL)
+			return;
+		pm4 = &scissor->pm4;
+		scissor->scissor = state[idx];
+		si_pm4_set_reg(pm4, R_028250_PA_SC_VPORT_SCISSOR_0_TL + offset,
+			       S_028250_TL_X(state[idx].minx) | S_028250_TL_Y(state[idx].miny) |
+			       S_028250_WINDOW_OFFSET_DISABLE(1));
+		si_pm4_set_reg(pm4, R_028254_PA_SC_VPORT_SCISSOR_0_BR + offset,
+			       S_028254_BR_X(state[idx].maxx) | S_028254_BR_Y(state[idx].maxy));
+		si_pm4_set_state(sctx, scissor[i], scissor);
+	}
 }
 
 static void si_set_viewport_states(struct pipe_context *ctx,
@@ -531,21 +539,29 @@ static void si_set_viewport_states(struct pipe_context *ctx,
                                    const struct pipe_viewport_state *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_state_viewport *viewport = CALLOC_STRUCT(si_state_viewport);
-	struct si_pm4_state *pm4 = &viewport->pm4;
+	struct si_state_viewport *viewport;
+	struct si_pm4_state *pm4;
+	int i;
 
-	if (viewport == NULL)
-		return;
+	for (i = start_slot; i < start_slot + num_viewports; i++) {
+		int idx = i - start_slot;
+		int offset = i * 4 * 6;
 
-	viewport->viewport = *state;
-	si_pm4_set_reg(pm4, R_02843C_PA_CL_VPORT_XSCALE_0, fui(state->scale[0]));
-	si_pm4_set_reg(pm4, R_028440_PA_CL_VPORT_XOFFSET_0, fui(state->translate[0]));
-	si_pm4_set_reg(pm4, R_028444_PA_CL_VPORT_YSCALE_0, fui(state->scale[1]));
-	si_pm4_set_reg(pm4, R_028448_PA_CL_VPORT_YOFFSET_0, fui(state->translate[1]));
-	si_pm4_set_reg(pm4, R_02844C_PA_CL_VPORT_ZSCALE_0, fui(state->scale[2]));
-	si_pm4_set_reg(pm4, R_028450_PA_CL_VPORT_ZOFFSET_0, fui(state->translate[2]));
+		viewport = CALLOC_STRUCT(si_state_viewport);
+		if (!viewport)
+			return;
+		pm4 = &viewport->pm4;
 
-	si_pm4_set_state(sctx, viewport, viewport);
+		viewport->viewport = state[idx];
+		si_pm4_set_reg(pm4, R_02843C_PA_CL_VPORT_XSCALE_0 + offset, fui(state[idx].scale[0]));
+		si_pm4_set_reg(pm4, R_028440_PA_CL_VPORT_XOFFSET_0 + offset, fui(state[idx].translate[0]));
+		si_pm4_set_reg(pm4, R_028444_PA_CL_VPORT_YSCALE_0 + offset, fui(state[idx].scale[1]));
+		si_pm4_set_reg(pm4, R_028448_PA_CL_VPORT_YOFFSET_0 + offset, fui(state[idx].translate[1]));
+		si_pm4_set_reg(pm4, R_02844C_PA_CL_VPORT_ZSCALE_0 + offset, fui(state[idx].scale[2]));
+		si_pm4_set_reg(pm4, R_028450_PA_CL_VPORT_ZOFFSET_0 + offset, fui(state[idx].translate[2]));
+
+		si_pm4_set_state(sctx, viewport[i], viewport);
+	}
 }
 
 /*
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 5e68b162137..d1f2dff2c3f 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -92,8 +92,8 @@ union si_state {
 		struct si_pm4_state		*blend_color;
 		struct si_pm4_state		*clip;
 		struct si_state_sample_mask	*sample_mask;
-		struct si_state_scissor		*scissor;
-		struct si_state_viewport	*viewport;
+		struct si_state_scissor		*scissor[16];
+		struct si_state_viewport	*viewport[16];
 		struct si_state_rasterizer	*rasterizer;
 		struct si_state_dsa		*dsa;
 		struct si_pm4_state		*fb_rs;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 208c8523ef1..48128fa44e1 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -187,8 +187,6 @@ static void si_shader_vs(struct si_shader *shader)
 		case TGSI_SEMANTIC_POSITION:
 		case TGSI_SEMANTIC_PSIZE:
 		case TGSI_SEMANTIC_EDGEFLAG:
-		case TGSI_SEMANTIC_VIEWPORT_INDEX:
-		case TGSI_SEMANTIC_LAYER:
 			break;
 		default:
 			nparams++;

From 556dd4af76ca0be9b0698139c06e6d12d52e8ff3 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 25 Jun 2015 03:55:54 +0100
Subject: [PATCH 0052/1208] radeonsi: add support for geometry shader
 invocations.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                                    | 2 +-
 src/gallium/drivers/radeonsi/si_shader.c        | 5 +++++
 src/gallium/drivers/radeonsi/si_shader.h        | 1 +
 src/gallium/drivers/radeonsi/si_state.c         | 1 -
 src/gallium/drivers/radeonsi/si_state_shaders.c | 7 +++++++
 5 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index df913bdd8c9..81014a5f9b5 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -104,7 +104,7 @@ GL 4.0, GLSL 4.00:
   - Fused multiply-add                                 DONE ()
   - Packing/bitfield/conversion functions              DONE (r600, radeonsi, softpipe)
   - Enhanced textureGather                             DONE (r600, radeonsi, softpipe)
-  - Geometry shader instancing                         DONE (r600, llvmpipe, softpipe)
+  - Geometry shader instancing                         DONE (r600, radeonsi, llvmpipe, softpipe)
   - Geometry shader multiple streams                   DONE ()
   - Enhanced per-sample shading                        DONE (r600, radeonsi)
   - Interpolation functions                            DONE (r600)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4ca31728dff..4d97b58aec8 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -630,6 +630,11 @@ static void declare_system_value(
 				     SI_PARAM_BASE_VERTEX);
 		break;
 
+	case TGSI_SEMANTIC_INVOCATIONID:
+		value = LLVMGetParam(radeon_bld->main_fn,
+				     SI_PARAM_GS_INSTANCE_ID);
+		break;
+
 	case TGSI_SEMANTIC_SAMPLEID:
 		value = get_sample_id(radeon_bld);
 		break;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 51055afe36a..b4339ae2b36 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -115,6 +115,7 @@ struct si_shader_selector {
 
 	unsigned	gs_output_prim;
 	unsigned	gs_max_out_vertices;
+	unsigned	gs_num_invocations;
 	uint64_t	gs_used_inputs; /* mask of "get_unique_index" bits */
 };
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 752467bcfd7..0dd08a248f4 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3078,7 +3078,6 @@ void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, 0);
 	si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, 0);
 	si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, 0);
-	si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT, 0);
 
 	si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
 	si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 48128fa44e1..eef3baad164 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -76,6 +76,7 @@ static void si_shader_gs(struct si_shader *shader)
 	unsigned gs_vert_itemsize = shader->selector->info.num_outputs * (16 >> 2);
 	unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
 	unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out;
+	unsigned gs_num_invocations = shader->selector->gs_num_invocations;
 	unsigned cut_mode;
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
@@ -118,6 +119,10 @@ static void si_shader_gs(struct si_shader *shader)
 
 	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize);
 
+	si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT,
+		       S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
+		       S_028B90_ENABLE(gs_num_invocations > 0));
+
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 	si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
@@ -490,6 +495,8 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 			sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
 		sel->gs_max_out_vertices =
 			sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
+		sel->gs_num_invocations =
+			sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
 
 		for (i = 0; i < sel->info.num_inputs; i++) {
 			unsigned name = sel->info.input_semantic_name[i];

From 2a210b797eacd27a556af9c5e0edca940f9486c5 Mon Sep 17 00:00:00 2001
From: Mike Stroyan <mike@lunarg.com>
Date: Fri, 26 Jun 2015 15:15:46 -0600
Subject: [PATCH 0053/1208] meta: Only change and restore viewport 0 in mesa
 meta mode

The meta code was setting a default depth range for all viewports
and 'restoring' all viewports to depth range values saved from viewport 0.

Cc: mesa-stable@lists.freedesktop.org
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/common/meta.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 214a68a9129..9a75019d059 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -728,7 +728,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       save->DepthNear = ctx->ViewportArray[0].Near;
       save->DepthFar = ctx->ViewportArray[0].Far;
       /* set depth range to default */
-      _mesa_DepthRange(0.0, 1.0);
+      _mesa_set_depth_range(ctx, 0, 0.0, 1.0);
    }
 
    if (state & MESA_META_CLAMP_FRAGMENT_COLOR) {
@@ -1129,7 +1129,7 @@ _mesa_meta_end(struct gl_context *ctx)
          _mesa_set_viewport(ctx, 0, save->ViewportX, save->ViewportY,
                             save->ViewportW, save->ViewportH);
       }
-      _mesa_DepthRange(save->DepthNear, save->DepthFar);
+      _mesa_set_depth_range(ctx, 0, save->DepthNear, save->DepthFar);
    }
 
    if (state & MESA_META_CLAMP_FRAGMENT_COLOR &&

From a98600b0ebdfc8481c168aae6c5670071e22fc29 Mon Sep 17 00:00:00 2001
From: Mario Kleiner <mario.kleiner.de@gmail.com>
Date: Fri, 5 Jun 2015 15:36:52 +0200
Subject: [PATCH 0054/1208] nouveau: Use dup fd as key in drm-winsys hash table
 to fix ZaphodHeads.

The dup'ed fd owned by the nouveau_screen for a device node
must also be used as key for the winsys hash table, instead
of using the original fd passed in for a screen, to make
multi-x-screen ZaphodHeads configurations work on nouveau.

The original fd's lifetime differs from that of the nouveau_screen stored
in the hash. The hash key is the fd, and in order to compare hash entries
we fstat them, so the fd must be around for as long as the screen is.

This is an extension of the fix in commit a59f2bb1 (nouveau: dup fd
before passing it to device).

Cc: "10.3 10.4 10.5 10.6" <mesa-stable@lists.freedesktop.org>
Signed-off-by: Mario Kleiner <mario.kleiner.de@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
index 063524655b6..5a4c256539d 100644
--- a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
+++ b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
@@ -120,7 +120,11 @@ nouveau_drm_screen_create(int fd)
 	if (!screen)
 		goto err;
 
-	util_hash_table_set(fd_tab, intptr_to_pointer(fd), screen);
+	/* Use dupfd in hash table, to avoid errors if the original fd gets
+	 * closed by its owner. The hash key needs to live at least as long as
+	 * the screen.
+	 */
+	util_hash_table_set(fd_tab, intptr_to_pointer(dupfd), screen);
 	screen->refcount = 1;
 	pipe_mutex_unlock(nouveau_screen_mutex);
 	return &screen->base;

From b4b4406e1e8dcf577551087cc6eb068e5303efdf Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Wed, 24 Jun 2015 21:11:27 +0200
Subject: [PATCH 0055/1208] gallium/hud: prevent NULL pointer dereference with
 pipe_query functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The HUD doesn't check if query_create() fails and it calls other
pipe_query functions with NULL pointer instead of a valid query object.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/hud/hud_driver_query.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/gallium/auxiliary/hud/hud_driver_query.c b/src/gallium/auxiliary/hud/hud_driver_query.c
index 603aba7e8cd..ee71678e894 100644
--- a/src/gallium/auxiliary/hud/hud_driver_query.c
+++ b/src/gallium/auxiliary/hud/hud_driver_query.c
@@ -62,7 +62,8 @@ query_new_value(struct hud_graph *gr)
    uint64_t now = os_time_get();
 
    if (info->last_time) {
-      pipe->end_query(pipe, info->query[info->head]);
+      if (info->query[info->head])
+         pipe->end_query(pipe, info->query[info->head]);
 
       /* read query results */
       while (1) {
@@ -70,7 +71,7 @@ query_new_value(struct hud_graph *gr)
          union pipe_query_result result;
          uint64_t *res64 = (uint64_t *)&result;
 
-         if (pipe->get_query_result(pipe, query, FALSE, &result)) {
+         if (query && pipe->get_query_result(pipe, query, FALSE, &result)) {
             info->results_cumulative += res64[info->result_index];
             info->num_results++;
 
@@ -88,7 +89,8 @@ query_new_value(struct hud_graph *gr)
                        "gallium_hud: all queries are busy after %i frames, "
                        "can't add another query\n",
                        NUM_QUERIES);
-               pipe->destroy_query(pipe, info->query[info->head]);
+               if (info->query[info->head])
+                  pipe->destroy_query(pipe, info->query[info->head]);
                info->query[info->head] =
                      pipe->create_query(pipe, info->query_type, 0);
             }
@@ -113,15 +115,15 @@ query_new_value(struct hud_graph *gr)
          info->results_cumulative = 0;
          info->num_results = 0;
       }
-
-      pipe->begin_query(pipe, info->query[info->head]);
    }
    else {
       /* initialize */
       info->last_time = now;
       info->query[info->head] = pipe->create_query(pipe, info->query_type, 0);
-      pipe->begin_query(pipe, info->query[info->head]);
    }
+
+   if (info->query[info->head])
+      pipe->begin_query(pipe, info->query[info->head]);
 }
 
 static void

From 17e8fca626c908dcbedabf57ce175113840e65c2 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Fri, 29 May 2015 22:40:07 -0700
Subject: [PATCH 0056/1208] i965: Write at least some data in SIMD8 URB write
 messages.

According to the "URB SIMD8 Write > Write Data Payload" documentation,
"The write data payload can be between 1 and 8 message phases long."

Apparently, the simulator considers it an error if you issue an URB
SIMD8 message with only a header and no actual data to write.

v2: Try to put in a better PRM citation, now that the Broadwell docs
    actually exist (requested by Jordan).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 4e13199eec0..0cbaf17f1e4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1799,16 +1799,23 @@ fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
       compute_clip_distance(clip_planes);
 
    /* If we don't have any valid slots to write, just do a minimal urb write
-    * send to terminate the shader. */
+    * send to terminate the shader.  This includes 1 slot of undefined data,
+    * because it's invalid to write 0 data:
+    *
+    * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
+    * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
+    * Write Data Payload:
+    *
+    *    "The write data payload can be between 1 and 8 message phases long."
+    */
    if (vue_map->slots_valid == 0) {
-
-      fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+      fs_reg payload = fs_reg(GRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
       bld.exec_all().MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
                                                 BRW_REGISTER_TYPE_UD)));
 
       fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
       inst->eot = true;
-      inst->mlen = 1;
+      inst->mlen = 2;
       inst->offset = 1;
       return;
    }

From 19a0ba130fd0d0f3b86181a8d05cf5391420360d Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Fri, 26 Jun 2015 15:05:13 -0700
Subject: [PATCH 0057/1208] i965/vs: Move compute_clip_distance() out of
 emit_urb_writes().

Legacy user clipping (using gl_Position or gl_ClipVertex) is handled by
turning those into the modern gl_ClipDistance equivalents.

This is unnecessary in Core Profile: if user clipping is enabled, but
the shader doesn't write the corresponding gl_ClipDistance entry,
results are undefined.  Hence, it is also unnecessary for geometry
shaders.

This patch moves the call up to run_vs().  This is equivalent for VS,
but removes the need to pass clip distances into emit_urb_writes().

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp         |  4 +++-
 src/mesa/drivers/dri/i965/brw_fs.h           |  2 +-
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 16 +++++++++++-----
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 4292aa6b9fb..8658554e96b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3816,7 +3816,9 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
    if (failed)
       return false;
 
-   emit_urb_writes(clip_planes);
+   compute_clip_distance(clip_planes);
+
+   emit_urb_writes();
 
    if (shader_time_index >= 0)
       emit_shader_time_end();
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 243baf688de..d08d438a40e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -271,7 +271,7 @@ public:
                                  fs_reg src0_alpha, unsigned components,
                                  unsigned exec_size, bool use_2nd_half = false);
    void emit_fb_writes();
-   void emit_urb_writes(gl_clip_plane *clip_planes);
+   void emit_urb_writes();
    void emit_cs_terminate();
 
    void emit_barrier();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 0cbaf17f1e4..34bf32d7ab3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1731,6 +1731,12 @@ fs_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
    }
 }
 
+/**
+ * Lower legacy fixed-function and gl_ClipVertex clipping to clip distances.
+ *
+ * This does nothing if the shader uses gl_ClipDistance or user clipping is
+ * disabled altogether.
+ */
 void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
 {
    struct brw_vue_prog_data *vue_prog_data =
@@ -1738,6 +1744,10 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
    const struct brw_vue_prog_key *key =
       (const struct brw_vue_prog_key *) this->key;
 
+   /* Bail unless some sort of legacy clipping is enabled */
+   if (!key->userclip_active || prog->UsesClipDistanceOut)
+      return;
+
    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
     *
     *     "If a linked set of shaders forming the vertex stage contains no
@@ -1781,7 +1791,7 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
 }
 
 void
-fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
+fs_visitor::emit_urb_writes()
 {
    int slot, urb_offset, length;
    struct brw_vs_prog_data *vs_prog_data =
@@ -1794,10 +1804,6 @@ fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
    bool flush;
    fs_reg sources[8];
 
-   /* Lower legacy ff and ClipVertex clipping to clip distances */
-   if (key->base.userclip_active && !prog->UsesClipDistanceOut)
-      compute_clip_distance(clip_planes);
-
    /* If we don't have any valid slots to write, just do a minimal urb write
     * send to terminate the shader.  This includes 1 slot of undefined data,
     * because it's invalid to write 0 data:

From b5622313ea2e070cc0c20c7cdccd844d383713d0 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 28 Jun 2015 22:30:27 -0400
Subject: [PATCH 0058/1208] nv40: enable base vertex

Still appears to have issues with negative indices less than -1M, but
that's a corner case of a corner case.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h | 2 ++
 src/gallium/drivers/nouveau/nv30/nv30_context.h   | 1 +
 src/gallium/drivers/nouveau/nv30/nv30_vbo.c       | 6 ++----
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h b/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h
index 447f4b3b7ae..95468e580dd 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h
@@ -1459,6 +1459,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define NV40_3D_VTX_CACHE_INVALIDATE				0x00001714
 
+#define NV40_3D_VB_ELEMENT_BASE					0x0000173c
+
 #define NV30_3D_VTXFMT(i0)				       (0x00001740 + 0x4*(i0))
 #define NV30_3D_VTXFMT__ESIZE					0x00000004
 #define NV30_3D_VTXFMT__LEN					0x00000010
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.h b/src/gallium/drivers/nouveau/nv30/nv30_context.h
index 592cdbe24f9..7181336b562 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.h
@@ -51,6 +51,7 @@ struct nv30_context {
       unsigned rt_enable;
       unsigned scissor_off;
       unsigned num_vtxelts;
+      int index_bias;
       boolean  prim_restart;
       struct nv30_fragprog *fragprog;
    } state;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
index d4e384b21d2..faa8812528a 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
@@ -461,13 +461,11 @@ nv30_draw_elements(struct nv30_context *nv30, boolean shorten,
    struct nouveau_object *eng3d = nv30->screen->eng3d;
    unsigned prim = nv30_prim_gl(mode);
 
-#if 0 /*XXX*/
-   if (index_bias != nv30->state.index_bias) {
-      BEGIN_NV04(push, NV30_3D(VB_ELEMENT_BASE), 1);
+   if (eng3d->oclass >= NV40_3D_CLASS && index_bias != nv30->state.index_bias) {
+      BEGIN_NV04(push, NV40_3D(VB_ELEMENT_BASE), 1);
       PUSH_DATA (push, index_bias);
       nv30->state.index_bias = index_bias;
    }
-#endif
 
    if (eng3d->oclass == NV40_3D_CLASS && index_size > 1 &&
        nv30->idxbuf.buffer) {

From 61912036d1cb67e52b1cc191bdff8ebded439e8c Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 29 Jun 2015 00:23:55 -0400
Subject: [PATCH 0059/1208] nv30: avoid leaking blit fp/vp

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv30/nv30_context.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c
index 617b0887810..ef035e58f3c 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c
@@ -165,6 +165,12 @@ nv30_context_destroy(struct pipe_context *pipe)
    if (nv30->draw)
       draw_destroy(nv30->draw);
 
+   if (nv30->blit_vp)
+      nouveau_heap_free(&nv30->blit_vp);
+
+   if (nv30->blit_fp)
+      pipe_resource_reference(&nv30->blit_fp, NULL);
+
    if (nv30->screen->base.pushbuf->user_priv == &nv30->bufctx)
       nv30->screen->base.pushbuf->user_priv = NULL;
 

From cae701fc8ed0faeaaaafd1cf57f6143031edcab2 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 28 Jun 2015 22:17:09 -0700
Subject: [PATCH 0060/1208] Revert "i965: Delete linked GLSL IR when using
 NIR."

This reverts commit 104c8fc2c2aa5621261f80aa6b4f76c3163078f1.
---
 src/mesa/drivers/dri/i965/brw_shader.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 5653d6ba1e4..32c40131434 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -387,11 +387,8 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
 
       brw_add_texrect_params(prog);
 
-      if (options->NirOptions) {
+      if (options->NirOptions)
          prog->nir = brw_create_nir(brw, shProg, prog, (gl_shader_stage) stage);
-         ralloc_free(shader->ir);
-         shader->ir = NULL;
-      }
 
       _mesa_reference_program(ctx, &prog, NULL);
    }

From 6218c68bece0cea671f2940a651119a87ab8b24e Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 28 Jun 2015 22:17:16 -0700
Subject: [PATCH 0061/1208] Revert "glsl: clone inputs and outputs during
 linking"

This reverts commit c2ff3485b3d48749ea9dcad07bc1a691627dc3e5.

Ilia and I noticed a memory leak caused by this patch: at least with
fixed-function programs, we clone things using ProgramResourceList as
the context before reralloc makes it non-NULL.

I believe Tapani found other bugs with these patches, so I'm just going
to revert them for now and let him pursue them further.
---
 src/glsl/linker.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 5da9cadcb08..4a726d4e2e7 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -2637,9 +2637,7 @@ add_interface_variables(struct gl_shader_program *shProg,
          continue;
       };
 
-      /* Clone ir_variable data so that backend is able to free memory. */
-      if (!add_program_resource(shProg, programInterface,
-                                var->clone(shProg->ProgramResourceList, NULL),
+      if (!add_program_resource(shProg, programInterface, var,
                                 build_stageref(shProg, var->name) | mask))
          return false;
    }

From 07158c508ac9b933d60dd3e2cd1e748601c44b68 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 29 Jun 2015 08:23:14 +0100
Subject: [PATCH 0062/1208] Add release notes for the 10.6.1 release

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
(cherry picked from commit a871e80fc6237fa029d6970f7e9b414fd097bd98)
---
 docs/relnotes/10.6.1.html | 103 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 docs/relnotes/10.6.1.html

diff --git a/docs/relnotes/10.6.1.html b/docs/relnotes/10.6.1.html
new file mode 100644
index 00000000000..03b5b086eb1
--- /dev/null
+++ b/docs/relnotes/10.6.1.html
@@ -0,0 +1,103 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.6.1 Release Notes / June 29, 2015</h1>
+
+<p>
+Mesa 10.6.1 is a bug fix release which fixes bugs found since the 10.6.0 release.
+</p>
+<p>
+Mesa 10.6.1 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90347">Bug 90347</a> - [NVE0+] Failure to insert texbar under some circumstances (causing bad colors in Terasology)</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Anuj Phogat (4):</p>
+<ul>
+  <li>mesa: Handle integer formats in need_rgb_to_luminance_conversion()</li>
+  <li>mesa: Use helper function need_rgb_to_luminance_conversion()</li>
+  <li>mesa: Turn need_rgb_to_luminance_conversion() in to a global function</li>
+  <li>meta: Abort meta path if ReadPixels need rgb to luminance conversion</li>
+</ul>
+
+<p>Ben Widawsky (1):</p>
+<ul>
+  <li>i965/gen9: Implement Push Constant Buffer workaround</li>
+</ul>
+
+<p>Boyan Ding (2):</p>
+<ul>
+  <li>egl/x11: Set version of swrastLoader to 2</li>
+  <li>egl/x11: Remove duplicate call to dri2_x11_add_configs_for_visuals</li>
+</ul>
+
+<p>Emil Velikov (6):</p>
+<ul>
+  <li>docs: Add sha256sums for the 10.6.0 release</li>
+  <li>configure: warn about shared_glapi &amp; xlib-glx only when both are set</li>
+  <li>configure: error out when building backend-less libEGL</li>
+  <li>configure: error out when building libEGL without shared-glapi</li>
+  <li>gbm: do not (over)link against libglapi.so</li>
+  <li>Update version to 10.6.1</li>
+</ul>
+
+<p>Frank Henigman (1):</p>
+<ul>
+  <li>gbm: dlopen libglapi so gbm_create_device works</li>
+</ul>
+
+<p>Ilia Mirkin (9):</p>
+<ul>
+  <li>nvc0/ir: fix collection of first uses for texture barrier insertion</li>
+  <li>nv50,nvc0: clamp uniform size to 64k</li>
+  <li>nvc0/ir: can't have a join on a load with an indirect source</li>
+  <li>glsl: handle conversions to double when comparing param matches</li>
+  <li>glsl: add version checks to conditionals for builtin variable enablement</li>
+  <li>mesa: add GL_PROGRAM_PIPELINE support in KHR_debug calls</li>
+  <li>glsl: binding point is a texture unit, which is a combined space</li>
+  <li>nvc0: always put all tfb bufs into bufctx</li>
+  <li>nv50,nvc0: make sure to pushbuf_refn before putting bo into pushbuf_data</li>
+</ul>
+
+
+</div>
+</body>
+</html>

From 24df6cd0f7723e163d75ed3eb0b7e22adc3ffd7f Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 29 Jun 2015 09:00:24 +0100
Subject: [PATCH 0063/1208] docs: Add sha256 checksums for the 10.6.1 release

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
(cherry picked from commit 6ff3ae8deb1d99037f2f8e5890b09bd984059cf0)
---
 docs/relnotes/10.6.1.html | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/relnotes/10.6.1.html b/docs/relnotes/10.6.1.html
index 03b5b086eb1..f197b0f3a42 100644
--- a/docs/relnotes/10.6.1.html
+++ b/docs/relnotes/10.6.1.html
@@ -31,7 +31,8 @@ because compatibility contexts are not supported.
 
 <h2>SHA256 checksums</h2>
 <pre>
-TBD
+b4cccd4d0eabcc2bca00c3175d3ad88fdda57ffdb883a7998525b873a21fe607  mesa-10.6.1.tar.gz
+6c80a2b647e57c85dc36e609d9aed17f878f0d8e0cf9ace86d14cf604101e1eb  mesa-10.6.1.tar.xz
 </pre>
 
 

From dd9ceb0219f6ca7864940ee1961f1b1890d27cea Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 29 Jun 2015 09:03:19 +0100
Subject: [PATCH 0064/1208] docs: add news item and link release notes for mesa
 10.6.1

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 docs/index.html    | 6 ++++++
 docs/relnotes.html | 1 +
 2 files changed, 7 insertions(+)

diff --git a/docs/index.html b/docs/index.html
index 80c6e03e3f1..29e62791131 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,12 @@
 
 <h1>News</h1>
 
+<h2>June 29, 2015</h2>
+<p>
+<a href="relnotes/10.6.1.html">Mesa 10.6.1</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>June 20, 2015</h2>
 <p>
 <a href="relnotes/10.5.8.html">Mesa 10.5.8</a> is released.
diff --git a/docs/relnotes.html b/docs/relnotes.html
index 5fd80025a39..3e2d13c84ed 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>
 
 <ul>
+<li><a href="relnotes/10.6.1.html">10.6.1 release notes</a>
 <li><a href="relnotes/10.5.8.html">10.5.8 release notes</a>
 <li><a href="relnotes/10.6.0.html">10.6.0 release notes</a>
 <li><a href="relnotes/10.5.7.html">10.5.7 release notes</a>

From c0ca6c30eaf7f488f154c462a01a8945cb4a3103 Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Fri, 26 Jun 2015 17:54:15 +0100
Subject: [PATCH 0065/1208] i965: Don't try to print the GLSL IR if it has been
 freed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 104c8fc2c2aa5621261f8 the GLSL IR will be freed if NIR is
being used. This was causing it to segfault if INTEL_DEBUG=wm is set.
This patch just makes it avoid dumping the GLSL IR in that case.

Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/drivers/dri/i965/brw_program.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 2327af77ad3..85e271d2351 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -574,10 +574,13 @@ brw_dump_ir(const char *stage, struct gl_shader_program *shader_prog,
             struct gl_shader *shader, struct gl_program *prog)
 {
    if (shader_prog) {
-      fprintf(stderr,
-              "GLSL IR for native %s shader %d:\n", stage, shader_prog->Name);
-      _mesa_print_ir(stderr, shader->ir, NULL);
-      fprintf(stderr, "\n\n");
+      if (shader->ir) {
+         fprintf(stderr,
+                 "GLSL IR for native %s shader %d:\n",
+                 stage, shader_prog->Name);
+         _mesa_print_ir(stderr, shader->ir, NULL);
+         fprintf(stderr, "\n\n");
+      }
    } else {
       fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
               stage, prog->Id, stage);

From 249a9df7fce0a6bebc70852ab583c5324208bf06 Mon Sep 17 00:00:00 2001
From: Grigori Goronzy <greg@chown.ath.cx>
Date: Thu, 28 May 2015 12:40:29 +0200
Subject: [PATCH 0066/1208] gallium: add PIPE_COMPUTE_CAP_SUBGROUP_SIZE

We need this to implement OpenCL's
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/gallium/docs/source/screen.rst            |  2 ++
 src/gallium/drivers/ilo/ilo_screen.c          |  8 ++++++++
 .../drivers/nouveau/nvc0/nvc0_screen.c        |  4 ++++
 src/gallium/drivers/radeon/r600_pipe_common.c |  6 ++++++
 src/gallium/drivers/radeon/r600_pipe_common.h | 20 +++++++++++++++++++
 src/gallium/include/pipe/p_defines.h          |  3 ++-
 6 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 8f64817fe5f..74636207d06 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -384,6 +384,8 @@ pipe_screen::get_compute_param.
   Value type: ``uint32_t``
 * ``PIPE_COMPUTE_CAP_IMAGES_SUPPORTED``: Whether images are supported
   non-zero means yes, zero means no. Value type: ``uint32_t``
+* ``PIPE_COMPUTE_CAP_SUBGROUP_SIZE``: The size of a basic execution unit in
+  threads. Also known as wavefront size, warp size or SIMD width.
 
 .. _pipe_bind:
 
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 94105559b80..faebb9279b3 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -193,6 +193,7 @@ ilo_get_compute_param(struct pipe_screen *screen,
       uint32_t max_clock_frequency;
       uint32_t max_compute_units;
       uint32_t images_supported;
+      uint32_t subgroup_size;
    } val;
    const void *ptr;
    int size;
@@ -284,6 +285,13 @@ ilo_get_compute_param(struct pipe_screen *screen,
       ptr = &val.images_supported;
       size = sizeof(val.images_supported);
       break;
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      /* best case is actually SIMD32 */
+      val.subgroup_size = 16;
+
+      ptr = &val.subgroup_size;
+      size = sizeof(val.subgroup_size);
+      break;
    default:
       ptr = NULL;
       size = 0;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 56c230e42fc..4c53106289c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -341,6 +341,7 @@ nvc0_screen_get_compute_param(struct pipe_screen *pscreen,
                               enum pipe_compute_cap param, void *data)
 {
    uint64_t *data64 = (uint64_t *)data;
+   uint32_t *data32 = (uint32_t *)data;
    const uint16_t obj_class = nvc0_screen(pscreen)->compute->oclass;
 
    switch (param) {
@@ -372,6 +373,9 @@ nvc0_screen_get_compute_param(struct pipe_screen *pscreen,
    case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */
       data64[0] = 4096;
       return 8;
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      data32[0] = 32;
+      return 4;
    default:
       return 0;
    }
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 3def4446882..775cf53ba88 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -636,6 +636,12 @@ static int r600_get_compute_param(struct pipe_screen *screen,
 		return sizeof(uint32_t);
 	case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
 		break; /* unused */
+	case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+		if (ret) {
+			uint32_t *subgroup_size = ret;
+			*subgroup_size = r600_wavefront_size(rscreen->family);
+		}
+		return sizeof(uint32_t);
 	}
 
         fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 6ce81d33ddd..51fd016229c 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -570,6 +570,26 @@ static inline unsigned r600_tex_aniso_filter(unsigned filter)
 	 /* else */        return 4;
 }
 
+static inline unsigned r600_wavefront_size(enum radeon_family family)
+{
+	switch (family) {
+	case CHIP_RV610:
+	case CHIP_RS780:
+	case CHIP_RV620:
+	case CHIP_RS880:
+		return 16;
+	case CHIP_RV630:
+	case CHIP_RV635:
+	case CHIP_RV730:
+	case CHIP_RV710:
+	case CHIP_PALM:
+	case CHIP_CEDAR:
+		return 32;
+	default:
+		return 64;
+	}
+}
+
 #define COMPUTE_DBG(rscreen, fmt, args...) \
 	do { \
 		if ((rscreen->b.debug_flags & DBG_COMPUTE)) fprintf(stderr, fmt, ##args); \
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 88b7b7699c1..153897af754 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -700,7 +700,8 @@ enum pipe_compute_cap
    PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
    PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY,
    PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS,
-   PIPE_COMPUTE_CAP_IMAGES_SUPPORTED
+   PIPE_COMPUTE_CAP_IMAGES_SUPPORTED,
+   PIPE_COMPUTE_CAP_SUBGROUP_SIZE
 };
 
 /**

From d15b32ebded278243eb648bb9ecd4c5f5d6d0569 Mon Sep 17 00:00:00 2001
From: Grigori Goronzy <greg@chown.ath.cx>
Date: Thu, 28 May 2015 13:01:51 +0200
Subject: [PATCH 0067/1208] clover: implement
 CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE

Work-group size should always be aligned to subgroup size; this is a
basic requirement, otherwise some work-items will be no-operation.

It might make sense to refine the value according to a kernel's
resource usage, but that's a possible optimization for the future.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/gallium/state_trackers/clover/api/kernel.cpp  | 2 +-
 src/gallium/state_trackers/clover/core/device.cpp | 5 +++++
 src/gallium/state_trackers/clover/core/device.hpp | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/gallium/state_trackers/clover/api/kernel.cpp b/src/gallium/state_trackers/clover/api/kernel.cpp
index 05cc392a914..857a152b554 100644
--- a/src/gallium/state_trackers/clover/api/kernel.cpp
+++ b/src/gallium/state_trackers/clover/api/kernel.cpp
@@ -169,7 +169,7 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev,
       break;
 
    case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
-      buf.as_scalar<size_t>() = 1;
+      buf.as_scalar<size_t>() = dev.subgroup_size();
       break;
 
    case CL_KERNEL_PRIVATE_MEM_SIZE:
diff --git a/src/gallium/state_trackers/clover/core/device.cpp b/src/gallium/state_trackers/clover/core/device.cpp
index 42b45b7f2b8..c42d1d26004 100644
--- a/src/gallium/state_trackers/clover/core/device.cpp
+++ b/src/gallium/state_trackers/clover/core/device.cpp
@@ -185,6 +185,11 @@ device::max_block_size() const {
    return { v.begin(), v.end() };
 }
 
+cl_uint
+device::subgroup_size() const {
+   return get_compute_param<uint32_t>(pipe, PIPE_COMPUTE_CAP_SUBGROUP_SIZE)[0];
+}
+
 std::string
 device::device_name() const {
    return pipe->get_name(pipe);
diff --git a/src/gallium/state_trackers/clover/core/device.hpp b/src/gallium/state_trackers/clover/core/device.hpp
index de5fc6bb9c4..285784744f3 100644
--- a/src/gallium/state_trackers/clover/core/device.hpp
+++ b/src/gallium/state_trackers/clover/core/device.hpp
@@ -67,6 +67,7 @@ namespace clover {
       bool has_doubles() const;
 
       std::vector<size_t> max_block_size() const;
+      cl_uint subgroup_size() const;
       std::string device_name() const;
       std::string vendor_name() const;
       enum pipe_shader_ir ir_format() const;

From 73d2b5af526676fd3f34243cdc155b3e1341b988 Mon Sep 17 00:00:00 2001
From: Erik Faye-Lund <kusmabite@gmail.com>
Date: Sun, 28 Jun 2015 14:51:04 +0200
Subject: [PATCH 0068/1208] mesa/main: Get rid of outdated GDB-hack

All of these enums are now in use around in the code, so there's no need
to explicitly use them here any more.

Signed-off-by: Erik Faye-Lund <kusmabite@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/main/context.c | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 79fa01849e0..265f98aea46 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -337,31 +337,6 @@ _mesa_destroy_visual( struct gl_config *vis )
 /*@{*/
 
 
-/**
- * This is lame.  gdb only seems to recognize enum types that are
- * actually used somewhere.  We want to be able to print/use enum
- * values such as TEXTURE_2D_INDEX in gdb.  But we don't actually use
- * the gl_texture_index type anywhere.  Thus, this lame function.
- */
-static void
-dummy_enum_func(void)
-{
-   gl_buffer_index bi = BUFFER_FRONT_LEFT;
-   gl_face_index fi = FACE_POS_X;
-   gl_frag_result fr = FRAG_RESULT_DEPTH;
-   gl_texture_index ti = TEXTURE_2D_ARRAY_INDEX;
-   gl_vert_attrib va = VERT_ATTRIB_POS;
-   gl_varying_slot vs = VARYING_SLOT_POS;
-
-   (void) bi;
-   (void) fi;
-   (void) fr;
-   (void) ti;
-   (void) va;
-   (void) vs;
-}
-
-
 /**
  * One-time initialization mutex lock.
  *
@@ -434,8 +409,6 @@ one_time_init( struct gl_context *ctx )
     * #ifdef tests here.
     */
    atexit(_mesa_destroy_shader_compiler);
-
-   dummy_enum_func();
 }
 
 

From ba5e1612c892282b930e278b5b98f1578cbe7dbb Mon Sep 17 00:00:00 2001
From: Erik Faye-Lund <kusmabite@gmail.com>
Date: Sun, 28 Jun 2015 14:51:05 +0200
Subject: [PATCH 0069/1208] dri: don't touch the shader compiler

This function is for deleting per-screen resources, and the shader
compiler resources are not of such nature. Besides, dri shouldn't
need to even know about the presence of a shader compiler.

These resources will already be released when mesa gets unloaded,
and that should be sufficient.

Signed-off-by: Erik Faye-Lund <kusmabite@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/drivers/dri/common/dri_util.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index e7ababe0b67..ae4592c739d 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -46,7 +46,6 @@
 #include "dri_util.h"
 #include "utils.h"
 #include "xmlpool.h"
-#include "../glsl/glsl_parser_extras.h"
 #include "main/mtypes.h"
 #include "main/version.h"
 #include "main/errors.h"
@@ -238,8 +237,6 @@ static void driDestroyScreen(__DRIscreen *psp)
 	 * stream open to the X-server anymore.
 	 */
 
-       _mesa_destroy_shader_compiler();
-
 	psp->driver->DestroyScreen(psp);
 
 	driDestroyOptionCache(&psp->optionCache);

From 195ab79ddecbdbf1f1714c233df278bff46c13e8 Mon Sep 17 00:00:00 2001
From: Erik Faye-Lund <kusmabite@gmail.com>
Date: Sun, 28 Jun 2015 14:51:06 +0200
Subject: [PATCH 0070/1208] mesa/main: only call _mesa_destroy_shader_compiler
 once on exit

There's no point in calling _mesa_destroy_shader_compiler multiple
times on exit; the resources will only be released once anyway.

So let's move the atexit-call into the part that is only called
once.

Signed-off-by: Erik Faye-Lund <kusmabite@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/main/context.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 265f98aea46..c4af8ea16db 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -382,6 +382,8 @@ one_time_init( struct gl_context *ctx )
          _mesa_ubyte_to_float_color_tab[i] = (float) i / 255.0F;
       }
 
+      atexit(_mesa_destroy_shader_compiler);
+
 #if defined(DEBUG) && defined(__DATE__) && defined(__TIME__)
       if (MESA_VERBOSE != 0) {
 	 _mesa_debug(ctx, "Mesa %s DEBUG build %s %s\n",
@@ -404,11 +406,6 @@ one_time_init( struct gl_context *ctx )
    api_init_mask |= 1 << ctx->API;
 
    mtx_unlock(&OneTimeLock);
-
-   /* Hopefully atexit() is widely available.  If not, we may need some
-    * #ifdef tests here.
-    */
-   atexit(_mesa_destroy_shader_compiler);
 }
 
 

From de3e323be1bdc40a2a7d724d0f3db7a81a93bbbb Mon Sep 17 00:00:00 2001
From: Erik Faye-Lund <kusmabite@gmail.com>
Date: Sun, 28 Jun 2015 14:51:07 +0200
Subject: [PATCH 0071/1208] glsl: No need to lock in _mesa_glsl_release_types

This function only gets called while mesa is unloading, so there's
no potential of racing or multiple calls at the same time. So let's
just get rid of the locking.

Signed-off-by: Erik Faye-Lund <kusmabite@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/glsl/glsl_types.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
index f675e90cb0d..c6223808bb8 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -324,8 +324,10 @@ const glsl_type *glsl_type::get_scalar_type() const
 void
 _mesa_glsl_release_types(void)
 {
-   mtx_lock(&glsl_type::mutex);
-
+   /* Should only be called during atexit (either when unloading shared
+    * object, or if process terminates), so no mutex-locking should be
+    * necessary.
+    */
    if (glsl_type::array_types != NULL) {
       hash_table_dtor(glsl_type::array_types);
       glsl_type::array_types = NULL;
@@ -335,8 +337,6 @@ _mesa_glsl_release_types(void)
       hash_table_dtor(glsl_type::record_types);
       glsl_type::record_types = NULL;
    }
-
-   mtx_unlock(&glsl_type::mutex);
 }
 
 

From c61bc6ed844b39e600cc64e3e552c7bf1894d7ba Mon Sep 17 00:00:00 2001
From: Erik Faye-Lund <kusmabite@gmail.com>
Date: Sun, 28 Jun 2015 14:51:08 +0200
Subject: [PATCH 0072/1208] util: port _mesa_strto[df] to C

_mesa_strtod and _mesa_strtof are only used from the GLSL compiler and
the ARB_[vertex|fragment]_program code, meaning that the locale doesn't
need to be initialized before the first OpenGL context gets initialized.

So let's use explicit initialization from the one-time init code instead
of depending on a C++ compiler to initialize at image-load time.

Signed-off-by: Erik Faye-Lund <kusmabite@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/glsl/glcpp/glcpp.c            |  3 +++
 src/glsl/main.cpp                 |  3 +++
 src/mesa/main/context.c           |  3 +++
 src/util/Makefile.sources         |  2 +-
 src/util/{strtod.cpp => strtod.c} | 14 ++++++++------
 src/util/strtod.h                 |  3 +++
 6 files changed, 21 insertions(+), 7 deletions(-)
 rename src/util/{strtod.cpp => strtod.c} (89%)

diff --git a/src/glsl/glcpp/glcpp.c b/src/glsl/glcpp/glcpp.c
index 5144516a69c..c62f4efec9d 100644
--- a/src/glsl/glcpp/glcpp.c
+++ b/src/glsl/glcpp/glcpp.c
@@ -29,6 +29,7 @@
 #include "glcpp.h"
 #include "main/mtypes.h"
 #include "main/shaderobj.h"
+#include "util/strtod.h"
 
 extern int glcpp_parser_debug;
 
@@ -168,6 +169,8 @@ main (int argc, char *argv[])
 	if (shader == NULL)
 	   return 1;
 
+	_mesa_locale_init();
+
 	ret = glcpp_preprocess(ctx, &shader, &info_log, NULL, &gl_ctx);
 
 	printf("%s", shader);
diff --git a/src/glsl/main.cpp b/src/glsl/main.cpp
index 23412980dce..58651df10a0 100644
--- a/src/glsl/main.cpp
+++ b/src/glsl/main.cpp
@@ -38,6 +38,7 @@
 #include "program/hash_table.h"
 #include "loop_analysis.h"
 #include "standalone_scaffolding.h"
+#include "util/strtod.h"
 
 static int glsl_version = 330;
 
@@ -46,6 +47,8 @@ initialize_context(struct gl_context *ctx, gl_api api)
 {
    initialize_context_to_defaults(ctx, api);
 
+   _mesa_locale_init();
+
    /* The standalone compiler needs to claim support for almost
     * everything in order to compile the built-in functions.
     */
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index c4af8ea16db..e68de68d645 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -120,6 +120,7 @@
 #include "shaderobj.h"
 #include "shaderimage.h"
 #include "util/simple_list.h"
+#include "util/strtod.h"
 #include "state.h"
 #include "stencil.h"
 #include "texcompress_s3tc.h"
@@ -374,6 +375,8 @@ one_time_init( struct gl_context *ctx )
       assert( sizeof(GLint) == 4 );
       assert( sizeof(GLuint) == 4 );
 
+      _mesa_locale_init();
+
       _mesa_one_time_init_extension_overrides();
 
       _mesa_get_cpu_features();
diff --git a/src/util/Makefile.sources b/src/util/Makefile.sources
index dc559391823..82df3bcb00a 100644
--- a/src/util/Makefile.sources
+++ b/src/util/Makefile.sources
@@ -19,7 +19,7 @@ MESA_UTIL_FILES :=	\
 	set.c \
 	set.h \
 	simple_list.h \
-	strtod.cpp \
+	strtod.c \
 	strtod.h \
 	texcompress_rgtc_tmp.h \
 	u_atomic.h
diff --git a/src/util/strtod.cpp b/src/util/strtod.c
similarity index 89%
rename from src/util/strtod.cpp
rename to src/util/strtod.c
index 2b4dd982a80..a4a60e0404a 100644
--- a/src/util/strtod.cpp
+++ b/src/util/strtod.c
@@ -30,18 +30,20 @@
 #include <locale.h>
 #ifdef HAVE_XLOCALE_H
 #include <xlocale.h>
+static locale_t loc;
 #endif
 #endif
 
 #include "strtod.h"
 
 
+void
+_mesa_locale_init(void)
+{
 #if defined(_GNU_SOURCE) && defined(HAVE_XLOCALE_H)
-static struct locale_initializer {
-   locale_initializer() { loc = newlocale(LC_CTYPE_MASK, "C", NULL); }
-   locale_t loc;
-} loc_init;
+   loc = newlocale(LC_CTYPE_MASK, "C", NULL);
 #endif
+}
 
 /**
  * Wrapper around strtod which uses the "C" locale so the decimal
@@ -51,7 +53,7 @@ double
 _mesa_strtod(const char *s, char **end)
 {
 #if defined(_GNU_SOURCE) && defined(HAVE_XLOCALE_H)
-   return strtod_l(s, end, loc_init.loc);
+   return strtod_l(s, end, loc);
 #else
    return strtod(s, end);
 #endif
@@ -66,7 +68,7 @@ float
 _mesa_strtof(const char *s, char **end)
 {
 #if defined(_GNU_SOURCE) && defined(HAVE_XLOCALE_H)
-   return strtof_l(s, end, loc_init.loc);
+   return strtof_l(s, end, loc);
 #elif defined(HAVE_STRTOF)
    return strtof(s, end);
 #else
diff --git a/src/util/strtod.h b/src/util/strtod.h
index 02c25ddb78f..b7e2beb5f30 100644
--- a/src/util/strtod.h
+++ b/src/util/strtod.h
@@ -31,6 +31,9 @@
 extern "C" {
 #endif
 
+extern void
+_mesa_locale_init(void);
+
 extern double
 _mesa_strtod(const char *s, char **end);
 

From e566e5203aaba98109a67766cf28991de3358490 Mon Sep 17 00:00:00 2001
From: Erik Faye-Lund <kusmabite@gmail.com>
Date: Sun, 28 Jun 2015 14:51:09 +0200
Subject: [PATCH 0073/1208] mesa/main: free locale at exit

In order to save a small leak if mesa is continously loaded and
unloaded, let's free the locale when the shared object is unloaded.

Signed-off-by: Erik Faye-Lund <kusmabite@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/main/context.c | 12 +++++++++++-
 src/util/strtod.c       |  8 ++++++++
 src/util/strtod.h       |  3 +++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index e68de68d645..fdef41287f7 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -346,6 +346,16 @@ _mesa_destroy_visual( struct gl_config *vis )
 mtx_t OneTimeLock = _MTX_INITIALIZER_NP;
 
 
+/**
+ * Calls all the various one-time-fini functions in Mesa
+ */
+
+static void
+one_time_fini(void)
+{
+   _mesa_destroy_shader_compiler();
+   _mesa_locale_fini();
+}
 
 /**
  * Calls all the various one-time-init functions in Mesa.
@@ -385,7 +395,7 @@ one_time_init( struct gl_context *ctx )
          _mesa_ubyte_to_float_color_tab[i] = (float) i / 255.0F;
       }
 
-      atexit(_mesa_destroy_shader_compiler);
+      atexit(one_time_fini);
 
 #if defined(DEBUG) && defined(__DATE__) && defined(__TIME__)
       if (MESA_VERBOSE != 0) {
diff --git a/src/util/strtod.c b/src/util/strtod.c
index a4a60e0404a..ea7d395e2da 100644
--- a/src/util/strtod.c
+++ b/src/util/strtod.c
@@ -45,6 +45,14 @@ _mesa_locale_init(void)
 #endif
 }
 
+void
+_mesa_locale_fini(void)
+{
+#if defined(_GNU_SOURCE) && defined(HAVE_XLOCALE_H)
+   freelocale(loc);
+#endif
+}
+
 /**
  * Wrapper around strtod which uses the "C" locale so the decimal
  * point is always '.'
diff --git a/src/util/strtod.h b/src/util/strtod.h
index b7e2beb5f30..60e15cfa0eb 100644
--- a/src/util/strtod.h
+++ b/src/util/strtod.h
@@ -34,6 +34,9 @@ extern "C" {
 extern void
 _mesa_locale_init(void);
 
+extern void
+_mesa_locale_fini(void);
+
 extern double
 _mesa_strtod(const char *s, char **end);
 

From 06f76b7fa68db1ac74ecca015412f71b3a5e9f9c Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Thu, 4 Jun 2015 16:57:02 -0700
Subject: [PATCH 0074/1208] i965: Make a helper function
 intel_miptree_set_alignment()

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 455984309fa..e35cb645c24 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -761,16 +761,13 @@ intel_miptree_set_total_width_height(struct brw_context *brw,
        mt->total_width, mt->total_height, mt->cpp);
 }
 
-void
-brw_miptree_layout(struct brw_context *brw,
-                   struct intel_mipmap_tree *mt,
-                   enum intel_miptree_tiling_mode requested,
-                   uint32_t layout_flags)
+static void
+intel_miptree_set_alignment(struct brw_context *brw,
+                            struct intel_mipmap_tree *mt,
+                            uint32_t layout_flags)
 {
    bool gen6_hiz_or_stencil = false;
 
-   mt->tr_mode = INTEL_MIPTREE_TRMODE_NONE;
-
    if (brw->gen == 6 && mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
       const GLenum base_format = _mesa_get_format_base_format(mt->format);
       gen6_hiz_or_stencil = _mesa_is_depth_or_stencil_format(base_format);
@@ -805,7 +802,17 @@ brw_miptree_layout(struct brw_context *brw,
          intel_horizontal_texture_alignment_unit(brw, mt, layout_flags);
       mt->align_h = intel_vertical_texture_alignment_unit(brw, mt);
    }
+}
 
+void
+brw_miptree_layout(struct brw_context *brw,
+                   struct intel_mipmap_tree *mt,
+                   enum intel_miptree_tiling_mode requested,
+                   uint32_t layout_flags)
+{
+   mt->tr_mode = INTEL_MIPTREE_TRMODE_NONE;
+
+   intel_miptree_set_alignment(brw, mt, layout_flags);
    intel_miptree_set_total_width_height(brw, mt);
 
    if (!mt->total_width || !mt->total_height) {

From c9dbdc08b9de016ab3b076feac3df4c81009996e Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Tue, 14 Apr 2015 22:06:49 -0700
Subject: [PATCH 0075/1208] i965/gen9: Plugin the code for selecting YF/YS
 tiling on skl+

Buffers with Yf/Ys tiling end up using meta upload / download
paths or the blitter for cases where they used tiled_memcpy paths
in case of Y tiling. This has exposed some bugs in meta path. To
avoid any piglit regressions on SKL this patch keeps the Yf/Ys
tiling disabled at the moment.

V3: Make brw_miptree_choose_tr_mode() actually choose TRMODE. (Ben)
    Few cosmetic changes.
V4: Get rid of brw_miptree_choose_tr_mode().
    Take care of all tile resource modes {Yf, Ys, none} for all
    generations at one place.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 100 +++++++++++++++++----
 1 file changed, 81 insertions(+), 19 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index e35cb645c24..45bbf7e7d0f 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -810,27 +810,89 @@ brw_miptree_layout(struct brw_context *brw,
                    enum intel_miptree_tiling_mode requested,
                    uint32_t layout_flags)
 {
-   mt->tr_mode = INTEL_MIPTREE_TRMODE_NONE;
-
-   intel_miptree_set_alignment(brw, mt, layout_flags);
-   intel_miptree_set_total_width_height(brw, mt);
-
-   if (!mt->total_width || !mt->total_height) {
-      intel_miptree_release(&mt);
-      return;
-   }
-
-   /* On Gen9+ the alignment values are expressed in multiples of the block
-    * size
+   const unsigned bpp = mt->cpp * 8;
+   /* Enable YF/YS tiling only for color surfaces because depth and
+    * stencil surfaces are not supported in blitter using fast copy
+    * blit and meta PBO upload, download paths. No other paths
+    * currently support Yf/Ys tiled surfaces.
+    * FINISHME:  Remove this restriction once we have a tiled_memcpy()
+    * path to do depth/stencil data upload/download to Yf/Ys tiled
+    * surfaces.
     */
-   if (brw->gen >= 9) {
-      unsigned int i, j;
-      _mesa_get_format_block_size(mt->format, &i, &j);
-      mt->align_w /= i;
-      mt->align_h /= j;
-   }
+   const bool is_tr_mode_yf_ys_allowed =
+      brw->gen >= 9 &&
+      !(layout_flags & MIPTREE_LAYOUT_FOR_BO) &&
+      !mt->compressed &&
+      _mesa_is_format_color_format(mt->format) &&
+      (requested == INTEL_MIPTREE_TILING_Y ||
+       requested == INTEL_MIPTREE_TILING_ANY) &&
+      (bpp && is_power_of_two(bpp)) &&
+      /* FIXME: To avoid piglit regressions keep the Yf/Ys tiling
+       * disabled at the moment.
+       */
+      false;
+
+   /* Lower index (Yf) is the higher priority mode */
+   const uint32_t tr_mode[3] = {INTEL_MIPTREE_TRMODE_YF,
+                                INTEL_MIPTREE_TRMODE_YS,
+                                INTEL_MIPTREE_TRMODE_NONE};
+   int i = is_tr_mode_yf_ys_allowed ? 0 : ARRAY_SIZE(tr_mode) - 1;
+
+   while (i < ARRAY_SIZE(tr_mode)) {
+      if (brw->gen < 9)
+         assert(tr_mode[i] == INTEL_MIPTREE_TRMODE_NONE);
+      else
+         assert(tr_mode[i] == INTEL_MIPTREE_TRMODE_YF ||
+                tr_mode[i] == INTEL_MIPTREE_TRMODE_YS ||
+                tr_mode[i] == INTEL_MIPTREE_TRMODE_NONE);
+
+      mt->tr_mode = tr_mode[i];
+      intel_miptree_set_alignment(brw, mt, layout_flags);
+      intel_miptree_set_total_width_height(brw, mt);
+
+      if (!mt->total_width || !mt->total_height) {
+         intel_miptree_release(&mt);
+         break;
+      }
+
+      /* On Gen9+ the alignment values are expressed in multiples of the
+       * block size.
+       */
+      if (brw->gen >= 9) {
+         unsigned int i, j;
+         _mesa_get_format_block_size(mt->format, &i, &j);
+         mt->align_w /= i;
+         mt->align_h /= j;
+      }
+
+      /* If there is already a BO, we cannot effect tiling modes */
+      if (layout_flags & MIPTREE_LAYOUT_FOR_BO)
+         break;
 
-   if ((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0)
       mt->tiling = brw_miptree_choose_tiling(brw, requested, mt);
+      if (is_tr_mode_yf_ys_allowed) {
+         unsigned int level = 0;
+
+         if (mt->tiling == I915_TILING_Y ||
+             mt->tiling == (I915_TILING_Y | I915_TILING_X) ||
+             mt->tr_mode == INTEL_MIPTREE_TRMODE_NONE) {
+            /* FIXME: Don't allow YS tiling at the moment. Using 64KB tiling
+             * for small textures might result in to memory wastage. Revisit
+             * this condition when we have more information about the specific
+             * cases where using YS over YF will be useful.
+             */
+            if (mt->tr_mode != INTEL_MIPTREE_TRMODE_YS)
+               break;
+         }
+         /* Failed to use selected tr_mode. Free up the memory allocated
+          * for miptree levels in intel_miptree_total_width_height().
+          */
+         for (level = mt->first_level; level <= mt->last_level; level++) {
+            free(mt->level[level].slice);
+            mt->level[level].slice = NULL;
+         }
+      }
+      i++;
+   }
 }
 

From 385cd3e0bed8113659f2db8976b677b090acc9d8 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Fri, 5 Jun 2015 10:41:24 -0700
Subject: [PATCH 0076/1208] i965: Make a helper function
 intel_miptree_release_levels()

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 45bbf7e7d0f..fc7454b7ef7 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -804,6 +804,17 @@ intel_miptree_set_alignment(struct brw_context *brw,
    }
 }
 
+static void
+intel_miptree_release_levels(struct intel_mipmap_tree *mt)
+{
+   unsigned int level = 0;
+
+   for (level = mt->first_level; level <= mt->last_level; level++) {
+      free(mt->level[level].slice);
+      mt->level[level].slice = NULL;
+   }
+}
+
 void
 brw_miptree_layout(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
@@ -871,8 +882,6 @@ brw_miptree_layout(struct brw_context *brw,
 
       mt->tiling = brw_miptree_choose_tiling(brw, requested, mt);
       if (is_tr_mode_yf_ys_allowed) {
-         unsigned int level = 0;
-
          if (mt->tiling == I915_TILING_Y ||
              mt->tiling == (I915_TILING_Y | I915_TILING_X) ||
              mt->tr_mode == INTEL_MIPTREE_TRMODE_NONE) {
@@ -887,10 +896,7 @@ brw_miptree_layout(struct brw_context *brw,
          /* Failed to use selected tr_mode. Free up the memory allocated
           * for miptree levels in intel_miptree_total_width_height().
           */
-         for (level = mt->first_level; level <= mt->last_level; level++) {
-            free(mt->level[level].slice);
-            mt->level[level].slice = NULL;
-         }
+         intel_miptree_release_levels(mt);
       }
       i++;
    }

From a1afd59662449803fa4a40a79bdf0db16ffcbcf5 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Fri, 5 Jun 2015 10:56:40 -0700
Subject: [PATCH 0077/1208] i965: Make a helper function
 intel_miptree_can_use_tr_mode()

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 30 ++++++++++++++--------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index fc7454b7ef7..389834f012a 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -815,6 +815,23 @@ intel_miptree_release_levels(struct intel_mipmap_tree *mt)
    }
 }
 
+static bool
+intel_miptree_can_use_tr_mode(const struct intel_mipmap_tree *mt)
+{
+   if (mt->tiling == I915_TILING_Y ||
+       mt->tiling == (I915_TILING_Y | I915_TILING_X) ||
+       mt->tr_mode == INTEL_MIPTREE_TRMODE_NONE) {
+      /* FIXME: Don't allow YS tiling at the moment. Using 64KB tiling
+       * for small textures might result in to memory wastage. Revisit
+       * this condition when we have more information about the specific
+       * cases where using YS over YF will be useful.
+       */
+      if (mt->tr_mode != INTEL_MIPTREE_TRMODE_YS)
+         return true;
+   }
+   return false;
+}
+
 void
 brw_miptree_layout(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
@@ -882,17 +899,8 @@ brw_miptree_layout(struct brw_context *brw,
 
       mt->tiling = brw_miptree_choose_tiling(brw, requested, mt);
       if (is_tr_mode_yf_ys_allowed) {
-         if (mt->tiling == I915_TILING_Y ||
-             mt->tiling == (I915_TILING_Y | I915_TILING_X) ||
-             mt->tr_mode == INTEL_MIPTREE_TRMODE_NONE) {
-            /* FIXME: Don't allow YS tiling at the moment. Using 64KB tiling
-             * for small textures might result in to memory wastage. Revisit
-             * this condition when we have more information about the specific
-             * cases where using YS over YF will be useful.
-             */
-            if (mt->tr_mode != INTEL_MIPTREE_TRMODE_YS)
-               break;
-         }
+         if (intel_miptree_can_use_tr_mode(mt))
+            break;
          /* Failed to use selected tr_mode. Free up the memory allocated
           * for miptree levels in intel_miptree_total_width_height().
           */

From 69ee316c1daf93b4a53b1c02301ffe9df9598d28 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Tue, 14 Apr 2015 22:06:48 -0700
Subject: [PATCH 0078/1208] i965/gen9: Allocate YF/YS tiled buffer objects

In case of I915_TILING_{X,Y} we need to pass tiling format to libdrm
using drm_intel_bo_alloc_tiled(). But, In case of YF/YS tiled buffers
libdrm need not know about the tiling format because these buffers
don't have hardware support to be tiled or detiled through a fenced
region. libdrm still need to know buffer alignment value for its use
in kernel when resolving the relocation.

Using drm_intel_bo_alloc_for_render() for YF/YS tiled buffers
satisfy both the above conditions.

V2: Delete min/max buffer size restrictions not valid for i965+.
    Remove redundant align to tile size statements.
    Remove some redundant code now when there are no min/max buffer size.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 65 ++++++++++++++++++-
 1 file changed, 62 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 31386b99656..fb896a92263 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -558,6 +558,53 @@ intel_lower_compressed_format(struct brw_context *brw, mesa_format format)
    }
 }
 
+/* This function computes Yf/Ys tiled bo size, alignment and pitch. */
+static uint64_t
+intel_get_yf_ys_bo_size(struct intel_mipmap_tree *mt, unsigned *alignment,
+                        uint64_t *pitch)
+{
+   const uint32_t bpp = mt->cpp * 8;
+   const uint32_t aspect_ratio = (bpp == 16 || bpp == 64) ? 2 : 1;
+   uint32_t tile_width, tile_height;
+   uint64_t stride, size, aligned_y;
+
+   assert(mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE);
+
+   switch (bpp) {
+   case 8:
+      tile_height = 64;
+      break;
+   case 16:
+   case 32:
+      tile_height = 32;
+      break;
+   case 64:
+   case 128:
+      tile_height = 16;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YS)
+      tile_height *= 4;
+
+   aligned_y = ALIGN(mt->total_height, tile_height);
+   stride = mt->total_width * mt->cpp;
+   tile_width = tile_height * mt->cpp * aspect_ratio;
+   stride = ALIGN(stride, tile_width);
+   size = stride * aligned_y;
+
+   if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YF) {
+      assert(size % 4096 == 0);
+      *alignment = 4096;
+   } else {
+      assert(size % (64 * 1024) == 0);
+      *alignment = 64 * 1024;
+   }
+   *pitch = stride;
+   return size;
+}
 
 struct intel_mipmap_tree *
 intel_miptree_create(struct brw_context *brw,
@@ -616,10 +663,22 @@ intel_miptree_create(struct brw_context *brw,
       alloc_flags |= BO_ALLOC_FOR_RENDER;
 
    unsigned long pitch;
-   mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree", total_width,
-                                     total_height, mt->cpp, &mt->tiling,
-                                     &pitch, alloc_flags);
    mt->etc_format = etc_format;
+
+   if (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
+      unsigned alignment = 0;
+      unsigned long size;
+      size = intel_get_yf_ys_bo_size(mt, &alignment, &pitch);
+      assert(size);
+      mt->bo = drm_intel_bo_alloc_for_render(brw->bufmgr, "miptree",
+                                             size, alignment);
+   } else {
+      mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree",
+                                        total_width, total_height, mt->cpp,
+                                        &mt->tiling, &pitch,
+                                        alloc_flags);
+   }
+
    mt->pitch = pitch;
 
    /* If the BO is too large to fit in the aperture, we need to use the

From 7f282d05a11e0c29bddc1fac8c7028c7e823234f Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Fri, 5 Jun 2015 19:18:19 -0700
Subject: [PATCH 0079/1208] mesa: Add a new helper function
 _mesa_regions_overlap()

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/main/blit.c | 26 ++++++++++++++++++++++++++
 src/mesa/main/blit.h |  6 ++++++
 2 files changed, 32 insertions(+)

diff --git a/src/mesa/main/blit.c b/src/mesa/main/blit.c
index db8fee5a414..4765198d63a 100644
--- a/src/mesa/main/blit.c
+++ b/src/mesa/main/blit.c
@@ -37,6 +37,7 @@
 #include "framebuffer.h"
 #include "glformats.h"
 #include "mtypes.h"
+#include "macros.h"
 #include "state.h"
 
 
@@ -58,6 +59,31 @@ find_attachment(const struct gl_framebuffer *fb,
 }
 
 
+/**
+ * \return true if two regions overlap, false otherwise
+ */
+bool
+_mesa_regions_overlap(int srcX0, int srcY0,
+                      int srcX1, int srcY1,
+                      int dstX0, int dstY0,
+                      int dstX1, int dstY1)
+{
+   if (MAX2(srcX0, srcX1) < MIN2(dstX0, dstX1))
+      return false; /* dst completely right of src */
+
+   if (MAX2(dstX0, dstX1) < MIN2(srcX0, srcX1))
+      return false; /* dst completely left of src */
+
+   if (MAX2(srcY0, srcY1) < MIN2(dstY0, dstY1))
+      return false; /* dst completely above src */
+
+   if (MAX2(dstY0, dstY1) < MIN2(srcY0, srcY1))
+      return false; /* dst completely below src */
+
+   return true; /* some overlap */
+}
+
+
 /**
  * Helper function for checking if the datatypes of color buffers are
  * compatible for glBlitFramebuffer.  From the 3.1 spec, page 198:
diff --git a/src/mesa/main/blit.h b/src/mesa/main/blit.h
index 54b946e3192..88dd4a9ec8d 100644
--- a/src/mesa/main/blit.h
+++ b/src/mesa/main/blit.h
@@ -28,6 +28,12 @@
 
 #include "glheader.h"
 
+extern bool
+_mesa_regions_overlap(int srcX0, int srcY0,
+                      int srcX1, int srcY1,
+                      int dstX0, int dstY0,
+                      int dstX1, int dstY1);
+
 extern void
 _mesa_blit_framebuffer(struct gl_context *ctx,
                        struct gl_framebuffer *readFb,

From 2a397c7958089f766aa0d3c66016742fdf7494dd Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Fri, 5 Jun 2015 19:23:46 -0700
Subject: [PATCH 0080/1208] mesa/st: Use global function
 _mesa_regions_overlap()

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/state_tracker/st_cb_drawpixels.c | 30 +++--------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index a6a98c83aa6..e736d4b5083 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -33,6 +33,7 @@
 #include "main/imports.h"
 #include "main/image.h"
 #include "main/bufferobj.h"
+#include "main/blit.h"
 #include "main/format_pack.h"
 #include "main/macros.h"
 #include "main/mtypes.h"
@@ -1312,31 +1313,6 @@ st_get_color_read_renderbuffer(struct gl_context *ctx)
 }
 
 
-/**
- * \return TRUE if two regions overlap, FALSE otherwise
- */
-static boolean
-regions_overlap(int srcX0, int srcY0,
-                int srcX1, int srcY1,
-                int dstX0, int dstY0,
-                int dstX1, int dstY1)
-{
-   if (MAX2(srcX0, srcX1) < MIN2(dstX0, dstX1))
-      return FALSE; /* src completely left of dst */
-
-   if (MAX2(dstX0, dstX1) < MIN2(srcX0, srcX1))
-      return FALSE; /* dst completely left of src */
-
-   if (MAX2(srcY0, srcY1) < MIN2(dstY0, dstY1))
-      return FALSE; /* src completely above dst */
-
-   if (MAX2(dstY0, dstY1) < MIN2(srcY0, srcY1))
-      return FALSE; /* dst completely above src */
-
-   return TRUE; /* some overlap */
-}
-
-
 /**
  * Try to do a glCopyPixels for simple cases with a blit by calling
  * pipe->blit().
@@ -1420,8 +1396,8 @@ blit_copy_pixels(struct gl_context *ctx, GLint srcx, GLint srcy,
       }
 
       if (rbRead != rbDraw ||
-          !regions_overlap(readX, readY, readX + readW, readY + readH,
-                           drawX, drawY, drawX + drawW, drawY + drawH)) {
+          !_mesa_regions_overlap(readX, readY, readX + readW, readY + readH,
+                                 drawX, drawY, drawX + drawW, drawY + drawH)) {
          struct pipe_blit_info blit;
 
          memset(&blit, 0, sizeof(blit));

From ca21c9ab28df24ef015ead28df1dcccd90387df6 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Tue, 9 Jun 2015 15:18:13 -0700
Subject: [PATCH 0081/1208] mesa/swrast: Use global function
 _mesa_regions_overlap()

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/swrast/s_copypix.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/src/mesa/swrast/s_copypix.c b/src/mesa/swrast/s_copypix.c
index 68c83e44e12..8fde0c29540 100644
--- a/src/mesa/swrast/s_copypix.c
+++ b/src/mesa/swrast/s_copypix.c
@@ -27,6 +27,7 @@
 #include "main/context.h"
 #include "main/condrender.h"
 #include "main/macros.h"
+#include "main/blit.h"
 #include "main/pixeltransfer.h"
 #include "main/imports.h"
 
@@ -52,19 +53,8 @@ regions_overlap(GLint srcx, GLint srcy,
                 GLfloat zoomX, GLfloat zoomY)
 {
    if (zoomX == 1.0 && zoomY == 1.0) {
-      /* no zoom */
-      if (srcx >= dstx + width || (srcx + width <= dstx)) {
-         return GL_FALSE;
-      }
-      else if (srcy < dsty) { /* this is OK */
-         return GL_FALSE;
-      }
-      else if (srcy > dsty + height) {
-         return GL_FALSE;
-      }
-      else {
-         return GL_TRUE;
-      }
+      return _mesa_regions_overlap(srcx, srcy, srcx + width, srcy + height,
+                                   dstx, dsty, dstx + width, dsty + height);
    }
    else {
       /* add one pixel of slop when zooming, just to be safe */

From 412c8c8e7eaeec2763bb21a30626544b5a711cb2 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Tue, 14 Apr 2015 22:06:49 -0700
Subject: [PATCH 0082/1208] i965/gen9: Add XY_FAST_COPY_BLT support to
 intelEmitCopyBlit()

This patch enables using XY_FAST_COPY_BLT only for Yf/Ys tiled buffers.
It can be later turned on for other tiling patterns (X,Y) too.

V3: Flush in between sequential fast copy blits.
    Fix src/dst alignment requirements.
    Make can_fast_copy_blit() helper.
    Use ffs(), is_power_of_two()
    Move overlap computation inside intel_miptree_blit().

V4: Use _mesa_regions_overlap() function.
    Add check for src_buffer == dst_buffer.
    Simplify horizontal and vertical alignment computations.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/intel_blit.c       | 302 ++++++++++++++++---
 src/mesa/drivers/dri/i965/intel_blit.h       |  28 +-
 src/mesa/drivers/dri/i965/intel_copy_image.c |   2 +
 src/mesa/drivers/dri/i965/intel_reg.h        |  16 +
 4 files changed, 286 insertions(+), 62 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index 9fac63d56a1..c773cbca974 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -27,6 +27,7 @@
 
 
 #include "main/mtypes.h"
+#include "main/blit.h"
 #include "main/context.h"
 #include "main/enums.h"
 #include "main/colormac.h"
@@ -43,6 +44,23 @@
 
 #define FILE_DEBUG_FLAG DEBUG_BLIT
 
+#define SET_TILING_XY_FAST_COPY_BLT(tiling, tr_mode, type)           \
+({                                                                   \
+   switch (tiling) {                                                 \
+   case I915_TILING_X:                                               \
+      CMD |= type ## _TILED_X;                                       \
+      break;                                                         \
+   case I915_TILING_Y:                                               \
+      if (tr_mode == INTEL_MIPTREE_TRMODE_YS)                        \
+         CMD |= type ## _TILED_64K;                                  \
+      else                                                           \
+         CMD |= type ## _TILED_Y;                                    \
+      break;                                                         \
+   default:                                                          \
+      unreachable("not reached");                                    \
+   }                                                                 \
+})
+
 static void
 intel_miptree_set_alpha_to_one(struct brw_context *brw,
                                struct intel_mipmap_tree *mt,
@@ -75,6 +93,10 @@ static uint32_t
 br13_for_cpp(int cpp)
 {
    switch (cpp) {
+   case 16:
+      return BR13_32323232;
+   case 8:
+      return BR13_16161616;
    case 4:
       return BR13_8888;
    case 2:
@@ -86,6 +108,64 @@ br13_for_cpp(int cpp)
    }
 }
 
+static uint32_t
+get_tr_horizontal_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
+   /* Alignment tables for YF/YS tiled surfaces. */
+   const uint32_t align_2d_yf[] = {64, 64, 32, 32, 16};
+   const uint32_t bpp = cpp * 8;
+   const uint32_t shift = is_src ? 17 : 10;
+   uint32_t align;
+   int i = 0;
+
+   if (tr_mode == INTEL_MIPTREE_TRMODE_NONE)
+      return 0;
+
+   /* Compute array index. */
+   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp));
+   i = ffs(bpp / 8) - 1;
+
+   align = tr_mode == INTEL_MIPTREE_TRMODE_YF ?
+           align_2d_yf[i] :
+           4 * align_2d_yf[i];
+
+   assert(is_power_of_two(align));
+
+   /* XY_FAST_COPY_BLT doesn't support horizontal alignment of 16. */
+   if (align == 16)
+      align = 32;
+
+   return (ffs(align) - 6) << shift;
+}
+
+static uint32_t
+get_tr_vertical_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
+   /* Vertical alignment tables for YF/YS tiled surfaces. */
+   const unsigned align_2d_yf[] = {64, 32, 32, 16, 16};
+   const uint32_t bpp = cpp * 8;
+   const uint32_t shift = is_src ? 15 : 8;
+   uint32_t align;
+   int i = 0;
+
+   if (tr_mode == INTEL_MIPTREE_TRMODE_NONE)
+      return 0;
+
+   /* Compute array index. */
+   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp));
+   i = ffs(bpp / 8) - 1;
+
+   align = tr_mode == INTEL_MIPTREE_TRMODE_YF ?
+           align_2d_yf[i] :
+           4 * align_2d_yf[i];
+
+   assert(is_power_of_two(align));
+
+   /* XY_FAST_COPY_BLT doesn't support vertical alignments of 16 and 32. */
+   if (align == 16 || align == 32)
+      align = 64;
+
+   return (ffs(align) - 7) << shift;
+}
+
 /**
  * Emits the packet for switching the blitter from X to Y tiled or back.
  *
@@ -278,9 +358,11 @@ intel_miptree_blit(struct brw_context *brw,
                           src_pitch,
                           src_mt->bo, src_mt->offset,
                           src_mt->tiling,
+                          src_mt->tr_mode,
                           dst_mt->pitch,
                           dst_mt->bo, dst_mt->offset,
                           dst_mt->tiling,
+                          dst_mt->tr_mode,
                           src_x, src_y,
                           dst_x, dst_y,
                           width, height,
@@ -313,6 +395,67 @@ alignment_valid(struct brw_context *brw, unsigned offset, uint32_t tiling)
    return true;
 }
 
+static bool
+can_fast_copy_blit(struct brw_context *brw,
+		   drm_intel_bo *src_buffer,
+                   int16_t src_x, int16_t src_y,
+                   uintptr_t src_offset, uint32_t src_pitch,
+                   uint32_t src_tiling, uint32_t src_tr_mode,
+		   drm_intel_bo *dst_buffer,
+                   int16_t dst_x, int16_t dst_y,
+                   uintptr_t dst_offset, uint32_t dst_pitch,
+                   uint32_t dst_tiling, uint32_t dst_tr_mode,
+                   int16_t w, int16_t h, uint32_t cpp)
+{
+   const bool dst_tiling_none = dst_tiling == I915_TILING_NONE;
+   const bool src_tiling_none = src_tiling == I915_TILING_NONE;
+
+   if (brw->gen < 9)
+      return false;
+
+   if (src_buffer->handle == dst_buffer->handle &&
+       _mesa_regions_overlap(src_x, src_y, src_x + w, src_y + h,
+                             dst_x, dst_y, dst_x + w, dst_y + h))
+      return false;
+
+   /* Enable fast copy blit only if the surfaces are Yf/Ys tiled.
+    * FIXME: Based on performance data, remove this condition later to
+    * enable for all types of surfaces.
+    */
+   if (src_tr_mode == INTEL_MIPTREE_TRMODE_NONE &&
+       dst_tr_mode == INTEL_MIPTREE_TRMODE_NONE)
+      return false;
+
+   /* For all surface types buffers must be cacheline-aligned. */
+   if ((dst_offset | src_offset) & 63)
+      return false;
+
+   /* Color depth greater than 128 bits not supported. */
+   if (cpp > 16)
+      return false;
+
+   /* For Fast Copy Blits the pitch cannot be a negative number. So, bit 15
+    * of the destination pitch must be zero.
+    */
+   if ((src_pitch >> 15 & 1) != 0 || (dst_pitch >> 15 & 1) != 0)
+      return false;
+
+   /* For Linear surfaces, the pitch has to be an OWord (16byte) multiple. */
+   if ((src_tiling_none && src_pitch % 16 != 0) ||
+       (dst_tiling_none && dst_pitch % 16 != 0))
+      return false;
+
+   /* For Tiled surfaces, the pitch has to be a multiple of the Tile width
+    * (X direction width of the Tile). This means the pitch value will
+    * always be Cache Line aligned (64byte multiple).
+    */
+   if ((!dst_tiling_none && dst_pitch % 64 != 0) ||
+       (!src_tiling_none && src_pitch % 64 != 0))
+      return false;
+
+   return true;
+}
+
 /* Copy BitBlt
  */
 bool
@@ -322,10 +465,12 @@ intelEmitCopyBlit(struct brw_context *brw,
 		  drm_intel_bo *src_buffer,
 		  GLuint src_offset,
 		  uint32_t src_tiling,
+		  uint32_t src_tr_mode,
 		  GLshort dst_pitch,
 		  drm_intel_bo *dst_buffer,
 		  GLuint dst_offset,
 		  uint32_t dst_tiling,
+		  uint32_t dst_tr_mode,
 		  GLshort src_x, GLshort src_y,
 		  GLshort dst_x, GLshort dst_y,
 		  GLshort w, GLshort h,
@@ -337,18 +482,11 @@ intelEmitCopyBlit(struct brw_context *brw,
    drm_intel_bo *aper_array[3];
    bool dst_y_tiled = dst_tiling == I915_TILING_Y;
    bool src_y_tiled = src_tiling == I915_TILING_Y;
-
-   if (!alignment_valid(brw, dst_offset, dst_tiling))
-      return false;
-   if (!alignment_valid(brw, src_offset, src_tiling))
-      return false;
+   bool use_fast_copy_blit = false;
 
    if ((dst_y_tiled || src_y_tiled) && brw->gen < 6)
       return false;
 
-   assert(!dst_y_tiled || (dst_pitch % 128) == 0);
-   assert(!src_y_tiled || (src_pitch % 128) == 0);
-
    /* do space check before going any further */
    do {
        aper_array[0] = brw->batch.bo;
@@ -373,52 +511,114 @@ intelEmitCopyBlit(struct brw_context *brw,
        src_buffer, src_pitch, src_offset, src_x, src_y,
        dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);
 
-   /* Blit pitch must be dword-aligned.  Otherwise, the hardware appears to drop
-    * the low bits.  Offsets must be naturally aligned.
-    */
-   if (src_pitch % 4 != 0 || src_offset % cpp != 0 ||
-       dst_pitch % 4 != 0 || dst_offset % cpp != 0)
-      return false;
+   use_fast_copy_blit = can_fast_copy_blit(brw,
+                                           src_buffer,
+                                           src_x, src_y,
+                                           src_offset, src_pitch,
+                                           src_tiling, src_tr_mode,
+                                           dst_buffer,
+                                           dst_x, dst_y,
+                                           dst_offset, dst_pitch,
+                                           dst_tiling, dst_tr_mode,
+                                           w, h, cpp);
+   assert(use_fast_copy_blit ||
+          (src_tr_mode == INTEL_MIPTREE_TRMODE_NONE &&
+           dst_tr_mode == INTEL_MIPTREE_TRMODE_NONE));
 
-   /* For big formats (such as floating point), do the copy using 16 or 32bpp
-    * and multiply the coordinates.
-    */
-   if (cpp > 4) {
-      if (cpp % 4 == 2) {
-         dst_x *= cpp / 2;
-         dst_x2 *= cpp / 2;
-         src_x *= cpp / 2;
-         cpp = 2;
-      } else {
-         assert(cpp % 4 == 0);
-         dst_x *= cpp / 4;
-         dst_x2 *= cpp / 4;
-         src_x *= cpp / 4;
-         cpp = 4;
+   if (use_fast_copy_blit) {
+      /* When two sequential fast copy blits have different source surfaces,
+       * but their destinations refer to the same destination surfaces and
+       * therefore destinations overlap it is imperative that a flush be
+       * inserted between the two blits.
+       *
+       * FIXME: Figure out a way to avoid flushing when not required.
+       */
+      brw_emit_mi_flush(brw);
+
+      assert(cpp <= 16);
+      BR13 = br13_for_cpp(cpp);
+
+      if (src_tr_mode == INTEL_MIPTREE_TRMODE_YF)
+         BR13 |= XY_FAST_SRC_TRMODE_YF;
+
+      if (dst_tr_mode == INTEL_MIPTREE_TRMODE_YF)
+         BR13 |= XY_FAST_DST_TRMODE_YF;
+
+      CMD = XY_FAST_COPY_BLT_CMD;
+
+      if (dst_tiling != I915_TILING_NONE) {
+         SET_TILING_XY_FAST_COPY_BLT(dst_tiling, dst_tr_mode, XY_FAST_DST);
+         /* Pitch value should be specified as a number of Dwords. */
+         dst_pitch /= 4;
+      }
+      if (src_tiling != I915_TILING_NONE) {
+         SET_TILING_XY_FAST_COPY_BLT(src_tiling, src_tr_mode, XY_FAST_SRC);
+         /* Pitch value should be specified as a number of Dwords. */
+         src_pitch /= 4;
       }
-   }
 
-   BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16;
+      CMD |= get_tr_horizontal_align(src_tr_mode, cpp, true /* is_src */);
+      CMD |= get_tr_vertical_align(src_tr_mode, cpp, true /* is_src */);
 
-   switch (cpp) {
-   case 1:
-   case 2:
-      CMD = XY_SRC_COPY_BLT_CMD;
-      break;
-   case 4:
-      CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
-      break;
-   default:
-      return false;
-   }
+      CMD |= get_tr_horizontal_align(dst_tr_mode, cpp, false /* is_src */);
+      CMD |= get_tr_vertical_align(dst_tr_mode, cpp, false /* is_src */);
 
-   if (dst_tiling != I915_TILING_NONE) {
-      CMD |= XY_DST_TILED;
-      dst_pitch /= 4;
-   }
-   if (src_tiling != I915_TILING_NONE) {
-      CMD |= XY_SRC_TILED;
-      src_pitch /= 4;
+   } else {
+      assert(!dst_y_tiled || (dst_pitch % 128) == 0);
+      assert(!src_y_tiled || (src_pitch % 128) == 0);
+
+      /* For big formats (such as floating point), do the copy using 16 or
+       * 32bpp and multiply the coordinates.
+       */
+      if (cpp > 4) {
+         if (cpp % 4 == 2) {
+            dst_x *= cpp / 2;
+            dst_x2 *= cpp / 2;
+            src_x *= cpp / 2;
+            cpp = 2;
+         } else {
+            assert(cpp % 4 == 0);
+            dst_x *= cpp / 4;
+            dst_x2 *= cpp / 4;
+            src_x *= cpp / 4;
+            cpp = 4;
+         }
+      }
+
+      if (!alignment_valid(brw, dst_offset, dst_tiling))
+         return false;
+      if (!alignment_valid(brw, src_offset, src_tiling))
+         return false;
+
+      /* Blit pitch must be dword-aligned.  Otherwise, the hardware appears to drop
+       * the low bits.  Offsets must be naturally aligned.
+       */
+      if (src_pitch % 4 != 0 || src_offset % cpp != 0 ||
+          dst_pitch % 4 != 0 || dst_offset % cpp != 0)
+         return false;
+
+      assert(cpp <= 4);
+      BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16;
+      switch (cpp) {
+      case 1:
+      case 2:
+         CMD = XY_SRC_COPY_BLT_CMD;
+         break;
+      case 4:
+         CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
+         break;
+      default:
+         return false;
+      }
+
+      if (dst_tiling != I915_TILING_NONE) {
+         CMD |= XY_DST_TILED;
+         dst_pitch /= 4;
+      }
+      if (src_tiling != I915_TILING_NONE) {
+         CMD |= XY_SRC_TILED;
+         src_pitch /= 4;
+      }
    }
 
    if (dst_y2 <= dst_y || dst_x2 <= dst_x) {
@@ -576,7 +776,9 @@ intel_emit_linear_blit(struct brw_context *brw,
    dst_x = dst_offset % 64;
    ok = intelEmitCopyBlit(brw, 1,
 			  pitch, src_bo, src_offset - src_x, I915_TILING_NONE,
+                          INTEL_MIPTREE_TRMODE_NONE,
 			  pitch, dst_bo, dst_offset - dst_x, I915_TILING_NONE,
+                          INTEL_MIPTREE_TRMODE_NONE,
 			  src_x, 0, /* src x/y */
 			  dst_x, 0, /* dst x/y */
 			  pitch, height, /* w, h */
@@ -595,7 +797,9 @@ intel_emit_linear_blit(struct brw_context *brw,
    if (size != 0) {
       ok = intelEmitCopyBlit(brw, 1,
 			     pitch, src_bo, src_offset - src_x, I915_TILING_NONE,
+                             INTEL_MIPTREE_TRMODE_NONE,
 			     pitch, dst_bo, dst_offset - dst_x, I915_TILING_NONE,
+                             INTEL_MIPTREE_TRMODE_NONE,
 			     src_x, 0, /* src x/y */
 			     dst_x, 0, /* dst x/y */
 			     size, 1, /* w, h */
diff --git a/src/mesa/drivers/dri/i965/intel_blit.h b/src/mesa/drivers/dri/i965/intel_blit.h
index 2287c379c4e..c3d19a5a20e 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.h
+++ b/src/mesa/drivers/dri/i965/intel_blit.h
@@ -32,19 +32,21 @@
 
 bool
 intelEmitCopyBlit(struct brw_context *brw,
-                              GLuint cpp,
-                              GLshort src_pitch,
-                              drm_intel_bo *src_buffer,
-                              GLuint src_offset,
-			      uint32_t src_tiling,
-                              GLshort dst_pitch,
-                              drm_intel_bo *dst_buffer,
-                              GLuint dst_offset,
-			      uint32_t dst_tiling,
-                              GLshort srcx, GLshort srcy,
-                              GLshort dstx, GLshort dsty,
-                              GLshort w, GLshort h,
-			      GLenum logicop );
+                  GLuint cpp,
+                  GLshort src_pitch,
+                  drm_intel_bo *src_buffer,
+                  GLuint src_offset,
+                  uint32_t src_tiling,
+                  uint32_t src_tr_mode,
+                  GLshort dst_pitch,
+                  drm_intel_bo *dst_buffer,
+                  GLuint dst_offset,
+                  uint32_t dst_tiling,
+                  uint32_t dst_tr_mode,
+                  GLshort srcx, GLshort srcy,
+                  GLshort dstx, GLshort dsty,
+                  GLshort w, GLshort h,
+                  GLenum logicop);
 
 bool intel_miptree_blit_compatible_formats(mesa_format src, mesa_format dst);
 
diff --git a/src/mesa/drivers/dri/i965/intel_copy_image.c b/src/mesa/drivers/dri/i965/intel_copy_image.c
index f4c7eff2904..3706704bf1a 100644
--- a/src/mesa/drivers/dri/i965/intel_copy_image.c
+++ b/src/mesa/drivers/dri/i965/intel_copy_image.c
@@ -126,9 +126,11 @@ copy_image_with_blitter(struct brw_context *brw,
                             src_mt->pitch,
                             src_mt->bo, src_mt->offset,
                             src_mt->tiling,
+                            src_mt->tr_mode,
                             dst_mt->pitch,
                             dst_mt->bo, dst_mt->offset,
                             dst_mt->tiling,
+                            dst_mt->tr_mode,
                             src_x, src_y,
                             dst_x, dst_y,
                             src_width, src_height,
diff --git a/src/mesa/drivers/dri/i965/intel_reg.h b/src/mesa/drivers/dri/i965/intel_reg.h
index bd14e189da3..4223e11e78c 100644
--- a/src/mesa/drivers/dri/i965/intel_reg.h
+++ b/src/mesa/drivers/dri/i965/intel_reg.h
@@ -102,6 +102,8 @@
 
 #define XY_SRC_COPY_BLT_CMD             (CMD_2D | (0x53 << 22))
 
+#define XY_FAST_COPY_BLT_CMD             (CMD_2D | (0x42 << 22))
+
 #define XY_TEXT_IMMEDIATE_BLIT_CMD	(CMD_2D | (0x31 << 22))
 # define XY_TEXT_BYTE_PACKED		(1 << 16)
 
@@ -111,10 +113,24 @@
 #define XY_SRC_TILED		(1 << 15)
 #define XY_DST_TILED		(1 << 11)
 
+/* BR00 */
+#define XY_FAST_SRC_TILED_64K        (3 << 20)
+#define XY_FAST_SRC_TILED_Y          (2 << 20)
+#define XY_FAST_SRC_TILED_X          (1 << 20)
+
+#define XY_FAST_DST_TILED_64K        (3 << 13)
+#define XY_FAST_DST_TILED_Y          (2 << 13)
+#define XY_FAST_DST_TILED_X          (1 << 13)
+
 /* BR13 */
 #define BR13_8			(0x0 << 24)
 #define BR13_565		(0x1 << 24)
 #define BR13_8888		(0x3 << 24)
+#define BR13_16161616		(0x4 << 24)
+#define BR13_32323232		(0x5 << 24)
+
+#define XY_FAST_SRC_TRMODE_YF        (1 << 31)
+#define XY_FAST_DST_TRMODE_YF        (1 << 30)
 
 /* Pipeline Statistics Counter Registers */
 #define IA_VERTICES_COUNT               0x2310

From 3df5aaaa158bfb878e9e5ce467dd654466942880 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Wed, 27 May 2015 19:28:34 -0700
Subject: [PATCH 0083/1208] i965/skl: Extract the blit command setup in to a
 helper

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/intel_blit.c | 93 +++++++++++++++++---------
 1 file changed, 61 insertions(+), 32 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index c773cbca974..bc390535c86 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -456,6 +456,51 @@ can_fast_copy_blit(struct brw_context *brw,
    return true;
 }
 
+static uint32_t
+xy_blit_cmd(uint32_t src_tiling, uint32_t src_tr_mode,
+            uint32_t dst_tiling, uint32_t dst_tr_mode,
+            uint32_t cpp, bool use_fast_copy_blit)
+{
+   uint32_t CMD = 0;
+
+   if (use_fast_copy_blit) {
+      CMD = XY_FAST_COPY_BLT_CMD;
+
+      if (dst_tiling != I915_TILING_NONE)
+         SET_TILING_XY_FAST_COPY_BLT(dst_tiling, dst_tr_mode, XY_FAST_DST);
+
+      if (src_tiling != I915_TILING_NONE)
+         SET_TILING_XY_FAST_COPY_BLT(src_tiling, src_tr_mode, XY_FAST_SRC);
+
+      CMD |= get_tr_horizontal_align(src_tr_mode, cpp, true /* is_src */);
+      CMD |= get_tr_vertical_align(src_tr_mode, cpp, true /* is_src */);
+
+      CMD |= get_tr_horizontal_align(dst_tr_mode, cpp, false /* is_src */);
+      CMD |= get_tr_vertical_align(dst_tr_mode, cpp, false /* is_src */);
+
+   } else {
+      assert(cpp <= 4);
+      switch (cpp) {
+      case 1:
+      case 2:
+         CMD = XY_SRC_COPY_BLT_CMD;
+         break;
+      case 4:
+         CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
+         break;
+      default:
+         unreachable("not reached");
+      }
+
+      if (dst_tiling != I915_TILING_NONE)
+         CMD |= XY_DST_TILED;
+
+      if (src_tiling != I915_TILING_NONE)
+         CMD |= XY_SRC_TILED;
+   }
+   return CMD;
+}
+
 /* Copy BitBlt
  */
 bool
@@ -544,24 +589,18 @@ intelEmitCopyBlit(struct brw_context *brw,
       if (dst_tr_mode == INTEL_MIPTREE_TRMODE_YF)
          BR13 |= XY_FAST_DST_TRMODE_YF;
 
-      CMD = XY_FAST_COPY_BLT_CMD;
+      CMD = xy_blit_cmd(src_tiling, src_tr_mode,
+                        dst_tiling, dst_tr_mode,
+                        cpp, use_fast_copy_blit);
 
-      if (dst_tiling != I915_TILING_NONE) {
-         SET_TILING_XY_FAST_COPY_BLT(dst_tiling, dst_tr_mode, XY_FAST_DST);
-         /* Pitch value should be specified as a number of Dwords. */
+      /* For tiled source and destination, pitch value should be specified
+       * as a number of Dwords.
+       */
+      if (dst_tiling != I915_TILING_NONE)
          dst_pitch /= 4;
-      }
-      if (src_tiling != I915_TILING_NONE) {
-         SET_TILING_XY_FAST_COPY_BLT(src_tiling, src_tr_mode, XY_FAST_SRC);
-         /* Pitch value should be specified as a number of Dwords. */
+
+      if (src_tiling != I915_TILING_NONE)
          src_pitch /= 4;
-      }
-
-      CMD |= get_tr_horizontal_align(src_tr_mode, cpp, true /* is_src */);
-      CMD |= get_tr_vertical_align(src_tr_mode, cpp, true /* is_src */);
-
-      CMD |= get_tr_horizontal_align(dst_tr_mode, cpp, false /* is_src */);
-      CMD |= get_tr_vertical_align(dst_tr_mode, cpp, false /* is_src */);
 
    } else {
       assert(!dst_y_tiled || (dst_pitch % 128) == 0);
@@ -599,26 +638,16 @@ intelEmitCopyBlit(struct brw_context *brw,
 
       assert(cpp <= 4);
       BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16;
-      switch (cpp) {
-      case 1:
-      case 2:
-         CMD = XY_SRC_COPY_BLT_CMD;
-         break;
-      case 4:
-         CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
-         break;
-      default:
-         return false;
-      }
 
-      if (dst_tiling != I915_TILING_NONE) {
-         CMD |= XY_DST_TILED;
+      CMD = xy_blit_cmd(src_tiling, src_tr_mode,
+                        dst_tiling, dst_tr_mode,
+                        cpp, use_fast_copy_blit);
+
+      if (dst_tiling != I915_TILING_NONE)
          dst_pitch /= 4;
-      }
-      if (src_tiling != I915_TILING_NONE) {
-         CMD |= XY_SRC_TILED;
+
+      if (src_tiling != I915_TILING_NONE)
          src_pitch /= 4;
-      }
    }
 
    if (dst_y2 <= dst_y || dst_x2 <= dst_x) {

From 54afb10f0e0a3b72a977c239c0aee04ea5dec967 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 29 Jun 2015 22:04:50 -0400
Subject: [PATCH 0084/1208] nv30: provide a minimum map buffer alignment

Otherwise we return 0, which is out of spec. Return 64 like all the
other nouveau drivers.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv30/nv30_screen.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 2e38a1978ae..4b3ed7dc157 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -69,6 +69,8 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return PIPE_ENDIAN_LITTLE;
    case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
       return 16;
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+      return NOUVEAU_MIN_BUFFER_MAP_ALIGN;
    case PIPE_CAP_MAX_VIEWPORTS:
       return 1;
    case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
@@ -135,7 +137,6 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
    case PIPE_CAP_START_INSTANCE:
    case PIPE_CAP_TEXTURE_MULTISAMPLE:
-   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
    case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
    case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_QUERY_PIPELINE_STATISTICS:

From b875198f1f0b7c90bcb22511c0050b06d8a33ac4 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 29 Jun 2015 02:16:23 -0400
Subject: [PATCH 0085/1208] nv30: modernize fp upload logic

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/nv30/nv30_fragprog.c      | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
index 7f227868f73..dbf36fd53b5 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
@@ -37,22 +37,26 @@ nv30_fragprog_upload(struct nv30_context *nv30)
    struct nouveau_context *nv = &nv30->base;
    struct nv30_fragprog *fp = nv30->fragprog.program;
    struct pipe_context *pipe = &nv30->base.pipe;
-   struct pipe_transfer *transfer;
-   uint32_t *map;
-   int i; (void)i;
 
-   if (unlikely(!fp->buffer)) {
+   if (unlikely(!fp->buffer))
       fp->buffer = pipe_buffer_create(pipe->screen, 0, 0, fp->insn_len * 4);
-   }
 
-   map = pipe_buffer_map(pipe, fp->buffer, PIPE_TRANSFER_WRITE, &transfer);
 #ifndef PIPE_ARCH_BIG_ENDIAN
-   memcpy(map, fp->insn, fp->insn_len * 4);
+   pipe_buffer_write(pipe, fp->buffer, 0, fp->insn_len * 4, fp->insn);
 #else
-   for (i = 0; i < fp->insn_len; i++)
-      *map++ = (fp->insn[i] >> 16) | (fp->insn[i] << 16);
+   {
+      struct pipe_transfer *transfer;
+      uint32_t *map;
+      int i;
+
+      map = pipe_buffer_map(pipe, fp->buffer,
+                            PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE,
+                            &transfer);
+      for (i = 0; i < fp->insn_len; i++)
+         *map++ = (fp->insn[i] >> 16) | (fp->insn[i] << 16);
+      pipe_buffer_unmap(pipe, transfer);
+   }
 #endif
-   pipe_buffer_unmap(pipe, transfer);
 
    if (nv04_resource(fp->buffer)->domain != NOUVEAU_BO_VRAM)
       nouveau_buffer_migrate(nv, nv04_resource(fp->buffer), NOUVEAU_BO_VRAM);

From bad107f2ec24b16118f4d99c54b853277b1a966d Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 29 Jun 2015 02:38:38 -0400
Subject: [PATCH 0086/1208] nv30: reset fragprog bufctx at bind time

A clear will do a partial validate, which will in turn reference all the
buffers in the bufctx again. However the fragprog last validated might
have already been deleted. So reset the bufctx when updating state.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv30/nv30_fragprog.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
index dbf36fd53b5..54f91bbd48b 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
@@ -165,8 +165,15 @@ static void
 nv30_fp_state_bind(struct pipe_context *pipe, void *hwcso)
 {
    struct nv30_context *nv30 = nv30_context(pipe);
+   struct nv30_fragprog *fp = hwcso;
 
-   nv30->fragprog.program = hwcso;
+   /* reset the bucftx so that we don't keep a dangling reference to the fp
+    * code
+    */
+   if (fp != nv30->state.fragprog)
+      PUSH_RESET(nv30->base.pushbuf, BUFCTX_FRAGPROG);
+
+   nv30->fragprog.program = fp;
    nv30->dirty |= NV30_NEW_FRAGPROG;
 }
 

From dacf9efd6326bed1166750680bfaa4e173315eba Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 29 Jun 2015 21:58:11 -0400
Subject: [PATCH 0087/1208] nv30: allow vertex state creation with 0 elements

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv30/nv30_vbo.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
index faa8812528a..adea1dcb77c 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
@@ -202,6 +202,9 @@ nv30_vbo_validate(struct nv30_context *nv30)
       return;
 
    redefine = MAX2(vertex->num_elements, nv30->state.num_vtxelts);
+   if (redefine == 0)
+      return;
+
    BEGIN_NV04(push, NV30_3D(VTXFMT(0)), redefine);
 
    for (i = 0; i < vertex->num_elements; i++) {
@@ -254,8 +257,6 @@ nv30_vertex_state_create(struct pipe_context *pipe, unsigned num_elements,
     struct translate_key transkey;
     unsigned i;
 
-    assert(num_elements);
-
     so = MALLOC(sizeof(*so) + sizeof(*so->element) * num_elements);
     if (!so)
         return NULL;

From 089e7c378838e7972d2c0588bb84a316fb929a59 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 29 Jun 2015 21:58:54 -0400
Subject: [PATCH 0088/1208] nv30: align transfer stride to 64, required by
 blit, sifm transfer impls

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv30/nv30_miptree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
index 1a4b8929c0f..846dcebc3a5 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
@@ -242,8 +242,8 @@ nv30_miptree_transfer_map(struct pipe_context *pipe, struct pipe_resource *pt,
    tx->base.level = level;
    tx->base.usage = usage;
    tx->base.box = *box;
-   tx->base.stride = util_format_get_nblocksx(pt->format, box->width) *
-                     util_format_get_blocksize(pt->format);
+   tx->base.stride = align(util_format_get_nblocksx(pt->format, box->width) *
+                           util_format_get_blocksize(pt->format), 64);
    tx->base.layer_stride = util_format_get_nblocksy(pt->format, box->height) *
                            tx->base.stride;
 

From e22e0de0d7c3a412bdd53c6d53825b7646624e3d Mon Sep 17 00:00:00 2001
From: Alexander von Gluck IV <kallisti5@unixzen.com>
Date: Mon, 29 Jun 2015 23:29:44 -0500
Subject: [PATCH 0089/1208] egl/haiku: fix Mesa build under Haiku

Performing a goto crosses the initialization of 'BWindow* win'
breaking the build. We also fix a missing semicolon.
---
 src/egl/drivers/haiku/egl_haiku.cpp | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/egl/drivers/haiku/egl_haiku.cpp b/src/egl/drivers/haiku/egl_haiku.cpp
index 3d00e47c8e6..ef74f657b14 100644
--- a/src/egl/drivers/haiku/egl_haiku.cpp
+++ b/src/egl/drivers/haiku/egl_haiku.cpp
@@ -92,8 +92,11 @@ haiku_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
 		return NULL;
 	}
 
-	if (!_eglInitSurface(&surface->surf, disp, EGL_WINDOW_BIT, conf, attrib_list))
-		goto cleanup_surface;
+	if (!_eglInitSurface(&surface->surf, disp, EGL_WINDOW_BIT,
+		conf, attrib_list)) {
+		free(surface);
+		return NULL;
+	}
 
 	(&surface->surf)->SwapInterval = 1;
 
@@ -110,10 +113,6 @@ haiku_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
 	TRACE("Showing window\n");
 	win->Show();
 	return &surface->surf;
-
-cleanup_surface:
-	free(surface);
-	return NULL;
 }
 
 
@@ -139,7 +138,7 @@ haiku_destroy_surface(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf)
 	if (_eglPutSurface(surf)) {
 		// XXX: detach haiku_egl_surface::gl from the native window and destroy it
 		free(surf);
-        }
+	}
 	return EGL_TRUE;
 }
 
@@ -153,7 +152,7 @@ haiku_add_configs_for_visuals(_EGLDisplay *dpy)
 	conf = (struct haiku_egl_config*) calloc(1, sizeof (*conf));
 	if (!conf) {
 		_eglError(EGL_BAD_ALLOC, "haiku_add_configs_for_visuals");
-		return NULL;
+		return EGL_FALSE;
 	}
 
 	_eglInitConfig(&conf->base, dpy, 1);
@@ -165,7 +164,7 @@ haiku_add_configs_for_visuals(_EGLDisplay *dpy)
 	_eglSetConfigKey(&conf->base, EGL_LUMINANCE_SIZE, 0);
 	_eglSetConfigKey(&conf->base, EGL_ALPHA_SIZE, 8);
 	_eglSetConfigKey(&conf->base, EGL_COLOR_BUFFER_TYPE, EGL_RGB_BUFFER);
-	EGLint r = (_eglGetConfigKey(&conf->base, EGL_RED_SIZE) 
+	EGLint r = (_eglGetConfigKey(&conf->base, EGL_RED_SIZE)
 		+ _eglGetConfigKey(&conf->base, EGL_GREEN_SIZE)
 		+ _eglGetConfigKey(&conf->base, EGL_BLUE_SIZE)
 		+ _eglGetConfigKey(&conf->base, EGL_ALPHA_SIZE));
@@ -195,7 +194,7 @@ haiku_add_configs_for_visuals(_EGLDisplay *dpy)
 		goto cleanup;
 	}
 	TRACE("Validated config\n");
-   
+
 	_eglLinkConfig(&conf->base);
 	if (!_eglGetArraySize(dpy->Configs)) {
 		_eglLog(_EGL_WARNING, "Haiku: failed to create any config");
@@ -210,6 +209,7 @@ cleanup:
 	return EGL_FALSE;
 }
 
+
 extern "C"
 EGLBoolean
 init_haiku(_EGLDriver *drv, _EGLDisplay *dpy)
@@ -221,7 +221,7 @@ init_haiku(_EGLDriver *drv, _EGLDisplay *dpy)
 		return EGL_FALSE;
 
 	dpy->Version = 14;
-   
+
 	TRACE("Initialization finished\n");
 
 	return EGL_TRUE;
@@ -271,7 +271,7 @@ haiku_destroy_context(_EGLDriver* drv, _EGLDisplay *disp, _EGLContext* ctx)
 	if (_eglPutContext(ctx)) {
 		// XXX: teardown the context ?
 		free(context);
-		ctx = NULL
+		ctx = NULL;
 	}
 	return EGL_TRUE;
 }
@@ -280,7 +280,7 @@ haiku_destroy_context(_EGLDriver* drv, _EGLDisplay *disp, _EGLContext* ctx)
 extern "C"
 EGLBoolean
 haiku_make_current(_EGLDriver* drv, _EGLDisplay* dpy, _EGLSurface *dsurf,
-		  _EGLSurface *rsurf, _EGLContext *ctx)
+	_EGLSurface *rsurf, _EGLContext *ctx)
 {
 	CALLED();
 
@@ -314,7 +314,7 @@ extern "C"
 void
 haiku_unload(_EGLDriver* drv)
 {
-	
+
 }
 
 

From 21b7c58b8a0cbf18c9ed90c260f01d00fefe0db2 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 23 Jun 2015 23:57:31 -0700
Subject: [PATCH 0090/1208] i965: Don't use GCC extension for ?: with only two
 operands.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

From the "apparently I don't know C" files...GCC apparently supports:

    x ?: y

which is equivalent to

    x ? x : y

except that it doesn't cause side-effects to occur twice.  See:
https://gcc.gnu.org/onlinedocs/gcc/Conditionals.html#Conditionals

This was confusing and looked like a typo.  It doesn't really buy us
anything, so just write the obvious code in normal C.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/mesa/drivers/dri/i965/intel_fbo.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 9e6a7116630..05e3f8b7ae2 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -551,10 +551,12 @@ intel_renderbuffer_update_wrapper(struct brw_context *brw,
 
    irb->mt_layer = layer_multiplier * layer;
 
-   if (layered) {
-      irb->layer_count = image->TexObject->NumLayers ?: mt->level[level].depth / layer_multiplier;
-   } else {
+   if (!layered) {
       irb->layer_count = 1;
+   } else if (image->TexObject->NumLayers > 0) {
+      irb->layer_count = image->TexObject->NumLayers;
+   } else {
+      irb->layer_count = mt->level[level].depth / layer_multiplier;
    }
 
    intel_miptree_reference(&irb->mt, mt);

From d5f1253b0c4637ad996fd0da45095165006d61d3 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 30 Jun 2015 02:46:26 -0400
Subject: [PATCH 0091/1208] nv50/ir: fix emission of address reg in 3rd source

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91056
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "10.5 10.6" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 1bfc8e32e84..6de8f45047a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -499,10 +499,14 @@ CodeEmitterNV50::emitForm_MAD(const Instruction *i)
    setSrc(i, 2, 2);
 
    if (i->getIndirect(0, 0)) {
-      assert(!i->getIndirect(1, 0));
+      assert(!i->srcExists(1) || !i->getIndirect(1, 0));
+      assert(!i->srcExists(2) || !i->getIndirect(2, 0));
       setAReg16(i, 0);
-   } else {
+   } else if (i->srcExists(1) && i->getIndirect(1, 0)) {
+      assert(!i->srcExists(2) || !i->getIndirect(2, 0));
       setAReg16(i, 1);
+   } else {
+      setAReg16(i, 2);
    }
 }
 

From edb8383c98ee23385731d0fc23a6b6673528a8ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Tue, 9 Jun 2015 13:28:44 +0300
Subject: [PATCH 0092/1208] glsl: Allow dynamic sampler array indexing with
 GLSL ES < 3.00
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dynamic indexing of sampler arrays is prohibited by GLSL ES 3.00.
Earlier versions allow 'constant-index-expression' indexing, where
index can contain a loop induction variable.

Patch allows dynamic indexing for sampler arrays when GLSL ES < 3.00.
This change makes 'sampler-array-index.frag' parser test in Piglit
pass + fishgl.com works when running Chrome on OpenGL ES 2.0 backend

v2: small change and some more commit message (Tapani)
v3: refactor checks to make it more readable (Ian Romanick)
v4: change warning comment in GLSL ES case (Curro)

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Signed-off-by: Kalyan Kondapally <kalyan.kondapally@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Cc: "10.5" and "10.6" <mesa-stable@lists.freedesktop.org>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=84225
---
 src/glsl/ast_array_index.cpp | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp
index 752d86f72fd..2c79002ebcb 100644
--- a/src/glsl/ast_array_index.cpp
+++ b/src/glsl/ast_array_index.cpp
@@ -226,24 +226,24 @@ _mesa_ast_array_index_to_hir(void *mem_ctx,
        * dynamically uniform expression is undefined.
        */
       if (array->type->without_array()->is_sampler()) {
-	 if (!state->is_version(130, 100)) {
-	    if (state->es_shader) {
-	       _mesa_glsl_warning(&loc, state,
-				  "sampler arrays indexed with non-constant "
-				  "expressions is optional in %s",
-				  state->get_version_string());
-	    } else {
-	       _mesa_glsl_warning(&loc, state,
-				  "sampler arrays indexed with non-constant "
-				  "expressions will be forbidden in GLSL 1.30 "
-				  "and later");
-	    }
-	 } else if (!state->is_version(400, 0) && !state->ARB_gpu_shader5_enable) {
-	    _mesa_glsl_error(&loc, state,
-			     "sampler arrays indexed with non-constant "
-			     "expressions is forbidden in GLSL 1.30 and "
-			     "later");
-	 }
+         if (!state->is_version(400, 0) && !state->ARB_gpu_shader5_enable) {
+            if (state->is_version(130, 300))
+               _mesa_glsl_error(&loc, state,
+                                "sampler arrays indexed with non-constant "
+                                "expressions are forbidden in GLSL %s "
+                                "and later",
+                                state->es_shader ? "ES 3.00" : "1.30");
+            else if (state->es_shader)
+               _mesa_glsl_warning(&loc, state,
+                                  "sampler arrays indexed with non-constant "
+                                  "expressions will be forbidden in GLSL "
+                                  "3.00 and later");
+            else
+               _mesa_glsl_warning(&loc, state,
+                                  "sampler arrays indexed with non-constant "
+                                  "expressions will be forbidden in GLSL "
+                                  "1.30 and later");
+         }
       }
    }
 

From e4512e1581cf90f56d13cfa6a809832ef3517283 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Tue, 9 Jun 2015 13:33:39 +0300
Subject: [PATCH 0093/1208] mesa/glsl: new compiler option
 EmitNoIndirectSampler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch provides new compiler option for backend to force unroll loops
that have non-constant expression indexing on sampler arrays.

This makes sure that we can never end up with a shader that uses loop
induction variable as sampler array index but does not unroll because
of having too much instructions. This would not work without dynamic
indexing support.

v2: change option name as EmitNoIndirectSampler

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Cc: "10.5" and "10.6" <mesa-stable@lists.freedesktop.org>
---
 src/glsl/loop_unroll.cpp | 12 ++++++++++++
 src/mesa/main/mtypes.h   |  1 +
 2 files changed, 13 insertions(+)

diff --git a/src/glsl/loop_unroll.cpp b/src/glsl/loop_unroll.cpp
index 635e1dd99cd..7a00fb8fea8 100644
--- a/src/glsl/loop_unroll.cpp
+++ b/src/glsl/loop_unroll.cpp
@@ -100,6 +100,18 @@ public:
 
    virtual ir_visitor_status visit_enter(ir_dereference_array *ir)
    {
+      /* Force unroll in case of dynamic indexing with sampler arrays
+       * when EmitNoIndirectSampler is set.
+       */
+      if (options->EmitNoIndirectSampler) {
+         if ((ir->array->type->is_array() &&
+              ir->array->type->contains_sampler()) &&
+             !ir->array_index->constant_expression_value()) {
+            unsupported_variable_indexing = true;
+            return visit_continue;
+         }
+      }
+
       /* Check for arrays variably-indexed by a loop induction variable.
        * Unrolling the loop may convert that access into constant-indexing.
        *
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 983b9dc307b..7b55677de30 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2881,6 +2881,7 @@ struct gl_shader_compiler_options
    GLboolean EmitNoIndirectOutput;  /**< No indirect addressing of outputs */
    GLboolean EmitNoIndirectTemp;    /**< No indirect addressing of temps */
    GLboolean EmitNoIndirectUniform; /**< No indirect addressing of constants */
+   GLboolean EmitNoIndirectSampler; /**< No indirect addressing of samplers */
    /*@}*/
 
    GLuint MaxIfDepth;               /**< Maximum nested IF blocks */

From 8852e26e93af1fc4b72bf9d57e847f53e1a1371b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Wed, 24 Jun 2015 13:22:43 +0300
Subject: [PATCH 0094/1208] i965: use EmitNoIndirectSampler for gen < 7
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Cc: "10.5" and "10.6" <mesa-stable@lists.freedesktop.org>
---
 src/mesa/drivers/dri/i965/brw_shader.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 32c40131434..3e3d78b9ad7 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -113,6 +113,10 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 	 (i == MESA_SHADER_FRAGMENT);
       compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
       compiler->glsl_compiler_options[i].LowerClipDistance = true;
+
+      /* !ARB_gpu_shader5 */
+      if (devinfo->gen < 7)
+         compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
    }
 
    compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = true;

From 2dc2b12ed15abb84c7e2b3c2726dcc1b735abcda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Mon, 29 Jun 2015 09:53:45 +0300
Subject: [PATCH 0095/1208] i915: use EmitNoIndirectSampler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Cc: "10.5" and "10.6" <mesa-stable@lists.freedesktop.org>
---
 src/mesa/drivers/dri/i915/i915_context.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index 42ea54e087d..57b033c07ea 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -255,6 +255,8 @@ i915CreateContext(int api,
     * FINISHME: vertex shaders?
     */
    ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitCondCodes = true;
+   ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectSampler =
+      true;
 
    struct gl_shader_compiler_options *const fs_options =
       & ctx->Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT];
@@ -266,6 +268,7 @@ i915CreateContext(int api,
    fs_options->EmitNoIndirectOutput = true;
    fs_options->EmitNoIndirectUniform = true;
    fs_options->EmitNoIndirectTemp = true;
+   fs_options->EmitNoIndirectSampler = true;
 
    ctx->Const.MaxDrawBuffers = 1;
    ctx->Const.QueryCounterBits.SamplesPassed = 0;

From f17c8c287f3581fccb52714fbd4b2ea09a58e3d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Mon, 29 Jun 2015 09:48:52 +0300
Subject: [PATCH 0096/1208] mesa/st: use EmitNoIndirectSampler if
 !ARB_gpu_shader5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Cc: "10.5" and "10.6" <mesa-stable@lists.freedesktop.org>
---
 src/mesa/state_tracker/st_context.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index ed9ed0f1b6c..62a0fbee3bb 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -287,6 +287,11 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
    /* For vertex shaders, make sure not to emit saturate when SM 3.0 is not supported */
    ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoSat = !st->has_shader_model3;
 
+   if (!ctx->Extensions.ARB_gpu_shader5) {
+      for (i = 0; i < MESA_SHADER_STAGES; i++)
+         ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectSampler = true;
+   }
+
    _mesa_compute_version(ctx);
 
    if (ctx->Version == 0) {

From 9350ea6979c48772e1fb55d4f1c7c5a3cfa987b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Tue, 19 May 2015 15:01:49 +0300
Subject: [PATCH 0097/1208] glsl: validate sampler array indexing for
 'constant-index-expression'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Desktop GLSL < 130 and GLSL ES < 300 allow sampler array indexing where
index can contain a loop induction variable. This extra check will warn
during linking if some of the indexes could not be turned in to constant
expressions.

v2: warning instead of error for backends that did not enable
    EmitNoIndirectSampler option (have dynamic indexing)

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Cc: "10.5" and "10.6" <mesa-stable@lists.freedesktop.org>
---
 src/glsl/linker.cpp | 77 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 4a726d4e2e7..74c2f2d4c92 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -346,6 +346,39 @@ private:
    bool uses_non_zero_stream;
 };
 
+/* Class that finds array derefs and check if indexes are dynamic. */
+class dynamic_sampler_array_indexing_visitor : public ir_hierarchical_visitor
+{
+public:
+   dynamic_sampler_array_indexing_visitor() :
+      dynamic_sampler_array_indexing(false)
+   {
+   }
+
+   ir_visitor_status visit_enter(ir_dereference_array *ir)
+   {
+      if (!ir->variable_referenced())
+         return visit_continue;
+
+      if (!ir->variable_referenced()->type->contains_sampler())
+         return visit_continue;
+
+      if (!ir->array_index->constant_expression_value()) {
+         dynamic_sampler_array_indexing = true;
+         return visit_stop;
+      }
+      return visit_continue;
+   }
+
+   bool uses_dynamic_sampler_array_indexing()
+   {
+      return dynamic_sampler_array_indexing;
+   }
+
+private:
+   bool dynamic_sampler_array_indexing;
+};
+
 } /* anonymous namespace */
 
 void
@@ -2743,6 +2776,40 @@ build_program_resource_list(struct gl_context *ctx,
     */
 }
 
+/**
+ * This check is done to make sure we allow only constant expression
+ * indexing and "constant-index-expression" (indexing with an expression
+ * that includes loop induction variable).
+ */
+static bool
+validate_sampler_array_indexing(struct gl_context *ctx,
+                                struct gl_shader_program *prog)
+{
+   dynamic_sampler_array_indexing_visitor v;
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      if (prog->_LinkedShaders[i] == NULL)
+	 continue;
+
+      bool no_dynamic_indexing =
+         ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectSampler;
+
+      /* Search for array derefs in shader. */
+      v.run(prog->_LinkedShaders[i]->ir);
+      if (v.uses_dynamic_sampler_array_indexing()) {
+         const char *msg = "sampler arrays indexed with non-constant "
+                           "expressions is forbidden in GLSL %s %u";
+         /* Backend has indicated that it has no dynamic indexing support. */
+         if (no_dynamic_indexing) {
+            linker_error(prog, msg, prog->IsES ? "ES" : "", prog->Version);
+            return false;
+         } else {
+            linker_warning(prog, msg, prog->IsES ? "ES" : "", prog->Version);
+         }
+      }
+   }
+   return true;
+}
+
 
 void
 link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
@@ -2961,6 +3028,16 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       lower_const_arrays_to_uniforms(prog->_LinkedShaders[i]->ir);
    }
 
+   /* Validation for special cases where we allow sampler array indexing
+    * with loop induction variable. This check emits a warning or error
+    * depending if backend can handle dynamic indexing.
+    */
+   if ((!prog->IsES && prog->Version < 130) ||
+       (prog->IsES && prog->Version < 300)) {
+      if (!validate_sampler_array_indexing(ctx, prog))
+         goto done;
+   }
+
    /* Check and validate stream emissions in geometry shaders */
    validate_geometry_shader_emissions(ctx, prog);
 

From 01b5f1336330f1c0f937fb08a444efc593b43435 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 24 Jun 2015 18:57:22 -0400
Subject: [PATCH 0098/1208] freedreno/ir3: fix constlen in case of
 load_uniform_indirect

We can't rely on what we get from the assembler if we have indirect
addressing of constant file, since the assembler doesn't know the array
index.  This got lost in the transition to NIR.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 48b1d8f3606..53b8a6fb101 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1372,6 +1372,11 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			dst[i] = create_uniform_indirect(ctx, n,
 					get_addr(ctx, src[0]));
 		}
+		/* NOTE: if relative addressing is used, we set constlen in
+		 * the compiler (to worst-case value) since we don't know in
+		 * the assembler what the max addr reg value can be:
+		 */
+		ctx->so->constlen = ctx->s->num_uniforms;
 		break;
 	case nir_intrinsic_load_ubo:
 	case nir_intrinsic_load_ubo_indirect:

From bb2c4b68f78f0105088c11408f8902fb22802125 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 26 Jun 2015 10:52:34 -0400
Subject: [PATCH 0099/1208] freedreno/ir3: fixes for indirect writes

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3.c              | 1 -
 src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 9 ++++++++-
 src/gallium/drivers/freedreno/ir3/ir3_ra.c           | 6 ++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index a166b67d7cf..6f6dad59793 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -669,7 +669,6 @@ struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
 	return ir3_instr_create2(block, category, opc, 4);
 }
 
-/* only used by old compiler: */
 struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
 {
 	struct ir3_instruction *new_instr = instr_create(instr->block,
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 53b8a6fb101..3b36114a5ba 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1307,7 +1307,7 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			 * store_output_indirect? or move this into
 			 * create_indirect_store()?
 			 */
-			for (int j = i; j < arr->length; j += 4) {
+			for (int j = i; j < arr->length; j += intr->num_components) {
 				struct ir3_instruction *split;
 
 				split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
@@ -1318,6 +1318,13 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 				arr->arr[j] = split;
 			}
 		}
+		/* fixup fanout/split neighbors: */
+		for (int i = 0; i < arr->length; i++) {
+			arr->arr[i]->cp.right = (i < (arr->length - 1)) ?
+					arr->arr[i+1] : NULL;
+			arr->arr[i]->cp.left = (i > 0) ?
+					arr->arr[i-1] : NULL;
+		}
 		break;
 	}
 	default:
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index e5aba859fab..0436e01ab2c 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -291,8 +291,6 @@ is_temp(struct ir3_register *reg)
 {
 	if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 		return false;
-	if (reg->flags & IR3_REG_RELATIV) // TODO
-		return false;
 	if ((reg->num == regid(REG_A0, 0)) ||
 			(reg->num == regid(REG_P0, 0)))
 		return false;
@@ -312,6 +310,10 @@ static struct ir3_instruction *
 get_definer(struct ir3_instruction *instr, int *sz, int *off)
 {
 	struct ir3_instruction *d = NULL;
+
+	if (instr->fanin)
+		return get_definer(instr->fanin, sz, off);
+
 	if (is_meta(instr) && (instr->opc == OPC_META_FI)) {
 		/* What about the case where collect is subset of array, we
 		 * need to find the distance between where actual array starts

From 1370fde8af1b0b5c5e6204c0dea6ebffb85dce0a Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 26 Jun 2015 14:32:08 -0400
Subject: [PATCH 0100/1208] freedreno/ir3: fix crash in RA

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_ra.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 0436e01ab2c..ee610c7d01e 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -403,6 +403,7 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		 * the phi, so we don't need to chase definers
 		 */
 		struct ir3_register *src;
+		struct ir3_instruction *dd = d;
 
 		/* note: don't use foreach_ssa_src as this gets called once
 		 * while assigning regs (which clears SSA flag)
@@ -410,9 +411,11 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		foreach_src(src, d) {
 			if (!src->instr)
 				continue;
-			if (src->instr->ip < d->ip)
-				d = src->instr;
+			if (src->instr->ip < dd->ip)
+				dd = src->instr;
 		}
+
+		d = dd;
 	}
 
 	if (is_meta(d) && (d->opc == OPC_META_FO)) {

From 0a8c8fa770db4cc4ef3db89a5dae1d136361495d Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 26 Jun 2015 13:38:03 -0400
Subject: [PATCH 0101/1208] freedreno/ir3: fix crash in fail path

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a3xx/fd3_draw.c  | 3 +++
 src/gallium/drivers/freedreno/a4xx/fd4_draw.c  | 3 +++
 src/gallium/drivers/freedreno/ir3/ir3_shader.c | 9 ++++++---
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index b5838b58eb2..29f4bae93fa 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -60,6 +60,9 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	const struct pipe_draw_info *info = emit->info;
 	enum pc_di_primtype primtype = ctx->primtypes[info->mode];
 
+	if (!(fd3_emit_get_vp(emit) && fd3_emit_get_fp(emit)))
+		return;
+
 	fd3_emit_state(ctx, ring, emit);
 
 	if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE))
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index de5a306af60..d070f5fd6b7 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -48,6 +48,9 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 {
 	const struct pipe_draw_info *info = emit->info;
 
+	if (!(fd4_emit_get_vp(emit) && fd4_emit_get_fp(emit)))
+		return;
+
 	fd4_emit_state(ctx, ring, emit);
 
 	if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE))
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index b5b038100cc..c22b7f8d169 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -46,7 +46,8 @@ delete_variant(struct ir3_shader_variant *v)
 {
 	if (v->ir)
 		ir3_destroy(v->ir);
-	fd_bo_del(v->bo);
+	if (v->bo)
+		fd_bo_del(v->bo);
 	free(v);
 }
 
@@ -228,8 +229,10 @@ ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 
 	/* compile new variant if it doesn't exist already: */
 	v = create_variant(shader, key);
-	v->next = shader->variants;
-	shader->variants = v;
+	if (v) {
+		v->next = shader->variants;
+		shader->variants = v;
+	}
 
 	return v;
 }

From 3244195f48affec1d3c2eb5d0e267c75b046db9f Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 26 Jun 2015 13:55:49 -0400
Subject: [PATCH 0102/1208] freedreno/a4xx: fix for sparse-samplers

Some piglit tests, like arb_fragment_program-sparse-samplers, result in
having a null samp#0 but valid samp#1.

TODO: a3xx probably needs similar fix

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a4xx/fd4_emit.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 4b6eb646aa7..6cd2cd730f5 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -223,15 +223,19 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			const struct fd4_pipe_sampler_view *view = tex->textures[i] ?
 					fd4_pipe_sampler_view(tex->textures[i]) :
 					&dummy_view;
-			struct fd_resource *rsc = fd_resource(view->base.texture);
 			unsigned start = view->base.u.tex.first_level;
-			uint32_t offset = fd_resource_offset(rsc, start, 0);
 
 			OUT_RING(ring, view->texconst0);
 			OUT_RING(ring, view->texconst1);
 			OUT_RING(ring, view->texconst2);
 			OUT_RING(ring, view->texconst3);
-			OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0);
+			if (view->base.texture) {
+				struct fd_resource *rsc = fd_resource(view->base.texture);
+				uint32_t offset = fd_resource_offset(rsc, start, 0);
+				OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0);
+			} else {
+				OUT_RING(ring, 0x00000000);
+			}
 			OUT_RING(ring, 0x00000000);
 			OUT_RING(ring, 0x00000000);
 			OUT_RING(ring, 0x00000000);

From db5105b4b35e064f3934154b45de15422a1bdb0a Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 29 Jun 2015 10:21:08 -0400
Subject: [PATCH 0103/1208] freedreno/ir3: add ir3_shader_disasm()

Split out most of dump_info() from ir3_cmdline compiler into a function
that can be used both by cmdline compiler and also for the disasm debug
option.  This way, for FD_MESA_DEBUG=disasm we also get to see intput/
output registers, etc.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/ir3/ir3_cmdline.c       | 116 +---------------
 .../drivers/freedreno/ir3/ir3_shader.c        | 127 +++++++++++++++++-
 .../drivers/freedreno/ir3/ir3_shader.h        |   1 +
 3 files changed, 124 insertions(+), 120 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index ad9d2719d59..2b89fb442b4 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -43,127 +43,15 @@
 #include "instr-a3xx.h"
 #include "ir3.h"
 
-static void dump_reg(const char *name, uint32_t r)
-{
-	if (r != regid(63,0))
-		debug_printf("; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
-}
-
-static void dump_semantic(struct ir3_shader_variant *so,
-		unsigned sem, const char *name)
-{
-	uint32_t regid;
-	regid = ir3_find_output_regid(so, ir3_semantic_name(sem, 0));
-	dump_reg(name, regid);
-}
-
 static void dump_info(struct ir3_shader_variant *so, const char *str)
 {
 	uint32_t *bin;
 	const char *type = (so->type == SHADER_VERTEX) ? "VERT" : "FRAG";
-
-	// for debug, dump some before/after info:
 	// TODO make gpu_id configurable on cmdline
 	bin = ir3_shader_assemble(so, 320);
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		struct ir3 *ir = so->ir;
-		struct ir3_register *reg;
-		uint8_t regid;
-		unsigned i;
-
-		debug_printf("; %s: %s\n", type, str);
-
-		for (i = 0; i < ir->ninputs; i++) {
-			if (!ir->inputs[i]) {
-				debug_printf("; in%d unused\n", i);
-				continue;
-			}
-			reg = ir->inputs[i]->regs[0];
-			regid = reg->num;
-			debug_printf("@in(%sr%d.%c)\tin%d\n",
-					(reg->flags & IR3_REG_HALF) ? "h" : "",
-					(regid >> 2), "xyzw"[regid & 0x3], i);
-		}
-
-		for (i = 0; i < ir->noutputs; i++) {
-			if (!ir->outputs[i]) {
-				debug_printf("; out%d unused\n", i);
-				continue;
-			}
-			/* kill shows up as a virtual output.. skip it! */
-			if (is_kill(ir->outputs[i]))
-				continue;
-			reg = ir->outputs[i]->regs[0];
-			regid = reg->num;
-			debug_printf("@out(%sr%d.%c)\tout%d\n",
-					(reg->flags & IR3_REG_HALF) ? "h" : "",
-					(regid >> 2), "xyzw"[regid & 0x3], i);
-		}
-
-		for (i = 0; i < so->immediates_count; i++) {
-			debug_printf("@const(c%d.x)\t", so->first_immediate + i);
-			debug_printf("0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
-					so->immediates[i].val[0],
-					so->immediates[i].val[1],
-					so->immediates[i].val[2],
-					so->immediates[i].val[3]);
-		}
-
-		disasm_a3xx(bin, so->info.sizedwords, 0, so->type);
-
-		debug_printf("; %s: outputs:", type);
-		for (i = 0; i < so->outputs_count; i++) {
-			uint8_t regid = so->outputs[i].regid;
-			ir3_semantic sem = so->outputs[i].semantic;
-			debug_printf(" r%d.%c (%u:%u)",
-					(regid >> 2), "xyzw"[regid & 0x3],
-					sem2name(sem), sem2idx(sem));
-		}
-		debug_printf("\n");
-		debug_printf("; %s: inputs:", type);
-		for (i = 0; i < so->inputs_count; i++) {
-			uint8_t regid = so->inputs[i].regid;
-			ir3_semantic sem = so->inputs[i].semantic;
-			debug_printf(" r%d.%c (%u:%u,cm=%x,il=%u,b=%u)",
-					(regid >> 2), "xyzw"[regid & 0x3],
-					sem2name(sem), sem2idx(sem),
-					so->inputs[i].compmask,
-					so->inputs[i].inloc,
-					so->inputs[i].bary);
-		}
-		debug_printf("\n");
-	}
-
-	/* print generic shader info: */
-	debug_printf("; %s: %u instructions, %d half, %d full\n", type,
-			so->info.instrs_count,
-			so->info.max_half_reg + 1,
-			so->info.max_reg + 1);
-
-	/* print shader type specific info: */
-	switch (so->type) {
-	case SHADER_VERTEX:
-		dump_semantic(so, TGSI_SEMANTIC_POSITION, "pos");
-		dump_semantic(so, TGSI_SEMANTIC_PSIZE, "psize");
-		break;
-	case SHADER_FRAGMENT:
-		dump_reg("pos (bary)", so->pos_regid);
-		dump_semantic(so, TGSI_SEMANTIC_POSITION, "posz");
-		dump_semantic(so, TGSI_SEMANTIC_COLOR, "color");
-		/* these two are hard-coded since we don't know how to
-		 * program them to anything but all 0's...
-		 */
-		if (so->frag_coord)
-			debug_printf("; fragcoord: r0.x\n");
-		if (so->frag_face)
-			debug_printf("; fragface: hr0.x\n");
-		break;
-	case SHADER_COMPUTE:
-		break;
-	}
+	debug_printf("; %s: %s\n", type, str);
+	ir3_shader_disasm(so, bin);
 	free(bin);
-
-	debug_printf("\n");
 }
 
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index c22b7f8d169..bfcc2ca8a53 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -140,6 +140,13 @@ assemble_variant(struct ir3_shader_variant *v)
 
 	memcpy(fd_bo_map(v->bo), bin, sz);
 
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		struct ir3_shader_key key = v->key;
+		DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+			key.binning_pass, key.color_two_side, key.half_precision);
+		ir3_shader_disasm(v, bin);
+	}
+
 	free(bin);
 
 	/* no need to keep the ir around beyond this point: */
@@ -179,12 +186,6 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 		goto fail;
 	}
 
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
-			key.binning_pass, key.color_two_side, key.half_precision);
-		disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
-	}
-
 	return v;
 
 fail:
@@ -262,3 +263,117 @@ ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens,
 	shader->tokens = tgsi_dup_tokens(tokens);
 	return shader;
 }
+
+static void dump_reg(const char *name, uint32_t r)
+{
+	if (r != regid(63,0))
+		debug_printf("; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
+}
+
+static void dump_semantic(struct ir3_shader_variant *so,
+		unsigned sem, const char *name)
+{
+	uint32_t regid;
+	regid = ir3_find_output_regid(so, ir3_semantic_name(sem, 0));
+	dump_reg(name, regid);
+}
+
+void
+ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin)
+{
+	struct ir3 *ir = so->ir;
+	struct ir3_register *reg;
+	const char *type = (so->type == SHADER_VERTEX) ? "VERT" : "FRAG";
+	uint8_t regid;
+	unsigned i;
+
+	for (i = 0; i < ir->ninputs; i++) {
+		if (!ir->inputs[i]) {
+			debug_printf("; in%d unused\n", i);
+			continue;
+		}
+		reg = ir->inputs[i]->regs[0];
+		regid = reg->num;
+		debug_printf("@in(%sr%d.%c)\tin%d\n",
+				(reg->flags & IR3_REG_HALF) ? "h" : "",
+				(regid >> 2), "xyzw"[regid & 0x3], i);
+	}
+
+	for (i = 0; i < ir->noutputs; i++) {
+		if (!ir->outputs[i]) {
+			debug_printf("; out%d unused\n", i);
+			continue;
+		}
+		/* kill shows up as a virtual output.. skip it! */
+		if (is_kill(ir->outputs[i]))
+			continue;
+		reg = ir->outputs[i]->regs[0];
+		regid = reg->num;
+		debug_printf("@out(%sr%d.%c)\tout%d\n",
+				(reg->flags & IR3_REG_HALF) ? "h" : "",
+				(regid >> 2), "xyzw"[regid & 0x3], i);
+	}
+
+	for (i = 0; i < so->immediates_count; i++) {
+		debug_printf("@const(c%d.x)\t", so->first_immediate + i);
+		debug_printf("0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
+				so->immediates[i].val[0],
+				so->immediates[i].val[1],
+				so->immediates[i].val[2],
+				so->immediates[i].val[3]);
+	}
+
+	disasm_a3xx(bin, so->info.sizedwords, 0, so->type);
+
+	debug_printf("; %s: outputs:", type);
+	for (i = 0; i < so->outputs_count; i++) {
+		uint8_t regid = so->outputs[i].regid;
+		ir3_semantic sem = so->outputs[i].semantic;
+		debug_printf(" r%d.%c (%u:%u)",
+				(regid >> 2), "xyzw"[regid & 0x3],
+				sem2name(sem), sem2idx(sem));
+	}
+	debug_printf("\n");
+	debug_printf("; %s: inputs:", type);
+	for (i = 0; i < so->inputs_count; i++) {
+		uint8_t regid = so->inputs[i].regid;
+		ir3_semantic sem = so->inputs[i].semantic;
+		debug_printf(" r%d.%c (%u:%u,cm=%x,il=%u,b=%u)",
+				(regid >> 2), "xyzw"[regid & 0x3],
+				sem2name(sem), sem2idx(sem),
+				so->inputs[i].compmask,
+				so->inputs[i].inloc,
+				so->inputs[i].bary);
+	}
+	debug_printf("\n");
+
+	/* print generic shader info: */
+	debug_printf("; %s: %u instructions, %d half, %d full\n", type,
+			so->info.instrs_count,
+			so->info.max_half_reg + 1,
+			so->info.max_reg + 1);
+
+	/* print shader type specific info: */
+	switch (so->type) {
+	case SHADER_VERTEX:
+		dump_semantic(so, TGSI_SEMANTIC_POSITION, "pos");
+		dump_semantic(so, TGSI_SEMANTIC_PSIZE, "psize");
+		break;
+	case SHADER_FRAGMENT:
+		dump_reg("pos (bary)", so->pos_regid);
+		dump_semantic(so, TGSI_SEMANTIC_POSITION, "posz");
+		dump_semantic(so, TGSI_SEMANTIC_COLOR, "color");
+		/* these two are hard-coded since we don't know how to
+		 * program them to anything but all 0's...
+		 */
+		if (so->frag_coord)
+			debug_printf("; fragcoord: r0.x\n");
+		if (so->frag_face)
+			debug_printf("; fragface: hr0.x\n");
+		break;
+	case SHADER_COMPUTE:
+		break;
+	}
+
+	debug_printf("\n");
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 9f1b0769180..3cf3167e1cc 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -212,6 +212,7 @@ struct ir3_shader * ir3_shader_create(struct pipe_context *pctx,
 void ir3_shader_destroy(struct ir3_shader *shader);
 struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
 		struct ir3_shader_key key);
+void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin);
 
 /*
  * Helper/util:

From 906da495272b1be4c278f5f7402594e3c52521c1 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 28 Jun 2015 11:13:58 -0400
Subject: [PATCH 0104/1208] freedreno/ir3: fix RA issue with fanin

The fanin source could be grouped, for example with shaders like:

    VERT
    DCL IN[0]
    DCL IN[1]
    DCL OUT[0], POSITION
    DCL OUT[1], GENERIC[9]
    DCL SAMP[0]
    DCL SVIEW[0], 2D, FLOAT
    DCL TEMP[0], LOCAL
      0: MOV TEMP[0].xy, IN[1].xyyy
      1: MOV TEMP[0].w, IN[1].wwww
      2: TXF TEMP[0], TEMP[0], SAMP[0], 2D
      3: MOV OUT[1], TEMP[0]
      4: MOV OUT[0], IN[0]
      5: END

The second arg to the isaml is IN[1].w, so we need to look at the fanin
source to get the correct offset.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_ra.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index ee610c7d01e..9f6ff12a119 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -320,19 +320,25 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		 * and fanin..  that probably doesn't happen currently.
 		 */
 		struct ir3_register *src;
+		int dsz, doff;
 
 		/* note: don't use foreach_ssa_src as this gets called once
 		 * while assigning regs (which clears SSA flag)
 		 */
-		foreach_src(src, instr) {
+		foreach_src_n(src, n, instr) {
+			struct ir3_instruction *dd;
 			if (!src->instr)
 				continue;
-			if ((!d) || (src->instr->ip < d->ip))
-				d = src->instr;
+
+			dd = get_definer(src->instr, &dsz, &doff);
+
+			if ((!d) || (dd->ip < d->ip)) {
+				d = dd;
+				*sz = dsz;
+				*off = doff - n;
+			}
 		}
 
-		*sz = instr->regs_count - 1;
-		*off = 0;
 
 	} else if (instr->cp.right || instr->cp.left) {
 		/* covers also the meta:fo case, which ends up w/ single
@@ -447,6 +453,10 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		struct ir3_instruction *defn;
 		int cls, sz, off;
 
+#ifdef DEBUG
+		instr->name = ~0;
+#endif
+
 		ctx->instr_cnt++;
 
 		if (instr->regs_count == 0)

From 00b6b41482985ba4a81fbb479a47c06ec83f3797 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 29 Jun 2015 14:49:08 -0400
Subject: [PATCH 0105/1208] freedreno/ir3: cache defining instruction

It is silly to traverse back to find first instruction that writes part
of a larger "virtual" register many times per instruction (plus per use
as a src to later instructions).  Cache this information so we only
figure it out once.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3.c    |   7 +-
 src/gallium/drivers/freedreno/ir3/ir3.h    |   2 +-
 src/gallium/drivers/freedreno/ir3/ir3_ra.c | 151 ++++++++++++---------
 3 files changed, 91 insertions(+), 69 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index 6f6dad59793..1da6cf0477e 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -722,15 +722,16 @@ ir3_clear_mark(struct ir3 *ir)
 }
 
 /* note: this will destroy instr->depth, don't do it until after sched! */
-void
+unsigned
 ir3_count_instructions(struct ir3 *ir)
 {
-	unsigned ip = 0;
+	unsigned cnt = 0;
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-			instr->ip = ip++;
+			instr->ip = cnt++;
 		}
 		block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
 		block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
 	}
+	return cnt;
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 9c35a763d58..bc0144568a5 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -431,7 +431,7 @@ static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
 void ir3_block_clear_mark(struct ir3_block *block);
 void ir3_clear_mark(struct ir3 *shader);
 
-void ir3_count_instructions(struct ir3 *ir);
+unsigned ir3_count_instructions(struct ir3 *ir);
 
 static inline int ir3_instr_regno(struct ir3_instruction *instr,
 		struct ir3_register *reg)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 9f6ff12a119..de48ecfe280 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -241,6 +241,21 @@ ir3_ra_alloc_reg_set(void *memctx)
 	return set;
 }
 
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+	BITSET_WORD *def;        /* variables defined before used in block */
+	BITSET_WORD *use;        /* variables used before defined in block */
+	BITSET_WORD *livein;     /* which defs reach entry point of block */
+	BITSET_WORD *liveout;    /* which defs reach exit point of block */
+};
+
+/* additional instruction-data (per-instruction) */
+struct ir3_ra_instr_data {
+	/* cached instruction 'definer' info: */
+	struct ir3_instruction *defn;
+	int off, sz, cls;
+};
+
 /* register-assign context, per-shader */
 struct ir3_ra_ctx {
 	struct ir3 *ir;
@@ -254,14 +269,7 @@ struct ir3_ra_ctx {
 	unsigned class_base[total_class_count];
 	unsigned instr_cnt;
 	unsigned *def, *use;     /* def/use table */
-};
-
-/* additional block-data (per-block) */
-struct ir3_ra_block_data {
-	BITSET_WORD *def;        /* variables defined before used in block */
-	BITSET_WORD *use;        /* variables used before defined in block */
-	BITSET_WORD *livein;     /* which defs reach entry point of block */
-	BITSET_WORD *liveout;    /* which defs reach exit point of block */
+	struct ir3_ra_instr_data *instrd;
 };
 
 static bool
@@ -307,12 +315,20 @@ writes_gpr(struct ir3_instruction *instr)
 }
 
 static struct ir3_instruction *
-get_definer(struct ir3_instruction *instr, int *sz, int *off)
+get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
+		int *sz, int *off)
 {
+	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 	struct ir3_instruction *d = NULL;
 
 	if (instr->fanin)
-		return get_definer(instr->fanin, sz, off);
+		return get_definer(ctx, instr->fanin, sz, off);
+
+	if (id->defn) {
+		*sz = id->sz;
+		*off = id->off;
+		return id->defn;
+	}
 
 	if (is_meta(instr) && (instr->opc == OPC_META_FI)) {
 		/* What about the case where collect is subset of array, we
@@ -330,7 +346,7 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 			if (!src->instr)
 				continue;
 
-			dd = get_definer(src->instr, &dsz, &doff);
+			dd = get_definer(ctx, src->instr, &dsz, &doff);
 
 			if ((!d) || (dd->ip < d->ip)) {
 				d = dd;
@@ -339,7 +355,6 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 			}
 		}
 
-
 	} else if (instr->cp.right || instr->cp.left) {
 		/* covers also the meta:fo case, which ends up w/ single
 		 * scalar instructions for each component:
@@ -394,7 +409,7 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		struct ir3_instruction *dd;
 		int dsz, doff;
 
-		dd = get_definer(phi, &dsz, &doff);
+		dd = get_definer(ctx, phi, &dsz, &doff);
 
 		*sz = MAX2(*sz, dsz);
 		*off = doff;
@@ -428,7 +443,7 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		struct ir3_instruction *dd;
 		int dsz, doff;
 
-		dd = get_definer(d->regs[1]->instr, &dsz, &doff);
+		dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
 
 		/* by definition, should come before: */
 		debug_assert(dd->ip < d->ip);
@@ -440,9 +455,25 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		d = dd;
 	}
 
+	id->defn = d;
+	id->sz = *sz;
+	id->off = *off;
+
 	return d;
 }
 
+static void
+ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+		if (instr->regs_count == 0)
+			continue;
+		id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+		id->cls = size_to_class(id->sz, is_half(id->defn));
+	}
+}
+
 /* give each instruction a name (and ip), and count up the # of names
  * of each class
  */
@@ -450,8 +481,7 @@ static void
 ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		struct ir3_instruction *defn;
-		int cls, sz, off;
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
 #ifdef DEBUG
 		instr->name = ~0;
@@ -465,9 +495,7 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		if (!writes_gpr(instr))
 			continue;
 
-		defn = get_definer(instr, &sz, &off);
-
-		if (defn != instr)
+		if (id->defn != instr)
 			continue;
 
 		/* arrays which don't fit in one of the pre-defined class
@@ -475,9 +503,8 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		 *
 		 * TODO but we still need to allocate names for them, don't we??
 		 */
-		cls = size_to_class(sz, is_half(defn));
-		if (cls >= 0) {
-			instr->name = ctx->class_alloc_count[cls]++;
+		if (id->cls >= 0) {
+			instr->name = ctx->class_alloc_count[id->cls]++;
 			ctx->alloc_count++;
 		}
 	}
@@ -486,8 +513,16 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 static void
 ra_init(struct ir3_ra_ctx *ctx)
 {
+	unsigned n;
+
 	ir3_clear_mark(ctx->ir);
-	ir3_count_instructions(ctx->ir);
+	n = ir3_count_instructions(ctx->ir);
+
+	ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_find_definers(ctx, block);
+	}
 
 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
 		ra_block_name_instructions(ctx, block);
@@ -503,6 +538,7 @@ ra_init(struct ir3_ra_ctx *ctx)
 	}
 
 	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
+	ralloc_steal(ctx->g, ctx->instrd);
 	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
 	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
 }
@@ -570,39 +606,36 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		 */
 
 		if (writes_gpr(instr)) {
-			struct ir3_instruction *defn;
-			int cls, sz, off;
+			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-			defn = get_definer(instr, &sz, &off);
-			if (defn == instr) {
+			if (id->defn == instr) {
 				/* arrays which don't fit in one of the pre-defined class
 				 * sizes are pre-colored:
 				 */
-				cls = size_to_class(sz, is_half(defn));
-				if (cls >= 0) {
-					unsigned name = ra_name(ctx, cls, defn);
+				if (id->cls >= 0) {
+					unsigned name = ra_name(ctx, id->cls, id->defn);
 
-					ctx->def[name] = defn->ip;
-					ctx->use[name] = defn->ip;
+					ctx->def[name] = id->defn->ip;
+					ctx->use[name] = id->defn->ip;
 
 					/* since we are in SSA at this point: */
 					debug_assert(!BITSET_TEST(bd->use, name));
 
 					BITSET_SET(bd->def, name);
 
-					if (is_half(defn)) {
+					if (is_half(id->defn)) {
 						ra_set_node_class(ctx->g, name,
-								ctx->set->half_classes[cls - class_count]);
+								ctx->set->half_classes[id->cls - class_count]);
 					} else {
 						ra_set_node_class(ctx->g, name,
-								ctx->set->classes[cls]);
+								ctx->set->classes[id->cls]);
 					}
 
 					/* extend the live range for phi srcs, which may come
 					 * from the bottom of the loop
 					 */
-					if (defn->regs[0]->flags & IR3_REG_PHI_SRC) {
-						struct ir3_instruction *phi = defn->regs[0]->instr;
+					if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+						struct ir3_instruction *phi = id->defn->regs[0]->instr;
 						foreach_ssa_src(src, phi) {
 							/* if src is after phi, then we need to extend
 							 * the liverange to the end of src's block:
@@ -621,13 +654,10 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 		foreach_ssa_src(src, instr) {
 			if (writes_gpr(src)) {
-				struct ir3_instruction *srcdefn;
-				int cls, sz, off;
+				struct ir3_ra_instr_data *id = &ctx->instrd[src->ip];
 
-				srcdefn = get_definer(src, &sz, &off);
-				cls = size_to_class(sz, is_half(srcdefn));
-				if (cls >= 0) {
-					unsigned name = ra_name(ctx, cls, srcdefn);
+				if (id->cls >= 0) {
+					unsigned name = ra_name(ctx, id->cls, id->defn);
 					ctx->use[name] = MAX2(ctx->use[name], instr->ip);
 					if (!BITSET_TEST(bd->def, name))
 						BITSET_SET(bd->use, name);
@@ -719,13 +749,10 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 	/* need to fix things up to keep outputs live: */
 	for (unsigned i = 0; i < ir->noutputs; i++) {
 		struct ir3_instruction *instr = ir->outputs[i];
-		struct ir3_instruction *defn;
-		int cls, sz, off;
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-		defn = get_definer(instr, &sz, &off);
-		cls = size_to_class(sz, is_half(defn));
-		if (cls >= 0) {
-			unsigned name = ra_name(ctx, cls, defn);
+		if (id->cls >= 0) {
+			unsigned name = ra_name(ctx, id->cls, id->defn);
 			ctx->use[name] = ctx->instr_cnt;
 		}
 	}
@@ -795,15 +822,12 @@ static void
 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 		struct ir3_instruction *instr)
 {
-	struct ir3_instruction *defn;
-	int cls, sz, off;
+	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-	defn = get_definer(instr, &sz, &off);
-	cls = size_to_class(sz, is_half(defn));
-	if (cls >= 0) {
-		unsigned name = ra_name(ctx, cls, defn);
+	if (id->cls >= 0) {
+		unsigned name = ra_name(ctx, id->cls, id->defn);
 		unsigned r = ra_get_node_reg(ctx->g, name);
-		unsigned num = ctx->set->ra_reg_to_gpr[r] + off;
+		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
 
 		if (reg->flags & IR3_REG_RELATIV)
 			num += reg->offset;
@@ -811,7 +835,7 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 		reg->num = num;
 		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
 
-		if (is_half(defn))
+		if (is_half(id->defn))
 			reg->flags |= IR3_REG_HALF;
 	}
 }
@@ -866,19 +890,16 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 		for (j = 0; i < ir->ninputs; i++) {
 			struct ir3_instruction *instr = ir->inputs[i];
 			if (instr) {
-				struct ir3_instruction *defn;
-				int cls, sz, off;
+				struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-				defn = get_definer(instr, &sz, &off);
-				if (defn == instr) {
+				if (id->defn == instr) {
 					unsigned name, reg;
 
-					cls = size_to_class(sz, is_half(defn));
-					name = ra_name(ctx, cls, defn);
-					reg = ctx->set->gpr_to_ra_reg[cls][j];
+					name = ra_name(ctx, id->cls, id->defn);
+					reg = ctx->set->gpr_to_ra_reg[id->cls][j];
 
 					ra_set_node_reg(ctx->g, name, reg);
-					j += sz;
+					j += id->sz;
 				}
 			}
 		}

From dc7e6463d3ec6980f1517ff10048e0dbf5bb38ad Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 27 Jun 2015 10:07:18 -0400
Subject: [PATCH 0106/1208] nir: cleanup open-coded instruction casts

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/glsl/nir/nir_lower_alu_to_scalar.c | 2 +-
 src/glsl/nir/nir_lower_vec_to_movs.c   | 2 +-
 src/glsl/nir/nir_search.c              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/glsl/nir/nir_lower_alu_to_scalar.c b/src/glsl/nir/nir_lower_alu_to_scalar.c
index 25bba4ef0b6..5d15fb2bc32 100644
--- a/src/glsl/nir/nir_lower_alu_to_scalar.c
+++ b/src/glsl/nir/nir_lower_alu_to_scalar.c
@@ -164,7 +164,7 @@ lower_alu_to_scalar_block(nir_block *block, void *data)
 {
    nir_foreach_instr_safe(block, instr) {
       if (instr->type == nir_instr_type_alu)
-         lower_alu_instr_scalar((nir_alu_instr *)instr, data);
+         lower_alu_instr_scalar(nir_instr_as_alu(instr), data);
    }
 
    return true;
diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c
index 602853ea665..e6d522f88ce 100644
--- a/src/glsl/nir/nir_lower_vec_to_movs.c
+++ b/src/glsl/nir/nir_lower_vec_to_movs.c
@@ -90,7 +90,7 @@ lower_vec_to_movs_block(nir_block *block, void *mem_ctx)
       if (instr->type != nir_instr_type_alu)
          continue;
 
-      nir_alu_instr *vec = (nir_alu_instr *)instr;
+      nir_alu_instr *vec = nir_instr_as_alu(instr);
 
       switch (vec->op) {
       case nir_op_vec2:
diff --git a/src/glsl/nir/nir_search.c b/src/glsl/nir/nir_search.c
index 0c4e48ce965..c33d6c3eb84 100644
--- a/src/glsl/nir/nir_search.c
+++ b/src/glsl/nir/nir_search.c
@@ -48,7 +48,7 @@ src_is_bool(nir_src src)
       return false;
    if (src.ssa->parent_instr->type != nir_instr_type_alu)
       return false;
-   return alu_instr_is_bool((nir_alu_instr *)src.ssa->parent_instr);
+   return alu_instr_is_bool(nir_instr_as_alu(src.ssa->parent_instr));
 }
 
 static bool

From 6082515de7c7b4885bd685d88aee32fc9e5103a1 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 27 Jun 2015 09:58:28 -0400
Subject: [PATCH 0107/1208] gallium/ttn: partial fix for output arrays

It isn't quite yet practical to enable TGSI_ANY_INOUT_DECL_RANGE shader
cap yet, at least not in drivers that need lower_to_scalar pass (which
right now is all of the ttn users), since the register arrays do not get
converted to SSA, which angers nir_lower_alu_to_scalar.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/auxiliary/nir/tgsi_to_nir.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 065bbf050c2..c5b65eeae0c 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -1716,9 +1716,11 @@ ttn_add_output_stores(struct ttn_compile *c)
       for (i = 0; i < array_len; i++) {
          nir_intrinsic_instr *store =
             nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+         unsigned loc = var->data.driver_location + i;
          store->num_components = 4;
-         store->const_index[0] = var->data.driver_location + i;
-         store->src[0].reg.reg = c->output_regs[var->data.driver_location].reg;
+         store->const_index[0] = loc;
+         store->src[0].reg.reg = c->output_regs[loc].reg;
+         store->src[0].reg.base_offset = c->output_regs[loc].offset;
          nir_instr_insert_after_cf_list(b->cf_node_list, &store->instr);
       }
    }

From d1f0e019797863b23388bfef53a77f659f749d3c Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 26 Jun 2015 13:48:29 -0400
Subject: [PATCH 0108/1208] gallium/ttn: add TXB2

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/auxiliary/nir/tgsi_to_nir.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index c5b65eeae0c..bd4cdcbbfcf 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -1068,6 +1068,11 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
       op = nir_texop_txb;
       num_srcs = 2;
       break;
+   case TGSI_OPCODE_TXB2:
+      op = nir_texop_txb;
+      num_srcs = 2;
+      samp = 2;
+      break;
    case TGSI_OPCODE_TXL:
       op = nir_texop_txl;
       num_srcs = 2;
@@ -1169,6 +1174,12 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
       src_number++;
    }
 
+   if (tgsi_inst->Instruction.Opcode == TGSI_OPCODE_TXB2) {
+      instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[1], X));
+      instr->src[src_number].src_type = nir_tex_src_bias;
+      src_number++;
+   }
+
    if (tgsi_inst->Instruction.Opcode == TGSI_OPCODE_TXL) {
       instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], W));
       instr->src[src_number].src_type = nir_tex_src_lod;

From 879dcf07f6a3ab56f23d540b0df94c57e0706094 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 26 Jun 2015 14:24:08 -0400
Subject: [PATCH 0109/1208] gallium/ttn: don't upset nir_validate w/ BRK's

Previously we were unconditionally doing ttn_get_src() even for
instructions with no src's.  Which created a lot of unnecessary
load_const instructions.  These were mostly harmless since NIR opt
passes would strip them back out.  But for an ENDIF following a
BRK, it would result in load_const instructions created after the
NIR break instruction.  Which nir_validate dislikes.

But we can actually just dtrt by using NumSrcRegs instead.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/auxiliary/nir/tgsi_to_nir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index bd4cdcbbfcf..e1647d53a40 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -1491,7 +1491,7 @@ ttn_emit_instruction(struct ttn_compile *c)
       return;
 
    nir_ssa_def *src[TGSI_FULL_MAX_SRC_REGISTERS];
-   for (i = 0; i < TGSI_FULL_MAX_SRC_REGISTERS; i++) {
+   for (i = 0; i < tgsi_inst->Instruction.NumSrcRegs; i++) {
       src[i] = ttn_get_src(c, &tgsi_inst->Src[i]);
    }
    nir_alu_dest dest = ttn_get_dest(c, tgsi_dst);

From 6098ef824467f685fb34914eb3fef73b3ba18c6f Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Thu, 18 Jun 2015 20:16:46 +0100
Subject: [PATCH 0110/1208] egl/drm:  plug memory leak
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Free the memory for dri2_surf in the unlikely case that one provides
NULL for native_window. Also set the relevant EGL_ERROR to provide
feedback to the user.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/egl/drivers/dri2/platform_drm.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/egl/drivers/dri2/platform_drm.c b/src/egl/drivers/dri2/platform_drm.c
index a62da4121fe..0d1f4c6e0a7 100644
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -115,8 +115,11 @@ dri2_drm_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
 
    switch (type) {
    case EGL_WINDOW_BIT:
-      if (!window)
-         return NULL;
+      if (!window) {
+         _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_create_surface");
+         goto cleanup_surf;
+      }
+
       surf = gbm_dri_surface(window);
       dri2_surf->gbm_surf = surf;
       dri2_surf->base.Width =  surf->base.width;

From 0afa6335079093627b47ff08da38bed00972c217 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Thu, 18 Jun 2015 20:19:32 +0100
Subject: [PATCH 0111/1208] egl/wayland: handle NULL native_window in
 create_surface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Raise EGL_BAD_NATIVE_WINDOW instead of crashing.

v2: s/Rise/Raise/ (spotted by Michel)

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/egl/drivers/dri2/platform_wayland.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index 160fa8ce8d7..6f42d90ed96 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -149,6 +149,11 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp,
    else
       dri2_surf->format = WL_DRM_FORMAT_ARGB8888;
 
+   if (!window) {
+      _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_create_surface");
+      goto cleanup_surf;
+   }
+
    dri2_surf->wl_win = window;
 
    dri2_surf->wl_win->private = dri2_surf;

From 4ea5223a95436b76a3f808732c565e9833f84551 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Thu, 18 Jun 2015 20:22:54 +0100
Subject: [PATCH 0112/1208] egl/wayland: cleanup dri2_wl_create_surface error
 path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/egl/drivers/dri2/platform_wayland.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index 6f42d90ed96..1e127607cdd 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -168,13 +168,11 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp,
 					    dri2_surf);
    if (dri2_surf->dri_drawable == NULL) {
       _eglError(EGL_BAD_ALLOC, "dri2->createNewDrawable");
-      goto cleanup_dri_drawable;
+      goto cleanup_surf;
    }
 
    return &dri2_surf->base;
 
- cleanup_dri_drawable:
-   dri2_dpy->core->destroyDrawable(dri2_surf->dri_drawable);
  cleanup_surf:
    free(dri2_surf);
 

From af2aea40d29dffd5e584432e0652db114113469b Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Thu, 18 Jun 2015 20:39:28 +0100
Subject: [PATCH 0113/1208] egl/x11: handle when invalid drawable is passed in
 create_surface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

0 is not used as a valid drawable id, as such there is no point in
attempting to query its geometry. Just bail out early and provide the
more meaningful EGL_BAD_NATIVE_WINDOW to the user.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/egl/drivers/dri2/platform_x11.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 0fbf4e40f2f..ad40bd57aa6 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -235,6 +235,10 @@ dri2_x11_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
                        dri2_surf->drawable, screen->root,
 			dri2_surf->base.Width, dri2_surf->base.Height);
    } else {
+      if (!drawable) {
+         _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_create_surface");
+         goto cleanup_surf;
+      }
       dri2_surf->drawable = drawable;
    }
 

From 2b1a1d8b1294f91b7ac563da1f395deba4384765 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Wed, 24 Jun 2015 05:28:34 -0700
Subject: [PATCH 0114/1208] nir/from_ssa: add a flag to not convert everything
 from SSA

We already don't convert constants out of SSA, and in our backend we'd
like to have only one way of saying something is still in SSA.

The one tricky part about this is that we may now leave some undef
instructions around if they aren't part of a phi-web, so we have to be
more careful about deleting them.

v2: rename and flip meaning of flag (Jason)

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/gallium/drivers/vc4/vc4_program.c |  2 +-
 src/glsl/nir/nir.h                    |  7 ++++++-
 src/glsl/nir/nir_from_ssa.c           | 25 ++++++++++++++++++-------
 src/mesa/drivers/dri/i965/brw_nir.c   |  2 +-
 4 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 2061631dc9e..7b39a03f01a 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2102,7 +2102,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
 
         nir_remove_dead_variables(c->s);
 
-        nir_convert_from_ssa(c->s);
+        nir_convert_from_ssa(c->s, false);
 
         if (vc4_debug & VC4_DEBUG_SHADERDB) {
                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n",
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 697d37e95ac..8c99845f04d 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1676,7 +1676,12 @@ bool nir_ssa_defs_interfere(nir_ssa_def *a, nir_ssa_def *b);
 
 void nir_convert_to_ssa_impl(nir_function_impl *impl);
 void nir_convert_to_ssa(nir_shader *shader);
-void nir_convert_from_ssa(nir_shader *shader);
+
+/* If convert_everything is true, convert all values (even those not involved
+ * in a phi node) to registers. If false, only convert SSA values involved in
+ * phi nodes to registers.
+ */
+void nir_convert_from_ssa(nir_shader *shader, bool phi_webs_only);
 
 bool nir_opt_algebraic(nir_shader *shader);
 bool nir_opt_algebraic_late(nir_shader *shader);
diff --git a/src/glsl/nir/nir_from_ssa.c b/src/glsl/nir/nir_from_ssa.c
index 67733e6da4f..e4a153e9584 100644
--- a/src/glsl/nir/nir_from_ssa.c
+++ b/src/glsl/nir/nir_from_ssa.c
@@ -37,6 +37,7 @@
 struct from_ssa_state {
    void *mem_ctx;
    void *dead_ctx;
+   bool phi_webs_only;
    struct hash_table *merge_node_table;
    nir_instr *instr;
    nir_function_impl *impl;
@@ -482,6 +483,9 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
 
       reg = node->set->reg;
    } else {
+      if (state->phi_webs_only)
+         return true;
+
       /* We leave load_const SSA values alone.  They act as immediates to
        * the backend.  If it got coalesced into a phi, that's ok.
        */
@@ -505,8 +509,15 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
    nir_ssa_def_rewrite_uses(def, nir_src_for_reg(reg), state->mem_ctx);
    assert(list_empty(&def->uses) && list_empty(&def->if_uses));
 
-   if (def->parent_instr->type == nir_instr_type_ssa_undef)
+   if (def->parent_instr->type == nir_instr_type_ssa_undef) {
+      /* If it's an ssa_undef instruction, remove it since we know we just got
+       * rid of all its uses.
+       */
+      nir_instr *parent_instr = def->parent_instr;
+      nir_instr_remove(parent_instr);
+      ralloc_steal(state->dead_ctx, parent_instr);
       return true;
+   }
 
    assert(def->parent_instr->type != nir_instr_type_load_const);
 
@@ -523,7 +534,7 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
 }
 
 /* Resolves ssa definitions to registers.  While we're at it, we also
- * remove phi nodes and ssa_undef instructions
+ * remove phi nodes.
  */
 static bool
 resolve_registers_block(nir_block *block, void *void_state)
@@ -534,8 +545,7 @@ resolve_registers_block(nir_block *block, void *void_state)
       state->instr = instr;
       nir_foreach_ssa_def(instr, rewrite_ssa_def, state);
 
-      if (instr->type == nir_instr_type_ssa_undef ||
-          instr->type == nir_instr_type_phi) {
+      if (instr->type == nir_instr_type_phi) {
          nir_instr_remove(instr);
          ralloc_steal(state->dead_ctx, instr);
       }
@@ -765,13 +775,14 @@ resolve_parallel_copies_block(nir_block *block, void *void_state)
 }
 
 static void
-nir_convert_from_ssa_impl(nir_function_impl *impl)
+nir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only)
 {
    struct from_ssa_state state;
 
    state.mem_ctx = ralloc_parent(impl);
    state.dead_ctx = ralloc_context(NULL);
    state.impl = impl;
+   state.phi_webs_only = phi_webs_only;
    state.merge_node_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                                     _mesa_key_pointer_equal);
 
@@ -801,10 +812,10 @@ nir_convert_from_ssa_impl(nir_function_impl *impl)
 }
 
 void
-nir_convert_from_ssa(nir_shader *shader)
+nir_convert_from_ssa(nir_shader *shader, bool phi_webs_only)
 {
    nir_foreach_overload(shader, overload) {
       if (overload->impl)
-         nir_convert_from_ssa_impl(overload->impl);
+         nir_convert_from_ssa_impl(overload->impl, phi_webs_only);
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index dffb8ab1ca7..d87e78312fd 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -156,7 +156,7 @@ brw_create_nir(struct brw_context *brw,
       nir_print_shader(nir, stderr);
    }
 
-   nir_convert_from_ssa(nir);
+   nir_convert_from_ssa(nir, false);
    nir_validate_shader(nir);
 
    /* This is the last pass we run before we start emitting stuff.  It

From 864907e2f14523c130e6ff24c081789bb079bae1 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Wed, 24 Jun 2015 12:28:47 -0700
Subject: [PATCH 0115/1208] i965/fs: use SSA values directly

Before, we would use registers, but set a magical "parent_instr" field
to indicate that it was actually purely an SSA value (i.e., it wasn't
involved in any phi nodes). Instead, just use SSA values directly, which
lets us get rid of the hack and reduces memory usage since we're not
allocating a nir_register for every value. It also makes our handling of
load_const more consistent compared to the other instructions.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.h            |  3 +
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp      | 64 ++++++++++++-------
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp  |  1 +
 src/mesa/drivers/dri/i965/brw_nir.c           |  2 +-
 .../i965/brw_nir_analyze_boolean_resolves.c   | 12 ++--
 5 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index d08d438a40e..8170f2aa109 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -249,6 +249,8 @@ public:
    void nir_emit_block(nir_block *block);
    void nir_emit_instr(nir_instr *instr);
    void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr);
+   void nir_emit_undef(const brw::fs_builder &bld,
+                       nir_ssa_undef_instr *instr);
    void nir_emit_intrinsic(const brw::fs_builder &bld,
                            nir_intrinsic_instr *instr);
    void nir_emit_texture(const brw::fs_builder &bld,
@@ -345,6 +347,7 @@ public:
    unsigned max_grf;
 
    fs_reg *nir_locals;
+   fs_reg *nir_ssa_values;
    fs_reg *nir_globals;
    fs_reg nir_inputs;
    fs_reg nir_outputs;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 59081eab877..166586ff52f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -366,6 +366,9 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
       nir_locals[reg->index] = bld.vgrf(BRW_REGISTER_TYPE_F, size);
    }
 
+   nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
+                             impl->ssa_alloc);
+
    nir_emit_cf_list(&impl->body);
 }
 
@@ -464,6 +467,10 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
        */
       break;
 
+   case nir_instr_type_ssa_undef:
+      nir_emit_undef(abld, nir_instr_as_ssa_undef(instr));
+      break;
+
    case nir_instr_type_jump:
       nir_emit_jump(abld, nir_instr_as_jump(instr));
       break;
@@ -495,17 +502,12 @@ bool
 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
                                          const fs_reg &result)
 {
-   if (instr->src[0].src.is_ssa ||
-       !instr->src[0].src.reg.reg ||
-       !instr->src[0].src.reg.reg->parent_instr)
-      return false;
-
-   if (instr->src[0].src.reg.reg->parent_instr->type !=
-       nir_instr_type_intrinsic)
+   if (!instr->src[0].src.is_ssa ||
+       instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
       return false;
 
    nir_intrinsic_instr *src0 =
-      nir_instr_as_intrinsic(instr->src[0].src.reg.reg->parent_instr);
+      nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
 
    if (src0->intrinsic != nir_intrinsic_load_front_face)
       return false;
@@ -1146,6 +1148,13 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    }
 }
 
+void
+fs_visitor::nir_emit_undef(const fs_builder &bld, nir_ssa_undef_instr *instr)
+{
+   nir_ssa_values[instr->def.index] = bld.vgrf(BRW_REGISTER_TYPE_D,
+                                               instr->def.num_components);
+}
+
 static fs_reg
 fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
                    unsigned base_offset, nir_src *indirect)
@@ -1171,30 +1180,39 @@ fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
 fs_reg
 fs_visitor::get_nir_src(nir_src src)
 {
+   fs_reg reg;
    if (src.is_ssa) {
-      assert(src.ssa->parent_instr->type == nir_instr_type_load_const);
-      nir_load_const_instr *load = nir_instr_as_load_const(src.ssa->parent_instr);
-      fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, src.ssa->num_components);
+      if (src.ssa->parent_instr->type == nir_instr_type_load_const) {
+         nir_load_const_instr *load =
+            nir_instr_as_load_const(src.ssa->parent_instr);
+         reg = bld.vgrf(BRW_REGISTER_TYPE_D, src.ssa->num_components);
 
-      for (unsigned i = 0; i < src.ssa->num_components; ++i)
-         bld.MOV(offset(reg, i), fs_reg(load->value.i[i]));
-
-      return reg;
+         for (unsigned i = 0; i < src.ssa->num_components; ++i)
+            bld.MOV(offset(reg, i), fs_reg(load->value.i[i]));
+      } else {
+         reg = nir_ssa_values[src.ssa->index];
+      }
    } else {
-      fs_reg reg = fs_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
-                                      src.reg.indirect);
-
-      /* to avoid floating-point denorm flushing problems, set the type by
-       * default to D - instructions that need floating point semantics will set
-       * this to F if they need to
-       */
-      return retype(reg, BRW_REGISTER_TYPE_D);
+      reg = fs_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
+                               src.reg.indirect);
    }
+
+   /* to avoid floating-point denorm flushing problems, set the type by
+    * default to D - instructions that need floating point semantics will set
+    * this to F if they need to
+    */
+   return retype(reg, BRW_REGISTER_TYPE_D);
 }
 
 fs_reg
 fs_visitor::get_nir_dest(nir_dest dest)
 {
+   if (dest.is_ssa) {
+      nir_ssa_values[dest.ssa.index] = bld.vgrf(BRW_REGISTER_TYPE_F,
+                                                dest.ssa.num_components);
+      return nir_ssa_values[dest.ssa.index];
+   }
+
    return fs_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset,
                              dest.reg.indirect);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 34bf32d7ab3..395af73bc2e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -2026,6 +2026,7 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
    this->no16_msg = NULL;
 
    this->nir_locals = NULL;
+   this->nir_ssa_values = NULL;
    this->nir_globals = NULL;
 
    memset(&this->payload, 0, sizeof(this->payload));
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index d87e78312fd..3e154c10526 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -156,7 +156,7 @@ brw_create_nir(struct brw_context *brw,
       nir_print_shader(nir, stderr);
    }
 
-   nir_convert_from_ssa(nir, false);
+   nir_convert_from_ssa(nir, true);
    nir_validate_shader(nir);
 
    /* This is the last pass we run before we start emitting stuff.  It
diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
index f0b018cf84a..9eb0ed9bd79 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
@@ -43,8 +43,8 @@
 static uint8_t
 get_resolve_status_for_src(nir_src *src)
 {
-   nir_instr *src_instr = nir_src_get_parent_instr(src);
-   if (src_instr) {
+   if (src->is_ssa) {
+      nir_instr *src_instr = src->ssa->parent_instr;
       uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
 
       /* If the source instruction needs resolve, then from the perspective
@@ -66,8 +66,8 @@ get_resolve_status_for_src(nir_src *src)
 static bool
 src_mark_needs_resolve(nir_src *src, void *void_state)
 {
-   nir_instr *src_instr = nir_src_get_parent_instr(src);
-   if (src_instr) {
+   if (src->is_ssa) {
+      nir_instr *src_instr = src->ssa->parent_instr;
       uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
 
       /* If the source instruction is unresolved, then mark it as needing
@@ -172,11 +172,11 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
             resolve_status = BRW_NIR_NON_BOOLEAN;
          }
 
-         /* If the destination is SSA-like, go ahead allow unresolved booleans.
+         /* If the destination is SSA, go ahead allow unresolved booleans.
           * If the destination register doesn't have a well-defined parent_instr
           * we need to resolve immediately.
           */
-         if (alu->dest.dest.reg.reg->parent_instr == NULL &&
+         if (!alu->dest.dest.is_ssa &&
              resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) {
             resolve_status = BRW_NIR_BOOLEAN_NEEDS_RESOLVE;
          }

From 0ecdf04060518149e99a098caf4f6025fd6482a4 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Thu, 25 Jun 2015 16:22:26 -0700
Subject: [PATCH 0116/1208] i965/fs: emit constants only once

Before, we would lazily emit a MOV whenever we encountered a use of a
constant. Now that we have a dedicated file for SSA values, we can
instead only emit the MOV's once, which is more consistent and prevents
us from relying on CSE to re-combine the constants when they aren't
absorbed into the instruction.

total instructions in shared programs: 6078991 -> 6073118 (-0.10%)
instructions in affected programs:     402221 -> 396348 (-1.46%)
helped:                                1527
HURT:                                  0
GAINED:                                8
LOST:                                  2

v2: split this out from the previous commit (Jason)

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.h       |  2 ++
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 27 ++++++++++++------------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 8170f2aa109..f20b540020f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -249,6 +249,8 @@ public:
    void nir_emit_block(nir_block *block);
    void nir_emit_instr(nir_instr *instr);
    void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr);
+   void nir_emit_load_const(const brw::fs_builder &bld,
+                            nir_load_const_instr *instr);
    void nir_emit_undef(const brw::fs_builder &bld,
                        nir_ssa_undef_instr *instr);
    void nir_emit_intrinsic(const brw::fs_builder &bld,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 166586ff52f..58896d72e14 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -462,9 +462,7 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
       break;
 
    case nir_instr_type_load_const:
-      /* We can hit these, but we do nothing now and use them as
-       * immediates later.
-       */
+      nir_emit_load_const(abld, nir_instr_as_load_const(instr));
       break;
 
    case nir_instr_type_ssa_undef:
@@ -1148,6 +1146,18 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    }
 }
 
+void
+fs_visitor::nir_emit_load_const(const fs_builder &bld,
+                                nir_load_const_instr *instr)
+{
+   fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, instr->def.num_components);
+
+   for (unsigned i = 0; i < instr->def.num_components; i++)
+      bld.MOV(offset(reg, i), fs_reg(instr->value.i[i]));
+
+   nir_ssa_values[instr->def.index] = reg;
+}
+
 void
 fs_visitor::nir_emit_undef(const fs_builder &bld, nir_ssa_undef_instr *instr)
 {
@@ -1182,16 +1192,7 @@ fs_visitor::get_nir_src(nir_src src)
 {
    fs_reg reg;
    if (src.is_ssa) {
-      if (src.ssa->parent_instr->type == nir_instr_type_load_const) {
-         nir_load_const_instr *load =
-            nir_instr_as_load_const(src.ssa->parent_instr);
-         reg = bld.vgrf(BRW_REGISTER_TYPE_D, src.ssa->num_components);
-
-         for (unsigned i = 0; i < src.ssa->num_components; ++i)
-            bld.MOV(offset(reg, i), fs_reg(load->value.i[i]));
-      } else {
-         reg = nir_ssa_values[src.ssa->index];
-      }
+      reg = nir_ssa_values[src.ssa->index];
    } else {
       reg = fs_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
                                src.reg.indirect);

From f49e51ef44ac6400967731b75db871129b6c45f5 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Wed, 24 Jun 2015 12:43:15 -0700
Subject: [PATCH 0117/1208] nir: remove nir_src_get_parent_instr()

It's now unused.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/glsl/nir/nir.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 8c99845f04d..e48db72c53d 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -565,16 +565,6 @@ nir_src_for_reg(nir_register *reg)
    return src;
 }
 
-static inline nir_instr *
-nir_src_get_parent_instr(const nir_src *src)
-{
-   if (src->is_ssa) {
-      return src->ssa->parent_instr;
-   } else {
-      return src->reg.reg->parent_instr;
-   }
-}
-
 static inline nir_dest
 nir_dest_for_reg(nir_register *reg)
 {

From aa7d4cecec1a1236d237b83ebf035285f438ee67 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Wed, 24 Jun 2015 12:55:41 -0700
Subject: [PATCH 0118/1208] nir: remove parent_instr from nir_register

It's no longer used.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/glsl/nir/nir.c          | 1 -
 src/glsl/nir/nir.h          | 8 --------
 src/glsl/nir/nir_from_ssa.c | 8 --------
 3 files changed, 17 deletions(-)

diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c
index f03e80a4e0e..f661249f9bb 100644
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -57,7 +57,6 @@ reg_create(void *mem_ctx, struct exec_list *list)
 {
    nir_register *reg = ralloc(mem_ctx, nir_register);
 
-   reg->parent_instr = NULL;
    list_inithead(&reg->uses);
    list_inithead(&reg->defs);
    list_inithead(&reg->if_uses);
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index e48db72c53d..4cb7d2f1eac 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -389,14 +389,6 @@ typedef struct {
     */
    bool is_packed;
 
-   /**
-    * If this pointer is non-NULL then this register has exactly one
-    * definition and that definition dominates all of its uses.  This is
-    * set by the out-of-SSA pass so that backends can get SSA-like
-    * information even once they have gone out of SSA.
-    */
-   struct nir_instr *parent_instr;
-
    /** set of nir_instr's where this register is used (read from) */
    struct list_head uses;
 
diff --git a/src/glsl/nir/nir_from_ssa.c b/src/glsl/nir/nir_from_ssa.c
index e4a153e9584..1fd8b24d33d 100644
--- a/src/glsl/nir/nir_from_ssa.c
+++ b/src/glsl/nir/nir_from_ssa.c
@@ -496,14 +496,6 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
       reg->name = def->name;
       reg->num_components = def->num_components;
       reg->num_array_elems = 0;
-
-      /* This register comes from an SSA definition that is defined and not
-       * part of a phi-web.  Therefore, we know it has a single unique
-       * definition that dominates all of its uses; we can copy the
-       * parent_instr from the SSA def safely.
-       */
-      if (def->parent_instr->type != nir_instr_type_ssa_undef)
-         reg->parent_instr = def->parent_instr;
    }
 
    nir_ssa_def_rewrite_uses(def, nir_src_for_reg(reg), state->mem_ctx);

From 3258e1b80d66ec26f14a24a5eae0629a2d23a444 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 11:53:08 -0700
Subject: [PATCH 0119/1208] i965/fs: Use a switch statement in
 fs_inst::regs_read()

This makes things a little simpler, more efficient, and quite a bit more
readable.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 45 ++++++++++++++--------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8658554e96b..79ca33e42ed 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -701,28 +701,29 @@ fs_inst::is_partial_write() const
 int
 fs_inst::regs_read(int arg) const
 {
-   if (is_tex() && arg == 0 && src[0].file == GRF) {
-      return mlen;
-   } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
-      return mlen;
-   } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
-      return mlen;
-   } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
-      return exec_size / 4;
+   switch (opcode) {
+   case FS_OPCODE_FB_WRITE:
+   case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      if (arg == 0)
+         return mlen;
+      break;
+
+   case FS_OPCODE_LINTERP:
+      if (arg == 0)
+         return exec_size / 4;
+      break;
+
+   default:
+      if (is_tex() && arg == 0 && src[0].file == GRF)
+         return mlen;
+      break;
    }
 
    switch (src[arg].file) {

From 241317d59ab440bdcda25bacaadacfb3b4c2dd93 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 19 Jun 2015 12:58:37 -0700
Subject: [PATCH 0120/1208] i965/fs: Actually set/use the mlen for gen7 uniform
 pull constant loads

Previously, we were allocating the payload with different sizes per gen and
then figuring out the mlen in the generator based on gen.  This meant,
among other things, that the higher level passes knew nothing about it.

Acked-by: Francisco Jerez <currojerez@riseup.net>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp          | 19 ++++++++++++-------
 .../drivers/dri/i965/brw_fs_generator.cpp     |  9 +++------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 79ca33e42ed..94f42949ce2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2909,14 +2909,18 @@ fs_visitor::lower_uniform_pull_constant_loads()
          assert(const_offset_reg.file == IMM &&
                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
-         fs_reg payload = fs_reg(GRF, alloc.allocate(1));
 
-         /* We have to use a message header on Skylake to get SIMD4x2 mode.
-          * Reserve space for the register.
-          */
+         fs_reg payload, offset;
          if (devinfo->gen >= 9) {
-            payload.reg_offset++;
-            alloc.sizes[payload.reg] = 2;
+            /* We have to use a message header on Skylake to get SIMD4x2
+             * mode.  Reserve space for the register.
+            */
+            offset = payload = fs_reg(GRF, alloc.allocate(2));
+            offset.reg_offset++;
+            inst->mlen = 2;
+         } else {
+            offset = payload = fs_reg(GRF, alloc.allocate(1));
+            inst->mlen = 1;
          }
 
          /* This is actually going to be a MOV, but since only the first dword
@@ -2925,7 +2929,7 @@ fs_visitor::lower_uniform_pull_constant_loads()
           * by live variable analysis, or register allocation will explode.
           */
          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
-                                               8, payload, const_offset_reg);
+                                               8, offset, const_offset_reg);
          setup->force_writemask_all = true;
 
          setup->ir = inst->ir;
@@ -2938,6 +2942,7 @@ fs_visitor::lower_uniform_pull_constant_loads()
           */
          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
          inst->src[1] = payload;
+         inst->base_mrf = -1;
 
          invalidate_live_intervals();
       } else {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 2ed0bac6fd9..8d821abbac2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1054,7 +1054,6 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                                                        struct brw_reg index,
                                                        struct brw_reg offset)
 {
-   assert(inst->mlen == 0);
    assert(index.type == BRW_REGISTER_TYPE_UD);
 
    assert(offset.file == BRW_GENERAL_REGISTER_FILE);
@@ -1069,12 +1068,10 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
 
    struct brw_reg src = offset;
    bool header_present = false;
-   int mlen = 1;
 
    if (devinfo->gen >= 9) {
       /* Skylake requires a message header in order to use SIMD4x2 mode. */
-      src = retype(brw_vec4_grf(offset.nr - 1, 0), BRW_REGISTER_TYPE_UD);
-      mlen = 2;
+      src = retype(brw_vec4_grf(offset.nr, 0), BRW_REGISTER_TYPE_UD);
       header_present = true;
 
       brw_push_insn_state(p);
@@ -1105,7 +1102,7 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                               0, /* LD message ignores sampler unit */
                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
                               1, /* rlen */
-                              mlen,
+                              inst->mlen,
                               header_present,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);
@@ -1135,7 +1132,7 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                               0, /* LD message ignores sampler unit */
                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
                               1, /* rlen */
-                              mlen,
+                              inst->mlen,
                               header_present,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);

From aca5228011e7b9e96f3bd3a621c88e63ba47a4f3 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 17 Jun 2015 18:02:11 -0700
Subject: [PATCH 0121/1208] i965/fs: Fix fs_inst::regs_read() for uniform pull
 constant loads

Previously, fs_inst::regs_read() fell back to depending on the register
width for the second source.  This isn't really correct since it isn't a
SIMD8 value at all, but a SIMD4x2 value.  This commit changes it to
explicitly be always one register.

v2: Use mlen for determining the number of registers read

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 94f42949ce2..e83a0923e80 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -715,6 +715,12 @@ fs_inst::regs_read(int arg) const
          return mlen;
       break;
 
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+      /* The payload is actually stored in src1 */
+      if (arg == 1)
+         return mlen;
+      break;
+
    case FS_OPCODE_LINTERP:
       if (arg == 0)
          return exec_size / 4;

From 12bc22ef58377191508af91a918efd18e2da7500 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 17:48:27 -0700
Subject: [PATCH 0122/1208] i965/fs: Report the right value in
 fs_inst::regs_read() for PIXEL_X/Y

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index e83a0923e80..d91ad0a0650 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -701,6 +701,7 @@ fs_inst::is_partial_write() const
 int
 fs_inst::regs_read(int arg) const
 {
+   unsigned components = 1;
    switch (opcode) {
    case FS_OPCODE_FB_WRITE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
@@ -726,6 +727,12 @@ fs_inst::regs_read(int arg) const
          return exec_size / 4;
       break;
 
+   case FS_OPCODE_PIXEL_X:
+   case FS_OPCODE_PIXEL_Y:
+      if (arg == 0)
+         components = 1;
+      break;
+
    default:
       if (is_tex() && arg == 0 && src[0].file == GRF)
          return mlen;
@@ -742,8 +749,8 @@ fs_inst::regs_read(int arg) const
       if (src[arg].stride == 0) {
          return 1;
       } else {
-         int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
-         return (size + 31) / 32;
+         int size = components * src[arg].width * type_sz(src[arg].type);
+         return DIV_ROUND_UP(size * src[arg].stride, 32);
       }
    case MRF:
       unreachable("MRF registers are not allowed as sources");

From c5a8da5f24eae4479b4ebe6301d780f781e24ed2 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 30 Jun 2015 15:51:13 -0700
Subject: [PATCH 0123/1208] i965/fs: Properly handle LOAD_PAYLOAD in
 fs_inst::regs_read

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index d91ad0a0650..cae4e4263ea 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -733,6 +733,11 @@ fs_inst::regs_read(int arg) const
          components = 1;
       break;
 
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+      if (arg < this->header_size)
+         return 1;
+      break;
+
    default:
       if (is_tex() && arg == 0 && src[0].file == GRF)
          return mlen;

From 438e9c8b88c8faf7cbc2a20b03c077342be214e3 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 17 Jun 2015 17:32:24 -0700
Subject: [PATCH 0124/1208] i965/fs: Explicitly set the exec_size on the
 add(32) in interpolation setup

Soon we will start using the builder to explicitly set all the execution
sizes.  We could make a 32-wide builder, but the builder asserts that we
never grow it which is usually a reasonable assumption.  Since this one
instruction is a bit of an odd-ball, we just set the exec_size explicitly.

v2: Explicitly new the fs_inst instead of using the builder and setting
    exec_size after the fact.

v3: Set force_writemask_all with the builder instead of directly.  The
    builder over-writes it if we set it manually.  Also, if we don't have
    force_writemask_all in the builder it will assert-fail on SIMD32.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 395af73bc2e..89eb71769a3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1358,10 +1358,12 @@ fs_visitor::emit_interpolation_setup_gen6()
        */
       fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
                           BRW_REGISTER_TYPE_UW, dispatch_width * 2);
-      abld.exec_all()
-          .ADD(int_pixel_xy,
-               fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
-               fs_reg(brw_imm_v(0x11001010)));
+      fs_inst *add =
+         new (mem_ctx) fs_inst(BRW_OPCODE_ADD, dispatch_width * 2,
+                               int_pixel_xy,
+                               fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
+                               fs_reg(brw_imm_v(0x11001010)));
+      abld.exec_all().emit(add);
 
       this->pixel_x = vgrf(glsl_type::float_type);
       this->pixel_y = vgrf(glsl_type::float_type);

From 362eff7741f9ca6e49074509120a2e6c03ef7ae6 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 15:58:59 -0700
Subject: [PATCH 0125/1208] i965/fs: Set the builder group for emitting
 FB-write stencil/AA alpha

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 89eb71769a3..69d3cfa8897 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1530,7 +1530,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
 
    if (payload.aa_dest_stencil_reg) {
       sources[length] = fs_reg(GRF, alloc.allocate(1));
-      bld.exec_all().annotate("FB write stencil/AA alpha")
+      bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
          .MOV(sources[length],
               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
       length++;

From b535ba55ed6023f402374aeff79f9f37dbb21df0 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 12:00:54 -0700
Subject: [PATCH 0126/1208] i965/blorp: Explicitly set execution sizes for
 new'd instructions

This doesn't affect instructions allocated using the builder.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
index 789520c7353..d458ad846bf 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
@@ -73,7 +73,7 @@ brw_blorp_eu_emitter::emit_kill_if_outside_rect(const struct brw_reg &x,
    emit_cmp(BRW_CONDITIONAL_L, x, dst_x1)->predicate = BRW_PREDICATE_NORMAL;
    emit_cmp(BRW_CONDITIONAL_L, y, dst_y1)->predicate = BRW_PREDICATE_NORMAL;
 
-   fs_inst *inst = new (mem_ctx) fs_inst(BRW_OPCODE_AND, g1, f0, g1);
+   fs_inst *inst = new (mem_ctx) fs_inst(BRW_OPCODE_AND, 16, g1, f0, g1);
    inst->force_writemask_all = true;
    insts.push_tail(inst);
 }
@@ -84,7 +84,7 @@ brw_blorp_eu_emitter::emit_texture_lookup(const struct brw_reg &dst,
                                           unsigned base_mrf,
                                           unsigned msg_length)
 {
-   fs_inst *inst = new (mem_ctx) fs_inst(op, dst, brw_message_reg(base_mrf),
+   fs_inst *inst = new (mem_ctx) fs_inst(op, 16, dst, brw_message_reg(base_mrf),
                                          fs_reg(0u));
 
    inst->base_mrf = base_mrf;
@@ -119,7 +119,8 @@ brw_blorp_eu_emitter::emit_combine(enum opcode combine_opcode,
 {
    assert(combine_opcode == BRW_OPCODE_ADD || combine_opcode == BRW_OPCODE_AVG);
 
-   insts.push_tail(new (mem_ctx) fs_inst(combine_opcode, dst, src_1, src_2));
+   insts.push_tail(new (mem_ctx) fs_inst(combine_opcode, 16, dst,
+                                         src_1, src_2));
 }
 
 fs_inst *
@@ -127,7 +128,7 @@ brw_blorp_eu_emitter::emit_cmp(enum brw_conditional_mod op,
                                const struct brw_reg &x,
                                const struct brw_reg &y)
 {
-   fs_inst *cmp = new (mem_ctx) fs_inst(BRW_OPCODE_CMP,
+   fs_inst *cmp = new (mem_ctx) fs_inst(BRW_OPCODE_CMP, 16,
                                         vec16(brw_null_reg()), x, y);
    cmp->conditional_mod = op;
    insts.push_tail(cmp);

From 7fcbe141076d18bf0245de1fd589c82f7c543fdf Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 25 Jun 2015 10:55:51 -0700
Subject: [PATCH 0127/1208] i965/fs: Move offset(fs_reg, unsigned) to brw_fs.h

Shortly, offset() will depend on the builder so we need it moved to some
place where it has access to that.

Reviewed-by: Iago Toral Quiroga <itoral@igali.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs.h    | 21 +++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_ir_fs.h | 21 ---------------------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index f20b540020f..06f46765dd1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -62,6 +62,27 @@ namespace brw {
    class fs_live_variables;
 }
 
+static inline fs_reg
+offset(fs_reg reg, unsigned delta)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case GRF:
+   case MRF:
+   case ATTR:
+      return byte_offset(reg,
+                         delta * MAX2(reg.width * reg.stride, 1) *
+                         type_sz(reg.type));
+   case UNIFORM:
+      reg.reg_offset += delta;
+      break;
+   default:
+      assert(delta == 0);
+   }
+   return reg;
+}
+
 /**
  * The fragment shader front-end.
  *
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index 96dc20da3cf..16b20beb788 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -128,27 +128,6 @@ horiz_offset(fs_reg reg, unsigned delta)
    return reg;
 }
 
-static inline fs_reg
-offset(fs_reg reg, unsigned delta)
-{
-   switch (reg.file) {
-   case BAD_FILE:
-      break;
-   case GRF:
-   case MRF:
-   case ATTR:
-      return byte_offset(reg,
-                         delta * MAX2(reg.width * reg.stride, 1) *
-                         type_sz(reg.type));
-   case UNIFORM:
-      reg.reg_offset += delta;
-      break;
-   default:
-      assert(delta == 0);
-   }
-   return reg;
-}
-
 static inline fs_reg
 component(fs_reg reg, unsigned idx)
 {

From f7dcc1160331462a071c54ca1067f9e2f57b55be Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 12:07:27 -0700
Subject: [PATCH 0128/1208] i965/fs: Add a builder argument to offset()

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp          |  42 ++---
 src/mesa/drivers/dri/i965/brw_fs.h            |   2 +-
 src/mesa/drivers/dri/i965/brw_fs_cse.cpp      |   2 +-
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp      |  58 +++----
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp  | 143 +++++++++---------
 .../dri/i965/test_fs_cmod_propagation.cpp     |   4 +-
 .../dri/i965/test_fs_saturate_propagation.cpp |   4 +-
 7 files changed, 132 insertions(+), 123 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index cae4e4263ea..ceac20cc97a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -267,7 +267,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
          inst->mlen = 1 + dispatch_width / 8;
    }
 
-   bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
+   bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
 }
 
 /**
@@ -361,7 +361,12 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
       reg.width = this->src[i].width;
       if (!this->src[i].equals(reg))
          return false;
-      reg = ::offset(reg, 1);
+
+      if (i < this->header_size) {
+         reg.reg_offset += 1;
+      } else {
+         reg.reg_offset += this->exec_size / 8;
+      }
    }
 
    return true;
@@ -926,7 +931,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
    } else {
       bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
    }
-   wpos = offset(wpos, 1);
+   wpos = offset(wpos, bld, 1);
 
    /* gl_FragCoord.y */
    if (!flip && pixel_center_integer) {
@@ -942,7 +947,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 
       bld.ADD(wpos, pixel_y, fs_reg(offset));
    }
-   wpos = offset(wpos, 1);
+   wpos = offset(wpos, bld, 1);
 
    /* gl_FragCoord.z */
    if (devinfo->gen >= 6) {
@@ -952,7 +957,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
            interp_reg(VARYING_SLOT_POS, 2));
    }
-   wpos = offset(wpos, 1);
+   wpos = offset(wpos, bld, 1);
 
    /* gl_FragCoord.w: Already set up in emit_interpolation */
    bld.MOV(wpos, this->wpos_w);
@@ -1035,7 +1040,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
 	    /* If there's no incoming setup data for this slot, don't
 	     * emit interpolation for it.
 	     */
-	    attr = offset(attr, type->vector_elements);
+	    attr = offset(attr, bld, type->vector_elements);
 	    location++;
 	    continue;
 	 }
@@ -1050,7 +1055,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
 	       interp = suboffset(interp, 3);
                interp.type = attr.type;
                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
-	       attr = offset(attr, 1);
+	       attr = offset(attr, bld, 1);
 	    }
 	 } else {
 	    /* Smooth/noperspective interpolation case. */
@@ -1088,7 +1093,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
                   bld.MUL(attr, attr, this->pixel_w);
                }
-	       attr = offset(attr, 1);
+	       attr = offset(attr, bld, 1);
 	    }
 
 	 }
@@ -1196,7 +1201,7 @@ fs_visitor::emit_samplepos_setup()
    }
    /* Compute gl_SamplePosition.x */
    compute_sample_position(pos, int_sample_x);
-   pos = offset(pos, 1);
+   pos = offset(pos, abld, 1);
    if (dispatch_width == 8) {
       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
    } else {
@@ -2986,10 +2991,6 @@ fs_visitor::lower_load_payload()
 
       assert(inst->dst.file == MRF || inst->dst.file == GRF);
       assert(inst->saturate == false);
-
-      const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
-                                 .exec_all(inst->force_writemask_all)
-                                 .at(block, inst);
       fs_reg dst = inst->dst;
 
       /* Get rid of COMPR4.  We'll add it back in if we need it */
@@ -2997,17 +2998,23 @@ fs_visitor::lower_load_payload()
          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
 
       dst.width = 8;
+      const fs_builder hbld = bld.group(8, 0).exec_all().at(block, inst);
+
       for (uint8_t i = 0; i < inst->header_size; i++) {
          if (inst->src[i].file != BAD_FILE) {
             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
             mov_src.width = 8;
-            ibld.exec_all().MOV(mov_dst, mov_src);
+            hbld.MOV(mov_dst, mov_src);
          }
-         dst = offset(dst, 1);
+         dst = offset(dst, hbld, 1);
       }
 
       dst.width = inst->exec_size;
+      const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
+                                 .exec_all(inst->force_writemask_all)
+                                 .at(block, inst);
+
       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
           inst->exec_size > 8) {
          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
@@ -3039,7 +3046,8 @@ fs_visitor::lower_load_payload()
                   fs_reg mov_dst = retype(dst, inst->src[i].type);
                   mov_dst.width = 8;
                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
-                  ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
+                  mov_dst.reg += 4;
+                  ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
                }
             }
 
@@ -3064,7 +3072,7 @@ fs_visitor::lower_load_payload()
       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
          if (inst->src[i].file != BAD_FILE)
             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
-         dst = offset(dst, 1);
+         dst = offset(dst, ibld, 1);
       }
 
       inst->remove(block);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 06f46765dd1..ece7e49ea17 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -63,7 +63,7 @@ namespace brw {
 }
 
 static inline fs_reg
-offset(fs_reg reg, unsigned delta)
+offset(fs_reg reg, const brw::fs_builder& bld, unsigned delta)
 {
    switch (reg.file) {
    case BAD_FILE:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 70f0217b93d..29d1f2a6a57 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -205,7 +205,7 @@ create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
       }
       for (int i = header_size; i < sources; i++) {
          payload[i] = src;
-         src = offset(src, 1);
+         src = offset(src, ubld, 1);
       }
       copy = ubld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
    } else {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 58896d72e14..f52f344ccff 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -76,7 +76,7 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
 {
    foreach_list_typed(nir_variable, var, node, &shader->inputs) {
       enum brw_reg_type type = brw_type_for_base_type(var->type);
-      fs_reg input = offset(nir_inputs, var->data.driver_location);
+      fs_reg input = offset(nir_inputs, bld, var->data.driver_location);
 
       fs_reg reg;
       switch (stage) {
@@ -95,8 +95,8 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
          unsigned array_length = var->type->is_array() ? var->type->length : 1;
          for (unsigned i = 0; i < array_length; i++) {
             for (unsigned j = 0; j < components; j++) {
-               bld.MOV(retype(offset(input, components * i + j), type),
-                       offset(fs_reg(ATTR, var->data.location + i, type), j));
+               bld.MOV(retype(offset(input, bld, components * i + j), type),
+                       offset(fs_reg(ATTR, var->data.location + i, type), bld, j));
             }
          }
          break;
@@ -127,7 +127,7 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
    foreach_list_typed(nir_variable, var, node, &shader->outputs) {
-      fs_reg reg = offset(nir_outputs, var->data.driver_location);
+      fs_reg reg = offset(nir_outputs, bld, var->data.driver_location);
 
       int vector_elements =
          var->type->is_array() ? var->type->fields.array->vector_elements
@@ -136,7 +136,7 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
       if (stage == MESA_SHADER_VERTEX) {
          for (int i = 0; i < ALIGN(type_size(var->type), 4) / 4; i++) {
             int output = var->data.location + i;
-            this->outputs[output] = offset(reg, 4 * i);
+            this->outputs[output] = offset(reg, bld, 4 * i);
             this->output_components[output] = vector_elements;
          }
       } else if (var->data.index > 0) {
@@ -162,7 +162,7 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
          /* General color output. */
          for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
             int output = var->data.location - FRAG_RESULT_DATA0 + i;
-            this->outputs[output] = offset(reg, vector_elements * i);
+            this->outputs[output] = offset(reg, bld, vector_elements * i);
             this->output_components[output] = vector_elements;
          }
       }
@@ -618,11 +618,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
             continue;
 
          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
-            inst = bld.MOV(offset(temp, i),
-                           offset(op[0], instr->src[0].swizzle[i]));
+            inst = bld.MOV(offset(temp, bld, i),
+                           offset(op[0], bld, instr->src[0].swizzle[i]));
          } else {
-            inst = bld.MOV(offset(temp, i),
-                           offset(op[i], instr->src[i].swizzle[0]));
+            inst = bld.MOV(offset(temp, bld, i),
+                           offset(op[i], bld, instr->src[i].swizzle[0]));
          }
          inst->saturate = instr->dest.saturate;
       }
@@ -636,7 +636,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
             if (!(instr->dest.write_mask & (1 << i)))
                continue;
 
-            bld.MOV(offset(result, i), offset(temp, i));
+            bld.MOV(offset(result, bld, i), offset(temp, bld, i));
          }
       }
       return;
@@ -657,12 +657,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       assert(_mesa_bitcount(instr->dest.write_mask) == 1);
       channel = ffs(instr->dest.write_mask) - 1;
 
-      result = offset(result, channel);
+      result = offset(result, bld, channel);
    }
 
    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
-      op[i] = offset(op[i], instr->src[i].swizzle[channel]);
+      op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
    }
 
    switch (instr->op) {
@@ -1153,7 +1153,7 @@ fs_visitor::nir_emit_load_const(const fs_builder &bld,
    fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, instr->def.num_components);
 
    for (unsigned i = 0; i < instr->def.num_components; i++)
-      bld.MOV(offset(reg, i), fs_reg(instr->value.i[i]));
+      bld.MOV(offset(reg, bld, i), fs_reg(instr->value.i[i]));
 
    nir_ssa_values[instr->def.index] = reg;
 }
@@ -1175,7 +1175,7 @@ fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
    else
       reg = v->nir_locals[nir_reg->index];
 
-   reg = offset(reg, base_offset * nir_reg->num_components);
+   reg = offset(reg, v->bld, base_offset * nir_reg->num_components);
    if (indirect) {
       int multiplier = nir_reg->num_components * (v->dispatch_width / 8);
 
@@ -1227,10 +1227,10 @@ fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
          continue;
 
       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
-      new_inst->dst = offset(new_inst->dst, i);
+      new_inst->dst = offset(new_inst->dst, bld, i);
       for (unsigned j = 0; j < new_inst->sources; j++)
          if (new_inst->src[j].file == GRF)
-            new_inst->src[j] = offset(new_inst->src[j], i);
+            new_inst->src[j] = offset(new_inst->src[j], bld, i);
 
       bld.emit(new_inst);
    }
@@ -1341,7 +1341,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       assert(sample_pos.file != BAD_FILE);
       dest.type = sample_pos.type;
       bld.MOV(dest, sample_pos);
-      bld.MOV(offset(dest, 1), offset(sample_pos, 1));
+      bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
       break;
    }
 
@@ -1368,13 +1368,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       }
 
       for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg src = offset(retype(uniform_reg, dest.type), index);
+         fs_reg src = offset(retype(uniform_reg, dest.type), bld, index);
          if (has_indirect)
             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
          index++;
 
          bld.MOV(dest, src);
-         dest = offset(dest, 1);
+         dest = offset(dest, bld, 1);
       }
       break;
    }
@@ -1416,7 +1416,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
          unsigned vec4_offset = instr->const_index[0] / 4;
          for (int i = 0; i < instr->num_components; i++)
-            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, i), surf_index,
+            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
                                        base_offset, vec4_offset + i);
       } else {
          fs_reg packed_consts = vgrf(glsl_type::float_type);
@@ -1435,7 +1435,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
             assert(packed_consts.subreg_offset < 32);
 
             bld.MOV(dest, packed_consts);
-            dest = offset(dest, 1);
+            dest = offset(dest, bld, 1);
          }
       }
       break;
@@ -1447,14 +1447,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_load_input: {
       unsigned index = 0;
       for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg src = offset(retype(nir_inputs, dest.type),
+         fs_reg src = offset(retype(nir_inputs, dest.type), bld,
                              instr->const_index[0] + index);
          if (has_indirect)
             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
          index++;
 
          bld.MOV(dest, src);
-         dest = offset(dest, 1);
+         dest = offset(dest, bld, 1);
       }
       break;
    }
@@ -1527,7 +1527,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                                        BRW_REGISTER_TYPE_F);
             for (int i = 0; i < 2; i++) {
                fs_reg temp = vgrf(glsl_type::float_type);
-               bld.MUL(temp, offset(offset_src, i), fs_reg(16.0f));
+               bld.MUL(temp, offset(offset_src, bld, i), fs_reg(16.0f));
                fs_reg itemp = vgrf(glsl_type::int_type);
                bld.MOV(itemp, temp);  /* float to int */
 
@@ -1547,7 +1547,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
                 */
                set_condmod(BRW_CONDITIONAL_L,
-                           bld.SEL(offset(src, i), itemp, fs_reg(7)));
+                           bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
             }
 
             mlen = 2;
@@ -1571,7 +1571,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          src.type = dest.type;
 
          bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
-         dest = offset(dest, 1);
+         dest = offset(dest, bld, 1);
       }
       break;
    }
@@ -1583,13 +1583,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       fs_reg src = get_nir_src(instr->src[0]);
       unsigned index = 0;
       for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg new_dest = offset(retype(nir_outputs, src.type),
+         fs_reg new_dest = offset(retype(nir_outputs, src.type), bld,
                                   instr->const_index[0] + index);
          if (has_indirect)
             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[1]));
          index++;
          bld.MOV(new_dest, src);
-         src = offset(src, 1);
+         src = offset(src, bld, 1);
       }
       break;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 69d3cfa8897..9b91f47d264 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -95,7 +95,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
    if (shadow_c.file != BAD_FILE) {
       for (int i = 0; i < coord_components; i++) {
          bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 coordinate = offset(coordinate, 1);
+	 coordinate = offset(coordinate, bld, 1);
       }
 
       /* gen4's SIMD8 sampler always has the slots for u,v,r present.
@@ -124,7 +124,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
    } else if (op == ir_tex) {
       for (int i = 0; i < coord_components; i++) {
          bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 coordinate = offset(coordinate, 1);
+	 coordinate = offset(coordinate, bld, 1);
       }
       /* zero the others. */
       for (int i = coord_components; i<3; i++) {
@@ -137,7 +137,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
 
       for (int i = 0; i < coord_components; i++) {
          bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 coordinate = offset(coordinate, 1);
+	 coordinate = offset(coordinate, bld, 1);
       }
       /* the slots for u and v are always present, but r is optional */
       mlen += MAX2(coord_components, 2);
@@ -158,13 +158,13 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
        */
       for (int i = 0; i < grad_components; i++) {
          bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdx);
-	 dPdx = offset(dPdx, 1);
+	 dPdx = offset(dPdx, bld, 1);
       }
       mlen += MAX2(grad_components, 2);
 
       for (int i = 0; i < grad_components; i++) {
          bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdy);
-	 dPdy = offset(dPdy, 1);
+	 dPdy = offset(dPdy, bld, 1);
       }
       mlen += MAX2(grad_components, 2);
    } else if (op == ir_txs) {
@@ -182,7 +182,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
       for (int i = 0; i < coord_components; i++) {
          bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
                  coordinate);
-	 coordinate = offset(coordinate, 1);
+	 coordinate = offset(coordinate, bld, 1);
       }
 
       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
@@ -232,8 +232,8 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
    if (simd16) {
       for (int i = 0; i < 4; i++) {
          bld.MOV(orig_dst, dst);
-	 orig_dst = offset(orig_dst, 1);
-	 dst = offset(dst, 2);
+	 orig_dst = offset(orig_dst, bld, 1);
+	 dst = offset(dst, bld, 2);
       }
    }
 
@@ -257,31 +257,31 @@ fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
 
    /* Copy the coordinates. */
    for (int i = 0; i < vector_elements; i++) {
-      bld.MOV(retype(offset(message, i), coordinate.type), coordinate);
-      coordinate = offset(coordinate, 1);
+      bld.MOV(retype(offset(message, bld, i), coordinate.type), coordinate);
+      coordinate = offset(coordinate, bld, 1);
    }
 
-   fs_reg msg_end = offset(message, vector_elements);
+   fs_reg msg_end = offset(message, bld, vector_elements);
 
    /* Messages other than sample and ld require all three components */
    if (vector_elements > 0 && (has_lod || shadow_c.file != BAD_FILE)) {
       for (int i = vector_elements; i < 3; i++) {
-         bld.MOV(offset(message, i), fs_reg(0.0f));
+         bld.MOV(offset(message, bld, i), fs_reg(0.0f));
       }
-      msg_end = offset(message, 3);
+      msg_end = offset(message, bld, 3);
    }
 
    if (has_lod) {
       fs_reg msg_lod = retype(msg_end, op == ir_txf ?
                               BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
       bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, 1);
+      msg_end = offset(msg_lod, bld, 1);
    }
 
    if (shadow_c.file != BAD_FILE) {
-      fs_reg msg_ref = offset(message, 3 + has_lod);
+      fs_reg msg_ref = offset(message, bld, 3 + has_lod);
       bld.MOV(msg_ref, shadow_c);
-      msg_end = offset(msg_ref, 1);
+      msg_end = offset(msg_ref, bld, 1);
    }
 
    enum opcode opcode;
@@ -335,16 +335,16 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
    }
 
    for (int i = 0; i < vector_elements; i++) {
-      bld.MOV(retype(offset(msg_coords, i), coordinate.type), coordinate);
-      coordinate = offset(coordinate, 1);
+      bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
+      coordinate = offset(coordinate, bld, 1);
    }
-   fs_reg msg_end = offset(msg_coords, vector_elements);
-   fs_reg msg_lod = offset(msg_coords, 4);
+   fs_reg msg_end = offset(msg_coords, bld, vector_elements);
+   fs_reg msg_lod = offset(msg_coords, bld, 4);
 
    if (shadow_c.file != BAD_FILE) {
       fs_reg msg_shadow = msg_lod;
       bld.MOV(msg_shadow, shadow_c);
-      msg_lod = offset(msg_shadow, 1);
+      msg_lod = offset(msg_shadow, bld, 1);
       msg_end = msg_lod;
    }
 
@@ -355,13 +355,13 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
       break;
    case ir_txb:
       bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, 1);
+      msg_end = offset(msg_lod, bld, 1);
 
       opcode = FS_OPCODE_TXB;
       break;
    case ir_txl:
       bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, 1);
+      msg_end = offset(msg_lod, bld, 1);
 
       opcode = SHADER_OPCODE_TXL;
       break;
@@ -378,12 +378,12 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
       msg_end = msg_lod;
       for (int i = 0; i < grad_components; i++) {
          bld.MOV(msg_end, lod);
-         lod = offset(lod, 1);
-         msg_end = offset(msg_end, 1);
+         lod = offset(lod, bld, 1);
+         msg_end = offset(msg_end, bld, 1);
 
          bld.MOV(msg_end, lod2);
-         lod2 = offset(lod2, 1);
-         msg_end = offset(msg_end, 1);
+         lod2 = offset(lod2, bld, 1);
+         msg_end = offset(msg_end, bld, 1);
       }
 
       opcode = SHADER_OPCODE_TXD;
@@ -392,31 +392,31 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
    case ir_txs:
       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
       bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, 1);
+      msg_end = offset(msg_lod, bld, 1);
 
       opcode = SHADER_OPCODE_TXS;
       break;
    case ir_query_levels:
       msg_lod = msg_end;
       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
-      msg_end = offset(msg_lod, 1);
+      msg_end = offset(msg_lod, bld, 1);
 
       opcode = SHADER_OPCODE_TXS;
       break;
    case ir_txf:
-      msg_lod = offset(msg_coords, 3);
+      msg_lod = offset(msg_coords, bld, 3);
       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
-      msg_end = offset(msg_lod, 1);
+      msg_end = offset(msg_lod, bld, 1);
 
       opcode = SHADER_OPCODE_TXF;
       break;
    case ir_txf_ms:
-      msg_lod = offset(msg_coords, 3);
+      msg_lod = offset(msg_coords, bld, 3);
       /* lod */
       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
       /* sample index */
-      bld.MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index);
-      msg_end = offset(msg_lod, 2);
+      bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
+      msg_end = offset(msg_lod, bld, 2);
 
       opcode = SHADER_OPCODE_TXF_CMS;
       break;
@@ -526,7 +526,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
        */
       for (int i = 0; i < coord_components; i++) {
          bld.MOV(sources[length], coordinate);
-	 coordinate = offset(coordinate, 1);
+	 coordinate = offset(coordinate, bld, 1);
 	 length++;
 
          /* For cube map array, the coordinate is (u,v,r,ai) but there are
@@ -534,11 +534,11 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
           */
          if (i < grad_components) {
             bld.MOV(sources[length], lod);
-            lod = offset(lod, 1);
+            lod = offset(lod, bld, 1);
             length++;
 
             bld.MOV(sources[length], lod2);
-            lod2 = offset(lod2, 1);
+            lod2 = offset(lod2, bld, 1);
             length++;
          }
       }
@@ -560,13 +560,13 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
        */
 
       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-      coordinate = offset(coordinate, 1);
+      coordinate = offset(coordinate, bld, 1);
       length++;
 
       if (devinfo->gen >= 9) {
          if (coord_components >= 2) {
             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-            coordinate = offset(coordinate, 1);
+            coordinate = offset(coordinate, bld, 1);
          }
          length++;
       }
@@ -576,7 +576,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
 
       for (int i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-	 coordinate = offset(coordinate, 1);
+	 coordinate = offset(coordinate, bld, 1);
 	 length++;
       }
 
@@ -595,7 +595,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
        */
       for (int i = 0; i < coord_components; i++) {
          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-         coordinate = offset(coordinate, 1);
+         coordinate = offset(coordinate, bld, 1);
          length++;
       }
 
@@ -609,19 +609,19 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
          /* More crazy intermixing */
          for (int i = 0; i < 2; i++) { /* u, v */
             bld.MOV(sources[length], coordinate);
-            coordinate = offset(coordinate, 1);
+            coordinate = offset(coordinate, bld, 1);
             length++;
          }
 
          for (int i = 0; i < 2; i++) { /* offu, offv */
             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
-            offset_value = offset(offset_value, 1);
+            offset_value = offset(offset_value, bld, 1);
             length++;
          }
 
          if (coord_components == 3) { /* r if present */
             bld.MOV(sources[length], coordinate);
-            coordinate = offset(coordinate, 1);
+            coordinate = offset(coordinate, bld, 1);
             length++;
          }
 
@@ -634,7 +634,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
    if (!coordinate_done) {
       for (int i = 0; i < coord_components; i++) {
          bld.MOV(sources[length], coordinate);
-         coordinate = offset(coordinate, 1);
+         coordinate = offset(coordinate, bld, 1);
          length++;
       }
    }
@@ -747,8 +747,8 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
       coordinate = dst;
 
       bld.MUL(dst, src, scale_x);
-      dst = offset(dst, 1);
-      src = offset(src, 1);
+      dst = offset(dst, bld, 1);
+      src = offset(src, bld, 1);
       bld.MUL(dst, src, scale_y);
    } else if (is_rect) {
       /* On gen6+, the sampler handles the rectangle coordinates
@@ -761,7 +761,7 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
       for (int i = 0; i < 2; i++) {
 	 if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 	    fs_reg chan = coordinate;
-	    chan = offset(chan, i);
+	    chan = offset(chan, bld, i);
 
             set_condmod(BRW_CONDITIONAL_GE,
                         bld.emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f)));
@@ -786,7 +786,7 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
       for (int i = 0; i < MIN2(coord_components, 3); i++) {
 	 if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 	    fs_reg chan = coordinate;
-	    chan = offset(chan, i);
+	    chan = offset(chan, bld, i);
             set_saturate(true, bld.MOV(chan, chan));
 	 }
       }
@@ -808,7 +808,7 @@ fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
    for (int i = 0; i < components; i++) {
       sources[i] = vgrf(glsl_type::float_type);
       bld.MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate);
-      coordinate = offset(coordinate, 1);
+      coordinate = offset(coordinate, bld, 1);
    }
 
    bld.LOAD_PAYLOAD(payload, sources, components, 0);
@@ -854,7 +854,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
 
          for (int i=0; i<4; i++) {
             bld.MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f));
-            res = offset(res, 1);
+            res = offset(res, bld, 1);
          }
          return;
       }
@@ -908,7 +908,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
 
    /* fixup #layers for cube map arrays */
    if (op == ir_txs && is_cube_array) {
-      fs_reg depth = offset(dst, 2);
+      fs_reg depth = offset(dst, bld, 2);
       fs_reg fixed_depth = vgrf(glsl_type::int_type);
       bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
 
@@ -918,7 +918,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
          if (i == 2) {
             fixed_payload[i] = fixed_depth;
          } else {
-            fixed_payload[i] = offset(dst, i);
+            fixed_payload[i] = offset(dst, bld, i);
          }
       }
       bld.LOAD_PAYLOAD(dst, fixed_payload, components, 0);
@@ -953,7 +953,7 @@ fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
          bld.ASR(dst, dst, fs_reg(32 - width));
       }
 
-      dst = offset(dst, 1);
+      dst = offset(dst, bld, 1);
    }
 }
 
@@ -990,7 +990,7 @@ fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
 {
    if (op == ir_query_levels) {
       /* # levels is in .w */
-      this->result = offset(orig_val, 3);
+      this->result = offset(orig_val, bld, 3);
       return;
    }
 
@@ -1011,15 +1011,15 @@ fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
       for (int i = 0; i < 4; i++) {
 	 int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
 	 fs_reg l = swizzled_result;
-	 l = offset(l, i);
+	 l = offset(l, bld, i);
 
 	 if (swiz == SWIZZLE_ZERO) {
             bld.MOV(l, fs_reg(0.0f));
 	 } else if (swiz == SWIZZLE_ONE) {
             bld.MOV(l, fs_reg(1.0f));
 	 } else {
-            bld.MOV(l, offset(orig_val,
-                              GET_SWZ(key_tex->swizzles[sampler], i)));
+            bld.MOV(l, offset(orig_val, bld,
+                                  GET_SWZ(key_tex->swizzles[sampler], i)));
 	 }
       }
       this->result = swizzled_result;
@@ -1316,14 +1316,14 @@ fs_visitor::emit_interpolation_setup_gen4()
 
    if (devinfo->has_pln && dispatch_width == 16) {
       for (unsigned i = 0; i < 2; i++) {
-         abld.half(i).ADD(half(offset(delta_xy, i), 0),
+         abld.half(i).ADD(half(offset(delta_xy, abld, i), 0),
                           half(this->pixel_x, i), xstart);
-         abld.half(i).ADD(half(offset(delta_xy, i), 1),
+         abld.half(i).ADD(half(offset(delta_xy, abld, i), 1),
                           half(this->pixel_y, i), ystart);
       }
    } else {
-      abld.ADD(offset(delta_xy, 0), this->pixel_x, xstart);
-      abld.ADD(offset(delta_xy, 1), this->pixel_y, ystart);
+      abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
+      abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
    }
 
    abld = bld.annotate("compute pos.w and 1/pos.w");
@@ -1421,7 +1421,7 @@ fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
       fs_reg tmp = vgrf(glsl_type::vec4_type);
       assert(color.type == BRW_REGISTER_TYPE_F);
       for (unsigned i = 0; i < components; i++) {
-         inst = bld.MOV(offset(tmp, i), offset(color, i));
+         inst = bld.MOV(offset(tmp, bld, i), offset(color, bld, i));
          inst->saturate = true;
       }
       color = tmp;
@@ -1430,10 +1430,10 @@ fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
    if (exec_size < dispatch_width) {
       unsigned half_idx = use_2nd_half ? 1 : 0;
       for (unsigned i = 0; i < components; i++)
-         dst[i] = half(offset(color, i), half_idx);
+         dst[i] = half(offset(color, bld, i), half_idx);
    } else {
       for (unsigned i = 0; i < components; i++)
-         dst[i] = offset(color, i);
+         dst[i] = offset(color, bld, i);
    }
 }
 
@@ -1481,7 +1481,7 @@ fs_visitor::emit_alpha_test()
                      BRW_CONDITIONAL_NEQ);
    } else {
       /* RT0 alpha */
-      fs_reg color = offset(outputs[0], 3);
+      fs_reg color = offset(outputs[0], bld, 3);
 
       /* f0.1 &= func(color, ref) */
       cmp = abld.CMP(bld.null_reg_f(), color, fs_reg(key->alpha_test_ref),
@@ -1558,7 +1558,8 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
        * alpha-testing, alpha-to-coverage, and so on.
        */
       if (this->outputs[0].file != BAD_FILE)
-         setup_color_payload(&sources[length + 3], offset(this->outputs[0], 3),
+         setup_color_payload(&sources[length + 3],
+                             offset(this->outputs[0], bld, 3),
                              1, exec_size, false);
       length += 4;
    } else if (color1.file == BAD_FILE) {
@@ -1694,7 +1695,7 @@ fs_visitor::emit_fb_writes()
 
          fs_reg src0_alpha;
          if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
-            src0_alpha = offset(outputs[0], 3);
+            src0_alpha = offset(outputs[0], bld, 3);
 
          inst = emit_single_fb_write(abld, this->outputs[target], reg_undef,
                                      src0_alpha,
@@ -1787,7 +1788,7 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
       abld.MUL(output, outputs[clip_vertex], u);
       for (int j = 1; j < 4; j++) {
          u.reg = userplane[i].reg + j;
-         abld.MAD(output, output, offset(outputs[clip_vertex], j), u);
+         abld.MAD(output, output, offset(outputs[clip_vertex], bld, j), u);
       }
    }
 }
@@ -1904,13 +1905,13 @@ fs_visitor::emit_urb_writes()
              */
             for (int i = 0; i < 4; i++) {
                reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
-               src = offset(this->outputs[varying], i);
+               src = offset(this->outputs[varying], bld, i);
                set_saturate(true, bld.MOV(reg, src));
                sources[length++] = reg;
             }
          } else {
             for (int i = 0; i < 4; i++)
-               sources[length++] = offset(this->outputs[varying], i);
+               sources[length++] = offset(this->outputs[varying], bld, i);
          }
          break;
       }
diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
index 8010fb4f610..ba67bc59e19 100644
--- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
@@ -283,10 +283,10 @@ TEST_F(cmod_propagation_test, intervening_dest_write)
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::vec2_type);
    fs_reg zero(0.0f);
-   bld.ADD(offset(dest, 2), src0, src1);
+   bld.ADD(offset(dest, bld, 2), src0, src1);
    bld.emit(SHADER_OPCODE_TEX, dest, src2)
       ->regs_written = 4;
-   bld.CMP(bld.null_reg_f(), offset(dest, 2), zero, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), offset(dest, bld, 2), zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
index 3ef0cb319eb..1caa0b50ec6 100644
--- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
@@ -367,10 +367,10 @@ TEST_F(saturate_propagation_test, intervening_dest_write)
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::vec2_type);
-   bld.ADD(offset(dst0, 2), src0, src1);
+   bld.ADD(offset(dst0, bld, 2), src0, src1);
    bld.emit(SHADER_OPCODE_TEX, dst0, src2)
       ->regs_written = 4;
-   set_saturate(true, bld.MOV(dst1, offset(dst0, 2)));
+   set_saturate(true, bld.MOV(dst1, offset(dst0, bld, 2)));
 
    /* = Before =
     *

From 67c4c9e1a709508b88d6d31eb1f7cb61d187189e Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 12:24:27 -0700
Subject: [PATCH 0129/1208] i965/fs: Make better use of the builder in
 shader_time

Previously, we were just depending on register widths to ensure that
various things were exec_size of 1 etc.  Now, we do so explicitly using the
builder.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index ceac20cc97a..464c1f673cd 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -557,7 +557,7 @@ fs_visitor::get_timestamp(const fs_builder &bld)
    /* We want to read the 3 fields we care about even if it's not enabled in
     * the dispatch.
     */
-   bld.exec_all().MOV(dst, ts);
+   bld.group(4, 0).exec_all().MOV(dst, ts);
 
    /* The caller wants the low 32 bits of the timestamp.  Since it's running
     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
@@ -604,17 +604,19 @@ fs_visitor::emit_shader_time_end()
    start.negate = true;
    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
    diff.set_smear(0);
-   ibld.ADD(diff, start, shader_end_time);
+
+   const fs_builder cbld = ibld.group(1, 0);
+   cbld.group(1, 0).ADD(diff, start, shader_end_time);
 
    /* If there were no instructions between the two timestamp gets, the diff
     * is 2 cycles.  Remove that overhead, so I can forget about that when
     * trying to determine the time taken for single instructions.
     */
-   ibld.ADD(diff, diff, fs_reg(-2u));
-   SHADER_TIME_ADD(ibld, 0, diff);
-   SHADER_TIME_ADD(ibld, 1, fs_reg(1u));
+   cbld.ADD(diff, diff, fs_reg(-2u));
+   SHADER_TIME_ADD(cbld, 0, diff);
+   SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
    ibld.emit(BRW_OPCODE_ELSE);
-   SHADER_TIME_ADD(ibld, 2, fs_reg(1u));
+   SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
    ibld.emit(BRW_OPCODE_ENDIF);
 }
 

From 89bc4c78c394e50ddb16cc089bd3ec90681342d7 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 12:30:43 -0700
Subject: [PATCH 0130/1208] i965/fs: Remove fs_inst constructors that don't
 take an explicit exec_size

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp       | 30 ++--------------------
 src/mesa/drivers/dri/i965/brw_fs_builder.h |  2 +-
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp   |  6 +++--
 src/mesa/drivers/dri/i965/brw_ir_fs.h      |  9 +------
 4 files changed, 8 insertions(+), 39 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 464c1f673cd..8d99abc6dbb 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -126,9 +126,9 @@ fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
    init(opcode, exec_size, reg_undef, NULL, 0);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
 {
-   init(opcode, 0, dst, NULL, 0);
+   init(opcode, exec_size, dst, NULL, 0);
 }
 
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
@@ -138,12 +138,6 @@ fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    init(opcode, exec_size, dst, src, 1);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
-{
-   const fs_reg src[1] = { src0 };
-   init(opcode, 0, dst, src, 1);
-}
-
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
                  const fs_reg &src0, const fs_reg &src1)
 {
@@ -151,13 +145,6 @@ fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    init(opcode, exec_size, dst, src, 2);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1)
-{
-   const fs_reg src[2] = { src0, src1 };
-   init(opcode, 0, dst, src, 2);
-}
-
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 {
@@ -165,19 +152,6 @@ fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    init(opcode, exec_size, dst, src, 3);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1, const fs_reg &src2)
-{
-   const fs_reg src[3] = { src0, src1, src2 };
-   init(opcode, 0, dst, src, 3);
-}
-
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
-                 const fs_reg src[], unsigned sources)
-{
-   init(opcode, 0, dst, src, sources);
-}
-
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
                  const fs_reg src[], unsigned sources)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index 58ac5980da5..c823190efbd 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -235,7 +235,7 @@ namespace brw {
       instruction *
       emit(enum opcode opcode, const dst_reg &dst) const
       {
-         return emit(instruction(opcode, dst));
+         return emit(instruction(opcode, dst.width, dst));
       }
 
       /**
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index f52f344ccff..caf1300d71b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -109,7 +109,8 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
          if (var->data.location == VARYING_SLOT_POS) {
             reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
                                                 var->data.origin_upper_left);
-            emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, input, reg), 0xF);
+            emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
+                                      input, reg), 0xF);
          } else {
             emit_general_interpolation(input, var->name, var->type,
                                        (glsl_interp_qualifier) var->data.interpolation,
@@ -1762,7 +1763,8 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
    fs_reg dest = get_nir_dest(instr->dest);
    dest.type = this->result.type;
    unsigned num_components = nir_tex_instr_dest_size(instr);
-   emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, dest, this->result),
+   emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
+                             dest, this->result),
                 (1 << num_components) - 1);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index 16b20beb788..d6b617ab2bd 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -189,20 +189,13 @@ public:
 
    fs_inst();
    fs_inst(enum opcode opcode, uint8_t exec_size);
-   fs_inst(enum opcode opcode, const fs_reg &dst);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst);
    fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
            const fs_reg &src0);
-   fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0);
    fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
            const fs_reg &src0, const fs_reg &src1);
-   fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-           const fs_reg &src1);
    fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
            const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
-   fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-           const fs_reg &src1, const fs_reg &src2);
-   fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg src[],
-           unsigned sources);
    fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
            const fs_reg src[], unsigned sources);
    fs_inst(const fs_inst &that);

From 500525e96019aff551afa8fee841d00ca9ec4c4f Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 12:50:09 -0700
Subject: [PATCH 0131/1208] i965/fs: Use exec_size for determining regs
 read/written and partial writes

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8d99abc6dbb..c11e3f3e17c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -101,7 +101,7 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    case MRF:
    case ATTR:
       this->regs_written =
-         DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
+         DIV_ROUND_UP(MAX2(exec_size * dst.stride, 1) * type_sz(dst.type), 32);
       break;
    case BAD_FILE:
       this->regs_written = 0;
@@ -675,7 +675,7 @@ bool
 fs_inst::is_partial_write() const
 {
    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
-           (this->dst.width * type_sz(this->dst.type)) < 32 ||
+           (this->exec_size * type_sz(this->dst.type)) < 32 ||
            !this->dst.is_contiguous());
 }
 
@@ -735,7 +735,7 @@ fs_inst::regs_read(int arg) const
       if (src[arg].stride == 0) {
          return 1;
       } else {
-         int size = components * src[arg].width * type_sz(src[arg].type);
+         int size = components * this->exec_size * type_sz(src[arg].type);
          return DIV_ROUND_UP(size * src[arg].stride, 32);
       }
    case MRF:

From b624ccc206cbf19989c6562416d7c21b66270577 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 12:51:51 -0700
Subject: [PATCH 0132/1208] i965/fs_builder: Use the dispatch width for setting
 exec sizes

Previously we used dst.width but the two *should* be the same.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs_builder.h | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index c823190efbd..8af16a0fd73 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -235,7 +235,7 @@ namespace brw {
       instruction *
       emit(enum opcode opcode, const dst_reg &dst) const
       {
-         return emit(instruction(opcode, dst.width, dst));
+         return emit(instruction(opcode, dispatch_width(), dst));
       }
 
       /**
@@ -253,11 +253,11 @@ namespace brw {
          case SHADER_OPCODE_SIN:
          case SHADER_OPCODE_COS:
             return fix_math_instruction(
-               emit(instruction(opcode, dst.width, dst,
+               emit(instruction(opcode, dispatch_width(), dst,
                                 fix_math_operand(src0))));
 
          default:
-            return emit(instruction(opcode, dst.width, dst, src0));
+            return emit(instruction(opcode, dispatch_width(), dst, src0));
          }
       }
 
@@ -273,12 +273,12 @@ namespace brw {
          case SHADER_OPCODE_INT_QUOTIENT:
          case SHADER_OPCODE_INT_REMAINDER:
             return fix_math_instruction(
-               emit(instruction(opcode, dst.width, dst,
+               emit(instruction(opcode, dispatch_width(), dst,
                                 fix_math_operand(src0),
                                 fix_math_operand(src1))));
 
          default:
-            return emit(instruction(opcode, dst.width, dst, src0, src1));
+            return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
 
          }
       }
@@ -295,13 +295,14 @@ namespace brw {
          case BRW_OPCODE_BFI2:
          case BRW_OPCODE_MAD:
          case BRW_OPCODE_LRP:
-            return emit(instruction(opcode, dst.width, dst,
+            return emit(instruction(opcode, dispatch_width(), dst,
                                     fix_3src_operand(src0),
                                     fix_3src_operand(src1),
                                     fix_3src_operand(src2)));
 
          default:
-            return emit(instruction(opcode, dst.width, dst, src0, src1, src2));
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    src0, src1, src2));
          }
       }
 
@@ -517,7 +518,8 @@ namespace brw {
       {
          assert(dst.width % 8 == 0);
          instruction *inst = emit(instruction(SHADER_OPCODE_LOAD_PAYLOAD,
-                                              dst.width, dst, src, sources));
+                                              dispatch_width(), dst,
+                                              src, sources));
          inst->header_size = header_size;
 
          for (unsigned i = 0; i < header_size; i++)
@@ -528,7 +530,7 @@ namespace brw {
          for (unsigned i = header_size; i < sources; ++i)
             assert(src[i].file != GRF ||
                    src[i].width == dst.width);
-         inst->regs_written += (sources - header_size) * (dst.width / 8);
+         inst->regs_written += (sources - header_size) * (dispatch_width() / 8);
 
          return inst;
       }

From c9676329dd6c69b2e0b12405c3b4078f7d216f2f Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 12:34:52 -0700
Subject: [PATCH 0133/1208] i965/fs: Remove exec_size guessing from
 fs_inst::init()

Now that all of the non-explicit constructors are gone, we don't need to
guess anymore.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index c11e3f3e17c..d08af84f157 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -68,28 +68,6 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 
    assert(dst.file != IMM && dst.file != UNIFORM);
 
-   /* If exec_size == 0, try to guess it from the registers.  Since all
-    * manner of things may use hardware registers, we first try to guess
-    * based on GRF registers.  If this fails, we will go ahead and take the
-    * width from the destination register.
-    */
-   if (this->exec_size == 0) {
-      if (dst.file == GRF) {
-         this->exec_size = dst.width;
-      } else {
-         for (unsigned i = 0; i < sources; ++i) {
-            if (src[i].file != GRF && src[i].file != ATTR)
-               continue;
-
-            if (this->exec_size <= 1)
-               this->exec_size = src[i].width;
-            assert(src[i].width == 1 || src[i].width == this->exec_size);
-         }
-      }
-
-      if (this->exec_size == 0 && dst.file != BAD_FILE)
-         this->exec_size = dst.width;
-   }
    assert(this->exec_size != 0);
 
    this->conditional_mod = BRW_CONDITIONAL_NONE;

From 21803b7b3304f053a48e313951ffddf1d2cd0bd9 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 13:41:38 -0700
Subject: [PATCH 0134/1208] i965/fs: Use the builder dispatch width instead of
 dst.width for pull constants

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index d08af84f157..3589bb92009 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -188,7 +188,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
    bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 
    int scale = 1;
-   if (devinfo->gen == 4 && dst.width == 8) {
+   if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
        * u, v, r) as parameters, or we can just use the SIMD16 message
        * consisting of (header, u).  We choose the second, at the cost of a
@@ -204,9 +204,9 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 
    assert(dst.width % 8 == 0);
-   int regs_written = 4 * (dst.width / 8) * scale;
+   int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
-                               dst.type, dst.width);
+                               dst.type, bld.dispatch_width());
    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
    inst->regs_written = regs_written;
 
@@ -216,7 +216,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
       if (devinfo->gen == 4)
          inst->mlen = 3;
       else
-         inst->mlen = 1 + dispatch_width / 8;
+         inst->mlen = 1 + bld.dispatch_width() / 8;
    }
 
    bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));

From 9a0c883292cf48910a32634f7cc8b855e08c09d5 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 25 Jun 2015 11:00:01 -0700
Subject: [PATCH 0135/1208] i965/fs: Use the builder dispatch_width for
 computing register offsets

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index ece7e49ea17..88a50ae0913 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -72,7 +72,7 @@ offset(fs_reg reg, const brw::fs_builder& bld, unsigned delta)
    case MRF:
    case ATTR:
       return byte_offset(reg,
-                         delta * MAX2(reg.width * reg.stride, 1) *
+                         delta * MAX2(bld.dispatch_width() * reg.stride, 1) *
                          type_sz(reg.type));
    case UNIFORM:
       reg.reg_offset += delta;

From 83458e7c53cfc1f344280da6eb9a3b4e2dfdbc00 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 13:49:22 -0700
Subject: [PATCH 0136/1208] i965/fs: Use exec_size instead of dst.width for
 computing component size

There are a variety of places where we use dst.width / 8 to compute the
size of a single logical channel.  Instead, we should be using exec_size.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp                    | 6 +++---
 src/mesa/drivers/dri/i965/brw_fs_cse.cpp                | 2 +-
 src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp  | 2 +-
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp            | 2 +-
 src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp | 4 ++--
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 3589bb92009..be772ae547b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2280,12 +2280,12 @@ fs_visitor::opt_register_renaming()
 
       if (depth == 0 &&
           inst->dst.file == GRF &&
-          alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
+          alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
           !inst->is_partial_write()) {
          if (remap[dst] == -1) {
             remap[dst] = dst;
          } else {
-            remap[dst] = alloc.allocate(inst->dst.width / 8);
+            remap[dst] = alloc.allocate(inst->exec_size / 8);
             inst->dst.reg = remap[dst];
             progress = true;
          }
@@ -2416,7 +2416,7 @@ fs_visitor::compute_to_mrf()
             /* Things returning more than one register would need us to
              * understand coalescing out more than one MOV at a time.
              */
-            if (scan_inst->regs_written > scan_inst->dst.width / 8)
+            if (scan_inst->regs_written > scan_inst->exec_size / 8)
                break;
 
 	    /* SEND instructions can't have MRF as a destination. */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 29d1f2a6a57..29b46b96b8a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -179,7 +179,7 @@ static void
 create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
 {
    int written = inst->regs_written;
-   int dst_width = inst->dst.width / 8;
+   int dst_width = inst->exec_size / 8;
    const fs_builder ubld = bld.group(inst->exec_size, inst->force_sechalf)
                               .exec_all(inst->force_writemask_all);
    fs_inst *copy;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 2ad7079bdf8..149c0f0e217 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -196,7 +196,7 @@ fs_visitor::register_coalesce()
             continue;
          }
          reg_to_offset[offset] = inst->dst.reg_offset;
-         if (inst->src[0].width == 16)
+         if (inst->exec_size == 16)
             reg_to_offset[offset + 1] = inst->dst.reg_offset + 1;
          mov[offset] = inst;
          channels_remaining -= inst->regs_written;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 9b91f47d264..d5ff1be1414 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -913,7 +913,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
       bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
 
       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
-      int components = inst->regs_written / (dst.width / 8);
+      int components = inst->regs_written / (inst->exec_size / 8);
       for (int i = 0; i < components; i++) {
          if (i == 2) {
             fixed_payload[i] = fixed_depth;
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index ee0add5d765..b49961fff68 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -1314,8 +1314,8 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
                 * single-result send is probably actually reducing register
                 * pressure.
                 */
-               if (inst->regs_written <= inst->dst.width / 8 &&
-                   chosen_inst->regs_written > chosen_inst->dst.width / 8) {
+               if (inst->regs_written <= inst->exec_size / 8 &&
+                   chosen_inst->regs_written > chosen_inst->exec_size / 8) {
                   chosen = n;
                   continue;
                } else if (inst->regs_written > chosen_inst->regs_written) {

From 7f77abc9edf1348b8c6b82dfff102896cd4a2a58 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 13:57:37 -0700
Subject: [PATCH 0137/1208] i965/fs_generator: Use inst->exec_size for
 determining hardware reg widths

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 8d821abbac2..0a70bdc3c76 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -48,7 +48,7 @@ static uint32_t brw_file_from_reg(fs_reg *reg)
 }
 
 static struct brw_reg
-brw_reg_from_fs_reg(fs_reg *reg)
+brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg)
 {
    struct brw_reg brw_reg;
 
@@ -57,10 +57,10 @@ brw_reg_from_fs_reg(fs_reg *reg)
    case MRF:
       if (reg->stride == 0) {
          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
-      } else if (reg->width < 8) {
+      } else if (inst->exec_size < 8) {
          brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
-         brw_reg = stride(brw_reg, reg->width * reg->stride,
-                          reg->width, reg->stride);
+         brw_reg = stride(brw_reg, inst->exec_size * reg->stride,
+                          inst->exec_size, reg->stride);
       } else {
          /* From the Haswell PRM:
           *
@@ -414,7 +414,7 @@ fs_generator::generate_blorp_fb_write(fs_inst *inst)
    brw_fb_WRITE(p,
                 16 /* dispatch_width */,
                 brw_message_reg(inst->base_mrf),
-                brw_reg_from_fs_reg(&inst->src[0]),
+                brw_reg_from_fs_reg(inst, &inst->src[0]),
                 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
                 inst->target,
                 inst->mlen,
@@ -1560,7 +1560,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
 
       for (unsigned int i = 0; i < inst->sources; i++) {
-	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
+	 src[i] = brw_reg_from_fs_reg(inst, &inst->src[i]);
 
 	 /* The accumulator result appears to get used for the
 	  * conditional modifier generation.  When negating a UD
@@ -1572,7 +1572,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
 		!inst->src[i].negate);
       }
-      dst = brw_reg_from_fs_reg(&inst->dst);
+      dst = brw_reg_from_fs_reg(inst, &inst->dst);
 
       brw_set_default_predicate_control(p, inst->predicate);
       brw_set_default_predicate_inverse(p, inst->predicate_inverse);

From 830f67046ace3c0b95a7f093fe373eeb417a1aad Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 18 Jun 2015 12:44:35 -0700
Subject: [PATCH 0138/1208] i965/fs: Remove the width field from fs_reg

As of now, the width field is no longer used for anything.  The width field
"seemed like a good idea at the time" but is actually entirely redundant
with the instruction's execution size.  Initially, it gave us the ability
to easily set the instructions execution size based entirely on register
widths.  With the builder, we can easiliy set the sizes explicitly and the
width field doesn't have as much purpose.  At this point, it's just
redundant information that can get out of sync so it really needs to go.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp          | 62 +++----------------
 src/mesa/drivers/dri/i965/brw_fs_builder.h    | 19 ++----
 .../dri/i965/brw_fs_copy_propagation.cpp      |  4 --
 src/mesa/drivers/dri/i965/brw_fs_cse.cpp      |  6 +-
 .../drivers/dri/i965/brw_fs_reg_allocate.cpp  |  4 +-
 .../dri/i965/brw_fs_register_coalesce.cpp     |  1 -
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp  | 26 ++++----
 src/mesa/drivers/dri/i965/brw_ir_fs.h         | 15 +----
 8 files changed, 30 insertions(+), 107 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index be772ae547b..dd955197bca 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -203,10 +203,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
    else
       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 
-   assert(dst.width % 8 == 0);
    int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
-   fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
-                               dst.type, bld.dispatch_width());
+   fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
    inst->regs_written = regs_written;
 
@@ -310,7 +308,6 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 
    for (int i = 0; i < this->sources; i++) {
       reg.type = this->src[i].type;
-      reg.width = this->src[i].width;
       if (!this->src[i].equals(reg))
          return false;
 
@@ -366,7 +363,6 @@ fs_reg::fs_reg(float f)
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_F;
    this->fixed_hw_reg.dw1.f = f;
-   this->width = 1;
 }
 
 /** Immediate value constructor. */
@@ -376,7 +372,6 @@ fs_reg::fs_reg(int32_t i)
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_D;
    this->fixed_hw_reg.dw1.d = i;
-   this->width = 1;
 }
 
 /** Immediate value constructor. */
@@ -386,7 +381,6 @@ fs_reg::fs_reg(uint32_t u)
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_UD;
    this->fixed_hw_reg.dw1.ud = u;
-   this->width = 1;
 }
 
 /** Vector float immediate value constructor. */
@@ -417,7 +411,6 @@ fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
    this->file = HW_REG;
    this->fixed_hw_reg = fixed_hw_reg;
    this->type = fixed_hw_reg.type;
-   this->width = 1 << fixed_hw_reg.width;
 }
 
 bool
@@ -432,7 +425,6 @@ fs_reg::equals(const fs_reg &r) const
            abs == r.abs &&
            !reladdr && !r.reladdr &&
            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
-           width == r.width &&
            stride == r.stride);
 }
 
@@ -504,7 +496,7 @@ fs_visitor::get_timestamp(const fs_builder &bld)
                                           0),
                              BRW_REGISTER_TYPE_UD));
 
-   fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
+   fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 
    /* We want to read the 3 fields we care about even if it's not enabled in
     * the dispatch.
@@ -554,7 +546,7 @@ fs_visitor::emit_shader_time_end()
 
    fs_reg start = shader_start_time;
    start.negate = true;
-   fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
+   fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
    diff.set_smear(0);
 
    const fs_builder cbld = ibld.group(1, 0);
@@ -809,7 +801,7 @@ fs_visitor::vgrf(const glsl_type *const type)
 {
    int reg_width = dispatch_width / 8;
    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
-                 brw_type_for_base_type(type), dispatch_width);
+                 brw_type_for_base_type(type));
 }
 
 /** Fixed HW reg constructor. */
@@ -819,14 +811,6 @@ fs_reg::fs_reg(enum register_file file, int reg)
    this->file = file;
    this->reg = reg;
    this->type = BRW_REGISTER_TYPE_F;
-
-   switch (file) {
-   case UNIFORM:
-      this->width = 1;
-      break;
-   default:
-      this->width = 8;
-   }
 }
 
 /** Fixed HW reg constructor. */
@@ -836,25 +820,6 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
    this->file = file;
    this->reg = reg;
    this->type = type;
-
-   switch (file) {
-   case UNIFORM:
-      this->width = 1;
-      break;
-   default:
-      this->width = 8;
-   }
-}
-
-/** Fixed HW reg constructor. */
-fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
-               uint8_t width)
-{
-   init();
-   this->file = file;
-   this->reg = reg;
-   this->type = type;
-   this->width = width;
 }
 
 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
@@ -1863,7 +1828,6 @@ fs_visitor::demote_pull_constants()
          inst->src[i].file = GRF;
          inst->src[i].reg = dst.reg;
          inst->src[i].reg_offset = 0;
-         inst->src[i].width = dispatch_width;
       }
    }
    invalidate_live_intervals();
@@ -2951,20 +2915,17 @@ fs_visitor::lower_load_payload()
       if (dst.file == MRF)
          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
 
-      dst.width = 8;
       const fs_builder hbld = bld.group(8, 0).exec_all().at(block, inst);
 
       for (uint8_t i = 0; i < inst->header_size; i++) {
          if (inst->src[i].file != BAD_FILE) {
             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
-            mov_src.width = 8;
             hbld.MOV(mov_dst, mov_src);
          }
          dst = offset(dst, hbld, 1);
       }
 
-      dst.width = inst->exec_size;
       const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
                                  .exec_all(inst->force_writemask_all)
                                  .at(block, inst);
@@ -2998,7 +2959,6 @@ fs_visitor::lower_load_payload()
                } else {
                   /* Platform doesn't have COMPR4.  We have to fake it */
                   fs_reg mov_dst = retype(dst, inst->src[i].type);
-                  mov_dst.width = 8;
                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
                   mov_dst.reg += 4;
                   ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
@@ -3070,7 +3030,7 @@ fs_visitor::lower_integer_multiplication()
           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
          if (devinfo->gen < 7) {
             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
-                       inst->dst.type, dispatch_width);
+                       inst->dst.type);
             ibld.MOV(imm, inst->src[1]);
             ibld.MUL(inst->dst, imm, inst->src[0]);
          } else {
@@ -3124,11 +3084,11 @@ fs_visitor::lower_integer_multiplication()
 
          if (inst->conditional_mod && inst->dst.is_null()) {
             inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
-                               inst->dst.type, dispatch_width);
+                               inst->dst.type);
          }
          fs_reg low = inst->dst;
          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
-                     inst->dst.type, dispatch_width);
+                     inst->dst.type);
 
          if (devinfo->gen >= 7) {
             fs_reg src1_0_w = inst->src[1];
@@ -3282,9 +3242,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    switch (inst->dst.file) {
    case GRF:
       fprintf(file, "vgrf%d", inst->dst.reg);
-      if (inst->dst.width != dispatch_width)
-         fprintf(file, "@%d", inst->dst.width);
-      if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
+      if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
           inst->dst.subreg_offset)
          fprintf(file, "+%d.%d",
                  inst->dst.reg_offset, inst->dst.subreg_offset);
@@ -3342,9 +3300,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
       switch (inst->src[i].file) {
       case GRF:
          fprintf(file, "vgrf%d", inst->src[i].reg);
-         if (inst->src[i].width != dispatch_width)
-            fprintf(file, "@%d", inst->src[i].width);
-         if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
+         if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
              inst->src[i].subreg_offset)
             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
                     inst->src[i].subreg_offset);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index 8af16a0fd73..2c36e071f54 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -163,7 +163,7 @@ namespace brw {
          return dst_reg(GRF, shader->alloc.allocate(
                            DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
                                         REG_SIZE)),
-                        type, dispatch_width());
+                        type);
       }
 
       /**
@@ -516,21 +516,12 @@ namespace brw {
       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
                    unsigned sources, unsigned header_size) const
       {
-         assert(dst.width % 8 == 0);
          instruction *inst = emit(instruction(SHADER_OPCODE_LOAD_PAYLOAD,
                                               dispatch_width(), dst,
                                               src, sources));
          inst->header_size = header_size;
-
-         for (unsigned i = 0; i < header_size; i++)
-            assert(src[i].file != GRF ||
-                   src[i].width * type_sz(src[i].type) == 32);
-         inst->regs_written = header_size;
-
-         for (unsigned i = header_size; i < sources; ++i)
-            assert(src[i].file != GRF ||
-                   src[i].width == dst.width);
-         inst->regs_written += (sources - header_size) * (dispatch_width() / 8);
+         inst->regs_written = header_size +
+                              (sources - header_size) * (dispatch_width() / 8);
 
          return inst;
       }
@@ -628,8 +619,8 @@ namespace brw {
                inst->resize_sources(1);
                inst->src[0] = src0;
 
-               at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type,
-                                          dispatch_width()), src1);
+               at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type),
+                                   src1);
             }
          }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index c92aae4b1d6..54e9114fe6b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -388,17 +388,14 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
 
    switch (entry->src.file) {
    case UNIFORM:
-      assert(entry->src.width == 1);
    case BAD_FILE:
    case HW_REG:
-      inst->src[arg].width = entry->src.width;
       inst->src[arg].reg_offset = entry->src.reg_offset;
       inst->src[arg].subreg_offset = entry->src.subreg_offset;
       break;
    case ATTR:
    case GRF:
       {
-         assert(entry->src.width % inst->src[arg].width == 0);
          /* In this case, we'll just leave the width alone.  The source
           * register could have different widths depending on how it is
           * being used.  For instance, if only half of the register was
@@ -715,7 +712,6 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
                acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
                entry->dst = inst->dst;
                entry->dst.reg_offset = offset;
-               entry->dst.width = effective_width;
                entry->src = inst->src[i];
                entry->regs_written = regs_written;
                entry->opcode = inst->opcode;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 29b46b96b8a..291feb3ec0c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -200,7 +200,6 @@ create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
       payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
       for (int i = 0; i < header_size; i++) {
          payload[i] = src;
-         payload[i].width = 8;
          src.reg_offset++;
       }
       for (int i = header_size; i < sources; i++) {
@@ -260,11 +259,9 @@ fs_visitor::opt_cse_local(bblock_t *block)
             bool no_existing_temp = entry->tmp.file == BAD_FILE;
             if (no_existing_temp && !entry->generator->dst.is_null()) {
                int written = entry->generator->regs_written;
-               assert((written * 8) % entry->generator->dst.width == 0);
 
                entry->tmp = fs_reg(GRF, alloc.allocate(written),
-                                   entry->generator->dst.type,
-                                   entry->generator->dst.width);
+                                   entry->generator->dst.type);
 
                create_copy_instr(bld.at(block, entry->generator->next),
                                  entry->generator, entry->tmp, false);
@@ -275,7 +272,6 @@ fs_visitor::opt_cse_local(bblock_t *block)
             /* dest <- temp */
             if (!inst->dst.is_null()) {
                assert(inst->regs_written == entry->generator->regs_written);
-               assert(inst->dst.width == entry->generator->dst.width);
                assert(inst->dst.type == entry->tmp.type);
 
                create_copy_instr(bld.at(block, inst), inst, entry->tmp, negate);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 364fc4a5ad2..8e5621d3cb5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -706,10 +706,8 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
                          uint32_t spill_offset, int count)
 {
    int reg_size = 1;
-   if (dispatch_width == 16 && count % 2 == 0) {
+   if (dispatch_width == 16 && count % 2 == 0)
       reg_size = 2;
-      dst.width = 16;
-   }
 
    const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
                               .group(reg_size * 8, 0)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 149c0f0e217..a253c81c845 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -167,7 +167,6 @@ fs_visitor::register_coalesce()
          src_size = alloc.sizes[inst->src[0].reg];
          assert(src_size <= MAX_VGRF_SIZE);
 
-         assert(inst->src[0].width % 8 == 0);
          channels_remaining = src_size;
          memset(mov, 0, sizeof(mov));
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index d5ff1be1414..79ebb2d3019 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -246,7 +246,7 @@ fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
                                      fs_reg shadow_c, fs_reg lod,
                                      uint32_t sampler)
 {
-   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
+   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
    bool has_lod = op == ir_txl || op == ir_txb || op == ir_txf || op == ir_txs;
 
    if (has_lod && shadow_c.file != BAD_FILE)
@@ -323,7 +323,7 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
    int reg_width = dispatch_width / 8;
    unsigned header_size = 0;
 
-   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
+   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
    fs_reg msg_coords = message;
 
    if (has_offset) {
@@ -646,7 +646,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
       mlen = length * reg_width;
 
    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_F, dispatch_width);
+                               BRW_REGISTER_TYPE_F);
    bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
 
    /* Generate the SEND */
@@ -800,7 +800,7 @@ fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
 {
    int reg_width = dispatch_width / 8;
    fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
-                           BRW_REGISTER_TYPE_F, dispatch_width);
+                           BRW_REGISTER_TYPE_F);
    fs_reg dest = vgrf(glsl_type::uvec4_type);
    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
 
@@ -1170,7 +1170,7 @@ fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
 
    int mlen = 1 + (length - 1) * reg_width;
    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_UD, dispatch_width);
+                               BRW_REGISTER_TYPE_UD);
    bld.LOAD_PAYLOAD(src_payload, sources, length, 1);
 
    /* Emit the instruction. */
@@ -1218,7 +1218,7 @@ fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
 
    int mlen = 1 + reg_width;
    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_UD, dispatch_width);
+                               BRW_REGISTER_TYPE_UD);
    fs_inst *inst = bld.LOAD_PAYLOAD(src_payload, sources, 2, 1);
 
    /* Emit the instruction. */
@@ -1236,8 +1236,8 @@ fs_visitor::emit_dummy_fs()
    /* Everyone's favorite color. */
    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
    for (int i = 0; i < 4; i++) {
-      bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
-                     dispatch_width), fs_reg(color[i]));
+      bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F),
+              fs_reg(color[i]));
    }
 
    fs_inst *write;
@@ -1357,7 +1357,7 @@ fs_visitor::emit_interpolation_setup_gen6()
        * compute our pixel centers.
        */
       fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
-                          BRW_REGISTER_TYPE_UW, dispatch_width * 2);
+                          BRW_REGISTER_TYPE_UW);
       fs_inst *add =
          new (mem_ctx) fs_inst(BRW_OPCODE_ADD, dispatch_width * 2,
                                int_pixel_xy,
@@ -1544,7 +1544,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
        * it's unsinged single words, one vgrf is always 16-wide.
        */
       sources[length] = fs_reg(GRF, alloc.allocate(1),
-                               BRW_REGISTER_TYPE_UW, 16);
+                               BRW_REGISTER_TYPE_UW);
       bld.exec_all().annotate("FB write oMask")
          .emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
       length++;
@@ -1613,7 +1613,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
    fs_inst *write;
    if (devinfo->gen >= 7) {
       /* Send from the GRF */
-      fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F, exec_size);
+      fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
       load = ubld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
       payload.reg = alloc.allocate(load->regs_written);
       load->dst = payload;
@@ -1621,7 +1621,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
       write->base_mrf = -1;
    } else {
       /* Send from the MRF */
-      load = ubld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size),
+      load = ubld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
                                sources, length, payload_header_size);
 
       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
@@ -1928,7 +1928,7 @@ fs_visitor::emit_urb_writes()
       if (flush) {
          fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
          fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
-                                 BRW_REGISTER_TYPE_F, dispatch_width);
+                                 BRW_REGISTER_TYPE_F);
          payload_sources[0] =
             fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index d6b617ab2bd..b48244ae4ca 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -44,7 +44,6 @@ public:
    fs_reg(struct brw_reg fixed_hw_reg);
    fs_reg(enum register_file file, int reg);
    fs_reg(enum register_file file, int reg, enum brw_reg_type type);
-   fs_reg(enum register_file file, int reg, enum brw_reg_type type, uint8_t width);
 
    bool equals(const fs_reg &r) const;
    bool is_contiguous() const;
@@ -60,14 +59,6 @@ public:
 
    fs_reg *reladdr;
 
-   /**
-    * The register width.  This indicates how many hardware values are
-    * represented by each virtual value.  Valid values are 1, 8, or 16.
-    * For immediate values, this is 1.  Most of the rest of the time, it
-    * will be equal to the dispatch width.
-    */
-   uint8_t width;
-
    /** Register region horizontal stride */
    uint8_t stride;
 };
@@ -132,9 +123,7 @@ static inline fs_reg
 component(fs_reg reg, unsigned idx)
 {
    assert(reg.subreg_offset == 0);
-   assert(idx < reg.width);
    reg.subreg_offset = idx * type_sz(reg.type);
-   reg.width = 1;
    reg.stride = 0;
    return reg;
 }
@@ -142,7 +131,7 @@ component(fs_reg reg, unsigned idx)
 static inline bool
 is_uniform(const fs_reg &reg)
 {
-   return (reg.width == 1 || reg.stride == 0 || reg.is_null()) &&
+   return (reg.stride == 0 || reg.is_null()) &&
           (!reg.reladdr || is_uniform(*reg.reladdr));
 }
 
@@ -164,8 +153,6 @@ half(fs_reg reg, unsigned idx)
 
    case GRF:
    case MRF:
-      assert(reg.width == 16);
-      reg.width = 8;
       return horiz_offset(reg, 8 * idx);
 
    case ATTR:

From ebe3043eeacb073c7dbb6162d8f0aee3bc66eeb1 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 30 Jun 2015 17:47:53 -0700
Subject: [PATCH 0139/1208] i965/fs: Fix PIXEL_X/Y in regs_read()

PIXEL_X/Y takes a vec2 in the first argument
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index dd955197bca..a19ea664e3f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -681,7 +681,7 @@ fs_inst::regs_read(int arg) const
    case FS_OPCODE_PIXEL_X:
    case FS_OPCODE_PIXEL_Y:
       if (arg == 0)
-         components = 1;
+         components = 2;
       break;
 
    case SHADER_OPCODE_LOAD_PAYLOAD:

From 5afed936fea56a60300c6ed1228eaccf60c8cbd6 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Tue, 23 Jun 2015 07:47:58 +1000
Subject: [PATCH 0140/1208] glsl: use consistent version string format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/glsl/glsl_parser_extras.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index 9a0c24e6787..02ddbbd14be 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -129,7 +129,7 @@ struct _mesa_glsl_parse_state {
    bool check_explicit_attrib_stream_allowed(YYLTYPE *locp)
    {
       if (!this->has_explicit_attrib_stream()) {
-         const char *const requirement = "GL_ARB_gpu_shader5 extension or GLSL 400";
+         const char *const requirement = "GL_ARB_gpu_shader5 extension or GLSL 4.00";
 
          _mesa_glsl_error(locp, this, "explicit stream requires %s",
                           requirement);
@@ -144,8 +144,8 @@ struct _mesa_glsl_parse_state {
    {
       if (!this->has_explicit_attrib_location()) {
          const char *const requirement = this->es_shader
-            ? "GLSL ES 300"
-            : "GL_ARB_explicit_attrib_location extension or GLSL 330";
+            ? "GLSL ES 3.00"
+            : "GL_ARB_explicit_attrib_location extension or GLSL 3.30";
 
          _mesa_glsl_error(locp, this, "%s explicit location requires %s",
                           mode_string(var), requirement);
@@ -160,8 +160,8 @@ struct _mesa_glsl_parse_state {
    {
       if (!this->has_separate_shader_objects()) {
          const char *const requirement = this->es_shader
-            ? "GL_EXT_separate_shader_objects extension or GLSL ES 310"
-            : "GL_ARB_separate_shader_objects extension or GLSL 420";
+            ? "GL_EXT_separate_shader_objects extension or GLSL ES 3.10"
+            : "GL_ARB_separate_shader_objects extension or GLSL 4.20";
 
          _mesa_glsl_error(locp, this, "%s explicit location requires %s",
                           mode_string(var), requirement);
@@ -177,9 +177,9 @@ struct _mesa_glsl_parse_state {
       if (!this->has_explicit_attrib_location() ||
           !this->has_explicit_uniform_location()) {
          const char *const requirement = this->es_shader
-            ? "GLSL ES 310"
+            ? "GLSL ES 3.10"
             : "GL_ARB_explicit_uniform_location and either "
-              "GL_ARB_explicit_attrib_location or GLSL 330.";
+              "GL_ARB_explicit_attrib_location or GLSL 3.30.";
 
          _mesa_glsl_error(locp, this,
                           "uniform explicit location requires %s",

From 1de93f94991c41081c3d9e01c2097401970f4095 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Tue, 23 Jun 2015 07:53:24 +1000
Subject: [PATCH 0141/1208] freedreno: use consistent version string format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/gallium/drivers/freedreno/freedreno_screen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index b3b5462b437..7330cd9c788 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -68,7 +68,7 @@ static const struct debug_named_value debug_options[] = {
 		{"fraghalf",  FD_DBG_FRAGHALF, "Use half-precision in fragment shader"},
 		{"nobin",     FD_DBG_NOBIN,  "Disable hw binning"},
 		{"optmsgs",   FD_DBG_OPTMSGS,"Enable optimizer debug messages"},
-		{"glsl120",   FD_DBG_GLSL120,"Temporary flag to force GLSL 120 (rather than 130) on a3xx+"},
+		{"glsl120",   FD_DBG_GLSL120,"Temporary flag to force GLSL 1.20 (rather than 1.30) on a3xx+"},
 		DEBUG_NAMED_VALUE_END
 };
 

From 5ccd61217d873567b8d9a7a0fa8f678522ec78cb Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@gmail.com>
Date: Sat, 27 Jun 2015 14:21:27 +1000
Subject: [PATCH 0142/1208] tgsi: add infer support for double opcodes.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/tgsi/tgsi_info.c | 37 ++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 929531109e5..4b16ef387f3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -374,7 +374,33 @@ tgsi_opcode_infer_type( uint opcode )
    case TGSI_OPCODE_IMUL_HI:
    case TGSI_OPCODE_IBFE:
    case TGSI_OPCODE_IMSB:
+   case TGSI_OPCODE_DSEQ:
+   case TGSI_OPCODE_DSGE:
+   case TGSI_OPCODE_DSLT:
+   case TGSI_OPCODE_DSNE:
       return TGSI_TYPE_SIGNED;
+   case TGSI_OPCODE_DADD:
+   case TGSI_OPCODE_DABS:
+   case TGSI_OPCODE_DNEG:
+   case TGSI_OPCODE_DMUL:
+   case TGSI_OPCODE_DMAX:
+   case TGSI_OPCODE_DMIN:
+   case TGSI_OPCODE_DRCP:
+   case TGSI_OPCODE_DSQRT:
+   case TGSI_OPCODE_DMAD:
+   case TGSI_OPCODE_DLDEXP:
+   case TGSI_OPCODE_DFRACEXP:
+   case TGSI_OPCODE_DFRAC:
+   case TGSI_OPCODE_DRSQ:
+   case TGSI_OPCODE_DTRUNC:
+   case TGSI_OPCODE_DCEIL:
+   case TGSI_OPCODE_DFLR:
+   case TGSI_OPCODE_DROUND:
+   case TGSI_OPCODE_DSSG:
+   case TGSI_OPCODE_F2D:
+   case TGSI_OPCODE_I2D:
+   case TGSI_OPCODE_U2D:
+      return TGSI_TYPE_DOUBLE;
    default:
       return TGSI_TYPE_FLOAT;
    }
@@ -391,6 +417,7 @@ tgsi_opcode_infer_src_type( uint opcode )
    case TGSI_OPCODE_TXF:
    case TGSI_OPCODE_BREAKC:
    case TGSI_OPCODE_U2F:
+   case TGSI_OPCODE_U2D:
    case TGSI_OPCODE_UADD:
    case TGSI_OPCODE_SWITCH:
    case TGSI_OPCODE_CASE:
@@ -400,10 +427,12 @@ tgsi_opcode_infer_src_type( uint opcode )
       return TGSI_TYPE_UNSIGNED;
    case TGSI_OPCODE_IMUL_HI:
    case TGSI_OPCODE_I2F:
+   case TGSI_OPCODE_I2D:
       return TGSI_TYPE_SIGNED;
    case TGSI_OPCODE_ARL:
    case TGSI_OPCODE_ARR:
    case TGSI_OPCODE_TXQ_LZ:
+   case TGSI_OPCODE_F2D:
    case TGSI_OPCODE_F2I:
    case TGSI_OPCODE_F2U:
    case TGSI_OPCODE_FSEQ:
@@ -412,6 +441,14 @@ tgsi_opcode_infer_src_type( uint opcode )
    case TGSI_OPCODE_FSNE:
    case TGSI_OPCODE_UCMP:
       return TGSI_TYPE_FLOAT;
+   case TGSI_OPCODE_D2F:
+   case TGSI_OPCODE_D2U:
+   case TGSI_OPCODE_D2I:
+   case TGSI_OPCODE_DSEQ:
+   case TGSI_OPCODE_DSGE:
+   case TGSI_OPCODE_DSLT:
+   case TGSI_OPCODE_DSNE:
+      return TGSI_TYPE_DOUBLE;
    default:
       return tgsi_opcode_infer_type(opcode);
    }

From e35c5717837d9ac6d9722b011852bdf187f29776 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@gmail.com>
Date: Sat, 27 Jun 2015 14:21:54 +1000
Subject: [PATCH 0143/1208] gallivm: add fp64 support. (v2.1)

This adds support for ARB_gpu_shader_fp64 and ARB_vertex_attrib_64bit to
llvmpipe.

Two things that don't mix well are SoA and doubles, see
emit_fetch_double, and emit_store_double_chan in this.

I've also had to split emit_data.chan, to add src_chan,
which can be different for doubles.

It handles indirect double fetches from temps, inputs, constants
and immediates. It doesn't handle double stores to indirects,
however it appears the mesa/st doesn't currently emit these,
it always does UARL/MOV combos, which will work fine.

tested with piglit, no regressions, all the fp64 tests seem to pass.

v2:
switch to using shuffles for fetch/store (Roland)
assert on indirect double stores - mesa/st never emits these (it uses MOV)
fix indirect temp/input/constant/immediates (Roland)
typos/formatting fixes (Roland)

v2.1:
cleanup some long lines, emit_store_double_chan cleanups.

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c   |  12 +
 src/gallium/auxiliary/gallivm/lp_bld_limits.h |   1 +
 src/gallium/auxiliary/gallivm/lp_bld_logic.c  |   2 +-
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.c   |  47 ++-
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.h   |   4 +
 .../auxiliary/gallivm/lp_bld_tgsi_action.c    | 246 ++++++++++++++++
 .../auxiliary/gallivm/lp_bld_tgsi_action.h    |   5 +
 .../auxiliary/gallivm/lp_bld_tgsi_soa.c       | 267 ++++++++++++++++--
 8 files changed, 553 insertions(+), 31 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 9daa93eec3e..8fba43f85d7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1997,6 +1997,12 @@ lp_build_floor(struct lp_build_context *bld,
       LLVMTypeRef int_vec_type = bld->int_vec_type;
       LLVMTypeRef vec_type = bld->vec_type;
 
+      if (type.width != 32) {
+         char intrinsic[32];
+         util_snprintf(intrinsic, sizeof intrinsic, "llvm.floor.v%uf%u", type.length, type.width);
+         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
+      }
+
       assert(type.width == 32); /* might want to handle doubles at some point */
 
       inttype = type;
@@ -2066,6 +2072,12 @@ lp_build_ceil(struct lp_build_context *bld,
       LLVMTypeRef int_vec_type = bld->int_vec_type;
       LLVMTypeRef vec_type = bld->vec_type;
 
+      if (type.width != 32) {
+         char intrinsic[32];
+         util_snprintf(intrinsic, sizeof intrinsic, "llvm.ceil.v%uf%u", type.length, type.width);
+         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
+      }
+
       assert(type.width == 32); /* might want to handle doubles at some point */
 
       inttype = type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index 2851fd10b04..3db7261e255 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -132,6 +132,7 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 1;
    case PIPE_SHADER_CAP_DOUBLES:
+      return 1;
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 80b53e5c3f8..f724cfa46ea 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -81,7 +81,7 @@ lp_build_compare_ext(struct gallivm_state *gallivm,
                      boolean ordered)
 {
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, lp_type_int_vec(32, 32 * type.length));
    LLVMValueRef zeros = LLVMConstNull(int_vec_type);
    LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
    LLVMValueRef cond;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index e391d8a4301..1887956d1c4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -175,13 +175,52 @@ void lp_build_fetch_args(
    unsigned src;
    for (src = 0; src < emit_data->info->num_src; src++) {
       emit_data->args[src] = lp_build_emit_fetch(bld_base, emit_data->inst, src,
-                                               emit_data->chan);
+                                                 emit_data->src_chan);
    }
    emit_data->arg_count = emit_data->info->num_src;
    lp_build_action_set_dst_type(emit_data, bld_base,
 		emit_data->inst->Instruction.Opcode);
 }
 
+/**
+ * with doubles src and dst channels aren't 1:1.
+ * check the src/dst types for the opcode,
+ * 1. if neither is double then src == dst;
+ * 2. if dest is double
+ *     - don't store to y or w
+ *     - if src is double then src == dst.
+ *     - else for f2d, d.xy = s.x
+ *     - else for f2d, d.zw = s.y
+ * 3. if dst is single, src is double
+ *    - map dst x,z to src xy;
+ *    - map dst y,w to src zw;
+ */
+static int get_src_chan_idx(unsigned opcode,
+                            int dst_chan_index)
+{
+   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(opcode);
+   enum tgsi_opcode_type stype = tgsi_opcode_infer_src_type(opcode);
+
+   if (dtype != TGSI_TYPE_DOUBLE && stype != TGSI_TYPE_DOUBLE)
+      return dst_chan_index;
+   if (dtype == TGSI_TYPE_DOUBLE) {
+      if (dst_chan_index == 1 || dst_chan_index == 3)
+         return -1;
+      if (stype == TGSI_TYPE_DOUBLE)
+         return dst_chan_index;
+      if (dst_chan_index == 0)
+         return 0;
+      if (dst_chan_index == 2)
+         return 1;
+   } else {
+      if (dst_chan_index == 0 || dst_chan_index == 2)
+         return 0;
+      if (dst_chan_index == 1 || dst_chan_index == 3)
+         return 2;
+   }
+   return -1;
+}
+
 /* XXX: COMMENT
  * It should be assumed that this function ignores writemasks
  */
@@ -197,7 +236,6 @@ lp_build_tgsi_inst_llvm(
    struct lp_build_emit_data emit_data;
    unsigned chan_index;
    LLVMValueRef val;
-
    bld_base->pc++;
 
    if (bld_base->emit_debug) {
@@ -240,7 +278,12 @@ lp_build_tgsi_inst_llvm(
    /* Emit the instructions */
    if (info->output_mode == TGSI_OUTPUT_COMPONENTWISE && bld_base->soa) {
       TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
+         int src_index = get_src_chan_idx(inst->Instruction.Opcode, chan_index);
+         /* ignore channels 1/3 in double dst */
+         if (src_index == -1)
+            continue;
          emit_data.chan = chan_index;
+         emit_data.src_chan = src_index;
          if (!action->fetch_args) {
             lp_build_fetch_args(bld_base, &emit_data);
          } else {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 967373ccdae..5809c5a2535 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -338,6 +338,7 @@ struct lp_build_tgsi_context
    struct lp_build_context uint_bld;
    struct lp_build_context int_bld;
 
+   struct lp_build_context dbl_bld;
    /** This array stores functions that are used to transform TGSI opcodes to
      * LLVM instructions.
      */
@@ -349,6 +350,9 @@ struct lp_build_tgsi_context
 
    struct lp_build_tgsi_action sqrt_action;
 
+   struct lp_build_tgsi_action drsq_action;
+
+   struct lp_build_tgsi_action dsqrt_action;
    const struct tgsi_shader_info *info;
 
    lp_build_emit_fetch_fn emit_fetch_funcs[TGSI_FILE_COUNT];
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 9cb42b237b7..1f2af85c92b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -894,6 +894,125 @@ const struct lp_build_tgsi_action xpd_action = {
    xpd_emit	 /* emit */
 };
 
+/* TGSI_OPCODE_D2F */
+static void
+d2f_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildFPTrunc(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                       bld_base->base.vec_type, "");
+}
+
+/* TGSI_OPCODE_D2I */
+static void
+d2i_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildFPToSI(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->base.int_vec_type, "");
+}
+
+/* TGSI_OPCODE_D2U */
+static void
+d2u_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildFPToUI(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->base.int_vec_type, "");
+}
+
+/* TGSI_OPCODE_F2D */
+static void
+f2d_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildFPExt(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->dbl_bld.vec_type, "");
+}
+
+/* TGSI_OPCODE_U2D */
+static void
+u2d_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildUIToFP(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->dbl_bld.vec_type, "");
+}
+
+/* TGSI_OPCODE_I2D */
+static void
+i2d_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildSIToFP(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->dbl_bld.vec_type, "");
+}
+
+/* TGSI_OPCODE_DMAD */
+static void
+dmad_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   LLVMValueRef tmp;
+   tmp = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_DMUL,
+                                   emit_data->args[0],
+                                   emit_data->args[1]);
+   emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base,
+                                       TGSI_OPCODE_DADD, tmp, emit_data->args[2]);
+}
+
+/*.TGSI_OPCODE_DRCP.*/
+static void drcp_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   LLVMValueRef one;
+   one = lp_build_const_vec(bld_base->dbl_bld.gallivm, bld_base->dbl_bld.type, 1.0f);
+   emit_data->output[emit_data->chan] = LLVMBuildFDiv(
+      bld_base->base.gallivm->builder,
+      one, emit_data->args[0], "");
+}
+
+/* TGSI_OPCODE_DFRAC */
+static void dfrac_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   LLVMValueRef tmp;
+   tmp = lp_build_floor(&bld_base->dbl_bld,
+			emit_data->args[0]);
+   emit_data->output[emit_data->chan] =  LLVMBuildFSub(bld_base->base.gallivm->builder,
+                                                       emit_data->args[0], tmp, "");
+}
+
 void
 lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
 {
@@ -948,6 +1067,25 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
 
    bld_base->op_actions[TGSI_OPCODE_MAX].emit = fmax_emit;
    bld_base->op_actions[TGSI_OPCODE_MIN].emit = fmin_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_DADD].emit = add_emit;
+   bld_base->op_actions[TGSI_OPCODE_DMAX].emit = fmax_emit;
+   bld_base->op_actions[TGSI_OPCODE_DMIN].emit = fmin_emit;
+   bld_base->op_actions[TGSI_OPCODE_DMUL].emit = mul_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_D2F].emit = d2f_emit;
+   bld_base->op_actions[TGSI_OPCODE_D2I].emit = d2i_emit;
+   bld_base->op_actions[TGSI_OPCODE_D2U].emit = d2u_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_F2D].emit = f2d_emit;
+   bld_base->op_actions[TGSI_OPCODE_I2D].emit = i2d_emit;
+   bld_base->op_actions[TGSI_OPCODE_U2D].emit = u2d_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_DMAD].emit = dmad_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_DRCP].emit = drcp_emit;
+   bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = dfrac_emit;
+
 }
 
 /* CPU Only default actions */
@@ -1792,6 +1930,104 @@ xor_emit_cpu(
                                                      emit_data->args[1]);
 }
 
+/* TGSI_OPCODE_DABS (CPU Only) */
+static void
+dabs_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_abs(&bld_base->dbl_bld,
+                                                       emit_data->args[0]);
+}
+
+/* TGSI_OPCODE_DNEG (CPU Only) */
+static void
+dneg_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_sub(&bld_base->dbl_bld,
+                                                     bld_base->dbl_bld.zero,
+                                                     emit_data->args[0]);
+}
+
+/* TGSI_OPCODE_DSET Helper (CPU Only) */
+static void
+dset_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data,
+   unsigned pipe_func)
+{
+   LLVMValueRef cond = lp_build_cmp(&bld_base->dbl_bld, pipe_func,
+                                    emit_data->args[0], emit_data->args[1]);
+   emit_data->output[emit_data->chan] = cond;
+}
+
+/* TGSI_OPCODE_DSEQ (CPU Only) */
+static void
+dseq_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_EQUAL);
+}
+
+/* TGSI_OPCODE_DSGE (CPU Only) */
+static void
+dsge_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_GEQUAL);
+}
+
+/* TGSI_OPCODE_DSLT (CPU Only) */
+static void
+dslt_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_LESS);
+}
+
+/* TGSI_OPCODE_DSNE (CPU Only) */
+static void
+dsne_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_NOTEQUAL);
+}
+
+/* Double Reciprocal squareroot (CPU Only) */
+static void
+drecip_sqrt_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_rsqrt(&bld_base->dbl_bld,
+                                                         emit_data->args[0]);
+}
+
+/* Double Squareroot (CPU Only) */
+static void
+dsqrt_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_sqrt(&bld_base->dbl_bld,
+                                                      emit_data->args[0]);
+}
+
 void
 lp_set_default_actions_cpu(
    struct lp_build_tgsi_context * bld_base)
@@ -1864,4 +2100,14 @@ lp_set_default_actions_cpu(
 
    bld_base->op_actions[TGSI_OPCODE_XOR].emit = xor_emit_cpu;
 
+   bld_base->op_actions[TGSI_OPCODE_DABS].emit = dabs_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DNEG].emit = dneg_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSEQ].emit = dseq_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSGE].emit = dsge_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSLT].emit = dslt_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSNE].emit = dsne_emit_cpu;
+
+   bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = drecip_sqrt_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = dsqrt_emit_cpu;
+
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
index fc7fdbdd231..463d44eb450 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
@@ -71,6 +71,11 @@ struct lp_build_emit_data {
     */
    unsigned chan;
 
+   /**
+    * This is used to specify the src channel to read from for doubles.
+    */
+   unsigned src_chan;
+
    /** The lp_build_tgsi_action::emit 'executes' the opcode and writes the
     * results to this array.
     */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 268379e7d13..faa546df11b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -947,15 +947,20 @@ static LLVMValueRef
 build_gather(struct lp_build_tgsi_context *bld_base,
              LLVMValueRef base_ptr,
              LLVMValueRef indexes,
-             LLVMValueRef overflow_mask)
+             LLVMValueRef overflow_mask,
+             LLVMValueRef indexes2)
 {
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    struct lp_build_context *bld = &bld_base->base;
-   LLVMValueRef res = bld->undef;
+   LLVMValueRef res;
    unsigned i;
 
+   if (indexes2)
+      res = LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2));
+   else
+      res = bld->undef;
    /*
     * overflow_mask is a vector telling us which channels
     * in the vector overflowed. We use the overflow behavior for
@@ -976,26 +981,47 @@ build_gather(struct lp_build_tgsi_context *bld_base,
        * control flow.
        */
       indexes = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes);
+      if (indexes2)
+         indexes2 = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes2);
    }
 
    /*
     * Loop over elements of index_vec, load scalar value, insert it into 'res'.
     */
-   for (i = 0; i < bld->type.length; i++) {
-      LLVMValueRef ii = lp_build_const_int32(bld->gallivm, i);
-      LLVMValueRef index = LLVMBuildExtractElement(builder,
-                                                   indexes, ii, "");
+   for (i = 0; i < bld->type.length * (indexes2 ? 2 : 1); i++) {
+      LLVMValueRef si, di;
+      LLVMValueRef index;
       LLVMValueRef scalar_ptr, scalar;
 
+      di = lp_build_const_int32(bld->gallivm, i);
+      if (indexes2)
+         si = lp_build_const_int32(bld->gallivm, i >> 1);
+      else
+         si = di;
+
+      if (indexes2 && (i & 1)) {
+         index = LLVMBuildExtractElement(builder,
+                                         indexes2, si, "");
+      } else {
+         index = LLVMBuildExtractElement(builder,
+                                         indexes, si, "");
+      }
       scalar_ptr = LLVMBuildGEP(builder, base_ptr,
                                 &index, 1, "gather_ptr");
       scalar = LLVMBuildLoad(builder, scalar_ptr, "");
 
-      res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
+      res = LLVMBuildInsertElement(builder, res, scalar, di, "");
    }
 
    if (overflow_mask) {
-      res = lp_build_select(bld, overflow_mask, bld->zero, res);
+      if (indexes2) {
+         res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
+         overflow_mask = LLVMBuildSExt(builder, overflow_mask,
+                                       bld_base->dbl_bld.int_vec_type, "");
+         res = lp_build_select(&bld_base->dbl_bld, overflow_mask,
+                               bld_base->dbl_bld.zero, res);
+      } else
+         res = lp_build_select(bld, overflow_mask, bld->zero, res);
    }
 
    return res;
@@ -1139,8 +1165,10 @@ stype_to_fetch(struct lp_build_tgsi_context * bld_base,
    case TGSI_TYPE_SIGNED:
       bld_fetch = &bld_base->int_bld;
       break;
-   case TGSI_TYPE_VOID:
    case TGSI_TYPE_DOUBLE:
+      bld_fetch = &bld_base->dbl_bld;
+      break;
+   case TGSI_TYPE_VOID:
    default:
       assert(0);
       bld_fetch = NULL;
@@ -1216,6 +1244,7 @@ emit_fetch_constant(
          lp_build_const_int_vec(gallivm, uint_bld->type, swizzle);
       LLVMValueRef index_vec;  /* index into the const buffer */
       LLVMValueRef overflow_mask;
+      LLVMValueRef index_vec2 = NULL;
 
       indirect_index = get_indirect_index(bld,
                                           reg->Register.File,
@@ -1235,22 +1264,33 @@ emit_fetch_constant(
       index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
       index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
 
+      if (stype == TGSI_TYPE_DOUBLE) {
+         LLVMValueRef swizzle_vec2;
+         swizzle_vec2 = lp_build_const_int_vec(gallivm, uint_bld->type, swizzle + 1);
+         index_vec2 = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec2 = lp_build_add(uint_bld, index_vec2, swizzle_vec2);
+      }
       /* Gather values from the constant buffer */
-      res = build_gather(bld_base, consts_ptr, index_vec, overflow_mask);
+      res = build_gather(bld_base, consts_ptr, index_vec, overflow_mask, index_vec2);
    }
    else {
       LLVMValueRef index;  /* index into the const buffer */
       LLVMValueRef scalar, scalar_ptr;
-
+      struct lp_build_context *bld_broad = &bld_base->base;
       index = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle);
 
       scalar_ptr = LLVMBuildGEP(builder, consts_ptr,
                                 &index, 1, "");
+      if (stype == TGSI_TYPE_DOUBLE) {
+         LLVMTypeRef dptr_type = LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0);
+         scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, dptr_type, "");
+         bld_broad = &bld_base->dbl_bld;
+      }
       scalar = LLVMBuildLoad(builder, scalar_ptr, "");
-      res = lp_build_broadcast_scalar(&bld_base->base, scalar);
+      res = lp_build_broadcast_scalar(bld_broad, scalar);
    }
 
-   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || stype == TGSI_TYPE_DOUBLE) {
       struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
       res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
    }
@@ -1258,6 +1298,39 @@ emit_fetch_constant(
    return res;
 }
 
+/**
+ * Fetch double values from two separate channels.
+ * Doubles are stored split across two channels, like xy and zw.
+ * This function creates a set of 16 floats,
+ * extracts the values from the two channels,
+ * puts them in the correct place, then casts to 8 doubles.
+ */
+static LLVMValueRef
+emit_fetch_double(
+   struct lp_build_tgsi_context * bld_base,
+   enum tgsi_opcode_type stype,
+   LLVMValueRef input,
+   LLVMValueRef input2)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef res;
+   struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
+   int i;
+   LLVMValueRef shuffles[16];
+   int len = bld_base->base.type.length * 2;
+   assert(len <= 16);
+
+   for (i = 0; i < bld_base->base.type.length * 2; i+=2) {
+      shuffles[i] = lp_build_const_int32(gallivm, i / 2);
+      shuffles[i + 1] = lp_build_const_int32(gallivm, i / 2 + bld_base->base.type.length);
+   }
+   res = LLVMBuildShuffleVector(builder, input, input2, LLVMConstVector(shuffles, len), "");
+
+   return LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
+}
+
 static LLVMValueRef
 emit_fetch_immediate(
    struct lp_build_tgsi_context * bld_base,
@@ -1281,7 +1354,7 @@ emit_fetch_immediate(
       if (reg->Register.Indirect) {
          LLVMValueRef indirect_index;
          LLVMValueRef index_vec;  /* index into the immediate register array */
-
+         LLVMValueRef index_vec2 = NULL;
          indirect_index = get_indirect_index(bld,
                                              reg->Register.File,
                                              reg->Register.Index,
@@ -1296,25 +1369,46 @@ emit_fetch_immediate(
                                            indirect_index,
                                            swizzle,
                                            FALSE);
-
+         if (stype == TGSI_TYPE_DOUBLE)
+            index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
+                                              indirect_index,
+                                              swizzle + 1,
+                                              FALSE);
          /* Gather values from the immediate register array */
-         res = build_gather(bld_base, imms_array, index_vec, NULL);
+         res = build_gather(bld_base, imms_array, index_vec, NULL, index_vec2);
       } else {
          LLVMValueRef lindex = lp_build_const_int32(gallivm,
                                         reg->Register.Index * 4 + swizzle);
          LLVMValueRef imms_ptr =  LLVMBuildGEP(builder,
                                                 bld->imms_array, &lindex, 1, "");
          res = LLVMBuildLoad(builder, imms_ptr, "");
+
+         if (stype == TGSI_TYPE_DOUBLE) {
+            LLVMValueRef lindex1;
+            LLVMValueRef imms_ptr2;
+            LLVMValueRef res2;
+
+            lindex1 = lp_build_const_int32(gallivm,
+                                           reg->Register.Index * 4 + swizzle + 1);
+            imms_ptr2 = LLVMBuildGEP(builder,
+                                      bld->imms_array, &lindex1, 1, "");
+            res2 = LLVMBuildLoad(builder, imms_ptr2, "");
+            res = emit_fetch_double(bld_base, stype, res, res2);
+         }
       }
    }
    else {
       res = bld->immediates[reg->Register.Index][swizzle];
+      if (stype == TGSI_TYPE_DOUBLE)
+         res = emit_fetch_double(bld_base, stype, res, bld->immediates[reg->Register.Index][swizzle + 1]);
    }
 
    if (stype == TGSI_TYPE_UNSIGNED) {
       res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
    } else if (stype == TGSI_TYPE_SIGNED) {
       res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
+   } else if (stype == TGSI_TYPE_DOUBLE) {
+      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
    }
    return res;
 }
@@ -1334,6 +1428,7 @@ emit_fetch_input(
    if (reg->Register.Indirect) {
       LLVMValueRef indirect_index;
       LLVMValueRef index_vec;  /* index into the input reg array */
+      LLVMValueRef index_vec2 = NULL;
       LLVMValueRef inputs_array;
       LLVMTypeRef fptr_type;
 
@@ -1346,23 +1441,43 @@ emit_fetch_input(
                                         indirect_index,
                                         swizzle,
                                         TRUE);
-
+      if (stype == TGSI_TYPE_DOUBLE) {
+         index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
+                                           indirect_index,
+                                           swizzle + 1,
+                                           TRUE);
+      }
       /* cast inputs_array pointer to float* */
       fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
       inputs_array = LLVMBuildBitCast(builder, bld->inputs_array, fptr_type, "");
 
       /* Gather values from the input register array */
-      res = build_gather(bld_base, inputs_array, index_vec, NULL);
+      res = build_gather(bld_base, inputs_array, index_vec, NULL, index_vec2);
    } else {
       if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
          LLVMValueRef lindex = lp_build_const_int32(gallivm,
                                         reg->Register.Index * 4 + swizzle);
-         LLVMValueRef input_ptr =  LLVMBuildGEP(builder,
-                                                bld->inputs_array, &lindex, 1, "");
+         LLVMValueRef input_ptr = LLVMBuildGEP(builder,
+                                               bld->inputs_array, &lindex, 1, "");
+
          res = LLVMBuildLoad(builder, input_ptr, "");
+         if (stype == TGSI_TYPE_DOUBLE) {
+            LLVMValueRef lindex1;
+            LLVMValueRef input_ptr2;
+            LLVMValueRef res2;
+
+            lindex1 = lp_build_const_int32(gallivm,
+                                           reg->Register.Index * 4 + swizzle + 1);
+            input_ptr2 = LLVMBuildGEP(builder,
+                                      bld->inputs_array, &lindex1, 1, "");
+            res2 = LLVMBuildLoad(builder, input_ptr2, "");
+            res = emit_fetch_double(bld_base, stype, res, res2);
+         }
       }
       else {
          res = bld->inputs[reg->Register.Index][swizzle];
+         if (stype == TGSI_TYPE_DOUBLE)
+            res = emit_fetch_double(bld_base, stype, res, bld->inputs[reg->Register.Index][swizzle + 1]);
       }
    }
 
@@ -1372,6 +1487,8 @@ emit_fetch_input(
       res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
    } else if (stype == TGSI_TYPE_SIGNED) {
       res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
+   } else if (stype == TGSI_TYPE_DOUBLE) {
+      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
    }
 
    return res;
@@ -1413,7 +1530,7 @@ emit_fetch_gs_input(
    } else {
       attrib_index = lp_build_const_int32(gallivm, reg->Register.Index);
    }
-   
+
    if (reg->Dimension.Indirect) {
       vertex_index = get_indirect_index(bld,
                                         reg->Register.File,
@@ -1436,6 +1553,8 @@ emit_fetch_gs_input(
       res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
    } else if (stype == TGSI_TYPE_SIGNED) {
       res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
+   } else if (stype == TGSI_TYPE_DOUBLE) {
+      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
    }
 
    return res;
@@ -1455,7 +1574,7 @@ emit_fetch_temporary(
 
    if (reg->Register.Indirect) {
       LLVMValueRef indirect_index;
-      LLVMValueRef index_vec;  /* index into the temp reg array */
+      LLVMValueRef index_vec, index_vec2 = NULL;  /* index into the temp reg array */
       LLVMValueRef temps_array;
       LLVMTypeRef fptr_type;
 
@@ -1468,21 +1587,35 @@ emit_fetch_temporary(
                                         indirect_index,
                                         swizzle,
                                         TRUE);
+      if (stype == TGSI_TYPE_DOUBLE) {
+               index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
+                                                  indirect_index,
+                                                  swizzle + 1,
+                                                  TRUE);
+      }
 
       /* cast temps_array pointer to float* */
       fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
       temps_array = LLVMBuildBitCast(builder, bld->temps_array, fptr_type, "");
 
       /* Gather values from the temporary register array */
-      res = build_gather(bld_base, temps_array, index_vec, NULL);
+      res = build_gather(bld_base, temps_array, index_vec, NULL, index_vec2);
    }
    else {
       LLVMValueRef temp_ptr;
       temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle);
       res = LLVMBuildLoad(builder, temp_ptr, "");
+
+      if (stype == TGSI_TYPE_DOUBLE) {
+         LLVMValueRef temp_ptr2, res2;
+
+         temp_ptr2 = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle + 1);
+         res2 = LLVMBuildLoad(builder, temp_ptr2, "");
+         res = emit_fetch_double(bld_base, stype, res, res2);
+      }
    }
 
-   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || stype == TGSI_TYPE_DOUBLE) {
       struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
       res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
    }
@@ -1648,6 +1781,50 @@ emit_fetch_predicate(
    }
 }
 
+/**
+ * store an array of 8 doubles into two arrays of 8 floats
+ * i.e.
+ * value is d0, d1, d2, d3 etc.
+ * each double has high and low pieces x, y
+ * so gets stored into the separate channels as:
+ * chan_ptr = d0.x, d1.x, d2.x, d3.x
+ * chan_ptr2 = d0.y, d1.y, d2.y, d3.y
+ */
+static void
+emit_store_double_chan(struct lp_build_tgsi_context *bld_base,
+                       int dtype,
+                       LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2,
+                       LLVMValueRef pred,
+                       LLVMValueRef value)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *float_bld = &bld_base->base;
+   int i;
+   LLVMValueRef temp, temp2;
+   LLVMValueRef shuffles[8];
+   LLVMValueRef shuffles2[8];
+
+   for (i = 0; i < bld_base->base.type.length; i++) {
+      shuffles[i] = lp_build_const_int32(gallivm, i * 2);
+      shuffles2[i] = lp_build_const_int32(gallivm, (i * 2) + 1);
+   }
+
+   temp = LLVMBuildShuffleVector(builder, value,
+                                 LLVMGetUndef(LLVMTypeOf(value)),
+                                 LLVMConstVector(shuffles,
+                                                 bld_base->base.type.length),
+                                 "");
+   temp2 = LLVMBuildShuffleVector(builder, value,
+                                  LLVMGetUndef(LLVMTypeOf(value)),
+                                  LLVMConstVector(shuffles2,
+                                                  bld_base->base.type.length),
+                                  "");
+
+   lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp, chan_ptr);
+   lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp2, chan_ptr2);
+}
 
 /**
  * Register store.
@@ -1683,6 +1860,11 @@ emit_store_chan(
    }
 
    if (reg->Register.Indirect) {
+      /*
+       * Currently the mesa/st doesn't generate indirect stores
+       * to doubles, it normally uses MOV to do indirect stores.
+       */
+      assert(dtype != TGSI_TYPE_DOUBLE);
       indirect_index = get_indirect_index(bld,
                                           reg->Register.File,
                                           reg->Register.Index,
@@ -1721,13 +1903,23 @@ emit_store_chan(
       else {
          LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index,
                                                   chan_index);
-         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
+
+         if (dtype == TGSI_TYPE_DOUBLE) {
+            LLVMValueRef out_ptr2 = lp_get_output_ptr(bld, reg->Register.Index,
+                                                      chan_index + 1);
+            emit_store_double_chan(bld_base, dtype, out_ptr, out_ptr2,
+                                   pred, value);
+         } else
+            lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
       }
       break;
 
    case TGSI_FILE_TEMPORARY:
       /* Temporaries are always stored as floats */
-      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+      if (dtype != TGSI_TYPE_DOUBLE)
+         value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+      else
+         value = LLVMBuildBitCast(builder, value,  LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2), "");
 
       if (reg->Register.Indirect) {
          LLVMValueRef index_vec;  /* indexes into the temp registers */
@@ -1749,7 +1941,16 @@ emit_store_chan(
       else {
          LLVMValueRef temp_ptr;
          temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index);
-         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
+
+         if (dtype == TGSI_TYPE_DOUBLE) {
+            LLVMValueRef temp_ptr2 = lp_get_temp_ptr_soa(bld,
+                                                         reg->Register.Index,
+                                                         chan_index + 1);
+            emit_store_double_chan(bld_base, dtype, temp_ptr, temp_ptr2,
+                                   pred, value);
+         }
+         else
+            lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
       }
       break;
 
@@ -1818,13 +2019,16 @@ emit_store(
 {
    unsigned chan_index;
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
-
+   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode);
    if(info->num_dst) {
       LLVMValueRef pred[TGSI_NUM_CHANNELS];
 
       emit_fetch_predicate( bld, inst, pred );
 
       TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+
+         if (dtype == TGSI_TYPE_DOUBLE && (chan_index == 1 || chan_index == 3))
+             continue;
          emit_store_chan(bld_base, inst, 0, chan_index, pred[chan_index], dst[chan_index]);
       }
    }
@@ -2823,6 +3027,7 @@ void lp_emit_immediate_soa(
                lp_build_const_vec(gallivm, bld_base->base.type, imm->u[i].Float);
 
       break;
+   case TGSI_IMM_FLOAT64:
    case TGSI_IMM_UINT32:
       for( i = 0; i < size; ++i ) {
          LLVMValueRef tmp = lp_build_const_vec(gallivm, bld_base->uint_bld.type, imm->u[i].Uint);
@@ -3674,6 +3879,12 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    lp_build_context_init(&bld.bld_base.uint_bld, gallivm, lp_uint_type(type));
    lp_build_context_init(&bld.bld_base.int_bld, gallivm, lp_int_type(type));
    lp_build_context_init(&bld.elem_bld, gallivm, lp_elem_type(type));
+   {
+      struct lp_type dbl_type;
+      dbl_type = type;
+      dbl_type.width *= 2;
+      lp_build_context_init(&bld.bld_base.dbl_bld, gallivm, dbl_type);
+   }
    bld.mask = mask;
    bld.inputs = inputs;
    bld.outputs = outputs;

From 4caaa2681e727fa6405ff6de6d1f6c6a356ede34 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 29 Jun 2015 17:11:59 +1000
Subject: [PATCH 0144/1208] docs: update for llvmpipe fp64 support

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt              | 4 ++--
 docs/relnotes/10.7.0.html | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 81014a5f9b5..94bbcd12dfc 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -109,7 +109,7 @@ GL 4.0, GLSL 4.00:
   - Enhanced per-sample shading                        DONE (r600, radeonsi)
   - Interpolation functions                            DONE (r600)
   - New overload resolution rules                      DONE
-  GL_ARB_gpu_shader_fp64                               DONE (nvc0, softpipe)
+  GL_ARB_gpu_shader_fp64                               DONE (nvc0, llvmpipe, softpipe)
   GL_ARB_sample_shading                                DONE (i965, nv50, nvc0, r600, radeonsi)
   GL_ARB_shader_subroutine                             started (Dave)
   GL_ARB_tessellation_shader                           started (Chris, Ilia)
@@ -127,7 +127,7 @@ GL 4.1, GLSL 4.10:
   GL_ARB_get_program_binary                            DONE (0 binary formats)
   GL_ARB_separate_shader_objects                       DONE (all drivers)
   GL_ARB_shader_precision                              started (Micah)
-  GL_ARB_vertex_attrib_64bit                           DONE (nvc0, softpipe)
+  GL_ARB_vertex_attrib_64bit                           DONE (nvc0, llvmpipe, softpipe)
   GL_ARB_viewport_array                                DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
 
 
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index fcc50811b69..2484243f7eb 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -49,6 +49,8 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_shader_stencil_export on llvmpipe</li>
 <li>GL_ARB_viewport_array on radeonsi</li>
 <li>GL_ARB_fragment_layer_viewport on radeonsi</li>
+<li>GL_ARB_gpu_shader_fp64 on llvmpipe</li>
+<li>GL_ARB_vertex_attrib_64bit on llvmpipe</li>
 </ul>
 
 <h2>Bug fixes</h2>

From 5dcb28c3d26828ed1b0e2bd5a0589c5baab04b85 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 1 Jul 2015 02:11:39 -0400
Subject: [PATCH 0145/1208] nv50/ir: copy joinAt when splitting both before and
 after

The current implementation only moves the joinAt when splitting after
the given instruction, not before it. So if you have a BB with

  foo
  instr
  bar
  joinat

and thus with joinAt set, we end up first splitting before instr, at
which point the instr's bb is updated to the new bb. Since that bb
doesn't have a joinAt set (despite containing one), when splitting after
the instr, there is nothing to copy over. Since the joinat will be in
the "split" bb irrespective of whether we're splitting before or after
the instruction, move it over in either case.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91124
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "10.5 10.6" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp            | 3 +++
 src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp     | 1 +
 src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp | 1 +
 3 files changed, 5 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
index 51b9225156b..fa8ee072a92 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -332,6 +332,9 @@ BasicBlock::splitBefore(Instruction *insn, bool attach)
    BasicBlock *bb = new BasicBlock(func);
    assert(!insn || insn->op != OP_PHI);
 
+   bb->joinAt = joinAt;
+   joinAt = NULL;
+
    splitCommon(insn, bb, attach);
    return bb;
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index ecd115f9807..9839a0e7bce 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1687,6 +1687,7 @@ Converter::insertConvergenceOps(BasicBlock *conv, BasicBlock *fork)
    join->fixed = 1;
    conv->insertHead(join);
 
+   assert(!fork->joinAt);
    fork->joinAt = new_FlowInstruction(func, OP_JOINAT, conv);
    fork->insertBefore(fork->getExit(), fork->joinAt);
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 2c7f7e326b2..bea293bac99 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -871,6 +871,7 @@ NV50LoweringPreSSA::handleTXL(TexInstruction *i)
    BasicBlock *joinBB = i->bb->splitAfter(i);
 
    bld.setPosition(currBB, true);
+   assert(!currBB->joinAt);
    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
 
    for (int l = 0; l <= 3; ++l) {

From 9e13f5c85f23ff67e685b41a4d439fc443de2dd0 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Mon, 29 Jun 2015 16:02:52 +0800
Subject: [PATCH 0146/1208] ilo: add image_get_gen6_layout()

It replaces only img_init_walk() right now.  It will replace all img_init_*().
---
 src/gallium/drivers/ilo/core/ilo_image.c | 189 +++++++++++++----------
 1 file changed, 107 insertions(+), 82 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 3209674154b..d4adffc181b 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -49,6 +49,102 @@ struct ilo_image_params {
    unsigned max_x, max_y;
 };
 
+struct ilo_image_layout {
+   enum ilo_image_walk_type walk;
+   bool interleaved_samples;
+};
+
+static enum ilo_image_walk_type
+image_get_gen6_walk(const struct ilo_dev *dev,
+                    const struct ilo_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   /* TODO we want LODs to be page-aligned */
+   if (info->type == GEN6_SURFTYPE_3D)
+      return ILO_IMAGE_WALK_3D;
+
+   /*
+    * From the Sandy Bridge PRM, volume 1 part 1, page 115:
+    *
+    *     "The separate stencil buffer does not support mip mapping, thus the
+    *      storage for LODs other than LOD 0 is not needed. The following
+    *      QPitch equation applies only to the separate stencil buffer:
+    *
+    *        QPitch = h_0"
+    *
+    * Use ILO_IMAGE_WALK_LOD and manually offset to the (page-aligned) levels
+    * when bound.
+    */
+   if (info->bind_zs && info->format == GEN6_FORMAT_R8_UINT)
+      return ILO_IMAGE_WALK_LOD;
+
+   /* compact spacing is not supported otherwise */
+   return ILO_IMAGE_WALK_LAYER;
+}
+
+static enum ilo_image_walk_type
+image_get_gen7_walk(const struct ilo_dev *dev,
+                    const struct ilo_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (info->type == GEN6_SURFTYPE_3D)
+      return ILO_IMAGE_WALK_3D;
+
+   /*
+    * From the Ivy Bridge PRM, volume 1 part 1, page 111:
+    *
+    *     "note that the depth buffer and stencil buffer have an implied value
+    *      of ARYSPC_FULL"
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 66:
+    *
+    *     "If Multisampled Surface Storage Format is MSFMT_MSS and Number of
+    *      Multisamples is not MULTISAMPLECOUNT_1, this field (Surface Array
+    *      Spacing) must be set to ARYSPC_LOD0."
+    */
+   if (info->sample_count > 1)
+      assert(info->level_count == 1);
+   return (info->bind_zs || info->level_count > 1) ?
+      ILO_IMAGE_WALK_LAYER : ILO_IMAGE_WALK_LOD;
+}
+
+static bool
+image_get_gen6_interleaved_samples(const struct ilo_dev *dev,
+                                   const struct ilo_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * Gen6 supports only interleaved samples.  It is not explicitly stated,
+    * but on Gen7+, render targets are expected to be UMS/CMS (samples
+    * non-interleaved) and depth/stencil buffers are expected to be IMS
+    * (samples interleaved).
+    *
+    * See "Multisampled Surface Storage Format" field of SURFACE_STATE.
+    */
+   return (ilo_dev_gen(dev) == ILO_GEN(6) || info->bind_zs);
+}
+
+static bool
+image_get_gen6_layout(const struct ilo_dev *dev,
+                      const struct ilo_image_info *info,
+                      struct ilo_image_layout *layout)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      layout->walk = image_get_gen7_walk(dev, info);
+   else
+      layout->walk = image_get_gen6_walk(dev, info);
+
+   layout->interleaved_samples =
+      image_get_gen6_interleaved_samples(dev, info);
+
+   return true;
+}
+
 static void
 img_get_slice_size(const struct ilo_image *img,
                    const struct ilo_image_params *params,
@@ -492,87 +588,6 @@ img_init_tiling(struct ilo_image *img,
       img->tiling = GEN6_TILING_NONE;
 }
 
-static void
-img_init_walk_gen7(struct ilo_image *img,
-                   const struct ilo_image_params *params)
-{
-   const struct ilo_image_info *info = params->info;
-
-   /*
-    * It is not explicitly states, but render targets are expected to be
-    * UMS/CMS (samples non-interleaved) and depth/stencil buffers are expected
-    * to be IMS (samples interleaved).
-    *
-    * See "Multisampled Surface Storage Format" field of SURFACE_STATE.
-    */
-   if (info->bind_zs) {
-      /*
-       * From the Ivy Bridge PRM, volume 1 part 1, page 111:
-       *
-       *     "note that the depth buffer and stencil buffer have an implied
-       *      value of ARYSPC_FULL"
-       */
-      img->walk = (info->type == GEN6_SURFTYPE_3D) ?
-         ILO_IMAGE_WALK_3D : ILO_IMAGE_WALK_LAYER;
-
-      img->interleaved_samples = true;
-   } else {
-      /*
-       * From the Ivy Bridge PRM, volume 4 part 1, page 66:
-       *
-       *     "If Multisampled Surface Storage Format is MSFMT_MSS and Number
-       *      of Multisamples is not MULTISAMPLECOUNT_1, this field (Surface
-       *      Array Spacing) must be set to ARYSPC_LOD0."
-       *
-       * As multisampled resources are not mipmapped, we never use
-       * ARYSPC_FULL for them.
-       */
-      if (info->sample_count > 1)
-         assert(info->level_count == 1);
-
-      img->walk =
-         (info->type == GEN6_SURFTYPE_3D) ? ILO_IMAGE_WALK_3D :
-         (info->level_count > 1) ? ILO_IMAGE_WALK_LAYER :
-         ILO_IMAGE_WALK_LOD;
-
-      img->interleaved_samples = false;
-   }
-}
-
-static void
-img_init_walk_gen6(struct ilo_image *img,
-                   const struct ilo_image_params *params)
-{
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 115:
-    *
-    *     "The separate stencil buffer does not support mip mapping, thus the
-    *      storage for LODs other than LOD 0 is not needed. The following
-    *      QPitch equation applies only to the separate stencil buffer:
-    *
-    *        QPitch = h_0"
-    *
-    * GEN6 does not support compact spacing otherwise.
-    */
-   img->walk =
-      (params->info->type == GEN6_SURFTYPE_3D) ? ILO_IMAGE_WALK_3D :
-      (img->format == GEN6_FORMAT_R8_UINT) ? ILO_IMAGE_WALK_LOD :
-      ILO_IMAGE_WALK_LAYER;
-
-   /* GEN6 supports only interleaved samples */
-   img->interleaved_samples = true;
-}
-
-static void
-img_init_walk(struct ilo_image *img,
-              const struct ilo_image_params *params)
-{
-   if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
-      img_init_walk_gen7(img, params);
-   else
-      img_init_walk_gen6(img, params);
-}
-
 static unsigned
 img_get_valid_tilings(const struct ilo_image *img,
                       const struct ilo_image_params *params)
@@ -1265,11 +1280,21 @@ static bool
 img_init(struct ilo_image *img,
          struct ilo_image_params *params)
 {
+   struct ilo_image_layout layout;
+
+   memset(&layout, 0, sizeof(layout));
+
+   if (!image_get_gen6_layout(params->dev, params->info, &layout))
+      return false;
+
    /* there are hard dependencies between every function here */
 
    img_init_size_and_format(img, params);
    img_init_aux(img, params);
-   img_init_walk(img, params);
+
+   img->walk = layout.walk;
+   img->interleaved_samples = layout.interleaved_samples;
+
    img_init_tiling(img, params);
    img_init_alignments(img, params);
    img_init_lods(img, params);

From c3b205dbeba9534e0cf707ddd9c075170ccad1bf Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Mon, 29 Jun 2015 16:11:09 +0800
Subject: [PATCH 0147/1208] ilo: add image_get_gen6_tiling()

It replaces img_init_tiling().
---
 src/gallium/drivers/ilo/core/ilo_image.c | 309 +++++++++++++----------
 1 file changed, 177 insertions(+), 132 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index d4adffc181b..84001d862fd 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -52,6 +52,9 @@ struct ilo_image_params {
 struct ilo_image_layout {
    enum ilo_image_walk_type walk;
    bool interleaved_samples;
+
+   uint8_t valid_tilings;
+   enum gen_surface_tiling tiling;
 };
 
 static enum ilo_image_walk_type
@@ -127,6 +130,171 @@ image_get_gen6_interleaved_samples(const struct ilo_dev *dev,
    return (ilo_dev_gen(dev) == ILO_GEN(6) || info->bind_zs);
 }
 
+static uint8_t
+image_get_gen6_valid_tilings(const struct ilo_dev *dev,
+                             const struct ilo_image_info *info)
+{
+   uint8_t valid_tilings = IMAGE_TILING_ALL;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (info->valid_tilings)
+      valid_tilings &= info->valid_tilings;
+
+   /*
+    * From the Sandy Bridge PRM, volume 1 part 2, page 32:
+    *
+    *     "Display/Overlay   Y-Major not supported.
+    *                        X-Major required for Async Flips"
+    */
+   if (unlikely(info->bind_scanout))
+      valid_tilings &= IMAGE_TILING_X;
+
+   /*
+    * From the Sandy Bridge PRM, volume 3 part 2, page 158:
+    *
+    *     "The cursor surface address must be 4K byte aligned. The cursor must
+    *      be in linear memory, it cannot be tiled."
+    */
+   if (unlikely(info->bind_cursor))
+      valid_tilings &= IMAGE_TILING_NONE;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 318:
+    *
+    *     "[DevSNB+]: This field (Tiled Surface) must be set to TRUE. Linear
+    *      Depth Buffer is not supported."
+    *
+    *     "The Depth Buffer, if tiled, must use Y-Major tiling."
+    *
+    * From the Sandy Bridge PRM, volume 1 part 2, page 22:
+    *
+    *     "W-Major Tile Format is used for separate stencil."
+    */
+   if (info->bind_zs) {
+      if (info->format == GEN6_FORMAT_R8_UINT)
+         valid_tilings &= IMAGE_TILING_W;
+      else
+         valid_tilings &= IMAGE_TILING_Y;
+   }
+
+   if (info->bind_surface_sampler ||
+       info->bind_surface_dp_render ||
+       info->bind_surface_dp_typed) {
+      /*
+       * From the Haswell PRM, volume 2d, page 233:
+       *
+       *     "If Number of Multisamples is not MULTISAMPLECOUNT_1, this field
+       *      (Tiled Surface) must be TRUE."
+       */
+      if (info->sample_count > 1)
+         valid_tilings &= ~IMAGE_TILING_NONE;
+
+      if (ilo_dev_gen(dev) < ILO_GEN(8))
+         valid_tilings &= ~IMAGE_TILING_W;
+   }
+
+   if (info->bind_surface_dp_render) {
+      /*
+       * From the Sandy Bridge PRM, volume 1 part 2, page 32:
+       *
+       *     "NOTE: 128BPE Format Color buffer ( render target ) MUST be
+       *      either TileX or Linear."
+       *
+       * From the Haswell PRM, volume 5, page 32:
+       *
+       *     "NOTE: 128 BPP format color buffer (render target) supports
+       *      Linear, TiledX and TiledY."
+       */
+      if (ilo_dev_gen(dev) < ILO_GEN(7.5) && info->block_size == 16)
+         valid_tilings &= ~IMAGE_TILING_Y;
+
+      /*
+       * From the Ivy Bridge PRM, volume 4 part 1, page 63:
+       *
+       *     "This field (Surface Vertical Aligment) must be set to VALIGN_4
+       *      for all tiled Y Render Target surfaces."
+       *
+       *     "VALIGN_4 is not supported for surface format R32G32B32_FLOAT."
+       *
+       * R32G32B32_FLOAT is not renderable and we only need an assert() here.
+       */
+      if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5))
+         assert(info->format != GEN6_FORMAT_R32G32B32_FLOAT);
+   }
+
+   return valid_tilings;
+}
+
+static uint64_t
+image_get_gen6_estimated_size(const struct ilo_dev *dev,
+                              const struct ilo_image_info *info)
+{
+   /* padding not considered */
+   const uint64_t slice_size = info->width * info->height *
+      info->block_size / (info->block_width * info->block_height);
+   const uint64_t slice_count =
+      info->depth * info->array_size * info->sample_count;
+   const uint64_t estimated_size = slice_size * slice_count;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (info->level_count == 1)
+      return estimated_size;
+   else
+      return estimated_size * 4 / 3;
+}
+
+static enum gen_surface_tiling
+image_get_gen6_tiling(const struct ilo_dev *dev,
+                      const struct ilo_image_info *info,
+                      uint8_t valid_tilings)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (valid_tilings) {
+   case IMAGE_TILING_NONE:
+      return GEN6_TILING_NONE;
+   case IMAGE_TILING_X:
+      return GEN6_TILING_X;
+   case IMAGE_TILING_Y:
+      return GEN6_TILING_Y;
+   case IMAGE_TILING_W:
+      return GEN8_TILING_W;
+   default:
+      break;
+   }
+
+   /*
+    * X-tiling has the property that vertically adjacent pixels are usually in
+    * the same page.  When the image size is less than a page, the image
+    * height is 1, or when the image is not accessed in blocks, there is no
+    * reason to tile.
+    *
+    * Y-tiling is similar, where vertically adjacent pixels are usually in the
+    * same cacheline.
+    */
+   if (valid_tilings & IMAGE_TILING_NONE) {
+      const uint64_t estimated_size =
+         image_get_gen6_estimated_size(dev, info);
+
+      if (info->height == 1 || !(info->bind_surface_sampler ||
+                                 info->bind_surface_dp_render ||
+                                 info->bind_surface_dp_typed))
+         return GEN6_TILING_NONE;
+
+      if (estimated_size <= 64)
+         return GEN6_TILING_NONE;
+
+      if (estimated_size <= 2048)
+         valid_tilings &= ~IMAGE_TILING_X;
+   }
+
+   return (valid_tilings & IMAGE_TILING_Y) ? GEN6_TILING_Y :
+          (valid_tilings & IMAGE_TILING_X) ? GEN6_TILING_X :
+          GEN6_TILING_NONE;
+}
+
 static bool
 image_get_gen6_layout(const struct ilo_dev *dev,
                       const struct ilo_image_info *info,
@@ -142,6 +310,12 @@ image_get_gen6_layout(const struct ilo_dev *dev,
    layout->interleaved_samples =
       image_get_gen6_interleaved_samples(dev, info);
 
+   layout->valid_tilings = image_get_gen6_valid_tilings(dev, info);
+   if (!layout->valid_tilings)
+      return false;
+
+   layout->tiling = image_get_gen6_tiling(dev, info, layout->valid_tilings);
+
    return true;
 }
 
@@ -549,135 +723,6 @@ img_init_alignments(struct ilo_image *img,
           util_is_power_of_two(img->block_height));
 }
 
-static void
-img_init_tiling(struct ilo_image *img,
-                const struct ilo_image_params *params)
-{
-   const struct ilo_image_info *info = params->info;
-   unsigned preferred_tilings = params->valid_tilings;
-
-   /* no fencing nor BLT support */
-   if (preferred_tilings & ~IMAGE_TILING_W)
-      preferred_tilings &= ~IMAGE_TILING_W;
-
-   if (info->bind_surface_dp_render || info->bind_surface_sampler) {
-      /*
-       * heuristically set a minimum width/height for enabling tiling
-       */
-      if (img->width0 < 64 && (preferred_tilings & ~IMAGE_TILING_X))
-         preferred_tilings &= ~IMAGE_TILING_X;
-
-      if ((img->width0 < 32 || img->height0 < 16) &&
-          (img->width0 < 16 || img->height0 < 32) &&
-          (preferred_tilings & ~IMAGE_TILING_Y))
-         preferred_tilings &= ~IMAGE_TILING_Y;
-   } else {
-      /* force linear if we are not sure where the texture is bound to */
-      if (preferred_tilings & IMAGE_TILING_NONE)
-         preferred_tilings &= IMAGE_TILING_NONE;
-   }
-
-   /* prefer tiled over linear */
-   if (preferred_tilings & IMAGE_TILING_Y)
-      img->tiling = GEN6_TILING_Y;
-   else if (preferred_tilings & IMAGE_TILING_X)
-      img->tiling = GEN6_TILING_X;
-   else if (preferred_tilings & IMAGE_TILING_W)
-      img->tiling = GEN8_TILING_W;
-   else
-      img->tiling = GEN6_TILING_NONE;
-}
-
-static unsigned
-img_get_valid_tilings(const struct ilo_image *img,
-                      const struct ilo_image_params *params)
-{
-   const struct ilo_image_info *info = params->info;
-   unsigned valid_tilings = params->valid_tilings;
-
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 2, page 32:
-    *
-    *     "Display/Overlay   Y-Major not supported.
-    *                        X-Major required for Async Flips"
-    */
-   if (unlikely(info->bind_scanout))
-      valid_tilings &= IMAGE_TILING_X;
-
-   /*
-    * From the Sandy Bridge PRM, volume 3 part 2, page 158:
-    *
-    *     "The cursor surface address must be 4K byte aligned. The cursor must
-    *      be in linear memory, it cannot be tiled."
-    */
-   if (unlikely(info->bind_cursor))
-      valid_tilings &= IMAGE_TILING_NONE;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 318:
-    *
-    *     "[DevSNB+]: This field (Tiled Surface) must be set to TRUE. Linear
-    *      Depth Buffer is not supported."
-    *
-    *     "The Depth Buffer, if tiled, must use Y-Major tiling."
-    *
-    * From the Sandy Bridge PRM, volume 1 part 2, page 22:
-    *
-    *     "W-Major Tile Format is used for separate stencil."
-    */
-   if (info->bind_zs) {
-      switch (info->format) {
-      case GEN6_FORMAT_R8_UINT:
-         valid_tilings &= IMAGE_TILING_W;
-         break;
-      default:
-         valid_tilings &= IMAGE_TILING_Y;
-         break;
-      }
-   }
-
-   if (info->bind_surface_dp_render) {
-      /*
-       * From the Sandy Bridge PRM, volume 1 part 2, page 32:
-       *
-       *     "NOTE: 128BPE Format Color buffer ( render target ) MUST be
-       *      either TileX or Linear."
-       *
-       * From the Haswell PRM, volume 5, page 32:
-       *
-       *     "NOTE: 128 BPP format color buffer (render target) supports
-       *      Linear, TiledX and TiledY."
-       */
-      if (ilo_dev_gen(params->dev) < ILO_GEN(7.5) && img->block_size == 16)
-         valid_tilings &= ~IMAGE_TILING_Y;
-
-      /*
-       * From the Ivy Bridge PRM, volume 4 part 1, page 63:
-       *
-       *     "This field (Surface Vertical Aligment) must be set to VALIGN_4
-       *      for all tiled Y Render Target surfaces."
-       *
-       *     "VALIGN_4 is not supported for surface format R32G32B32_FLOAT."
-       */
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
-          ilo_dev_gen(params->dev) <= ILO_GEN(7.5) &&
-          img->format == GEN6_FORMAT_R32G32B32_FLOAT)
-         valid_tilings &= ~IMAGE_TILING_Y;
-
-      valid_tilings &= ~IMAGE_TILING_W;
-   }
-
-   if (info->bind_surface_sampler) {
-      if (ilo_dev_gen(params->dev) < ILO_GEN(8))
-         valid_tilings &= ~IMAGE_TILING_W;
-   }
-
-   /* no conflicting binding flags */
-   assert(valid_tilings);
-
-   return valid_tilings;
-}
-
 static void
 img_init_size_and_format(struct ilo_image *img,
                          struct ilo_image_params *params)
@@ -708,8 +753,6 @@ img_init_size_and_format(struct ilo_image *img,
    img->array_size = info->array_size;
    img->level_count = info->level_count;
    img->sample_count = info->sample_count;
-
-   params->valid_tilings = img_get_valid_tilings(img, params);
 }
 
 static bool
@@ -1295,7 +1338,9 @@ img_init(struct ilo_image *img,
    img->walk = layout.walk;
    img->interleaved_samples = layout.interleaved_samples;
 
-   img_init_tiling(img, params);
+   params->valid_tilings = layout.valid_tilings;
+   img->tiling = layout.tiling;
+
    img_init_alignments(img, params);
    img_init_lods(img, params);
    img_init_layer_height(img, params);

From c88e6cdfbfd7a7727dbae6b47a803b18aca5d9f4 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Mon, 29 Jun 2015 16:14:36 +0800
Subject: [PATCH 0148/1208] ilo: add image_get_gen6_{hiz,mcs}_enable()

They replace img_init_aux().
---
 src/gallium/drivers/ilo/core/ilo_image.c | 198 +++++++++++------------
 1 file changed, 97 insertions(+), 101 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 84001d862fd..c22c11974ea 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -55,6 +55,8 @@ struct ilo_image_layout {
 
    uint8_t valid_tilings;
    enum gen_surface_tiling tiling;
+
+   enum ilo_image_aux_type aux;
 };
 
 static enum ilo_image_walk_type
@@ -295,6 +297,91 @@ image_get_gen6_tiling(const struct ilo_dev *dev,
           GEN6_TILING_NONE;
 }
 
+static bool
+image_get_gen6_hiz_enable(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* depth buffer? */
+   if (!info->bind_zs ||
+       info->format == GEN6_FORMAT_R8_UINT ||
+       info->interleaved_stencil)
+      return false;
+
+   /* we want to be able to force 8x4 alignments */
+   if (info->type == GEN6_SURFTYPE_1D)
+      return false;
+
+   if (info->aux_disable)
+      return false;
+
+   if (ilo_debug & ILO_DEBUG_NOHIZ)
+      return false;
+
+   return true;
+}
+
+static bool
+image_get_gen7_mcs_enable(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info,
+                          enum gen_surface_tiling tiling)
+{
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!info->bind_surface_sampler && !info->bind_surface_dp_render)
+      return false;
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 77:
+    *
+    *     "For Render Target and Sampling Engine Surfaces:If the surface is
+    *      multisampled (Number of Multisamples any value other than
+    *      MULTISAMPLECOUNT_1), this field (MCS Enable) must be enabled."
+    *
+    *     "This field must be set to 0 for all SINT MSRTs when all RT channels
+    *      are not written"
+    */
+   if (info->sample_count > 1) {
+      if (ilo_dev_gen(dev) < ILO_GEN(8))
+         assert(!info->is_integer);
+      return true;
+   }
+
+   if (info->aux_disable)
+      return false;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 326:
+    *
+    *     "When MCS is buffer is used for color clear of non-multisampler
+    *      render target, the following restrictions apply.
+    *      - Support is limited to tiled render targets.
+    *      - Support is for non-mip-mapped and non-array surface types only.
+    *      - Clear is supported only on the full RT; i.e., no partial clear or
+    *        overlapping clears.
+    *      - MCS buffer for non-MSRT is supported only for RT formats 32bpp,
+    *        64bpp and 128bpp.
+    *      ..."
+    *
+    * How about SURFTYPE_3D?
+    */
+   if (!info->bind_surface_dp_render ||
+       tiling == GEN6_TILING_NONE ||
+       info->level_count > 1 ||
+       info->array_size > 1)
+      return false;
+
+   switch (info->block_size) {
+   case 4:
+   case 8:
+   case 16:
+      return true;
+   default:
+      return false;
+   }
+}
+
 static bool
 image_get_gen6_layout(const struct ilo_dev *dev,
                       const struct ilo_image_info *info,
@@ -316,6 +403,14 @@ image_get_gen6_layout(const struct ilo_dev *dev,
 
    layout->tiling = image_get_gen6_tiling(dev, info, layout->valid_tilings);
 
+   if (image_get_gen6_hiz_enable(dev, info))
+      layout->aux = ILO_IMAGE_AUX_HIZ;
+   else if (ilo_dev_gen(dev) >= ILO_GEN(7) &&
+            image_get_gen7_mcs_enable(dev, info, layout->tiling))
+      layout->aux = ILO_IMAGE_AUX_MCS;
+   else
+      layout->aux = ILO_IMAGE_AUX_NONE;
+
    return true;
 }
 
@@ -755,106 +850,6 @@ img_init_size_and_format(struct ilo_image *img,
    img->sample_count = info->sample_count;
 }
 
-static bool
-img_want_mcs(const struct ilo_image *img,
-             const struct ilo_image_params *params)
-{
-   const struct ilo_image_info *info = params->info;
-   bool want_mcs = false;
-
-   /* MCS is for RT on GEN7+ */
-   if (ilo_dev_gen(params->dev) < ILO_GEN(7))
-      return false;
-
-   if (info->type != GEN6_SURFTYPE_2D || !info->bind_surface_dp_render)
-      return false;
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 77:
-    *
-    *     "For Render Target and Sampling Engine Surfaces:If the surface is
-    *      multisampled (Number of Multisamples any value other than
-    *      MULTISAMPLECOUNT_1), this field (MCS Enable) must be enabled."
-    *
-    *     "This field must be set to 0 for all SINT MSRTs when all RT channels
-    *      are not written"
-    */
-   if (info->sample_count > 1 && !info->is_integer) {
-      want_mcs = true;
-   } else if (info->sample_count == 1 && !info->aux_disable) {
-      /*
-       * From the Ivy Bridge PRM, volume 2 part 1, page 326:
-       *
-       *     "When MCS is buffer is used for color clear of non-multisampler
-       *      render target, the following restrictions apply.
-       *      - Support is limited to tiled render targets.
-       *      - Support is for non-mip-mapped and non-array surface types
-       *        only.
-       *      - Clear is supported only on the full RT; i.e., no partial clear
-       *        or overlapping clears.
-       *      - MCS buffer for non-MSRT is supported only for RT formats
-       *        32bpp, 64bpp and 128bpp.
-       *      ..."
-       */
-      if (img->tiling != GEN6_TILING_NONE &&
-          info->level_count == 1 && info->array_size == 1) {
-         switch (img->block_size) {
-         case 4:
-         case 8:
-         case 16:
-            want_mcs = true;
-            break;
-         default:
-            break;
-         }
-      }
-   }
-
-   return want_mcs;
-}
-
-static bool
-img_want_hiz(const struct ilo_image *img,
-             const struct ilo_image_params *params)
-{
-   const struct ilo_image_info *info = params->info;
-
-   if (ilo_debug & ILO_DEBUG_NOHIZ)
-      return false;
-
-   if (info->aux_disable)
-      return false;
-
-   /* we want 8x4 aligned levels */
-   if (info->type == GEN6_SURFTYPE_1D)
-      return false;
-
-   if (!info->bind_zs)
-      return false;
-
-   if (info->interleaved_stencil)
-      return false;
-
-   switch (info->format) {
-   case GEN6_FORMAT_R32_FLOAT:
-   case GEN6_FORMAT_R24_UNORM_X8_TYPELESS:
-   case GEN6_FORMAT_R16_UNORM:
-      return true;
-   default:
-      return false;
-   }
-}
-
-static void
-img_init_aux(struct ilo_image *img,
-             const struct ilo_image_params *params)
-{
-   if (img_want_hiz(img, params))
-      img->aux.type = ILO_IMAGE_AUX_HIZ;
-   else if (img_want_mcs(img, params))
-      img->aux.type = ILO_IMAGE_AUX_MCS;
-}
-
 static void
 img_align(struct ilo_image *img, struct ilo_image_params *params)
 {
@@ -1333,7 +1328,6 @@ img_init(struct ilo_image *img,
    /* there are hard dependencies between every function here */
 
    img_init_size_and_format(img, params);
-   img_init_aux(img, params);
 
    img->walk = layout.walk;
    img->interleaved_samples = layout.interleaved_samples;
@@ -1341,6 +1335,8 @@ img_init(struct ilo_image *img,
    params->valid_tilings = layout.valid_tilings;
    img->tiling = layout.tiling;
 
+   img->aux.type = layout.aux;
+
    img_init_alignments(img, params);
    img_init_lods(img, params);
    img_init_layer_height(img, params);

From f1946546c7d4ac22799a8b4944d6c36b77e22626 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Mon, 29 Jun 2015 16:16:11 +0800
Subject: [PATCH 0149/1208] ilo: add image_get_gen{6,7}_alignment()

They replace img_init_alignments().
---
 src/gallium/drivers/ilo/core/ilo_image.c | 336 ++++++++++++-----------
 1 file changed, 177 insertions(+), 159 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index c22c11974ea..d28cb075fb5 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -57,6 +57,9 @@ struct ilo_image_layout {
    enum gen_surface_tiling tiling;
 
    enum ilo_image_aux_type aux;
+
+   int align_i;
+   int align_j;
 };
 
 static enum ilo_image_walk_type
@@ -382,6 +385,154 @@ image_get_gen7_mcs_enable(const struct ilo_dev *dev,
    }
 }
 
+static void
+image_get_gen6_alignments(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info,
+                          int *align_i, int *align_j)
+{
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   /*
+    * From the Sandy Bridge PRM, volume 1 part 1, page 113:
+    *
+    *     "surface format           align_i     align_j
+    *      YUV 4:2:2 formats        4           *see below
+    *      BC1-5                    4           4
+    *      FXT1                     8           4
+    *      all other formats        4           *see below"
+    *
+    *     "- align_j = 4 for any depth buffer
+    *      - align_j = 2 for separate stencil buffer
+    *      - align_j = 4 for any render target surface is multisampled (4x)
+    *      - align_j = 4 for any render target surface with Surface Vertical
+    *        Alignment = VALIGN_4
+    *      - align_j = 2 for any render target surface with Surface Vertical
+    *        Alignment = VALIGN_2
+    *      - align_j = 2 for all other render target surface
+    *      - align_j = 2 for any sampling engine surface with Surface Vertical
+    *        Alignment = VALIGN_2
+    *      - align_j = 4 for any sampling engine surface with Surface Vertical
+    *        Alignment = VALIGN_4"
+    *
+    * From the Sandy Bridge PRM, volume 4 part 1, page 86:
+    *
+    *     "This field (Surface Vertical Alignment) must be set to VALIGN_2 if
+    *      the Surface Format is 96 bits per element (BPE)."
+    *
+    * They can be rephrased as
+    *
+    *                                  align_i        align_j
+    *   compressed formats             block width    block height
+    *   GEN6_FORMAT_R8_UINT            4              2
+    *   other depth/stencil formats    4              4
+    *   4x multisampled                4              4
+    *   bpp 96                         4              2
+    *   others                         4              2 or 4
+    */
+
+   *align_i = (info->compressed) ? info->block_width : 4;
+   if (info->compressed) {
+      *align_j = info->block_height;
+   } else if (info->bind_zs) {
+      *align_j = (info->format == GEN6_FORMAT_R8_UINT) ? 2 : 4;
+   } else {
+      *align_j = (info->sample_count > 1 || info->block_size != 12) ? 4 : 2;
+   }
+}
+
+static void
+image_get_gen7_alignments(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info,
+                          enum gen_surface_tiling tiling,
+                          int *align_i, int *align_j)
+{
+   int i, j;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 1 part 1, page 110:
+    *
+    *     "surface defined by      surface format     align_i     align_j
+    *      3DSTATE_DEPTH_BUFFER    D16_UNORM          8           4
+    *                              not D16_UNORM      4           4
+    *      3DSTATE_STENCIL_BUFFER  N/A                8           8
+    *      SURFACE_STATE           BC*, ETC*, EAC*    4           4
+    *                              FXT1               8           4
+    *                              all others         (set by SURFACE_STATE)"
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 63:
+    *
+    *     "- This field (Surface Vertical Aligment) is intended to be set to
+    *        VALIGN_4 if the surface was rendered as a depth buffer, for a
+    *        multisampled (4x) render target, or for a multisampled (8x)
+    *        render target, since these surfaces support only alignment of 4.
+    *      - Use of VALIGN_4 for other surfaces is supported, but uses more
+    *        memory.
+    *      - This field must be set to VALIGN_4 for all tiled Y Render Target
+    *        surfaces.
+    *      - Value of 1 is not supported for format YCRCB_NORMAL (0x182),
+    *        YCRCB_SWAPUVY (0x183), YCRCB_SWAPUV (0x18f), YCRCB_SWAPY (0x190)
+    *      - If Number of Multisamples is not MULTISAMPLECOUNT_1, this field
+    *        must be set to VALIGN_4."
+    *      - VALIGN_4 is not supported for surface format R32G32B32_FLOAT."
+    *
+    *     "- This field (Surface Horizontal Aligment) is intended to be set to
+    *        HALIGN_8 only if the surface was rendered as a depth buffer with
+    *        Z16 format or a stencil buffer, since these surfaces support only
+    *        alignment of 8.
+    *      - Use of HALIGN_8 for other surfaces is supported, but uses more
+    *        memory.
+    *      - This field must be set to HALIGN_4 if the Surface Format is BC*.
+    *      - This field must be set to HALIGN_8 if the Surface Format is
+    *        FXT1."
+    *
+    * They can be rephrased as
+    *
+    *                                  align_i        align_j
+    *  compressed formats              block width    block height
+    *  GEN6_FORMAT_R16_UNORM           8              4
+    *  GEN6_FORMAT_R8_UINT             8              8
+    *  other depth/stencil formats     4              4
+    *  2x or 4x multisampled           4 or 8         4
+    *  tiled Y                         4 or 8         4 (if rt)
+    *  GEN6_FORMAT_R32G32B32_FLOAT     4 or 8         2
+    *  others                          4 or 8         2 or 4
+    */
+   if (info->compressed) {
+      i = info->block_width;
+      j = info->block_height;
+   } else if (info->bind_zs) {
+      switch (info->format) {
+      case GEN6_FORMAT_R16_UNORM:
+         i = 8;
+         j = 4;
+         break;
+      case GEN6_FORMAT_R8_UINT:
+         i = 8;
+         j = 8;
+         break;
+      default:
+         i = 4;
+         j = 4;
+         break;
+      }
+   } else {
+      const bool valign_4 =
+         (info->sample_count > 1 || ilo_dev_gen(dev) >= ILO_GEN(8) ||
+          (tiling == GEN6_TILING_Y && info->bind_surface_dp_render));
+
+      if (ilo_dev_gen(dev) < ILO_GEN(8) && valign_4)
+         assert(info->format != GEN6_FORMAT_R32G32B32_FLOAT);
+
+      i = 4;
+      j = (valign_4) ? 4 : 2;
+   }
+
+   *align_i = i;
+   *align_j = j;
+}
+
 static bool
 image_get_gen6_layout(const struct ilo_dev *dev,
                       const struct ilo_image_info *info,
@@ -411,6 +562,29 @@ image_get_gen6_layout(const struct ilo_dev *dev,
    else
       layout->aux = ILO_IMAGE_AUX_NONE;
 
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      image_get_gen7_alignments(dev, info, layout->tiling,
+            &layout->align_i, &layout->align_j);
+   } else {
+      image_get_gen6_alignments(dev, info,
+            &layout->align_i, &layout->align_j);
+   }
+
+   /*
+    * the fact that align i and j are multiples of block width and height
+    * respectively is what makes the size of the bo a multiple of the block
+    * size, slices start at block boundaries, and many of the computations
+    * work.
+    */
+   assert(layout->align_i % info->block_width == 0);
+   assert(layout->align_j % info->block_height == 0);
+
+   /* make sure align() works */
+   assert(util_is_power_of_two(layout->align_i) &&
+          util_is_power_of_two(layout->align_j));
+   assert(util_is_power_of_two(info->block_width) &&
+          util_is_power_of_two(info->block_height));
+
    return true;
 }
 
@@ -660,164 +834,6 @@ img_init_lods(struct ilo_image *img,
    }
 }
 
-static void
-img_init_alignments(struct ilo_image *img,
-                    const struct ilo_image_params *params)
-{
-   const struct ilo_image_info *info = params->info;
-
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 113:
-    *
-    *     "surface format           align_i     align_j
-    *      YUV 4:2:2 formats        4           *see below
-    *      BC1-5                    4           4
-    *      FXT1                     8           4
-    *      all other formats        4           *see below"
-    *
-    *     "- align_j = 4 for any depth buffer
-    *      - align_j = 2 for separate stencil buffer
-    *      - align_j = 4 for any render target surface is multisampled (4x)
-    *      - align_j = 4 for any render target surface with Surface Vertical
-    *        Alignment = VALIGN_4
-    *      - align_j = 2 for any render target surface with Surface Vertical
-    *        Alignment = VALIGN_2
-    *      - align_j = 2 for all other render target surface
-    *      - align_j = 2 for any sampling engine surface with Surface Vertical
-    *        Alignment = VALIGN_2
-    *      - align_j = 4 for any sampling engine surface with Surface Vertical
-    *        Alignment = VALIGN_4"
-    *
-    * From the Sandy Bridge PRM, volume 4 part 1, page 86:
-    *
-    *     "This field (Surface Vertical Alignment) must be set to VALIGN_2 if
-    *      the Surface Format is 96 bits per element (BPE)."
-    *
-    * They can be rephrased as
-    *
-    *                                  align_i        align_j
-    *   compressed formats             block width    block height
-    *   GEN6_FORMAT_R8_UINT            4              2
-    *   other depth/stencil formats    4              4
-    *   4x multisampled                4              4
-    *   bpp 96                         4              2
-    *   others                         4              2 or 4
-    */
-
-   /*
-    * From the Ivy Bridge PRM, volume 1 part 1, page 110:
-    *
-    *     "surface defined by      surface format     align_i     align_j
-    *      3DSTATE_DEPTH_BUFFER    D16_UNORM          8           4
-    *                              not D16_UNORM      4           4
-    *      3DSTATE_STENCIL_BUFFER  N/A                8           8
-    *      SURFACE_STATE           BC*, ETC*, EAC*    4           4
-    *                              FXT1               8           4
-    *                              all others         (set by SURFACE_STATE)"
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 63:
-    *
-    *     "- This field (Surface Vertical Aligment) is intended to be set to
-    *        VALIGN_4 if the surface was rendered as a depth buffer, for a
-    *        multisampled (4x) render target, or for a multisampled (8x)
-    *        render target, since these surfaces support only alignment of 4.
-    *      - Use of VALIGN_4 for other surfaces is supported, but uses more
-    *        memory.
-    *      - This field must be set to VALIGN_4 for all tiled Y Render Target
-    *        surfaces.
-    *      - Value of 1 is not supported for format YCRCB_NORMAL (0x182),
-    *        YCRCB_SWAPUVY (0x183), YCRCB_SWAPUV (0x18f), YCRCB_SWAPY (0x190)
-    *      - If Number of Multisamples is not MULTISAMPLECOUNT_1, this field
-    *        must be set to VALIGN_4."
-    *      - VALIGN_4 is not supported for surface format R32G32B32_FLOAT."
-    *
-    *     "- This field (Surface Horizontal Aligment) is intended to be set to
-    *        HALIGN_8 only if the surface was rendered as a depth buffer with
-    *        Z16 format or a stencil buffer, since these surfaces support only
-    *        alignment of 8.
-    *      - Use of HALIGN_8 for other surfaces is supported, but uses more
-    *        memory.
-    *      - This field must be set to HALIGN_4 if the Surface Format is BC*.
-    *      - This field must be set to HALIGN_8 if the Surface Format is
-    *        FXT1."
-    *
-    * They can be rephrased as
-    *
-    *                                  align_i        align_j
-    *  compressed formats              block width    block height
-    *  GEN6_FORMAT_R16_UNORM           8              4
-    *  GEN6_FORMAT_R8_UINT             8              8
-    *  other depth/stencil formats     4              4
-    *  2x or 4x multisampled           4 or 8         4
-    *  tiled Y                         4 or 8         4 (if rt)
-    *  GEN6_FORMAT_R32G32B32_FLOAT     4 or 8         2
-    *  others                          4 or 8         2 or 4
-    */
-
-   if (info->compressed) {
-      /* this happens to be the case */
-      img->align_i = img->block_width;
-      img->align_j = img->block_height;
-   } else if (info->bind_zs) {
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7)) {
-         switch (img->format) {
-         case GEN6_FORMAT_R16_UNORM:
-            img->align_i = 8;
-            img->align_j = 4;
-            break;
-         case GEN6_FORMAT_R8_UINT:
-            img->align_i = 8;
-            img->align_j = 8;
-            break;
-         default:
-            img->align_i = 4;
-            img->align_j = 4;
-            break;
-         }
-      } else {
-         switch (img->format) {
-         case GEN6_FORMAT_R8_UINT:
-            img->align_i = 4;
-            img->align_j = 2;
-            break;
-         default:
-            img->align_i = 4;
-            img->align_j = 4;
-            break;
-         }
-      }
-   } else {
-      const bool valign_4 =
-         (info->sample_count > 1) ||
-         (ilo_dev_gen(params->dev) >= ILO_GEN(8)) ||
-         (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
-          img->tiling == GEN6_TILING_Y &&
-          info->bind_surface_dp_render);
-
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
-          ilo_dev_gen(params->dev) <= ILO_GEN(7.5) && valign_4)
-         assert(img->format != GEN6_FORMAT_R32G32B32_FLOAT);
-
-      img->align_i = 4;
-      img->align_j = (valign_4) ? 4 : 2;
-   }
-
-   /*
-    * the fact that align i and j are multiples of block width and height
-    * respectively is what makes the size of the bo a multiple of the block
-    * size, slices start at block boundaries, and many of the computations
-    * work.
-    */
-   assert(img->align_i % img->block_width == 0);
-   assert(img->align_j % img->block_height == 0);
-
-   /* make sure align() works */
-   assert(util_is_power_of_two(img->align_i) &&
-          util_is_power_of_two(img->align_j));
-   assert(util_is_power_of_two(img->block_width) &&
-          util_is_power_of_two(img->block_height));
-}
-
 static void
 img_init_size_and_format(struct ilo_image *img,
                          struct ilo_image_params *params)
@@ -1337,7 +1353,9 @@ img_init(struct ilo_image *img,
 
    img->aux.type = layout.aux;
 
-   img_init_alignments(img, params);
+   img->align_i = layout.align_i;
+   img->align_j = layout.align_j;
+
    img_init_lods(img, params);
    img_init_layer_height(img, params);
 

From 0faeb21dc0c029b345eaf5545b17b97d5fb8d8da Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Mon, 29 Jun 2015 16:25:32 +0800
Subject: [PATCH 0150/1208] ilo: add image_get_gen6_lods()

It replaces img_init_lods() and img_init_layer_height().
---
 src/gallium/drivers/ilo/core/ilo_image.c | 236 ++++++++++++++---------
 1 file changed, 148 insertions(+), 88 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index d28cb075fb5..c2a84876312 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -60,6 +60,13 @@ struct ilo_image_layout {
 
    int align_i;
    int align_j;
+
+   struct ilo_image_lod *lods;
+   int walk_layer_h0;
+   int walk_layer_h1;
+   int walk_layer_height;
+   int monolithic_width;
+   int monolithic_height;
 };
 
 static enum ilo_image_walk_type
@@ -534,9 +541,9 @@ image_get_gen7_alignments(const struct ilo_dev *dev,
 }
 
 static bool
-image_get_gen6_layout(const struct ilo_dev *dev,
-                      const struct ilo_image_info *info,
-                      struct ilo_image_layout *layout)
+image_init_gen6_hardware_layout(const struct ilo_dev *dev,
+                                const struct ilo_image_info *info,
+                                struct ilo_image_layout *layout)
 {
    ILO_DEV_ASSERT(dev, 6, 8);
 
@@ -570,34 +577,22 @@ image_get_gen6_layout(const struct ilo_dev *dev,
             &layout->align_i, &layout->align_j);
    }
 
-   /*
-    * the fact that align i and j are multiples of block width and height
-    * respectively is what makes the size of the bo a multiple of the block
-    * size, slices start at block boundaries, and many of the computations
-    * work.
-    */
-   assert(layout->align_i % info->block_width == 0);
-   assert(layout->align_j % info->block_height == 0);
-
-   /* make sure align() works */
-   assert(util_is_power_of_two(layout->align_i) &&
-          util_is_power_of_two(layout->align_j));
-   assert(util_is_power_of_two(info->block_width) &&
-          util_is_power_of_two(info->block_height));
-
    return true;
 }
 
 static void
-img_get_slice_size(const struct ilo_image *img,
-                   const struct ilo_image_params *params,
-                   unsigned level, unsigned *width, unsigned *height)
+image_get_gen6_slice_size(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info,
+                          const struct ilo_image_layout *layout,
+                          uint8_t level,
+                          int *width, int *height)
 {
-   const struct ilo_image_info *info = params->info;
-   unsigned w, h;
+   int w, h;
 
-   w = u_minify(img->width0, level);
-   h = u_minify(img->height0, level);
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   w = u_minify(info->width, level);
+   h = u_minify(info->height, level);
 
    /*
     * From the Sandy Bridge PRM, volume 1 part 1, page 114:
@@ -606,8 +601,8 @@ img_get_slice_size(const struct ilo_image *img,
     *      sizing algorithm presented in Non-Power-of-Two Mipmaps above. Then,
     *      if necessary, they are padded out to compression block boundaries."
     */
-   w = align(w, img->block_width);
-   h = align(h, img->block_height);
+   w = align(w, info->block_width);
+   h = align(h, info->block_height);
 
    /*
     * From the Sandy Bridge PRM, volume 1 part 1, page 111:
@@ -648,7 +643,7 @@ img_get_slice_size(const struct ilo_image *img,
     *   w = align(w, 2) * 2;
     *   y = align(y, 2) * 2;
     */
-   if (img->interleaved_samples) {
+   if (layout->interleaved_samples) {
       switch (info->sample_count) {
       case 1:
          break;
@@ -682,40 +677,50 @@ img_get_slice_size(const struct ilo_image *img,
     * To make things easier (for transfer), we will just double the stencil
     * stride in 3DSTATE_STENCIL_BUFFER.
     */
-   w = align(w, img->align_i);
-   h = align(h, img->align_j);
+   w = align(w, layout->align_i);
+   h = align(h, layout->align_j);
 
    *width = w;
    *height = h;
 }
 
-static unsigned
-img_get_num_layers(const struct ilo_image *img,
-                   const struct ilo_image_params *params)
+static int
+image_get_gen6_layer_count(const struct ilo_dev *dev,
+                           const struct ilo_image_info *info,
+                           const struct ilo_image_layout *layout)
 {
-   const struct ilo_image_info *info = params->info;
-   unsigned num_layers = info->array_size;
+   int count = info->array_size;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
 
    /* samples of the same index are stored in a layer */
-   if (info->sample_count > 1 && !img->interleaved_samples)
-      num_layers *= info->sample_count;
+   if (!layout->interleaved_samples)
+      count *= info->sample_count;
 
-   return num_layers;
+   return count;
 }
 
 static void
-img_init_layer_height(struct ilo_image *img,
-                      struct ilo_image_params *params)
+image_get_gen6_walk_layer_heights(const struct ilo_dev *dev,
+                                  const struct ilo_image_info *info,
+                                  struct ilo_image_layout *layout)
 {
-   const struct ilo_image_info *info = params->info;
-   unsigned num_layers;
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (img->walk != ILO_IMAGE_WALK_LAYER)
-      return;
+   layout->walk_layer_h0 = layout->lods[0].slice_height;
 
-   num_layers = img_get_num_layers(img, params);
-   if (num_layers <= 1)
+   if (info->level_count > 1) {
+      layout->walk_layer_h1 = layout->lods[1].slice_height;
+   } else {
+      int dummy;
+      image_get_gen6_slice_size(dev, info, layout, 1,
+            &dummy, &layout->walk_layer_h1);
+   }
+
+   if (image_get_gen6_layer_count(dev, info, layout) == 1) {
+      layout->walk_layer_height = 0;
       return;
+   }
 
    /*
     * From the Sandy Bridge PRM, volume 1 part 1, page 115:
@@ -751,38 +756,44 @@ img_init_layer_height(struct ilo_image *img,
     * slices.  Since we use texel rows everywhere, we do not need to divide
     * QPitch by 4.
     */
-   img->walk_layer_height = params->h0 + params->h1 +
-      ((ilo_dev_gen(params->dev) >= ILO_GEN(7)) ? 12 : 11) * img->align_j;
+   layout->walk_layer_height = layout->walk_layer_h0 + layout->walk_layer_h1 +
+      ((ilo_dev_gen(dev) >= ILO_GEN(7)) ? 12 : 11) * layout->align_j;
 
-   if (ilo_dev_gen(params->dev) == ILO_GEN(6) && info->sample_count > 1 &&
-       img->height0 % 4 == 1)
-      img->walk_layer_height += 4;
-
-   params->max_y += img->walk_layer_height * (num_layers - 1);
+   if (ilo_dev_gen(dev) == ILO_GEN(6) && info->sample_count > 1 &&
+       info->height % 4 == 1)
+      layout->walk_layer_height += 4;
 }
 
 static void
-img_init_lods(struct ilo_image *img,
-              struct ilo_image_params *params)
+image_get_gen6_lods(const struct ilo_dev *dev,
+                    const struct ilo_image_info *info,
+                    struct ilo_image_layout *layout)
 {
-   const struct ilo_image_info *info = params->info;
-   unsigned cur_x, cur_y;
-   unsigned lv;
+   const int layer_count = image_get_gen6_layer_count(dev, info, layout);
+   int cur_x, cur_y, max_x, max_y;
+   uint8_t lv;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
 
    cur_x = 0;
    cur_y = 0;
+   max_x = 0;
+   max_y = 0;
    for (lv = 0; lv < info->level_count; lv++) {
-      unsigned lod_w, lod_h;
+      int slice_w, slice_h, lod_w, lod_h;
 
-      img_get_slice_size(img, params, lv, &lod_w, &lod_h);
+      image_get_gen6_slice_size(dev, info, layout, lv, &slice_w, &slice_h);
 
-      img->lods[lv].x = cur_x;
-      img->lods[lv].y = cur_y;
-      img->lods[lv].slice_width = lod_w;
-      img->lods[lv].slice_height = lod_h;
+      layout->lods[lv].x = cur_x;
+      layout->lods[lv].y = cur_y;
+      layout->lods[lv].slice_width = slice_w;
+      layout->lods[lv].slice_height = slice_h;
 
-      switch (img->walk) {
+      switch (layout->walk) {
       case ILO_IMAGE_WALK_LAYER:
+         lod_w = slice_w;
+         lod_h = slice_h;
+
          /* MIPLAYOUT_BELOW */
          if (lv == 1)
             cur_x += lod_w;
@@ -790,7 +801,9 @@ img_init_lods(struct ilo_image *img,
             cur_y += lod_h;
          break;
       case ILO_IMAGE_WALK_LOD:
-         lod_h *= img_get_num_layers(img, params);
+         lod_w = slice_w;
+         lod_h = slice_h * layer_count;
+
          if (lv == 1)
             cur_x += lod_w;
          else
@@ -798,40 +811,83 @@ img_init_lods(struct ilo_image *img,
 
          /* every LOD begins at tile boundaries */
          if (info->level_count > 1) {
-            assert(img->format == GEN6_FORMAT_R8_UINT);
+            assert(info->format == GEN6_FORMAT_R8_UINT);
             cur_x = align(cur_x, 64);
             cur_y = align(cur_y, 64);
          }
          break;
       case ILO_IMAGE_WALK_3D:
          {
-            const unsigned num_slices = u_minify(info->depth, lv);
-            const unsigned num_slices_per_row = 1 << lv;
-            const unsigned num_rows =
-               (num_slices + num_slices_per_row - 1) / num_slices_per_row;
+            const int slice_count = u_minify(info->depth, lv);
+            const int slice_count_per_row = 1 << lv;
+            const int row_count =
+               (slice_count + slice_count_per_row - 1) / slice_count_per_row;
 
-            lod_w *= num_slices_per_row;
-            lod_h *= num_rows;
-
-            cur_y += lod_h;
+            lod_w = slice_w * slice_count_per_row;
+            lod_h = slice_h * row_count;
          }
+
+         cur_y += lod_h;
+         break;
+      default:
+         assert(!"unknown walk type");
+         lod_w = 0;
+         lod_h = 0;
          break;
       }
 
-      if (params->max_x < img->lods[lv].x + lod_w)
-         params->max_x = img->lods[lv].x + lod_w;
-      if (params->max_y < img->lods[lv].y + lod_h)
-         params->max_y = img->lods[lv].y + lod_h;
+      if (max_x < layout->lods[lv].x + lod_w)
+         max_x = layout->lods[lv].x + lod_w;
+      if (max_y < layout->lods[lv].y + lod_h)
+         max_y = layout->lods[lv].y + lod_h;
    }
 
-   if (img->walk == ILO_IMAGE_WALK_LAYER) {
-      params->h0 = img->lods[0].slice_height;
-
-      if (info->level_count > 1)
-         params->h1 = img->lods[1].slice_height;
-      else
-         img_get_slice_size(img, params, 1, &cur_x, &params->h1);
+   if (layout->walk == ILO_IMAGE_WALK_LAYER) {
+      image_get_gen6_walk_layer_heights(dev, info, layout);
+      if (layer_count > 1)
+         max_y += layout->walk_layer_height * (layer_count - 1);
+   } else {
+      layout->walk_layer_h0 = 0;
+      layout->walk_layer_h1 = 0;
+      layout->walk_layer_height = 0;
    }
+
+   layout->monolithic_width = max_x;
+   layout->monolithic_height = max_y;
+}
+
+static bool
+image_get_gen6_layout(const struct ilo_dev *dev,
+                      const struct ilo_image_info *info,
+                      struct ilo_image_layout *layout)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!image_init_gen6_hardware_layout(dev, info, layout))
+      return false;
+
+   /*
+    * the fact that align i and j are multiples of block width and height
+    * respectively is what makes the size of the bo a multiple of the block
+    * size, slices start at block boundaries, and many of the computations
+    * work.
+    */
+   assert(layout->align_i % info->block_width == 0);
+   assert(layout->align_j % info->block_height == 0);
+
+   /* make sure align() works */
+   assert(util_is_power_of_two(layout->align_i) &&
+          util_is_power_of_two(layout->align_j));
+   assert(util_is_power_of_two(info->block_width) &&
+          util_is_power_of_two(info->block_height));
+
+   image_get_gen6_lods(dev, info, layout);
+
+   assert(layout->walk_layer_height % info->block_height == 0);
+   assert(layout->monolithic_width % info->block_width == 0);
+   assert(layout->monolithic_height % info->block_height == 0);
+
+   return true;
 }
 
 static void
@@ -1337,6 +1393,7 @@ img_init(struct ilo_image *img,
    struct ilo_image_layout layout;
 
    memset(&layout, 0, sizeof(layout));
+   layout.lods = img->lods;
 
    if (!image_get_gen6_layout(params->dev, params->info, &layout))
       return false;
@@ -1356,8 +1413,11 @@ img_init(struct ilo_image *img,
    img->align_i = layout.align_i;
    img->align_j = layout.align_j;
 
-   img_init_lods(img, params);
-   img_init_layer_height(img, params);
+   params->max_x = layout.monolithic_width;
+   params->max_y = layout.monolithic_height;
+   params->h0 = layout.walk_layer_h0;
+   params->h1 = layout.walk_layer_h1;
+   img->walk_layer_height = layout.walk_layer_height;
 
    img_align(img, params);
    img_calculate_bo_size(img, params);

From 0da3b732ad156f63e32e7520bc1af97e1b733be7 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Mon, 29 Jun 2015 16:38:49 +0800
Subject: [PATCH 0151/1208] ilo: add image_get_gen6_monolithic_size()

It replaces img_align().
---
 src/gallium/drivers/ilo/core/ilo_image.c | 134 +++++++++++------------
 1 file changed, 67 insertions(+), 67 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index c2a84876312..535407fefff 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -764,6 +764,72 @@ image_get_gen6_walk_layer_heights(const struct ilo_dev *dev,
       layout->walk_layer_height += 4;
 }
 
+static void
+image_get_gen6_monolithic_size(const struct ilo_dev *dev,
+                               const struct ilo_image_info *info,
+                               struct ilo_image_layout *layout,
+                               int max_x, int max_y)
+{
+   int align_w = 1, align_h = 1, pad_h = 0;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 1 part 1, page 118:
+    *
+    *     "To determine the necessary padding on the bottom and right side of
+    *      the surface, refer to the table in Section 7.18.3.4 for the i and j
+    *      parameters for the surface format in use. The surface must then be
+    *      extended to the next multiple of the alignment unit size in each
+    *      dimension, and all texels contained in this extended surface must
+    *      have valid GTT entries."
+    *
+    *     "For cube surfaces, an additional two rows of padding are required
+    *      at the bottom of the surface. This must be ensured regardless of
+    *      whether the surface is stored tiled or linear.  This is due to the
+    *      potential rotation of cache line orientation from memory to cache."
+    *
+    *     "For compressed textures (BC* and FXT1 surface formats), padding at
+    *      the bottom of the surface is to an even compressed row, which is
+    *      equal to a multiple of 8 uncompressed texel rows. Thus, for padding
+    *      purposes, these surfaces behave as if j = 8 only for surface
+    *      padding purposes. The value of 4 for j still applies for mip level
+    *      alignment and QPitch calculation."
+    */
+   if (info->bind_surface_sampler) {
+      align_w = MAX2(align_w, layout->align_i);
+      align_h = MAX2(align_h, layout->align_j);
+
+      if (info->type == GEN6_SURFTYPE_CUBE)
+         pad_h += 2;
+
+      if (info->compressed)
+         align_h = MAX2(align_h, layout->align_j * 2);
+   }
+
+   /*
+    * From the Sandy Bridge PRM, volume 1 part 1, page 118:
+    *
+    *     "If the surface contains an odd number of rows of data, a final row
+    *      below the surface must be allocated."
+    */
+   if (info->bind_surface_dp_render)
+      align_h = MAX2(align_h, 2);
+
+   /*
+    * Depth Buffer Clear/Resolve works in 8x4 sample blocks.  Pad to allow HiZ
+    * for unaligned non-mipmapped and non-array images.
+    */
+   if (layout->aux == ILO_IMAGE_AUX_HIZ &&
+       info->level_count == 1 && info->array_size == 1 && info->depth == 1) {
+      align_w = MAX2(align_w, 8);
+      align_h = MAX2(align_h, 4);
+   }
+
+   layout->monolithic_width = align(max_x, align_w);
+   layout->monolithic_height = align(max_y + pad_h, align_h);
+}
+
 static void
 image_get_gen6_lods(const struct ilo_dev *dev,
                     const struct ilo_image_info *info,
@@ -852,8 +918,7 @@ image_get_gen6_lods(const struct ilo_dev *dev,
       layout->walk_layer_height = 0;
    }
 
-   layout->monolithic_width = max_x;
-   layout->monolithic_height = max_y;
+   image_get_gen6_monolithic_size(dev, info, layout, max_x, max_y);
 }
 
 static bool
@@ -922,70 +987,6 @@ img_init_size_and_format(struct ilo_image *img,
    img->sample_count = info->sample_count;
 }
 
-static void
-img_align(struct ilo_image *img, struct ilo_image_params *params)
-{
-   const struct ilo_image_info *info = params->info;
-   int align_w = 1, align_h = 1, pad_h = 0;
-
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 118:
-    *
-    *     "To determine the necessary padding on the bottom and right side of
-    *      the surface, refer to the table in Section 7.18.3.4 for the i and j
-    *      parameters for the surface format in use. The surface must then be
-    *      extended to the next multiple of the alignment unit size in each
-    *      dimension, and all texels contained in this extended surface must
-    *      have valid GTT entries."
-    *
-    *     "For cube surfaces, an additional two rows of padding are required
-    *      at the bottom of the surface. This must be ensured regardless of
-    *      whether the surface is stored tiled or linear.  This is due to the
-    *      potential rotation of cache line orientation from memory to cache."
-    *
-    *     "For compressed textures (BC* and FXT1 surface formats), padding at
-    *      the bottom of the surface is to an even compressed row, which is
-    *      equal to a multiple of 8 uncompressed texel rows. Thus, for padding
-    *      purposes, these surfaces behave as if j = 8 only for surface
-    *      padding purposes. The value of 4 for j still applies for mip level
-    *      alignment and QPitch calculation."
-    */
-   if (info->bind_surface_sampler) {
-      align_w = MAX2(align_w, img->align_i);
-      align_h = MAX2(align_h, img->align_j);
-
-      if (info->type == GEN6_SURFTYPE_CUBE)
-         pad_h += 2;
-
-      if (info->compressed)
-         align_h = MAX2(align_h, img->align_j * 2);
-   }
-
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 118:
-    *
-    *     "If the surface contains an odd number of rows of data, a final row
-    *      below the surface must be allocated."
-    */
-   if (info->bind_surface_dp_render)
-      align_h = MAX2(align_h, 2);
-
-   /*
-    * Depth Buffer Clear/Resolve works in 8x4 sample blocks.  Pad to allow HiZ
-    * for unaligned non-mipmapped and non-array images.
-    */
-   if (img->aux.type == ILO_IMAGE_AUX_HIZ &&
-       info->level_count == 1 &&
-       info->array_size == 1 &&
-       info->depth == 1) {
-      align_w = MAX2(align_w, 8);
-      align_h = MAX2(align_h, 4);
-   }
-
-   params->max_x = align(params->max_x, align_w);
-   params->max_y = align(params->max_y + pad_h, align_h);
-}
-
 /* note that this may force the texture to be linear */
 static void
 img_calculate_bo_size(struct ilo_image *img,
@@ -1419,7 +1420,6 @@ img_init(struct ilo_image *img,
    params->h1 = layout.walk_layer_h1;
    img->walk_layer_height = layout.walk_layer_height;
 
-   img_align(img, params);
    img_calculate_bo_size(img, params);
 
    img->scanout = params->info->bind_scanout;

From 0896d629fded96178daa79c393ba4dae0d56f2ff Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Mon, 29 Jun 2015 16:42:04 +0800
Subject: [PATCH 0152/1208] ilo: add image_set_gen6_{hiz,mcs}

They replace img_calculate_{hiz,mcs}_size().
---
 src/gallium/drivers/ilo/core/ilo_image.c | 110 +++++++++++++----------
 1 file changed, 61 insertions(+), 49 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 535407fefff..4b5323357de 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -1097,20 +1097,24 @@ img_calculate_bo_size(struct ilo_image *img,
    }
 }
 
-static void
-img_calculate_hiz_size(struct ilo_image *img,
-                       const struct ilo_image_params *params)
+static bool
+image_set_gen6_hiz(struct ilo_image *img,
+                   const struct ilo_dev *dev,
+                   const struct ilo_image_info *info,
+                   const struct ilo_image_layout *layout)
 {
-   const struct ilo_image_info *info = params->info;
-   const unsigned hz_align_j = 8;
+   const int hz_align_j = 8;
    enum ilo_image_walk_type hz_walk;
-   unsigned hz_width, hz_height, lv;
-   unsigned hz_clear_w, hz_clear_h;
+   int hz_width, hz_height;
+   int hz_clear_w, hz_clear_h;
+   uint8_t lv;
 
-   assert(img->aux.type == ILO_IMAGE_AUX_HIZ);
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   assert(img->walk == ILO_IMAGE_WALK_LAYER ||
-          img->walk == ILO_IMAGE_WALK_3D);
+   assert(layout->aux == ILO_IMAGE_AUX_HIZ);
+
+   assert(layout->walk == ILO_IMAGE_WALK_LAYER ||
+          layout->walk == ILO_IMAGE_WALK_3D);
 
    /*
     * From the Sandy Bridge PRM, volume 2 part 1, page 312:
@@ -1123,8 +1127,8 @@ img_calculate_hiz_size(struct ilo_image *img,
     *
     * We will put all LODs in a single bo with ILO_IMAGE_WALK_LOD.
     */
-   if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
-      hz_walk = img->walk;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      hz_walk = layout->walk;
    else
       hz_walk = ILO_IMAGE_WALK_LOD;
 
@@ -1138,16 +1142,16 @@ img_calculate_hiz_size(struct ilo_image *img,
    switch (hz_walk) {
    case ILO_IMAGE_WALK_LAYER:
       {
-         const unsigned h0 = align(params->h0, hz_align_j);
-         const unsigned h1 = align(params->h1, hz_align_j);
-         const unsigned htail =
-            ((ilo_dev_gen(params->dev) >= ILO_GEN(7)) ? 12 : 11) * hz_align_j;
-         const unsigned hz_qpitch = h0 + h1 + htail;
+         const int h0 = align(layout->walk_layer_h0, hz_align_j);
+         const int h1 = align(layout->walk_layer_h1, hz_align_j);
+         const int htail =
+            ((ilo_dev_gen(dev) >= ILO_GEN(7)) ? 12 : 11) * hz_align_j;
+         const int hz_qpitch = h0 + h1 + htail;
 
-         hz_width = align(img->lods[0].slice_width, 16);
+         hz_width = align(layout->lods[0].slice_width, 16);
 
          hz_height = hz_qpitch * info->array_size / 2;
-         if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
+         if (ilo_dev_gen(dev) >= ILO_GEN(7))
             hz_height = align(hz_height, 8);
 
          img->aux.walk_layer_height = hz_qpitch;
@@ -1155,9 +1159,9 @@ img_calculate_hiz_size(struct ilo_image *img,
       break;
    case ILO_IMAGE_WALK_LOD:
       {
-         unsigned lod_tx[ILO_IMAGE_MAX_LEVEL_COUNT];
-         unsigned lod_ty[ILO_IMAGE_MAX_LEVEL_COUNT];
-         unsigned cur_tx, cur_ty;
+         int lod_tx[ILO_IMAGE_MAX_LEVEL_COUNT];
+         int lod_ty[ILO_IMAGE_MAX_LEVEL_COUNT];
+         int cur_tx, cur_ty;
 
          /* figure out the tile offsets of LODs */
          hz_width = 0;
@@ -1165,17 +1169,17 @@ img_calculate_hiz_size(struct ilo_image *img,
          cur_tx = 0;
          cur_ty = 0;
          for (lv = 0; lv < info->level_count; lv++) {
-            unsigned tw, th;
+            int tw, th;
 
             lod_tx[lv] = cur_tx;
             lod_ty[lv] = cur_ty;
 
-            tw = align(img->lods[lv].slice_width, 16);
-            th = align(img->lods[lv].slice_height, hz_align_j) *
+            tw = align(layout->lods[lv].slice_width, 16);
+            th = align(layout->lods[lv].slice_height, hz_align_j) *
                info->array_size / 2;
             /* convert to Y-tiles */
-            tw = align(tw, 128) / 128;
-            th = align(th, 32) / 32;
+            tw = (tw + 127) / 128;
+            th = (th + 31) / 32;
 
             if (hz_width < cur_tx + tw)
                hz_width = cur_tx + tw;
@@ -1193,16 +1197,17 @@ img_calculate_hiz_size(struct ilo_image *img,
             img->aux.walk_lod_offsets[lv] =
                (lod_ty[lv] * hz_width + lod_tx[lv]) * 4096;
          }
+
          hz_width *= 128;
          hz_height *= 32;
       }
       break;
    case ILO_IMAGE_WALK_3D:
-      hz_width = align(img->lods[0].slice_width, 16);
+      hz_width = align(layout->lods[0].slice_width, 16);
 
       hz_height = 0;
       for (lv = 0; lv < info->level_count; lv++) {
-         const unsigned h = align(img->lods[lv].slice_height, hz_align_j);
+         const int h = align(layout->lods[lv].slice_height, hz_align_j);
          /* according to the formula, slices are packed together vertically */
          hz_height += h * u_minify(info->depth, lv);
       }
@@ -1245,30 +1250,35 @@ img_calculate_hiz_size(struct ilo_image *img,
    }
 
    for (lv = 0; lv < info->level_count; lv++) {
-      if (u_minify(img->width0, lv) % hz_clear_w ||
-          u_minify(img->height0, lv) % hz_clear_h)
+      if (u_minify(info->width, lv) % hz_clear_w ||
+          u_minify(info->height, lv) % hz_clear_h)
          break;
       img->aux.enables |= 1 << lv;
    }
 
-   /* we padded to allow this in img_align() */
+   /* we padded to allow this in image_get_gen6_monolithic_size() */
    if (info->level_count == 1 && info->array_size == 1 && info->depth == 1)
       img->aux.enables |= 0x1;
 
    /* align to Y-tile */
    img->aux.bo_stride = align(hz_width, 128);
    img->aux.bo_height = align(hz_height, 32);
+
+   return true;
 }
 
-static void
-img_calculate_mcs_size(struct ilo_image *img,
-                       const struct ilo_image_params *params)
+static bool
+image_set_gen7_mcs(struct ilo_image *img,
+                   const struct ilo_dev *dev,
+                   const struct ilo_image_info *info,
+                   const struct ilo_image_layout *layout)
 {
-   const struct ilo_image_info *info = params->info;
    int mcs_width, mcs_height, mcs_cpp;
    int downscale_x, downscale_y;
 
-   assert(img->aux.type == ILO_IMAGE_AUX_MCS);
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   assert(layout->aux == ILO_IMAGE_AUX_MCS);
 
    if (info->sample_count > 1) {
       /*
@@ -1303,7 +1313,7 @@ img_calculate_mcs_size(struct ilo_image *img,
          break;
       default:
          assert(!"unsupported sample count");
-         return;
+         return false;
          break;
       }
 
@@ -1312,8 +1322,8 @@ img_calculate_mcs_size(struct ilo_image *img,
        * clear rectangle cannot be masked.  The scale-down clear rectangle
        * thus must be aligned to 2x2, and we need to pad.
        */
-      mcs_width = align(img->width0, downscale_x * 2);
-      mcs_height = align(img->height0, downscale_y * 2);
+      mcs_width = align(info->width, downscale_x * 2);
+      mcs_height = align(info->height, downscale_y * 2);
    } else {
       /*
        * From the Ivy Bridge PRM, volume 2 part 1, page 327:
@@ -1348,18 +1358,18 @@ img_calculate_mcs_size(struct ilo_image *img,
        * anything except for the size of the allocated MCS.  Let's see if we
        * hit out-of-bound access.
        */
-      switch (img->tiling) {
+      switch (layout->tiling) {
       case GEN6_TILING_X:
-         downscale_x = 64 / img->block_size;
+         downscale_x = 64 / info->block_size;
          downscale_y = 2;
          break;
       case GEN6_TILING_Y:
-         downscale_x = 32 / img->block_size;
+         downscale_x = 32 / info->block_size;
          downscale_y = 4;
          break;
       default:
          assert(!"unsupported tiling mode");
-         return;
+         return false;
          break;
       }
 
@@ -1376,8 +1386,8 @@ img_calculate_mcs_size(struct ilo_image *img,
        * The scaled-down clear rectangle must be aligned to 4x4 instead of
        * 2x2, and we need to pad.
        */
-      mcs_width = align(img->width0, downscale_x * 4) / downscale_x;
-      mcs_height = align(img->height0, downscale_y * 4) / downscale_y;
+      mcs_width = align(info->width, downscale_x * 4) / downscale_x;
+      mcs_height = align(info->height, downscale_y * 4) / downscale_y;
       mcs_cpp = 16; /* an OWord */
    }
 
@@ -1385,6 +1395,8 @@ img_calculate_mcs_size(struct ilo_image *img,
    /* align to Y-tile */
    img->aux.bo_stride = align(mcs_width * mcs_cpp, 128);
    img->aux.bo_height = align(mcs_height, 32);
+
+   return true;
 }
 
 static bool
@@ -1424,12 +1436,12 @@ img_init(struct ilo_image *img,
 
    img->scanout = params->info->bind_scanout;
 
-   switch (img->aux.type) {
+   switch (layout.aux) {
    case ILO_IMAGE_AUX_HIZ:
-      img_calculate_hiz_size(img, params);
+      image_set_gen6_hiz(img, params->dev, params->info, &layout);
       break;
    case ILO_IMAGE_AUX_MCS:
-      img_calculate_mcs_size(img, params);
+      image_set_gen7_mcs(img, params->dev, params->info, &layout);
       break;
    default:
       break;

From 3c6af396f9526bdc8351ff61bcc6c42a3892e6b8 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Mon, 29 Jun 2015 16:46:34 +0800
Subject: [PATCH 0153/1208] ilo: add image_set_gen6_bo_size()

It replaces img_calculate_bo_size().
---
 src/gallium/drivers/ilo/core/ilo_image.c | 195 +++++++++--------------
 src/gallium/drivers/ilo/core/ilo_image.h |   8 +-
 src/gallium/drivers/ilo/ilo_resource.c   |   8 +
 3 files changed, 91 insertions(+), 120 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 4b5323357de..ec5ae04d26f 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -43,7 +43,6 @@ enum {
 struct ilo_image_params {
    const struct ilo_dev *dev;
    const struct ilo_image_info *info;
-   unsigned valid_tilings;
 
    unsigned h0, h1;
    unsigned max_x, max_y;
@@ -295,7 +294,8 @@ image_get_gen6_tiling(const struct ilo_dev *dev,
                                  info->bind_surface_dp_typed))
          return GEN6_TILING_NONE;
 
-      if (estimated_size <= 64)
+      if (estimated_size <= 64 ||
+          estimated_size > info->prefer_linear_threshold)
          return GEN6_TILING_NONE;
 
       if (estimated_size <= 2048)
@@ -987,114 +987,87 @@ img_init_size_and_format(struct ilo_image *img,
    img->sample_count = info->sample_count;
 }
 
-/* note that this may force the texture to be linear */
-static void
-img_calculate_bo_size(struct ilo_image *img,
-                      const struct ilo_image_params *params)
+static bool
+image_set_gen6_bo_size(struct ilo_image *img,
+                       const struct ilo_dev *dev,
+                       const struct ilo_image_info *info,
+                       const struct ilo_image_layout *layout)
 {
-   assert(params->max_x % img->block_width == 0);
-   assert(params->max_y % img->block_height == 0);
-   assert(img->walk_layer_height % img->block_height == 0);
+   int stride, height;
+   int align_w, align_h;
 
-   img->bo_stride =
-      (params->max_x / img->block_width) * img->block_size;
-   img->bo_height = params->max_y / img->block_height;
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   while (true) {
-      unsigned w = img->bo_stride, h = img->bo_height;
-      unsigned align_w, align_h;
+   stride = (layout->monolithic_width / info->block_width) * info->block_size;
+   height = layout->monolithic_height / info->block_height;
 
+   /*
+    * From the Haswell PRM, volume 5, page 163:
+    *
+    *     "For linear surfaces, additional padding of 64 bytes is required
+    *      at the bottom of the surface. This is in addition to the padding
+    *      required above."
+    */
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && info->bind_surface_sampler &&
+       layout->tiling == GEN6_TILING_NONE)
+      height += (64 + stride - 1) / stride;
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 81:
+    *
+    *     "- For linear render target surfaces, the pitch must be a multiple
+    *        of the element size for non-YUV surface formats.  Pitch must be a
+    *        multiple of 2 * element size for YUV surface formats.
+    *
+    *      - For other linear surfaces, the pitch can be any multiple of
+    *        bytes.
+    *      - For tiled surfaces, the pitch must be a multiple of the tile
+    *        width."
+    *
+    * Different requirements may exist when the image is used in different
+    * places, but our alignments here should be good enough that we do not
+    * need to check info->bind_x.
+    */
+   switch (layout->tiling) {
+   case GEN6_TILING_X:
+      align_w = 512;
+      align_h = 8;
+      break;
+   case GEN6_TILING_Y:
+      align_w = 128;
+      align_h = 32;
+      break;
+   case GEN8_TILING_W:
       /*
-       * From the Haswell PRM, volume 5, page 163:
+       * From the Sandy Bridge PRM, volume 1 part 2, page 22:
        *
-       *     "For linear surfaces, additional padding of 64 bytes is required
-       *      at the bottom of the surface. This is in addition to the padding
-       *      required above."
+       *     "A 4KB tile is subdivided into 8-high by 8-wide array of
+       *      Blocks for W-Major Tiles (W Tiles). Each Block is 8 rows by 8
+       *      bytes."
        */
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7.5) &&
-          params->info->bind_surface_sampler &&
-          img->tiling == GEN6_TILING_NONE)
-         h += (64 + img->bo_stride - 1) / img->bo_stride;
-
-      /*
-       * From the Sandy Bridge PRM, volume 4 part 1, page 81:
-       *
-       *     "- For linear render target surfaces, the pitch must be a
-       *        multiple of the element size for non-YUV surface formats.
-       *        Pitch must be a multiple of 2 * element size for YUV surface
-       *        formats.
-       *      - For other linear surfaces, the pitch can be any multiple of
-       *        bytes.
-       *      - For tiled surfaces, the pitch must be a multiple of the tile
-       *        width."
-       *
-       * Different requirements may exist when the bo is used in different
-       * places, but our alignments here should be good enough that we do not
-       * need to check params->info->bind_x.
-       */
-      switch (img->tiling) {
-      case GEN6_TILING_X:
-         align_w = 512;
-         align_h = 8;
-         break;
-      case GEN6_TILING_Y:
-         align_w = 128;
-         align_h = 32;
-         break;
-      case GEN8_TILING_W:
-         /*
-          * From the Sandy Bridge PRM, volume 1 part 2, page 22:
-          *
-          *     "A 4KB tile is subdivided into 8-high by 8-wide array of
-          *      Blocks for W-Major Tiles (W Tiles). Each Block is 8 rows by 8
-          *      bytes."
-          */
-         align_w = 64;
-         align_h = 64;
-         break;
-      default:
-         assert(img->tiling == GEN6_TILING_NONE);
-         /* some good enough values */
-         align_w = 64;
-         align_h = 2;
-         break;
-      }
-
-      w = align(w, align_w);
-      h = align(h, align_h);
-
-      /* make sure the bo is mappable */
-      if (img->tiling != GEN6_TILING_NONE) {
-         /*
-          * Usually only the first 256MB of the GTT is mappable.
-          *
-          * See also how intel_context::max_gtt_map_object_size is calculated.
-          */
-         const size_t mappable_gtt_size = 256 * 1024 * 1024;
-
-         /*
-          * Be conservative.  We may be able to switch from VALIGN_4 to
-          * VALIGN_2 if the image was Y-tiled, but let's keep it simple.
-          */
-         if (mappable_gtt_size / w / 4 < h) {
-            if (params->valid_tilings & IMAGE_TILING_NONE) {
-               img->tiling = GEN6_TILING_NONE;
-               /* MCS support for non-MSRTs is limited to tiled RTs */
-               if (img->aux.type == ILO_IMAGE_AUX_MCS &&
-                   params->info->sample_count == 1)
-                  img->aux.type = ILO_IMAGE_AUX_NONE;
-
-               continue;
-            } else {
-               ilo_warn("cannot force texture to be linear\n");
-            }
-         }
-      }
-
-      img->bo_stride = w;
-      img->bo_height = h;
+      align_w = 64;
+      align_h = 64;
+      break;
+   default:
+      assert(layout->tiling == GEN6_TILING_NONE);
+      /* some good enough values */
+      align_w = 64;
+      align_h = 2;
       break;
    }
+
+   if (info->force_bo_stride) {
+      if (info->force_bo_stride % align_w || info->force_bo_stride < stride)
+         return false;
+
+      img->bo_stride = info->force_bo_stride;
+   } else {
+      img->bo_stride = align(stride, align_w);
+   }
+
+   img->bo_height = align(height, align_h);
+
+   return true;
 }
 
 static bool
@@ -1411,14 +1384,11 @@ img_init(struct ilo_image *img,
    if (!image_get_gen6_layout(params->dev, params->info, &layout))
       return false;
 
-   /* there are hard dependencies between every function here */
-
    img_init_size_and_format(img, params);
 
    img->walk = layout.walk;
    img->interleaved_samples = layout.interleaved_samples;
 
-   params->valid_tilings = layout.valid_tilings;
    img->tiling = layout.tiling;
 
    img->aux.type = layout.aux;
@@ -1432,7 +1402,8 @@ img_init(struct ilo_image *img,
    params->h1 = layout.walk_layer_h1;
    img->walk_layer_height = layout.walk_layer_height;
 
-   img_calculate_bo_size(img, params);
+   if (!image_set_gen6_bo_size(img, params->dev, params->info, &layout))
+      return false;
 
    img->scanout = params->info->bind_scanout;
 
@@ -1538,23 +1509,9 @@ ilo_image_init(struct ilo_image *img,
    memset(&params, 0, sizeof(params));
    params.dev = dev;
    params.info = info;
-   params.valid_tilings = (info->valid_tilings) ?
-      info->valid_tilings : IMAGE_TILING_ALL;
 
    if (!img_init(img, &params))
       return false;
 
-   if (info->force_bo_stride) {
-      if ((img->tiling == GEN6_TILING_X && info->force_bo_stride % 512) ||
-          (img->tiling == GEN6_TILING_Y && info->force_bo_stride % 128) ||
-          (img->tiling == GEN8_TILING_W && info->force_bo_stride % 64))
-         return false;
-
-      if (img->bo_stride > info->force_bo_stride)
-         return false;
-
-      img->bo_stride = info->force_bo_stride;
-   }
-
    return true;
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index cfe18b9e8d1..646ed6f5727 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -100,7 +100,13 @@ struct ilo_image_info {
    /* tilings to consider, if any bit is set */
    uint8_t valid_tilings;
 
-   /* force a stride */
+   /*
+    * prefer GEN6_TILING_NONE when the (estimated) image size exceeds the
+    * threshold
+    */
+   uint32_t prefer_linear_threshold;
+
+   /* force a stride when non-zero */
    uint32_t force_bo_stride;
 
    bool bind_surface_sampler;
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index 962c710a57a..3797c8ec24f 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -199,6 +199,14 @@ resource_get_image_info(const struct pipe_resource *templ,
    if (templ->bind & PIPE_BIND_LINEAR)
       info->valid_tilings = 1 << GEN6_TILING_NONE;
 
+   /*
+    * Tiled images must be mapped via GTT to get a linear view.  Prefer linear
+    * images when the image size is greater than one-fourth of the mappable
+    * aperture.
+    */
+   if (templ->bind & (PIPE_BIND_TRANSFER_WRITE | PIPE_BIND_TRANSFER_READ))
+      info->prefer_linear_threshold = dev->aperture_mappable / 4;
+
    info->bind_surface_sampler = (templ->bind & PIPE_BIND_SAMPLER_VIEW);
    info->bind_surface_dp_render = (templ->bind & PIPE_BIND_RENDER_TARGET);
    info->bind_surface_dp_typed = (templ->bind &

From b4c66e4d3eadc04bdffbf4821636299bc49c89a4 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Mon, 29 Jun 2015 16:51:46 +0800
Subject: [PATCH 0154/1208] ilo: add image_init_gen6_transfer_layout()

It replaces img_init_for_transfer().
---
 src/gallium/drivers/ilo/core/ilo_image.c | 112 ++++++++---------------
 1 file changed, 37 insertions(+), 75 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index ec5ae04d26f..c1edffbeb5b 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -580,6 +580,25 @@ image_init_gen6_hardware_layout(const struct ilo_dev *dev,
    return true;
 }
 
+static bool
+image_init_gen6_transfer_layout(const struct ilo_dev *dev,
+                                const struct ilo_image_info *info,
+                                struct ilo_image_layout *layout)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* we can define our own layout to save space */
+   layout->walk = ILO_IMAGE_WALK_LOD;
+   layout->interleaved_samples = false;
+   layout->valid_tilings = IMAGE_TILING_NONE;
+   layout->tiling = GEN6_TILING_NONE;
+   layout->aux = ILO_IMAGE_AUX_NONE;
+   layout->align_i = info->block_width;
+   layout->align_j = info->block_height;
+
+   return true;
+}
+
 static void
 image_get_gen6_slice_size(const struct ilo_dev *dev,
                           const struct ilo_image_info *info,
@@ -921,6 +940,17 @@ image_get_gen6_lods(const struct ilo_dev *dev,
    image_get_gen6_monolithic_size(dev, info, layout, max_x, max_y);
 }
 
+static bool
+image_bind_gpu(const struct ilo_image_info *info)
+{
+   return (info->bind_surface_sampler ||
+           info->bind_surface_dp_render ||
+           info->bind_surface_dp_typed ||
+           info->bind_zs ||
+           info->bind_scanout ||
+           info->bind_cursor);
+}
+
 static bool
 image_get_gen6_layout(const struct ilo_dev *dev,
                       const struct ilo_image_info *info,
@@ -928,8 +958,13 @@ image_get_gen6_layout(const struct ilo_dev *dev,
 {
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (!image_init_gen6_hardware_layout(dev, info, layout))
-      return false;
+   if (image_bind_gpu(info) || info->level_count > 1) {
+      if (!image_init_gen6_hardware_layout(dev, info, layout))
+         return false;
+   } else {
+      if (!image_init_gen6_transfer_layout(dev, info, layout))
+         return false;
+   }
 
    /*
     * the fact that align i and j are multiples of block width and height
@@ -1421,71 +1456,6 @@ img_init(struct ilo_image *img,
    return true;
 }
 
-/**
- * The texutre is for transfer only.  We can define our own layout to save
- * space.
- */
-static void
-img_init_for_transfer(struct ilo_image *img,
-                      const struct ilo_dev *dev,
-                      const struct ilo_image_info *info)
-{
-   const unsigned num_layers = (info->type == GEN6_SURFTYPE_3D) ?
-      info->depth : info->array_size;
-   unsigned layer_width, layer_height;
-
-   assert(info->level_count == 1);
-   assert(info->sample_count == 1);
-
-   img->aux.type = ILO_IMAGE_AUX_NONE;
-
-   img->type = info->type;
-   img->width0 = info->width;
-   img->height0 = info->height;
-   img->depth0 = info->depth;
-   img->array_size = info->array_size;
-   img->level_count = 1;
-   img->sample_count = 1;
-
-   img->format = info->format;
-   img->block_width = info->block_width;
-   img->block_height = info->block_height;
-   img->block_size = info->block_size;
-
-   img->walk = ILO_IMAGE_WALK_LOD;
-
-   img->tiling = GEN6_TILING_NONE;
-
-   img->align_i = img->block_width;
-   img->align_j = img->block_height;
-
-   assert(util_is_power_of_two(img->block_width) &&
-          util_is_power_of_two(img->block_height));
-
-   /* use packed layout */
-   layer_width = align(info->width, img->align_i);
-   layer_height = align(info->height, img->align_j);
-
-   img->lods[0].slice_width = layer_width;
-   img->lods[0].slice_height = layer_height;
-
-   img->bo_stride = (layer_width / img->block_width) * img->block_size;
-   img->bo_stride = align(img->bo_stride, 64);
-
-   img->bo_height = (layer_height / img->block_height) * num_layers;
-}
-
-static bool
-img_is_bind_gpu(const struct ilo_image_info *info)
-{
-   return (info->bind_surface_sampler ||
-           info->bind_surface_dp_render ||
-           info->bind_surface_dp_typed ||
-           info->bind_zs ||
-           info->bind_scanout ||
-           info->bind_cursor);
-}
-
 /**
  * Initialize the image.  Callers should zero-initialize \p img first.
  */
@@ -1498,14 +1468,6 @@ ilo_image_init(struct ilo_image *img,
 
    assert(ilo_is_zeroed(img, sizeof(*img)));
 
-   /* use transfer layout when the texture is never bound to GPU */
-   if (!img_is_bind_gpu(info) &&
-       info->level_count == 1 &&
-       info->sample_count == 1) {
-      img_init_for_transfer(img, dev, info);
-      return true;
-   }
-
    memset(&params, 0, sizeof(params));
    params.dev = dev;
    params.info = info;

From 19ea623586aacc995b3f4a1a3ea321ead12dc43c Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Mon, 29 Jun 2015 16:58:17 +0800
Subject: [PATCH 0155/1208] ilo: remove ilo_image_params

It suffices to use ilo_image_layout directly.
---
 src/gallium/drivers/ilo/core/ilo_image.c | 122 +++++++++--------------
 1 file changed, 47 insertions(+), 75 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index c1edffbeb5b..fa547ac5c36 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -40,14 +40,6 @@ enum {
                         IMAGE_TILING_W)
 };
 
-struct ilo_image_params {
-   const struct ilo_dev *dev;
-   const struct ilo_image_info *info;
-
-   unsigned h0, h1;
-   unsigned max_x, max_y;
-};
-
 struct ilo_image_layout {
    enum ilo_image_walk_type walk;
    bool interleaved_samples;
@@ -951,6 +943,26 @@ image_bind_gpu(const struct ilo_image_info *info)
            info->bind_cursor);
 }
 
+static bool
+image_validate_gen6(const struct ilo_dev *dev,
+                    const struct ilo_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 314:
+    *
+    *     "The separate stencil buffer is always enabled, thus the field in
+    *      3DSTATE_DEPTH_BUFFER to explicitly enable the separate stencil
+    *      buffer has been removed Surface formats with interleaved depth and
+    *      stencil are no longer supported"
+    */
+   if (ilo_dev_gen(dev) >= ILO_GEN(7) && info->bind_zs)
+      assert(!info->interleaved_stencil);
+
+   return true;
+}
+
 static bool
 image_get_gen6_layout(const struct ilo_dev *dev,
                       const struct ilo_image_info *info,
@@ -958,6 +970,9 @@ image_get_gen6_layout(const struct ilo_dev *dev,
 {
    ILO_DEV_ASSERT(dev, 6, 8);
 
+   if (!image_validate_gen6(dev, info))
+      return false;
+
    if (image_bind_gpu(info) || info->level_count > 1) {
       if (!image_init_gen6_hardware_layout(dev, info, layout))
          return false;
@@ -990,38 +1005,6 @@ image_get_gen6_layout(const struct ilo_dev *dev,
    return true;
 }
 
-static void
-img_init_size_and_format(struct ilo_image *img,
-                         struct ilo_image_params *params)
-{
-   const struct ilo_image_info *info = params->info;
-
-   img->type = info->type;
-
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 314:
-    *
-    *     "The separate stencil buffer is always enabled, thus the field in
-    *      3DSTATE_DEPTH_BUFFER to explicitly enable the separate stencil
-    *      buffer has been removed Surface formats with interleaved depth and
-    *      stencil are no longer supported"
-    */
-   if (ilo_dev_gen(params->dev) >= ILO_GEN(7) && info->bind_zs)
-      assert(!info->interleaved_stencil);
-
-   img->format = info->format;
-   img->block_width = info->block_width;
-   img->block_height = info->block_height;
-   img->block_size = info->block_size;
-
-   img->width0 = info->width;
-   img->height0 = info->height;
-   img->depth0 = info->depth;
-   img->array_size = info->array_size;
-   img->level_count = info->level_count;
-   img->sample_count = info->sample_count;
-}
-
 static bool
 image_set_gen6_bo_size(struct ilo_image *img,
                        const struct ilo_dev *dev,
@@ -1407,19 +1390,34 @@ image_set_gen7_mcs(struct ilo_image *img,
    return true;
 }
 
-static bool
-img_init(struct ilo_image *img,
-         struct ilo_image_params *params)
+bool
+ilo_image_init(struct ilo_image *img,
+               const struct ilo_dev *dev,
+               const struct ilo_image_info *info)
 {
    struct ilo_image_layout layout;
 
+   assert(ilo_is_zeroed(img, sizeof(*img)));
+
    memset(&layout, 0, sizeof(layout));
    layout.lods = img->lods;
 
-   if (!image_get_gen6_layout(params->dev, params->info, &layout))
+   if (!image_get_gen6_layout(dev, info, &layout))
       return false;
 
-   img_init_size_and_format(img, params);
+   img->type = info->type;
+
+   img->format = info->format;
+   img->block_width = info->block_width;
+   img->block_height = info->block_height;
+   img->block_size = info->block_size;
+
+   img->width0 = info->width;
+   img->height0 = info->height;
+   img->depth0 = info->depth;
+   img->array_size = info->array_size;
+   img->level_count = info->level_count;
+   img->sample_count = info->sample_count;
 
    img->walk = layout.walk;
    img->interleaved_samples = layout.interleaved_samples;
@@ -1431,23 +1429,19 @@ img_init(struct ilo_image *img,
    img->align_i = layout.align_i;
    img->align_j = layout.align_j;
 
-   params->max_x = layout.monolithic_width;
-   params->max_y = layout.monolithic_height;
-   params->h0 = layout.walk_layer_h0;
-   params->h1 = layout.walk_layer_h1;
    img->walk_layer_height = layout.walk_layer_height;
 
-   if (!image_set_gen6_bo_size(img, params->dev, params->info, &layout))
+   if (!image_set_gen6_bo_size(img, dev, info, &layout))
       return false;
 
-   img->scanout = params->info->bind_scanout;
+   img->scanout = info->bind_scanout;
 
    switch (layout.aux) {
    case ILO_IMAGE_AUX_HIZ:
-      image_set_gen6_hiz(img, params->dev, params->info, &layout);
+      image_set_gen6_hiz(img, dev, info, &layout);
       break;
    case ILO_IMAGE_AUX_MCS:
-      image_set_gen7_mcs(img, params->dev, params->info, &layout);
+      image_set_gen7_mcs(img, dev, info, &layout);
       break;
    default:
       break;
@@ -1455,25 +1449,3 @@ img_init(struct ilo_image *img,
 
    return true;
 }
-
-/**
- * Initialize the image.  Callers should zero-initialize \p img first.
- */
-bool
-ilo_image_init(struct ilo_image *img,
-               const struct ilo_dev *dev,
-               const struct ilo_image_info *info)
-{
-   struct ilo_image_params params;
-
-   assert(ilo_is_zeroed(img, sizeof(*img)));
-
-   memset(&params, 0, sizeof(params));
-   params.dev = dev;
-   params.info = info;
-
-   if (!img_init(img, &params))
-      return false;
-
-   return true;
-}

From ccaf37f4496eb836866c9daacf21f1f5ac8c6d66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Mon, 29 Jun 2015 14:19:00 +0300
Subject: [PATCH 0156/1208] glsl: build stageref mask using IR, not symbol
 table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of using symbol table, build mask by inspecting IR. This
change is required by further patches to move resource list creation
to happen later when symbol table does not exist anymore.

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Martin Peres <martin.peres@linux.intel.com>
---
 src/glsl/linker.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 74c2f2d4c92..d9527d4ba2b 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -2624,9 +2624,17 @@ build_stageref(struct gl_shader_program *shProg, const char *name)
       struct gl_shader *sh = shProg->_LinkedShaders[i];
       if (!sh)
          continue;
-      ir_variable *var = sh->symbols->get_variable(name);
-      if (var)
-         stages |= (1 << i);
+
+      /* Shader symbol table may contain variables that have
+       * been optimized away. Search IR for the variable instead.
+       */
+      foreach_in_list(ir_instruction, node, sh->ir) {
+         ir_variable *var = node->as_variable();
+         if (var && strcmp(var->name, name) == 0) {
+            stages |= (1 << i);
+            break;
+         }
+      }
    }
    return stages;
 }

From 73afa31f07fe4af605088f6590edc4227652c482 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Mon, 29 Jun 2015 14:39:05 +0300
Subject: [PATCH 0157/1208] glsl: expose build_program_resource_list function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is required so that we can move resource list creation
to happen later.

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Martin Peres <martin.peres@linux.intel.com>
---
 src/glsl/linker.cpp | 2 +-
 src/glsl/program.h  | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index d9527d4ba2b..e0ce00ce5b9 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -2689,7 +2689,7 @@ add_interface_variables(struct gl_shader_program *shProg,
  * Builds up a list of program resources that point to existing
  * resource data.
  */
-static void
+void
 build_program_resource_list(struct gl_context *ctx,
                             struct gl_shader_program *shProg)
 {
diff --git a/src/glsl/program.h b/src/glsl/program.h
index f15113a08d2..c06541a6105 100644
--- a/src/glsl/program.h
+++ b/src/glsl/program.h
@@ -39,6 +39,10 @@ _mesa_glsl_compile_shader(struct gl_context *ctx, struct gl_shader *shader,
 extern void
 link_shaders(struct gl_context *ctx, struct gl_shader_program *prog);
 
+extern void
+build_program_resource_list(struct gl_context *ctx,
+                            struct gl_shader_program *shProg);
+
 extern void
 linker_error(struct gl_shader_program *prog, const char *fmt, ...)
    PRINTFLIKE(2, 3);

From f045b8b2ff5ac75da3e092f482fd1717571d8462 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Mon, 29 Jun 2015 15:23:45 +0300
Subject: [PATCH 0158/1208] glsl: create program resource list after LinkShader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Resource list can be created properly  only after LinkShader hook
has been called to make sure all dead variables have been removed.

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Martin Peres <martin.peres@linux.intel.com>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=90925
---
 src/glsl/linker.cpp             | 4 ----
 src/mesa/program/ir_to_mesa.cpp | 2 ++
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index e0ce00ce5b9..71a45e8db9c 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3242,10 +3242,6 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       }
    }
 
-   build_program_resource_list(ctx, prog);
-   if (!prog->LinkStatus)
-      goto done;
-
    /* FINISHME: Assign fragment shader output locations. */
 
 done:
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 18e3bc5d5cc..0b2eb122364 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -2975,6 +2975,8 @@ _mesa_glsl_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
    if (prog->LinkStatus) {
       if (!ctx->Driver.LinkShader(ctx, prog)) {
 	 prog->LinkStatus = GL_FALSE;
+      } else {
+         build_program_resource_list(ctx, prog);
       }
    }
 

From 8276ba260e5500664b8d8748f3224f73ef221887 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 1 Jul 2015 03:47:41 -0400
Subject: [PATCH 0159/1208] nouveau: rename var name for nouveau_vieux to avoid
 conflict with nouveau

We want to require different versions for nouveau and nouveau_vieux.
autoconf will only check for NOUVEAU once if both drivers are enabled,
meaning both version checks don't get executed. Rename the nouveau_vieux
one to NVVIEUX to avoid the issue.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Tested-by: Alexandre Courbot <acourbot@nvidia.com>
Tested-by: Martin Peres <martin.peres@free.fr>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 configure.ac                             | 2 +-
 src/mesa/drivers/dri/nouveau/Makefile.am | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index af61aa2018c..60d180dbd29 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1421,7 +1421,7 @@ if test -n "$with_dri_drivers"; then
             ;;
         xnouveau)
             HAVE_NOUVEAU_DRI=yes;
-            PKG_CHECK_MODULES([NOUVEAU], [libdrm_nouveau >= $LIBDRM_NVVIEUX_REQUIRED])
+            PKG_CHECK_MODULES([NVVIEUX], [libdrm_nouveau >= $LIBDRM_NVVIEUX_REQUIRED])
             ;;
         xradeon)
             HAVE_RADEON_DRI=yes;
diff --git a/src/mesa/drivers/dri/nouveau/Makefile.am b/src/mesa/drivers/dri/nouveau/Makefile.am
index 61af95a7dbc..01e34a8e3c3 100644
--- a/src/mesa/drivers/dri/nouveau/Makefile.am
+++ b/src/mesa/drivers/dri/nouveau/Makefile.am
@@ -38,8 +38,8 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
-	$(NOUVEAU_CFLAGS)
+	$(NVVIEUX_CFLAGS)
 
 noinst_LTLIBRARIES = libnouveau_dri.la
 libnouveau_dri_la_SOURCES = $(NOUVEAU_C_FILES)
-libnouveau_dri_la_LIBADD = $(NOUVEAU_LIBS)
+libnouveau_dri_la_LIBADD = $(NVVIEUX_LIBS)

From dabec9c293ee29335f5a6d5d1d3c2b7a715605c1 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 30 Jun 2015 15:15:44 +0300
Subject: [PATCH 0160/1208] i965/fs: Relax fs_builder channel group assertion
 when force_writemask_all is on.

This assertion was meant to catch code inadvertently escaping the
control flow jail determined by the group of channel enable signals
selected by some caller, however it seems useful to be able to
increase the default execution size as long as force_writemask_all is
enabled, because force_writemask_all is an explicit indication that
there is no longer a one-to-one correspondence between channels and
SIMD components so the restriction doesn't apply.

In addition reorder the calls to fs_builder::group and ::exec_all in a
couple of places to make sure that we don't temporarily break this
invariant in the future for instructions with exec_size higher than
the dispatch width.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp       | 6 +++---
 src/mesa/drivers/dri/i965/brw_fs_builder.h | 4 ++--
 src/mesa/drivers/dri/i965/brw_fs_cse.cpp   | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index a19ea664e3f..189da1d553e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2915,7 +2915,7 @@ fs_visitor::lower_load_payload()
       if (dst.file == MRF)
          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
 
-      const fs_builder hbld = bld.group(8, 0).exec_all().at(block, inst);
+      const fs_builder hbld = bld.exec_all().group(8, 0).at(block, inst);
 
       for (uint8_t i = 0; i < inst->header_size; i++) {
          if (inst->src[i].file != BAD_FILE) {
@@ -2926,8 +2926,8 @@ fs_visitor::lower_load_payload()
          dst = offset(dst, hbld, 1);
       }
 
-      const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
-                                 .exec_all(inst->force_writemask_all)
+      const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
+                                 .group(inst->exec_size, inst->force_sechalf)
                                  .at(block, inst);
 
       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index 2c36e071f54..34646d7fd74 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -99,8 +99,8 @@ namespace brw {
       fs_builder
       group(unsigned n, unsigned i) const
       {
-         assert(n <= dispatch_width() &&
-                i < dispatch_width() / n);
+         assert(force_writemask_all ||
+                (n <= dispatch_width() && i < dispatch_width() / n));
          fs_builder bld = *this;
          bld._dispatch_width = n;
          bld._group += i * n;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 291feb3ec0c..e33fe6a9802 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -180,8 +180,8 @@ create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
 {
    int written = inst->regs_written;
    int dst_width = inst->exec_size / 8;
-   const fs_builder ubld = bld.group(inst->exec_size, inst->force_sechalf)
-                              .exec_all(inst->force_writemask_all);
+   const fs_builder ubld = bld.exec_all(inst->force_writemask_all)
+                              .group(inst->exec_size, inst->force_sechalf);
    fs_inst *copy;
 
    if (written > dst_width) {

From 80fc9c01dfe4cbbcf1c6b101fcdfdecbda63131e Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 30 Jun 2015 17:04:52 -0700
Subject: [PATCH 0161/1208] i965/fs: Use the builder directly for the gen6
 interpolation add(32)

Now that we can create builders with a bigger width than their parent as
long as it's exec_all, we don't need to create the instruction manually.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 79ebb2d3019..94d6a584424 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1358,12 +1358,11 @@ fs_visitor::emit_interpolation_setup_gen6()
        */
       fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
                           BRW_REGISTER_TYPE_UW);
-      fs_inst *add =
-         new (mem_ctx) fs_inst(BRW_OPCODE_ADD, dispatch_width * 2,
-                               int_pixel_xy,
-                               fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
-                               fs_reg(brw_imm_v(0x11001010)));
-      abld.exec_all().emit(add);
+
+      const fs_builder dbld = abld.exec_all().group(dispatch_width * 2, 0);
+      dbld.ADD(int_pixel_xy,
+               fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
+               fs_reg(brw_imm_v(0x11001010)));
 
       this->pixel_x = vgrf(glsl_type::float_type);
       this->pixel_y = vgrf(glsl_type::float_type);

From 2c8f251369072ce382f651ba73ca280517d26e7f Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Wed, 24 Jun 2015 10:59:13 -0700
Subject: [PATCH 0162/1208] i965/gen9: use an unreserved surface alignment
 value

Although the horizontal and vertical alignment fields are ignored here,
0 is a reserved value for them and may cause undefined behavior. Change
the default value to an abitrary valid one.

v2: add comment about chosen value (Topi).

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/drivers/dri/i965/gen8_surface_state.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index b2d1a579815..bd3eb00a439 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -88,12 +88,12 @@ vertical_alignment(const struct brw_context *brw,
                    uint32_t surf_type)
 {
    /* On Gen9+ vertical alignment is ignored for 1D surfaces and when
-    * tr_mode is not TRMODE_NONE.
+    * tr_mode is not TRMODE_NONE. Set to an arbitrary non-reserved value.
     */
    if (brw->gen > 8 &&
        (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
         surf_type == BRW_SURFACE_1D))
-      return 0;
+      return GEN8_SURFACE_VALIGN_4;
 
    switch (mt->align_h) {
    case 4:
@@ -113,12 +113,12 @@ horizontal_alignment(const struct brw_context *brw,
                      uint32_t surf_type)
 {
    /* On Gen9+ horizontal alignment is ignored when tr_mode is not
-    * TRMODE_NONE.
+    * TRMODE_NONE. Set to an arbitrary non-reserved value.
     */
    if (brw->gen > 8 &&
        (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
         gen9_use_linear_1d_layout(brw, mt)))
-      return 0;
+      return GEN8_SURFACE_HALIGN_4;
 
    switch (mt->align_w) {
    case 4:

From e212a80db37b0fc9d57beb91dbca1c43ae4476a0 Mon Sep 17 00:00:00 2001
From: Alexandre Courbot <acourbot@nvidia.com>
Date: Tue, 30 Jun 2015 22:37:40 +0900
Subject: [PATCH 0163/1208] nvc0: create screen fence objects with coherent
 attribute

This is required on non-coherent architectures to ensure the value of
the fence is correct at all times. Failure to do this results in the
display freezing for a few seconds every now and then on Tegra.

The NOUVEAU_BO_COHERENT is a no-op for coherent architectures, so behavior
on x86 should not be affected by this patch.

Also bump the required libdrm version to 2.4.62, which introduced this
flag.

Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
Reviewed-by: Martin Peres <martin.peres@free.fr>
---
 configure.ac                                   | 2 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index 60d180dbd29..ea0f069cd31 100644
--- a/configure.ac
+++ b/configure.ac
@@ -70,7 +70,7 @@ LIBDRM_REQUIRED=2.4.38
 LIBDRM_RADEON_REQUIRED=2.4.56
 LIBDRM_INTEL_REQUIRED=2.4.60
 LIBDRM_NVVIEUX_REQUIRED=2.4.33
-LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41"
+LIBDRM_NOUVEAU_REQUIRED=2.4.62
 LIBDRM_FREEDRENO_REQUIRED=2.4.57
 DRI2PROTO_REQUIRED=2.6
 DRI3PROTO_REQUIRED=1.0
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 4c53106289c..95e246b9866 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -614,6 +614,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    struct nouveau_pushbuf *push;
    uint64_t value;
    uint32_t obj_class;
+   uint32_t flags;
    int ret;
    unsigned i;
 
@@ -669,8 +670,11 @@ nvc0_screen_create(struct nouveau_device *dev)
    screen->base.base.get_video_param = nouveau_vp3_screen_get_video_param;
    screen->base.base.is_video_format_supported = nouveau_vp3_screen_video_supported;
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0, 4096, NULL,
-                        &screen->fence.bo);
+   flags = NOUVEAU_BO_GART | NOUVEAU_BO_MAP;
+   if (dev->drm_version >= 0x01000202)
+      flags |= NOUVEAU_BO_COHERENT;
+
+   ret = nouveau_bo_new(dev, flags, 0, 4096, NULL, &screen->fence.bo);
    if (ret)
       goto fail;
    nouveau_bo_map(screen->fence.bo, 0, NULL);

From 4f57cdba2767b56eb4752f14ba9853ba6bc06d0e Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 1 Jul 2015 15:18:47 -0400
Subject: [PATCH 0164/1208] mesa: reset the source packing when creating temp
 transfer image

Commit 4b249d2ee (mesa: Handle transferOps in texstore_rgba) introduced
proper transferops handling, but in updating the source to the newly
allocated temporary image neglected to reset the source packing. Set it
to the default which should be appropriate for the floats used.

Fixes: 4b249d2ee (mesa: Handle transferOps in texstore_rgba)
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91173
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "10.5 10.6" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
---
 src/mesa/main/texstore.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 1525205981b..37c05690091 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -787,6 +787,7 @@ texstore_rgba(TEXSTORE_PARAMS)
       srcType = GL_FLOAT;
       srcRowStride = srcWidth * 4 * sizeof(float);
       srcMesaFormat = RGBA32_FLOAT;
+      srcPacking = &ctx->DefaultPacking;
    }
 
    src = (GLubyte *)

From 1087c566e3496d08fe70bc0725073e3022716dc5 Mon Sep 17 00:00:00 2001
From: Alexandre Courbot <acourbot@nvidia.com>
Date: Thu, 2 Jul 2015 11:36:55 +0900
Subject: [PATCH 0165/1208] nvc0: tune PREFER_BLIT_BASED_TEXTURE_TRANSFER
 capability

Prefer blit-based texture transfers only if the chip has dedicated VRAM
since it would translate to a copy into the same memory on shared-memory
chips.

Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
Reported-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 95e246b9866..3f52c85bc74 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -163,7 +163,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_USER_CONSTANT_BUFFERS:
    case PIPE_CAP_USER_INDEX_BUFFERS:
    case PIPE_CAP_USER_VERTEX_BUFFERS:
-   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
    case PIPE_CAP_TEXTURE_QUERY_LOD:
    case PIPE_CAP_SAMPLE_SHADING:
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
@@ -179,6 +178,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
    case PIPE_CAP_COMPUTE:
       return (class_3d == NVE4_3D_CLASS) ? 1 : 0;
+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+      return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0;
 
    /* unsupported caps */
    case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:

From c3215ef204c0fdfc44230adbd423720169d44dcb Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 2 Jul 2015 00:13:36 -0400
Subject: [PATCH 0166/1208] nv50/ir: don't emit src2 in immediate form

In the immediate form, src2 == dst, so it does not need to be emitted.
Otherwise it overlaps with the immediate value's low bits.

Fixes: 09ee907266 (nv50/ir: Fold IMM into MAD)
Cc: "10.6" <mesa-stable@lists.freedesktop.org>
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 6de8f45047a..67ea6df773c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -550,7 +550,7 @@ CodeEmitterNV50::emitForm_MUL(const Instruction *i)
 }
 
 // usual immediate form
-// - 1 to 3 sources where last is immediate (rir, gir)
+// - 1 to 3 sources where second is immediate (rir, gir)
 // - no address or predicate possible
 void
 CodeEmitterNV50::emitForm_IMM(const Instruction *i)
@@ -566,7 +566,7 @@ CodeEmitterNV50::emitForm_IMM(const Instruction *i)
    if (Target::operationSrcNr[i->op] > 1) {
       setSrc(i, 0, 0);
       setImmediate(i, 1);
-      setSrc(i, 2, 1);
+      // If there is another source, it has to be the same as the dest reg.
    } else {
       setImmediate(i, 0);
    }

From 9d408a41a3ab2fe456ebf2f7af7bad8f6c4bca17 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 29 Jun 2015 10:44:52 +0200
Subject: [PATCH 0167/1208] mesa/st: Add checks for signed/unsigned integer
 conversions in ReadPixels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These checks were in Mesa prior to commit fbba25bba, but they were
not necessary for the purpose that Mesa intended (check if we could
resolve ReadPixels via memcpy), so that commit took them away.

Unfortunately, it seems that some Gallium drivers rely on these
checks to make the decision of whether they should fallback to Mesa's
implementation of ReadPixels correctly. Michel Dänzer reported that
the following piglit test would fail on radeonsi after commit
fbba25bba:

spec@ext_texture_integer@fbo_integer_readpixels_sint_uint

This patch puts the checks back in Gallium, where they are needed.

Tested-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/mesa/state_tracker/st_cb_readpixels.c | 28 +++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index d95a608d32e..18ea43fa71a 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -43,6 +43,30 @@
 #include "state_tracker/st_format.h"
 #include "state_tracker/st_texture.h"
 
+static boolean
+needs_integer_signed_unsigned_conversion(const struct gl_context *ctx,
+                                         GLenum format, GLenum type)
+{
+   struct gl_renderbuffer *rb =
+      _mesa_get_read_renderbuffer_for_format(ctx, format);
+
+   assert(rb);
+
+   GLenum srcType = _mesa_get_format_datatype(rb->Format);
+
+    if ((srcType == GL_INT &&
+        (type == GL_UNSIGNED_INT ||
+         type == GL_UNSIGNED_SHORT ||
+         type == GL_UNSIGNED_BYTE)) ||
+       (srcType == GL_UNSIGNED_INT &&
+        (type == GL_INT ||
+         type == GL_SHORT ||
+         type == GL_BYTE))) {
+      return TRUE;
+   }
+
+   return FALSE;
+}
 
 /**
  * This uses a blit to copy the read buffer to a texture format which matches
@@ -123,6 +147,10 @@ st_readpixels(struct gl_context *ctx, GLint x, GLint y,
       goto fallback;
    }
 
+   if (needs_integer_signed_unsigned_conversion(ctx, format, type)) {
+      goto fallback;
+   }
+
    /* Convert the source format to what is expected by ReadPixels
     * and see if it's supported. */
    src_format = util_format_linear(src->format);

From fe2b748a39ff676949fcefccf739aff967fc38c5 Mon Sep 17 00:00:00 2001
From: Mike Stroyan <mike@lunarg.com>
Date: Wed, 1 Jul 2015 10:16:28 -0600
Subject: [PATCH 0168/1208] i965: allocate at least 1 BLEND_STATE element

When there are no color buffer render targets, gen6 and gen7 still
use the first BLEND_STATE element to determine alpha test.
gen6_upload_blend_state was allocating zero elements when
ctx->Color.AlphaEnabled was false.
That left _3DSTATE_CC_STATE_POINTERS or _3DSTATE_BLEND_STATE_POINTERS
pointing to random data from some previous brw_state_batch().
That sometimes suppressed depth rendering when those bits
happened to mean COMPAREFUNC_NEVER.
This produced flickering shadows for dota2 reborn.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=80500
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/gen6_cc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 2bfa271b527..2b76e241d1a 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -51,7 +51,7 @@ gen6_upload_blend_state(struct brw_context *brw)
     * with render target 0, which will reference BLEND_STATE[0] for
     * alpha test enable.
     */
-   if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
+   if (nr_draw_buffers == 0)
       nr_draw_buffers = 1;
 
    size = sizeof(*blend) * nr_draw_buffers;

From 197a19f9ed0ba12cc431542ac09f2af0a8bd0bce Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 1 Jul 2015 18:22:23 -0400
Subject: [PATCH 0169/1208] mesa/prog: relative offsets into constbufs are not
 constant

The optimization logic relies on being able to read out constbuf values
from program parameters. However that only works if there's no relative
addressing involved.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91173
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "10.5 10.6" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
---
 src/mesa/program/prog_opt_constant_fold.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/program/prog_opt_constant_fold.c b/src/mesa/program/prog_opt_constant_fold.c
index 3811c0d8aa6..e2518e660e6 100644
--- a/src/mesa/program/prog_opt_constant_fold.c
+++ b/src/mesa/program/prog_opt_constant_fold.c
@@ -38,6 +38,8 @@ src_regs_are_constant(const struct prog_instruction *inst, unsigned num_srcs)
    for (i = 0; i < num_srcs; i++) {
       if (inst->SrcReg[i].File != PROGRAM_CONSTANT)
 	 return false;
+      if (inst->SrcReg[i].RelAddr)
+         return false;
    }
 
    return true;

From 89bd5ee64c5aa1b977f4ba832cf7772e81ee286d Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 1 Jul 2015 16:00:08 -0700
Subject: [PATCH 0170/1208] nir: Don't allow copying SSA destinations

Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
---
 src/glsl/nir/nir.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c
index f661249f9bb..78ff886218d 100644
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -147,18 +147,18 @@ void nir_src_copy(nir_src *dest, const nir_src *src, void *mem_ctx)
 
 void nir_dest_copy(nir_dest *dest, const nir_dest *src, void *mem_ctx)
 {
-   dest->is_ssa = src->is_ssa;
-   if (src->is_ssa) {
-      dest->ssa = src->ssa;
+   /* Copying an SSA definition makes no sense whatsoever. */
+   assert(!src->is_ssa);
+
+   dest->is_ssa = false;
+
+   dest->reg.base_offset = src->reg.base_offset;
+   dest->reg.reg = src->reg.reg;
+   if (src->reg.indirect) {
+      dest->reg.indirect = ralloc(mem_ctx, nir_src);
+      nir_src_copy(dest->reg.indirect, src->reg.indirect, mem_ctx);
    } else {
-      dest->reg.base_offset = src->reg.base_offset;
-      dest->reg.reg = src->reg.reg;
-      if (src->reg.indirect) {
-         dest->reg.indirect = ralloc(mem_ctx, nir_src);
-         nir_src_copy(dest->reg.indirect, src->reg.indirect, mem_ctx);
-      } else {
-         dest->reg.indirect = NULL;
-      }
+      dest->reg.indirect = NULL;
    }
 }
 

From 7abc1e3286bc4729e144d3a247c2a275e46aaf53 Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Thu, 2 Jul 2015 17:49:19 +0100
Subject: [PATCH 0171/1208] i965/fs: Don't disable SIMD16 when using the pixel
 interpolator

There was a comment saying that in SIMD16 mode the pixel interpolator
returns coords interleaved 8 channels at a time and that this requires
extra work to support. However, this interleaved format is exactly
what the PLN instruction requires so I don't think anything needs to
be done to support it apart from removing the line to disable it and
to ensure that the message lengths for the send message are correct.

I am more convinced that this is correct because as it says in the
comment this interleaved output is identical to what is given in the
thread payload. The code generated to apply the plane equation to
these coordinates is identical on SIMD16 and SIMD8 except that the
dispatch width is larger which implies no special unmangling is
needed.

Perhaps the confusion stems from the fact that the description of the
PLN instruction in the IVB PRM seems to imply that the src1 inputs are
not interleaved so it wouldn't work. However, in the HSW and BDW PRMs,
the pseudo-code is different and looks like it expects the interleaved
format. Mesa doesn't seem to generate different code on IVB to
uninterleave the payload registers and everything is working so I can
only assume that the PRM is wrong.

I tested the interpolateAt tests on HSW and did a full Piglit run on
IVB on there were no regressions.

Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index caf1300d71b..bd71404ef8d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1481,12 +1481,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_interp_var_at_centroid:
    case nir_intrinsic_interp_var_at_sample:
    case nir_intrinsic_interp_var_at_offset: {
-      /* in SIMD16 mode, the pixel interpolator returns coords interleaved
-       * 8 channels at a time, same as the barycentric coords presented in
-       * the FS payload. this requires a bit of extra work to support.
-       */
-      no16("interpolate_at_* not yet supported in SIMD16 mode.");
-
       fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
 
       /* For most messages, we need one reg of ignored data; the hardware
@@ -1551,7 +1545,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                            bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
             }
 
-            mlen = 2;
+            mlen = 2 * dispatch_width / 8;
             inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
                             fs_reg(0u));
          }
@@ -1563,7 +1557,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       }
 
       inst->mlen = mlen;
-      inst->regs_written = 2; /* 2 floats per slot returned */
+      /* 2 floats per slot returned */
+      inst->regs_written = 2 * dispatch_width / 8;
       inst->pi_noperspective = instr->variables[0]->var->data.interpolation ==
                                INTERP_QUALIFIER_NOPERSPECTIVE;
 

From fc73f8ab8cd3975993546b5e0312d595b76d03be Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 26 Jun 2015 19:04:39 -0400
Subject: [PATCH 0172/1208] tgsi: update docs for ArrayID usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/docs/source/tgsi.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 89ca172080e..4e869e72b24 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2603,6 +2603,7 @@ not relative to the specified declaration
 If no ArrayID is specified with an indirect addressing operand the whole
 register file might be accessed by this operand. This is strongly discouraged
 and will prevent packing of scalar/vec2 arrays and effective alias analysis.
+This is only legal for TEMP and CONST register files.
 
 Declaration Semantic
 ^^^^^^^^^^^^^^^^^^^^^^^^

From 29addf50e038d7323a7ac8093d93422c28ad8635 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 26 Jun 2015 19:11:53 -0400
Subject: [PATCH 0173/1208] gallium/ttn: IN/OUT are only array if ArrayID != 0

Fixes issue with gallium HUD.  See this thread for details:
http://lists.freedesktop.org/archives/mesa-dev/2015-June/087140.html

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/auxiliary/nir/tgsi_to_nir.c | 157 +++++++++++++-----------
 1 file changed, 88 insertions(+), 69 deletions(-)

diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index e1647d53a40..bf7eb2fde02 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -184,7 +184,8 @@ ttn_emit_declaration(struct ttn_compile *c)
          c->samp_types[decl->Range.First + i] = type;
       }
    } else {
-      nir_variable *var;
+      bool is_array = (array_size > 1);
+
       assert(file == TGSI_FILE_INPUT ||
              file == TGSI_FILE_OUTPUT ||
              file == TGSI_FILE_CONSTANT);
@@ -193,76 +194,94 @@ ttn_emit_declaration(struct ttn_compile *c)
       if ((file == TGSI_FILE_CONSTANT) && decl->Declaration.Dimension)
          return;
 
-      var = rzalloc(b->shader, nir_variable);
-      var->data.driver_location = decl->Range.First;
-
-      var->type = glsl_vec4_type();
-      if (array_size > 1)
-         var->type = glsl_array_type(var->type, array_size);
-
-      switch (file) {
-      case TGSI_FILE_INPUT:
-         var->data.read_only = true;
-         var->data.mode = nir_var_shader_in;
-         var->name = ralloc_asprintf(var, "in_%d", decl->Range.First);
-
-         /* We should probably translate to a VERT_ATTRIB_* or VARYING_SLOT_*
-          * instead, but nothing in NIR core is looking at the value
-          * currently, and this is less change to drivers.
-          */
-         var->data.location = decl->Semantic.Name;
-         var->data.index = decl->Semantic.Index;
-
-         /* We definitely need to translate the interpolation field, because
-          * nir_print will decode it.
-          */
-         switch (decl->Interp.Interpolate) {
-         case TGSI_INTERPOLATE_CONSTANT:
-            var->data.interpolation = INTERP_QUALIFIER_FLAT;
-            break;
-         case TGSI_INTERPOLATE_LINEAR:
-            var->data.interpolation = INTERP_QUALIFIER_NOPERSPECTIVE;
-            break;
-         case TGSI_INTERPOLATE_PERSPECTIVE:
-            var->data.interpolation = INTERP_QUALIFIER_SMOOTH;
-            break;
-         }
-
-         exec_list_push_tail(&b->shader->inputs, &var->node);
-         break;
-      case TGSI_FILE_OUTPUT: {
-         /* Since we can't load from outputs in the IR, we make temporaries
-          * for the outputs and emit stores to the real outputs at the end of
-          * the shader.
-          */
-         nir_register *reg = nir_local_reg_create(b->impl);
-         reg->num_components = 4;
-         if (array_size > 1)
-            reg->num_array_elems = array_size;
-
-         var->data.mode = nir_var_shader_out;
-         var->name = ralloc_asprintf(var, "out_%d", decl->Range.First);
-
-         var->data.location = decl->Semantic.Name;
-         var->data.index = decl->Semantic.Index;
-
-         for (i = 0; i < array_size; i++) {
-            c->output_regs[decl->Range.First + i].offset = i;
-            c->output_regs[decl->Range.First + i].reg = reg;
-         }
-
-         exec_list_push_tail(&b->shader->outputs, &var->node);
+      if ((file == TGSI_FILE_INPUT) || (file == TGSI_FILE_OUTPUT)) {
+         is_array = (is_array && decl->Declaration.Array &&
+                     (decl->Array.ArrayID != 0));
       }
-         break;
-      case TGSI_FILE_CONSTANT:
-         var->data.mode = nir_var_uniform;
-         var->name = ralloc_asprintf(var, "uniform_%d", decl->Range.First);
 
-         exec_list_push_tail(&b->shader->uniforms, &var->node);
-         break;
-      default:
-         unreachable("bad declaration file");
-         return;
+      for (i = 0; i < array_size; i++) {
+         unsigned idx = decl->Range.First + i;
+         nir_variable *var = rzalloc(b->shader, nir_variable);
+
+         var->data.driver_location = idx;
+
+         var->type = glsl_vec4_type();
+         if (is_array)
+            var->type = glsl_array_type(var->type, array_size);
+
+         switch (file) {
+         case TGSI_FILE_INPUT:
+            var->data.read_only = true;
+            var->data.mode = nir_var_shader_in;
+            var->name = ralloc_asprintf(var, "in_%d", idx);
+
+            /* We should probably translate to a VERT_ATTRIB_* or VARYING_SLOT_*
+             * instead, but nothing in NIR core is looking at the value
+             * currently, and this is less change to drivers.
+             */
+            var->data.location = decl->Semantic.Name;
+            var->data.index = decl->Semantic.Index;
+
+            /* We definitely need to translate the interpolation field, because
+             * nir_print will decode it.
+             */
+            switch (decl->Interp.Interpolate) {
+            case TGSI_INTERPOLATE_CONSTANT:
+               var->data.interpolation = INTERP_QUALIFIER_FLAT;
+               break;
+            case TGSI_INTERPOLATE_LINEAR:
+               var->data.interpolation = INTERP_QUALIFIER_NOPERSPECTIVE;
+               break;
+            case TGSI_INTERPOLATE_PERSPECTIVE:
+               var->data.interpolation = INTERP_QUALIFIER_SMOOTH;
+               break;
+            }
+
+            exec_list_push_tail(&b->shader->inputs, &var->node);
+            break;
+         case TGSI_FILE_OUTPUT: {
+            /* Since we can't load from outputs in the IR, we make temporaries
+             * for the outputs and emit stores to the real outputs at the end of
+             * the shader.
+             */
+            nir_register *reg = nir_local_reg_create(b->impl);
+            reg->num_components = 4;
+            if (is_array)
+               reg->num_array_elems = array_size;
+
+            var->data.mode = nir_var_shader_out;
+            var->name = ralloc_asprintf(var, "out_%d", idx);
+
+            var->data.location = decl->Semantic.Name;
+            var->data.index = decl->Semantic.Index;
+
+            if (is_array) {
+               unsigned j;
+               for (j = 0; j < array_size; j++) {
+                  c->output_regs[idx + j].offset = i + j;
+                  c->output_regs[idx + j].reg = reg;
+               }
+            } else {
+               c->output_regs[idx].offset = i;
+               c->output_regs[idx].reg = reg;
+            }
+
+            exec_list_push_tail(&b->shader->outputs, &var->node);
+         }
+            break;
+         case TGSI_FILE_CONSTANT:
+            var->data.mode = nir_var_uniform;
+            var->name = ralloc_asprintf(var, "uniform_%d", idx);
+
+            exec_list_push_tail(&b->shader->uniforms, &var->node);
+            break;
+         default:
+            unreachable("bad declaration file");
+            return;
+         }
+
+         if (is_array)
+            break;
       }
 
    }

From 959b47262b339ad6d1a072c17a1abe9735ead41d Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 26 Jun 2015 15:05:32 -0400
Subject: [PATCH 0174/1208] nir/lower_phis_to_scalar: undef is trivially
 scalarizable

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
---
 src/glsl/nir/nir_lower_phis_to_scalar.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/glsl/nir/nir_lower_phis_to_scalar.c b/src/glsl/nir/nir_lower_phis_to_scalar.c
index a57d253975d..739170d61fd 100644
--- a/src/glsl/nir/nir_lower_phis_to_scalar.c
+++ b/src/glsl/nir/nir_lower_phis_to_scalar.c
@@ -75,6 +75,7 @@ is_phi_src_scalarizable(nir_phi_src *src,
       return should_lower_phi(nir_instr_as_phi(src_instr), state);
 
    case nir_instr_type_load_const:
+   case nir_instr_type_ssa_undef:
       /* These are trivially scalarizable */
       return true;
 

From 0a155538eb7e7870b99fb8b3fd8e2a268361d2c8 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 27 Jun 2015 17:38:57 -0400
Subject: [PATCH 0175/1208] gallium/ttn: mark location specially in nir for
 color0-writes-all

We need to distinguish a shader that has separate writes to each MRT
from one which is supposed to write the data from MRT 0 to all the MRTs.
In TGSI this is done with a property. NIR doesn't have that, so encode
it as a funny location and decode on the other end.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/auxiliary/nir/tgsi_to_nir.c              | 7 ++++++-
 src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 4 ++++
 src/gallium/drivers/vc4/vc4_program.c                | 6 ++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index bf7eb2fde02..4130697e2a7 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -253,7 +253,12 @@ ttn_emit_declaration(struct ttn_compile *c)
             var->name = ralloc_asprintf(var, "out_%d", idx);
 
             var->data.location = decl->Semantic.Name;
-            var->data.index = decl->Semantic.Index;
+            if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
+                decl->Semantic.Index == 0 &&
+                c->scan->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
+               var->data.index = -1;
+            else
+               var->data.index = decl->Semantic.Index;
 
             if (is_array) {
                unsigned j;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 3b36114a5ba..fa13c4076dc 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -2110,6 +2110,10 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
 			so->writes_pos = true;
 			break;
 		case TGSI_SEMANTIC_COLOR:
+			if (semantic_index == -1) {
+				semantic_index = 0;
+				so->color0_mrt = 1;
+			}
 			break;
 		default:
 			compile_error(ctx, "unknown FS semantic name: %s\n",
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 7b39a03f01a..a7aa3172a75 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1783,6 +1783,12 @@ ntq_setup_outputs(struct vc4_compile *c)
 
                 assert(array_len == 1);
 
+                /* NIR hack to pass through
+                 * TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS */
+                if (semantic_name == TGSI_SEMANTIC_COLOR &&
+                    semantic_index == -1)
+                        semantic_index = 0;
+
                 for (int i = 0; i < 4; i++) {
                         add_output(c,
                                    loc + i,

From 6b9f5cd5f7b25e9e03104fe279df74817f69fe87 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Thu, 2 Jul 2015 13:52:38 -0400
Subject: [PATCH 0176/1208] freedreno/ir3: fix indirects tracking

cp would update instr->address but not update the indirects array
resulting in sched getting confused when it had to 'spill' the address
register.  Add an ir3_instr_set_address() helper to set instr->address
and also update ir->indirects, and update all places that were writing
instr->address to use helper instead.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3.c              | 11 +++++++++++
 src/gallium/drivers/freedreno/ir3/ir3.h              |  5 +++++
 src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c |  9 +++------
 src/gallium/drivers/freedreno/ir3/ir3_cp.c           |  6 +++---
 src/gallium/drivers/freedreno/ir3/ir3_sched.c        |  2 +-
 5 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index 1da6cf0477e..a0cb74498ec 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -706,6 +706,17 @@ struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 	return reg;
 }
 
+void
+ir3_instr_set_address(struct ir3_instruction *instr,
+		struct ir3_instruction *addr)
+{
+	if (instr->address != addr) {
+		struct ir3 *ir = instr->block->shader;
+		instr->address = addr;
+		array_insert(ir->indirects, instr);
+	}
+}
+
 void
 ir3_block_clear_mark(struct ir3_block *block)
 {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index bc0144568a5..f11d8eda5f2 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -285,6 +285,8 @@ struct ir3_instruction {
 
 	/* an instruction can reference at most one address register amongst
 	 * it's src/dst registers.  Beyond that, you need to insert mov's.
+	 *
+	 * NOTE: do not write this directly, use ir3_instr_set_address()
 	 */
 	struct ir3_instruction *address;
 
@@ -420,6 +422,9 @@ const char *ir3_instr_name(struct ir3_instruction *instr);
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags);
 
+void ir3_instr_set_address(struct ir3_instruction *instr,
+		struct ir3_instruction *addr);
+
 static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
 {
 	if (instr->flags & IR3_INSTR_MARK)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index fa13c4076dc..8d8d4434d9e 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -637,9 +637,8 @@ create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
 	ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV);
-	mov->address = address;
 
-	array_insert(ctx->ir->indirects, mov);
+	ir3_instr_set_address(mov, address);
 
 	return mov;
 }
@@ -677,9 +676,8 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 	src->instr = collect;
 	src->size  = arrsz;
 	src->offset = n;
-	mov->address = address;
 
-	array_insert(ctx->ir->indirects, mov);
+	ir3_instr_set_address(mov, address);
 
 	return mov;
 }
@@ -700,10 +698,9 @@ create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 	dst->size  = arrsz;
 	dst->offset = n;
 	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
-	mov->address = address;
 	mov->fanin = collect;
 
-	array_insert(ctx->ir->indirects, mov);
+	ir3_instr_set_address(mov, address);
 
 	return mov;
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 8c7c80f7aae..f4c825b2ab6 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -291,7 +291,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 			instr->regs[n+1] = src_reg;
 
 			if (src_reg->flags & IR3_REG_RELATIV)
-				instr->address = reg->instr->address;
+				ir3_instr_set_address(instr, reg->instr->address);
 
 			return;
 		}
@@ -300,7 +300,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 				!conflicts(instr->address, reg->instr->address)) {
 			src_reg->flags = new_flags;
 			instr->regs[n+1] = src_reg;
-			instr->address = reg->instr->address;
+			ir3_instr_set_address(instr, reg->instr->address);
 
 			return;
 		}
@@ -389,7 +389,7 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags)
 	}
 
 	if (instr->address)
-		instr->address = instr_cp(instr->address, NULL);
+		ir3_instr_set_address(instr, instr_cp(instr->address, NULL));
 
 	return instr;
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 49a4426d163..01d72301dcc 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -312,7 +312,7 @@ split_addr(struct ir3_sched_ctx *ctx)
 				/* original addr is scheduled, but new one isn't: */
 				new_addr->flags &= ~IR3_INSTR_MARK;
 			}
-			indirect->address = new_addr;
+			ir3_instr_set_address(indirect, new_addr);
 		}
 	}
 

From 2215ff2a5d5f1df5791399e1ff78b56bf06e9102 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Thu, 2 Jul 2015 14:59:08 -0400
Subject: [PATCH 0177/1208] freedreno/ir3: sched fixes for addr register usage

A handful of fixes and cleanups:

1) If we split addr/pred, we need the newly created instruction to
   end up in the unscheduled_list
2) Avoid scheduling a write to the address register if there is no
   instruction using the address register that is otherwise ready
   to schedule.  Note that I currently don't bother with the same
   logic for predicate register, since the only instructions using
   predicate (br/kill) don't take any other src registers, so this
   situation should not arise.
3) few other cosmetic cleanups

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_sched.c | 77 ++++++++++++++++---
 1 file changed, 65 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 01d72301dcc..414d14e0f0d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -80,12 +80,12 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 	list_delinit(&instr->node);
 
 	if (writes_addr(instr)) {
-		assert(ctx->addr == NULL);
+		debug_assert(ctx->addr == NULL);
 		ctx->addr = instr;
 	}
 
 	if (writes_pred(instr)) {
-		assert(ctx->pred == NULL);
+		debug_assert(ctx->pred == NULL);
 		ctx->pred = instr;
 	}
 
@@ -180,13 +180,13 @@ check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	 * free:
 	 */
 	if (writes_addr(instr) && ctx->addr) {
-		assert(ctx->addr != instr);
+		debug_assert(ctx->addr != instr);
 		notes->addr_conflict = true;
 		return true;
 	}
 
 	if (writes_pred(instr) && ctx->pred) {
-		assert(ctx->pred != instr);
+		debug_assert(ctx->pred != instr);
 		notes->pred_conflict = true;
 		return true;
 	}
@@ -261,6 +261,20 @@ instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	return 0;
 }
 
+/* could an instruction be scheduled if specified ssa src was scheduled? */
+static bool
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+{
+	struct ir3_instruction *other_src;
+	foreach_ssa_src(other_src, instr) {
+		/* if dependency not scheduled, we aren't ready yet: */
+		if ((src != other_src) && !is_scheduled(other_src)) {
+			return false;
+		}
+	}
+	return true;
+}
+
 /* move eligible instructions to the priority list: */
 static unsigned
 add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
@@ -272,6 +286,29 @@ add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		int e = instr_eligibility(ctx, notes, instr);
 		if (e < 0)
 			continue;
+
+		/* For instructions that write address register we need to
+		 * make sure there is at least one instruction that uses the
+		 * addr value which is otherwise ready.
+		 *
+		 * TODO if any instructions use pred register and have other
+		 * src args, we would need to do the same for writes_pred()..
+		 */
+		if (unlikely(writes_addr(instr))) {
+			struct ir3 *ir = instr->block->shader;
+			bool ready = false;
+			for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+				struct ir3_instruction *indirect = ir->indirects[i];
+				if (indirect->address != instr)
+					continue;
+				ready = could_sched(indirect, instr);
+			}
+
+			/* nothing could be scheduled, so keep looking: */
+			if (!ready)
+				continue;
+		}
+
 		min_delay = MIN2(min_delay, e);
 		if (e == 0) {
 			/* remove from unscheduled list and into priority queue: */
@@ -287,20 +324,22 @@ add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
  * instructions which depend on the current address register
  * to a clone of the instruction which wrote the address reg.
  */
-static void
+static struct ir3_instruction *
 split_addr(struct ir3_sched_ctx *ctx)
 {
-	struct ir3 *ir = ctx->addr->block->shader;
+	struct ir3 *ir;
 	struct ir3_instruction *new_addr = NULL;
 	unsigned i;
 
 	debug_assert(ctx->addr);
 
+	ir = ctx->addr->block->shader;
+
 	for (i = 0; i < ir->indirects_count; i++) {
 		struct ir3_instruction *indirect = ir->indirects[i];
 
 		/* skip instructions already scheduled: */
-		if (indirect->flags & IR3_INSTR_MARK)
+		if (is_scheduled(indirect))
 			continue;
 
 		/* remap remaining instructions using current addr
@@ -318,26 +357,30 @@ split_addr(struct ir3_sched_ctx *ctx)
 
 	/* all remaining indirects remapped to new addr: */
 	ctx->addr = NULL;
+
+	return new_addr;
 }
 
 /* "spill" the predicate register by remapping any unscheduled
  * instructions which depend on the current predicate register
  * to a clone of the instruction which wrote the address reg.
  */
-static void
+static struct ir3_instruction *
 split_pred(struct ir3_sched_ctx *ctx)
 {
-	struct ir3 *ir = ctx->pred->block->shader;
+	struct ir3 *ir;
 	struct ir3_instruction *new_pred = NULL;
 	unsigned i;
 
 	debug_assert(ctx->pred);
 
+	ir = ctx->pred->block->shader;
+
 	for (i = 0; i < ir->predicates_count; i++) {
 		struct ir3_instruction *predicated = ir->predicates[i];
 
 		/* skip instructions already scheduled: */
-		if (predicated->flags & IR3_INSTR_MARK)
+		if (is_scheduled(predicated))
 			continue;
 
 		/* remap remaining instructions using current pred
@@ -358,6 +401,8 @@ split_pred(struct ir3_sched_ctx *ctx)
 
 	/* all remaining predicated remapped to new pred: */
 	ctx->pred = NULL;
+
+	return new_pred;
 }
 
 static void
@@ -407,20 +452,28 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 
 			schedule(ctx, instr);
 		} else if (delay == ~0) {
+			struct ir3_instruction *new_instr = NULL;
+
 			/* nothing available to schedule.. if we are blocked on
 			 * address/predicate register conflict, then break the
 			 * deadlock by cloning the instruction that wrote that
 			 * reg:
 			 */
 			if (notes.addr_conflict) {
-				split_addr(ctx);
+				new_instr = split_addr(ctx);
 			} else if (notes.pred_conflict) {
-				split_pred(ctx);
+				new_instr = split_pred(ctx);
 			} else {
 				debug_assert(0);
 				ctx->error = true;
 				return;
 			}
+
+			if (new_instr) {
+				list_del(&new_instr->node);
+				list_addtail(&new_instr->node, &unscheduled_list);
+			}
+
 		} else {
 			/* and if we run out of instructions that can be scheduled,
 			 * then it is time for nop's:

From a84505c71920f2c70bc8d83cee3e223cd2d976ad Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Thu, 2 Jul 2015 15:38:34 -0400
Subject: [PATCH 0178/1208] freedreno/ir3: don't be confused by eliminated
 indirects

If an instruction using address register value gets eliminated, we need
to remove it from the indirects list, otherwise it causes mayhem in
sched for scheduling address register usage.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_depth.c | 9 +++++++++
 src/gallium/drivers/freedreno/ir3/ir3_sched.c | 5 +++++
 2 files changed, 14 insertions(+)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index 3a108243479..0f346b26f4a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -167,6 +167,15 @@ ir3_depth(struct ir3 *ir)
 		remove_unused_by_block(block);
 	}
 
+	/* note that we can end up with unused indirects, but we should
+	 * not end up with unused predicates.
+	 */
+	for (i = 0; i < ir->indirects_count; i++) {
+		struct ir3_instruction *instr = ir->indirects[i];
+		if (instr->depth == DEPTH_UNUSED)
+			ir->indirects[i] = NULL;
+	}
+
 	/* cleanup unused inputs: */
 	for (i = 0; i < ir->ninputs; i++) {
 		struct ir3_instruction *in = ir->inputs[i];
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 414d14e0f0d..0cb1589eb4d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -299,6 +299,8 @@ add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 			bool ready = false;
 			for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
 				struct ir3_instruction *indirect = ir->indirects[i];
+				if (!indirect)
+					continue;
 				if (indirect->address != instr)
 					continue;
 				ready = could_sched(indirect, instr);
@@ -338,6 +340,9 @@ split_addr(struct ir3_sched_ctx *ctx)
 	for (i = 0; i < ir->indirects_count; i++) {
 		struct ir3_instruction *indirect = ir->indirects[i];
 
+		if (!indirect)
+			continue;
+
 		/* skip instructions already scheduled: */
 		if (is_scheduled(indirect))
 			continue;

From b193f2b9b6ae4d071e2cdef62d4398fec5d9aad8 Mon Sep 17 00:00:00 2001
From: Anatoli Antonovitch <anatoli.antonovitch@amd.com>
Date: Wed, 10 Jun 2015 14:42:31 +0200
Subject: [PATCH 0179/1208] egl: implement EGL_KHR_gl_texture_3D_image

Most of the code has been in place already.
---
 src/egl/drivers/dri2/egl_dri2.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 223cc5c5fbf..400ee635e77 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -1566,9 +1566,15 @@ dri2_create_image_khr_texture(_EGLDisplay *disp, _EGLContext *ctx,
       gl_target = GL_TEXTURE_2D;
       break;
    case EGL_GL_TEXTURE_3D_KHR:
-      depth = attrs.GLTextureZOffset;
-      gl_target = GL_TEXTURE_3D;
-      break;
+      if (disp->Extensions.KHR_gl_texture_3D_image) {
+         depth = attrs.GLTextureZOffset;
+         gl_target = GL_TEXTURE_3D;
+         break;
+      }
+      else {
+         _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
+         return EGL_NO_IMAGE_KHR;
+      }
    case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_X_KHR:
    case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_X_KHR:
    case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Y_KHR:
@@ -1868,6 +1874,14 @@ dri2_create_image_khr(_EGLDriver *drv, _EGLDisplay *disp,
    case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Z_KHR:
    case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_KHR:
       return dri2_create_image_khr_texture(disp, ctx, target, buffer, attr_list);
+   case EGL_GL_TEXTURE_3D_KHR:
+      if (disp->Extensions.KHR_gl_texture_3D_image) {
+         return dri2_create_image_khr_texture(disp, ctx, target, buffer, attr_list);
+      }
+      else {
+         _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
+         return EGL_NO_IMAGE_KHR;
+      }
    case EGL_GL_RENDERBUFFER_KHR:
       return dri2_create_image_khr_renderbuffer(disp, ctx, buffer, attr_list);
 #ifdef HAVE_LIBDRM

From 32aa1d769de070c4e8756922571c35deaf12a40a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 9 Jun 2015 23:08:57 +0200
Subject: [PATCH 0180/1208] egl: sort extension lists alphabetically

and add the missing KHR_gl_colorspace case.
---
 src/egl/main/eglapi.c     | 65 +++++++++++++++++++-------------------
 src/egl/main/egldisplay.h | 66 +++++++++++++++++++--------------------
 src/egl/main/eglglobals.c |  4 +--
 3 files changed, 66 insertions(+), 69 deletions(-)

diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index 105e919683a..824e51ea55e 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -381,48 +381,47 @@ _eglCreateExtensionsString(_EGLDisplay *dpy)
 
    char *exts = dpy->ExtensionsString;
 
-   _EGL_CHECK_EXTENSION(MESA_drm_display);
-   _EGL_CHECK_EXTENSION(MESA_drm_image);
-   _EGL_CHECK_EXTENSION(MESA_configless_context);
-
-   _EGL_CHECK_EXTENSION(WL_bind_wayland_display);
-   _EGL_CHECK_EXTENSION(WL_create_wayland_buffer_from_image);
-
-   _EGL_CHECK_EXTENSION(KHR_image_base);
-   _EGL_CHECK_EXTENSION(KHR_image_pixmap);
-   if (dpy->Extensions.KHR_image_base && dpy->Extensions.KHR_image_pixmap)
-      _eglAppendExtension(&exts, "EGL_KHR_image");
-
-   _EGL_CHECK_EXTENSION(KHR_vg_parent_image);
-   _EGL_CHECK_EXTENSION(KHR_get_all_proc_addresses);
-   _EGL_CHECK_EXTENSION(KHR_gl_texture_2D_image);
-   _EGL_CHECK_EXTENSION(KHR_gl_texture_cubemap_image);
-   _EGL_CHECK_EXTENSION(KHR_gl_texture_3D_image);
-   _EGL_CHECK_EXTENSION(KHR_gl_renderbuffer_image);
-
-   _EGL_CHECK_EXTENSION(KHR_reusable_sync);
-   _EGL_CHECK_EXTENSION(KHR_fence_sync);
-   _EGL_CHECK_EXTENSION(KHR_wait_sync);
-   _EGL_CHECK_EXTENSION(KHR_cl_event2);
-
-   _EGL_CHECK_EXTENSION(KHR_surfaceless_context);
-   _EGL_CHECK_EXTENSION(KHR_create_context);
-
-   _EGL_CHECK_EXTENSION(NOK_swap_region);
-   _EGL_CHECK_EXTENSION(NOK_texture_from_pixmap);
-
+   /* Please keep these sorted alphabetically. */
    _EGL_CHECK_EXTENSION(ANDROID_image_native_buffer);
 
    _EGL_CHECK_EXTENSION(CHROMIUM_sync_control);
 
-   _EGL_CHECK_EXTENSION(EXT_create_context_robustness);
    _EGL_CHECK_EXTENSION(EXT_buffer_age);
-   _EGL_CHECK_EXTENSION(EXT_swap_buffers_with_damage);
+   _EGL_CHECK_EXTENSION(EXT_create_context_robustness);
    _EGL_CHECK_EXTENSION(EXT_image_dma_buf_import);
+   _EGL_CHECK_EXTENSION(EXT_swap_buffers_with_damage);
+
+   _EGL_CHECK_EXTENSION(KHR_cl_event2);
+   _EGL_CHECK_EXTENSION(KHR_create_context);
+   _EGL_CHECK_EXTENSION(KHR_fence_sync);
+   _EGL_CHECK_EXTENSION(KHR_get_all_proc_addresses);
+   _EGL_CHECK_EXTENSION(KHR_gl_colorspace);
+   _EGL_CHECK_EXTENSION(KHR_gl_renderbuffer_image);
+   _EGL_CHECK_EXTENSION(KHR_gl_texture_2D_image);
+   _EGL_CHECK_EXTENSION(KHR_gl_texture_3D_image);
+   _EGL_CHECK_EXTENSION(KHR_gl_texture_cubemap_image);
+   if (dpy->Extensions.KHR_image_base && dpy->Extensions.KHR_image_pixmap)
+      _eglAppendExtension(&exts, "EGL_KHR_image");
+   _EGL_CHECK_EXTENSION(KHR_image_base);
+   _EGL_CHECK_EXTENSION(KHR_image_pixmap);
+   _EGL_CHECK_EXTENSION(KHR_reusable_sync);
+   _EGL_CHECK_EXTENSION(KHR_surfaceless_context);
+   _EGL_CHECK_EXTENSION(KHR_vg_parent_image);
+   _EGL_CHECK_EXTENSION(KHR_wait_sync);
+
+   _EGL_CHECK_EXTENSION(MESA_configless_context);
+   _EGL_CHECK_EXTENSION(MESA_drm_display);
+   _EGL_CHECK_EXTENSION(MESA_drm_image);
+   _EGL_CHECK_EXTENSION(MESA_image_dma_buf_export);
+
+   _EGL_CHECK_EXTENSION(NOK_swap_region);
+   _EGL_CHECK_EXTENSION(NOK_texture_from_pixmap);
 
    _EGL_CHECK_EXTENSION(NV_post_sub_buffer);
 
-   _EGL_CHECK_EXTENSION(MESA_image_dma_buf_export);
+   _EGL_CHECK_EXTENSION(WL_bind_wayland_display);
+   _EGL_CHECK_EXTENSION(WL_create_wayland_buffer_from_image);
+
 #undef _EGL_CHECK_EXTENSION
 }
 
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index 0b50a36a098..8d3393564bf 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -91,46 +91,44 @@ struct _egl_resource
  */
 struct _egl_extensions
 {
-   EGLBoolean MESA_drm_display;
-   EGLBoolean MESA_drm_image;
-   EGLBoolean MESA_configless_context;
-
-   EGLBoolean WL_bind_wayland_display;
-   EGLBoolean WL_create_wayland_buffer_from_image;
-
-   EGLBoolean KHR_image_base;
-   EGLBoolean KHR_image_pixmap;
-   EGLBoolean KHR_vg_parent_image;
-   EGLBoolean KHR_get_all_proc_addresses;
-   EGLBoolean KHR_gl_colorspace;
-   EGLBoolean KHR_gl_texture_2D_image;
-   EGLBoolean KHR_gl_texture_cubemap_image;
-   EGLBoolean KHR_gl_texture_3D_image;
-   EGLBoolean KHR_gl_renderbuffer_image;
-
-   EGLBoolean KHR_reusable_sync;
-   EGLBoolean KHR_fence_sync;
-   EGLBoolean KHR_wait_sync;
-   EGLBoolean KHR_cl_event2;
-
-   EGLBoolean KHR_surfaceless_context;
-   EGLBoolean KHR_create_context;
-
-   EGLBoolean NOK_swap_region;
-   EGLBoolean NOK_texture_from_pixmap;
-
+   /* Please keep these sorted alphabetically. */
    EGLBoolean ANDROID_image_native_buffer;
 
    EGLBoolean CHROMIUM_sync_control;
 
+   EGLBoolean EXT_buffer_age;
+   EGLBoolean EXT_create_context_robustness;
+   EGLBoolean EXT_image_dma_buf_import;
+   EGLBoolean EXT_swap_buffers_with_damage;
+
+   EGLBoolean KHR_cl_event2;
+   EGLBoolean KHR_create_context;
+   EGLBoolean KHR_fence_sync;
+   EGLBoolean KHR_get_all_proc_addresses;
+   EGLBoolean KHR_gl_colorspace;
+   EGLBoolean KHR_gl_renderbuffer_image;
+   EGLBoolean KHR_gl_texture_2D_image;
+   EGLBoolean KHR_gl_texture_3D_image;
+   EGLBoolean KHR_gl_texture_cubemap_image;
+   EGLBoolean KHR_image_base;
+   EGLBoolean KHR_image_pixmap;
+   EGLBoolean KHR_reusable_sync;
+   EGLBoolean KHR_surfaceless_context;
+   EGLBoolean KHR_vg_parent_image;
+   EGLBoolean KHR_wait_sync;
+
+   EGLBoolean MESA_configless_context;
+   EGLBoolean MESA_drm_display;
+   EGLBoolean MESA_drm_image;
+   EGLBoolean MESA_image_dma_buf_export;
+
+   EGLBoolean NOK_swap_region;
+   EGLBoolean NOK_texture_from_pixmap;
+
    EGLBoolean NV_post_sub_buffer;
 
-   EGLBoolean EXT_create_context_robustness;
-   EGLBoolean EXT_buffer_age;
-   EGLBoolean EXT_swap_buffers_with_damage;
-   EGLBoolean EXT_image_dma_buf_import;
-
-   EGLBoolean MESA_image_dma_buf_export;
+   EGLBoolean WL_bind_wayland_display;
+   EGLBoolean WL_create_wayland_buffer_from_image;
 };
 
 
diff --git a/src/egl/main/eglglobals.c b/src/egl/main/eglglobals.c
index 884cff0c36b..938d9537891 100644
--- a/src/egl/main/eglglobals.c
+++ b/src/egl/main/eglglobals.c
@@ -53,10 +53,10 @@ struct _egl_global _eglGlobal =
    /* ClientExtensionsString */
    "EGL_EXT_client_extensions"
    " EGL_EXT_platform_base"
-   " EGL_EXT_platform_x11"
    " EGL_EXT_platform_wayland"
-   " EGL_MESA_platform_gbm"
+   " EGL_EXT_platform_x11"
    " EGL_KHR_client_get_all_proc_addresses"
+   " EGL_MESA_platform_gbm"
 };
 
 

From 9e127325ef461a11345df7ba6884e77c7168ab37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 10 Jun 2015 02:53:33 +0200
Subject: [PATCH 0181/1208] mesa: fix sRGB rendering for GLES1

---
 src/mesa/main/blend.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index d869fa2aa09..d36530541ec 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -930,12 +930,10 @@ void _mesa_init_color( struct gl_context * ctx )
    ctx->Color._ClampFragmentColor = GL_FALSE;
    ctx->Color.ClampReadColor = GL_FIXED_ONLY_ARB;
 
-   if (ctx->API == API_OPENGLES2) {
-      /* GLES 3 behaves as though GL_FRAMEBUFFER_SRGB is always enabled. */
-      ctx->Color.sRGBEnabled = GL_TRUE;
-   } else {
-      ctx->Color.sRGBEnabled = GL_FALSE;
-   }
+   /* GLES 1/2/3 behaves as though GL_FRAMEBUFFER_SRGB is always enabled
+    * if EGL_KHR_gl_colorspace has been used to request sRGB.
+    */
+   ctx->Color.sRGBEnabled = _mesa_is_gles(ctx);
 }
 
 /*@}*/

From a34e8714491022a2efde8a44972ac582f098b7ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 10 Jun 2015 02:50:42 +0200
Subject: [PATCH 0182/1208] dri/common: allow BGRX sRGB visuals

---
 src/mesa/drivers/dri/common/utils.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/drivers/dri/common/utils.c b/src/mesa/drivers/dri/common/utils.c
index 70d34e8ce55..b51b263fe46 100644
--- a/src/mesa/drivers/dri/common/utils.c
+++ b/src/mesa/drivers/dri/common/utils.c
@@ -213,6 +213,7 @@ driCreateConfigs(mesa_format format,
       masks = masks_table[0];
       break;
    case MESA_FORMAT_B8G8R8X8_UNORM:
+   case MESA_FORMAT_B8G8R8X8_SRGB:
       masks = masks_table[1];
       break;
    case MESA_FORMAT_B8G8R8A8_UNORM:

From 914365c0eb039f66370cff166428c703e02ad510 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 29 Apr 2015 15:27:50 +0200
Subject: [PATCH 0183/1208] r600g,radeonsi: implement get_device_reset_status

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/gallium/drivers/r600/r600_pipe.c          |  4 +++-
 src/gallium/drivers/radeon/r600_pipe_common.c | 20 +++++++++++++++++++
 src/gallium/drivers/radeon/r600_pipe_common.h |  1 +
 src/gallium/drivers/radeon/radeon_winsys.h    |  5 +++--
 src/gallium/drivers/radeonsi/si_pipe.c        |  4 +++-
 .../winsys/radeon/drm/radeon_drm_winsys.c     |  8 ++++++++
 6 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index e122b607b86..143e98ea0af 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -270,6 +270,9 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
 		return 1;
 
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+		return rscreen->b.info.drm_major == 2 && rscreen->b.info.drm_minor >= 43;
+
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 		return !R600_BIG_ENDIAN && rscreen->b.info.has_userptr;
 
@@ -332,7 +335,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
-	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 		return 0;
 
 	/* Stream output. */
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 775cf53ba88..5dd28df5d5f 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -196,6 +196,19 @@ static void r600_flush_dma_ring(void *ctx, unsigned flags,
 	rctx->rings.dma.flushing = false;
 }
 
+static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	unsigned latest = rctx->ws->query_value(rctx->ws,
+						RADEON_GPU_RESET_COUNTER);
+
+	if (rctx->gpu_reset_counter == latest)
+		return PIPE_NO_RESET;
+
+	rctx->gpu_reset_counter = latest;
+	return PIPE_UNKNOWN_CONTEXT_RESET;
+}
+
 bool r600_common_context_init(struct r600_common_context *rctx,
 			      struct r600_common_screen *rscreen)
 {
@@ -222,6 +235,13 @@ bool r600_common_context_init(struct r600_common_context *rctx,
         rctx->b.memory_barrier = r600_memory_barrier;
 	rctx->b.flush = r600_flush_from_st;
 
+	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
+		rctx->b.get_device_reset_status = r600_get_reset_status;
+		rctx->gpu_reset_counter =
+			rctx->ws->query_value(rctx->ws,
+					      RADEON_GPU_RESET_COUNTER);
+	}
+
 	LIST_INITHEAD(&rctx->texture_buffers);
 
 	r600_init_context_texture_functions(rctx);
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 51fd016229c..2b27e58e0e2 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -356,6 +356,7 @@ struct r600_common_context {
 	enum chip_class			chip_class;
 	struct r600_rings		rings;
 	unsigned			initial_gfx_cs_size;
+	unsigned			gpu_reset_counter;
 
 	struct u_upload_mgr		*uploader;
 	struct u_suballocator		*allocator_so_filled_size;
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 3bfbb6d75b7..48342c063c1 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -169,9 +169,10 @@ enum radeon_value_id {
     RADEON_NUM_BYTES_MOVED,
     RADEON_VRAM_USAGE,
     RADEON_GTT_USAGE,
-    RADEON_GPU_TEMPERATURE,
+    RADEON_GPU_TEMPERATURE, /* DRM 2.42.0 */
     RADEON_CURRENT_SCLK,
-    RADEON_CURRENT_MCLK
+    RADEON_CURRENT_MCLK,
+    RADEON_GPU_RESET_COUNTER, /* DRM 2.43.0 */
 };
 
 enum radeon_bo_priority {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 480a3010d31..77b8d7d3a6a 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -257,6 +257,9 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 		return !SI_BIG_ENDIAN && sscreen->b.info.has_userptr;
 
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+		return sscreen->b.info.drm_major == 2 && sscreen->b.info.drm_minor >= 43;
+
 	case PIPE_CAP_TEXTURE_MULTISAMPLE:
 		/* 2D tiling on CIK is supported since DRM 2.35.0 */
 		return sscreen->b.chip_class < CIK ||
@@ -293,7 +296,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
-	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 		return 0;
 
 	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index d8bb353df9d..e7b77283b8a 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -59,6 +59,10 @@
 
 #define RADEON_INFO_VA_UNMAP_WORKING	0x25
 
+#ifndef RADEON_INFO_GPU_RESET_COUNTER
+#define RADEON_INFO_GPU_RESET_COUNTER   0x26
+#endif
+
 static struct util_hash_table *fd_tab = NULL;
 pipe_static_mutex(fd_tab_mutex);
 
@@ -567,6 +571,10 @@ static uint64_t radeon_query_value(struct radeon_winsys *rws,
         radeon_get_drm_value(ws->fd, RADEON_INFO_CURRENT_GPU_MCLK,
                              "current-gpu-mclk", (uint32_t*)&retval);
         return retval;
+    case RADEON_GPU_RESET_COUNTER:
+        radeon_get_drm_value(ws->fd, RADEON_INFO_GPU_RESET_COUNTER,
+                             "gpu-reset-counter", (uint32_t*)&retval);
+        return retval;
     }
     return 0;
 }

From 7744687ddb7f1b223da6a862c282173123921023 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 29 Apr 2015 17:57:46 +0200
Subject: [PATCH 0184/1208] docs/relnotes: document create_context_robustness
 extensions

---
 docs/relnotes/10.7.0.html | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index 2484243f7eb..7f55b067a4a 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -45,12 +45,14 @@ Note: some of the new features are only available with certain drivers.
 
 <ul>
 <li>GL_AMD_vertex_shader_viewport_index on radeonsi</li>
-<li>GL_ARB_framebuffer_no_attachments on i965</li>
-<li>GL_ARB_shader_stencil_export on llvmpipe</li>
-<li>GL_ARB_viewport_array on radeonsi</li>
 <li>GL_ARB_fragment_layer_viewport on radeonsi</li>
+<li>GL_ARB_framebuffer_no_attachments on i965</li>
 <li>GL_ARB_gpu_shader_fp64 on llvmpipe</li>
+<li>GL_ARB_shader_stencil_export on llvmpipe</li>
 <li>GL_ARB_vertex_attrib_64bit on llvmpipe</li>
+<li>GL_ARB_viewport_array on radeonsi</li>
+<li>GLX_ARB_create_context_robustness on r600, radeonsi</li>
+<li>EGL_EXT_create_context_robustness on r600, radeonsi</li>
 </ul>
 
 <h2>Bug fixes</h2>

From 97ec2c694fe568e375ec7a2b85c1acb1e4666b54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 3 Jul 2015 16:20:32 +0200
Subject: [PATCH 0185/1208] r600g: disable single-sample fast color clear due
 to hangs

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73528
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=82186

Cc: 10.4 10.5 10.6 <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/r600/r600_blit.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 01262a59e90..8e553a8e3b9 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -393,7 +393,12 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct pipe_framebuffer_state *fb = &rctx->framebuffer.state;
 
-	if (buffers & PIPE_CLEAR_COLOR && rctx->b.chip_class >= EVERGREEN) {
+	/* Single-sample fast color clear is broken on r600g:
+	 *   https://bugs.freedesktop.org/show_bug.cgi?id=73528
+	 *   https://bugs.freedesktop.org/show_bug.cgi?id=82186
+	 */
+	if (buffers & PIPE_CLEAR_COLOR && rctx->b.chip_class >= EVERGREEN &&
+	    rctx->framebuffer.nr_samples > 1) {
 		evergreen_do_fast_color_clear(&rctx->b, fb, &rctx->framebuffer.atom,
 					      &buffers, color);
 	}

From 28dda47ae4d974e3e032d60e8e0965c8c068c6d8 Mon Sep 17 00:00:00 2001
From: Mario Kleiner <mario.kleiner.de@gmail.com>
Date: Sun, 28 Jun 2015 03:02:31 +0200
Subject: [PATCH 0186/1208] winsys/radeon: Use dup fd as key in drm-winsys hash
 table to fix ZaphodHeads.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same problem and fix as for nouveau's ZaphodHeads trouble.

See patch ...

"nouveau: Use dup fd as key in drm-winsys hash table to fix ZaphodHeads."

... for reference.

Cc: "10.3 10.4 10.5 10.6" <mesa-stable@lists.freedesktop.org>
Signed-off-by: Mario Kleiner <mario.kleiner.de@gmail.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index e7b77283b8a..41f8826c39d 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -492,6 +492,10 @@ static void radeon_winsys_destroy(struct radeon_winsys *rws)
     if (ws->gen >= DRV_R600) {
         radeon_surface_manager_free(ws->surf_man);
     }
+
+    if (ws->fd)
+        close(ws->fd);
+
     FREE(rws);
 }
 
@@ -708,7 +712,7 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
         return NULL;
     }
 
-    ws->fd = fd;
+    ws->fd = dup(fd);
 
     if (!do_winsys_init(ws))
         goto fail;
@@ -724,7 +728,7 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
         goto fail;
 
     if (ws->gen >= DRV_R600) {
-        ws->surf_man = radeon_surface_manager_new(fd);
+        ws->surf_man = radeon_surface_manager_new(ws->fd);
         if (!ws->surf_man)
             goto fail;
     }
@@ -765,7 +769,7 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
         return NULL;
     }
 
-    util_hash_table_set(fd_tab, intptr_to_pointer(fd), ws);
+    util_hash_table_set(fd_tab, intptr_to_pointer(ws->fd), ws);
 
     /* We must unlock the mutex once the winsys is fully initialized, so that
      * other threads attempting to create the winsys from the same fd will
@@ -782,6 +786,9 @@ fail:
         ws->kman->destroy(ws->kman);
     if (ws->surf_man)
         radeon_surface_manager_free(ws->surf_man);
+    if (ws->fd)
+        close(ws->fd);
+
     FREE(ws);
     return NULL;
 }

From 83984f134b4a1e2829cb238c404bc82c98be6082 Mon Sep 17 00:00:00 2001
From: Erik Faye-Lund <kusmabite@gmail.com>
Date: Fri, 3 Jul 2015 09:46:01 +0200
Subject: [PATCH 0187/1208] glsl: add a missing call to _mesa_locale_init

After c61bc6e ("util: port _mesa_strto[df] to C"), "make check"
fails due to a missing _mesa_locale_init. Fixup this oversight,
by moving the stand-alone compiler initializer inside
initialize_context_to_defaults().

Reviewed-by: Matt Turner <mattst88@gmail.com>
Signed-off-by: Erik Faye-Lund <kusmabite@gmail.com>
---
 src/glsl/main.cpp                   | 3 ---
 src/glsl/standalone_scaffolding.cpp | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/glsl/main.cpp b/src/glsl/main.cpp
index 58651df10a0..23412980dce 100644
--- a/src/glsl/main.cpp
+++ b/src/glsl/main.cpp
@@ -38,7 +38,6 @@
 #include "program/hash_table.h"
 #include "loop_analysis.h"
 #include "standalone_scaffolding.h"
-#include "util/strtod.h"
 
 static int glsl_version = 330;
 
@@ -47,8 +46,6 @@ initialize_context(struct gl_context *ctx, gl_api api)
 {
    initialize_context_to_defaults(ctx, api);
 
-   _mesa_locale_init();
-
    /* The standalone compiler needs to claim support for almost
     * everything in order to compile the built-in functions.
     */
diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp
index 00db61e409b..172c6f41570 100644
--- a/src/glsl/standalone_scaffolding.cpp
+++ b/src/glsl/standalone_scaffolding.cpp
@@ -33,6 +33,7 @@
 #include <stdio.h>
 #include <string.h>
 #include "util/ralloc.h"
+#include "util/strtod.h"
 
 void
 _mesa_warning(struct gl_context *ctx, const char *fmt, ...)
@@ -191,4 +192,6 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api)
 
    for (int sh = 0; sh < MESA_SHADER_STAGES; ++sh)
       memcpy(&ctx->Const.ShaderCompilerOptions[sh], &options, sizeof(options));
+
+   _mesa_locale_init();
 }

From f70719cc4b64e12310dfe8825a8e2d4bce970673 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 3 Jul 2015 19:09:09 -0400
Subject: [PATCH 0188/1208] nv50/ir: UCMP arguments are float, so make sure
 modifiers are applied

The first argument to UCMP needs to be compared against 0, but the
latter arguments are treated as float and need to be able to properly
apply neg/abs arguments. Adjust the inferSrcType function accordingly.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "10.5 10.6" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 9839a0e7bce..018a1ec6d1d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -434,7 +434,6 @@ nv50_ir::DataType Instruction::inferSrcType() const
    case TGSI_OPCODE_USLT:
    case TGSI_OPCODE_USNE:
    case TGSI_OPCODE_USHR:
-   case TGSI_OPCODE_UCMP:
    case TGSI_OPCODE_ATOMUADD:
    case TGSI_OPCODE_ATOMXCHG:
    case TGSI_OPCODE_ATOMCAS:
@@ -2570,6 +2569,8 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
       }
       break;
    case TGSI_OPCODE_UCMP:
+      srcTy = TYPE_U32;
+      /* fallthrough */
    case TGSI_OPCODE_CMP:
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
          src0 = fetchSrc(0, c);

From 9565e345285c71af064e2bb5e0ee762655310802 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Wed, 27 May 2015 12:02:40 +1000
Subject: [PATCH 0189/1208] glsl: allow precision qualifiers for AoA

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/glsl/ast_to_hir.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 8cb46beab1e..de6a86de07e 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -3881,9 +3881,7 @@ ast_declarator_list::hir(exec_list *instructions,
        * an array of that type.
        */
       if (!(this->type->qualifier.precision == ast_precision_none
-          || precision_qualifier_allowed(var->type)
-          || (var->type->is_array()
-	      && precision_qualifier_allowed(var->type->fields.array)))) {
+          || precision_qualifier_allowed(var->type->without_array()))) {
 
          _mesa_glsl_error(&loc, state,
                           "precision qualifiers apply only to floating point"

From 7ecb11c81c1e2fc816b36c82657ab139eb1d84b6 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sun, 22 Feb 2015 23:35:43 +1100
Subject: [PATCH 0190/1208] glsl: update assert to support arrays of arrays

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/glsl/glsl_types.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
index c6223808bb8..281ff51f3e9 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -1086,7 +1086,8 @@ glsl_type::std140_base_alignment(bool row_major) const
 	  this->fields.array->is_matrix()) {
 	 return MAX2(this->fields.array->std140_base_alignment(row_major), 16);
       } else {
-	 assert(this->fields.array->is_record());
+	 assert(this->fields.array->is_record() ||
+                this->fields.array->is_array());
 	 return this->fields.array->std140_base_alignment(row_major);
       }
    }

From 939dc2850645786b4ff76aa162e44eb9f77be805 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sat, 14 Mar 2015 12:40:20 +1100
Subject: [PATCH 0191/1208] glsl: update types for unsized arrays of members

Assigns a new array type based on the max access of
unsized array members. This is to support arrays of arrays.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/glsl/linker.cpp | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 71a45e8db9c..8d4b40e4f0c 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1250,8 +1250,7 @@ public:
                resize_interface_members(var->type->fields.array,
                                         var->get_max_ifc_array_access());
             var->change_interface_type(new_type);
-            var->type =
-               glsl_type::get_array_instance(new_type, var->type->length);
+            var->type = update_interface_members_array(var->type, new_type);
          }
       } else if (const glsl_type *ifc_type = var->get_interface_type()) {
          /* Store a pointer to the variable in the unnamed_interfaces
@@ -1299,6 +1298,21 @@ private:
       }
    }
 
+   static const glsl_type *
+   update_interface_members_array(const glsl_type *type,
+                                  const glsl_type *new_interface_type)
+   {
+      const glsl_type *element_type = type->fields.array;
+      if (element_type->is_array()) {
+         const glsl_type *new_array_type =
+            update_interface_members_array(element_type, new_interface_type);
+         return glsl_type::get_array_instance(new_array_type, type->length);
+      } else {
+         return glsl_type::get_array_instance(new_interface_type,
+                                              type->length);
+      }
+   }
+
    /**
     * Determine whether the given interface type contains unsized arrays (if
     * it doesn't, array_sizing_visitor doesn't need to process it).

From 24bf11e9c7846a2ad9624e421c85aaa1d4411cd9 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Sat, 4 Jul 2015 12:09:10 +0100
Subject: [PATCH 0192/1208] Add release notes for the 10.5.9 release

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
(cherry picked from commit 7f40d083748f3a8276e08a2fa0ae7149269ea379)
---
 docs/relnotes/10.5.9.html | 139 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 docs/relnotes/10.5.9.html

diff --git a/docs/relnotes/10.5.9.html b/docs/relnotes/10.5.9.html
new file mode 100644
index 00000000000..dd71feda4ba
--- /dev/null
+++ b/docs/relnotes/10.5.9.html
@@ -0,0 +1,139 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.5.9 Release Notes / July 04, 2015</h1>
+
+<p>
+Mesa 10.5.9 is a bug fix release which fixes bugs found since the 10.5.8 release.
+</p>
+<p>
+Mesa 10.5.9 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=84225">Bug 84225</a> - Allow constant-index-expression sampler array indexing with GLSL-ES &lt; 300</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88999">Bug 88999</a> - [SKL] Compiz crashes after opening unity dash</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89118">Bug 89118</a> - [SKL Bisected]many Ogles3conform cases core dumped</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90537">Bug 90537</a> - radeonsi bo/va conflict on RADEON_GEM_VA (rscreen-&gt;ws-&gt;buffer_from_handle returns NULL)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90839">Bug 90839</a> - [10.5.5/10.6 regression, bisected] PBO glDrawPixels no longer using blit fastpath</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90873">Bug 90873</a> - Kernel hang, TearFree On, Mate desktop environment</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91056">Bug 91056</a> - The Bard's Tale (2005, native)  has rendering issues</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91117">Bug 91117</a> - Nimbus (running in wine) has rendering issues, objects are semi-transparent</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91124">Bug 91124</a> - Civilization V (in Wine) has rendering issues: text missing, menu bar corrupted</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Ben Widawsky (2):</p>
+<ul>
+  <li>i965/gen9: Implement Push Constant Buffer workaround</li>
+  <li>i965/skl: Use 1 register for uniform pull constant payload</li>
+</ul>
+
+<p>Boyan Ding (1):</p>
+<ul>
+  <li>egl/x11: Remove duplicate call to dri2_x11_add_configs_for_visuals</li>
+</ul>
+
+<p>Chris Wilson (3):</p>
+<ul>
+  <li>i965: Fix HW blitter pitch limits</li>
+  <li>i915: Blit RGBX&lt;-&gt;RGBA drawpixels</li>
+  <li>i965: Export format comparison for blitting between miptrees</li>
+</ul>
+
+<p>Emil Velikov (6):</p>
+<ul>
+  <li>docs: Add sha256sums for the 10.5.8 release</li>
+  <li>configure: warn about shared_glapi &amp; xlib-glx only when both are set</li>
+  <li>configure: error out when building backend-less libEGL</li>
+  <li>configure: error out when building libEGL without shared-glapi</li>
+  <li>gbm: do not (over)link against libglapi.so</li>
+  <li>Update version to 10.5.9</li>
+</ul>
+
+<p>Frank Henigman (1):</p>
+<ul>
+  <li>gbm: dlopen libglapi so gbm_create_device works</li>
+</ul>
+
+<p>Ilia Mirkin (8):</p>
+<ul>
+  <li>glsl: add version checks to conditionals for builtin variable enablement</li>
+  <li>mesa: add GL_PROGRAM_PIPELINE support in KHR_debug calls</li>
+  <li>glsl: binding point is a texture unit, which is a combined space</li>
+  <li>nvc0: always put all tfb bufs into bufctx</li>
+  <li>nv50,nvc0: make sure to pushbuf_refn before putting bo into pushbuf_data</li>
+  <li>nv50/ir: propagate modifier to right arg when const-folding mad</li>
+  <li>nv50/ir: fix emission of address reg in 3rd source</li>
+  <li>nv50/ir: copy joinAt when splitting both before and after</li>
+</ul>
+
+<p>Mario Kleiner (2):</p>
+<ul>
+  <li>nouveau: Use dup fd as key in drm-winsys hash table to fix ZaphodHeads.</li>
+  <li>winsys/radeon: Use dup fd as key in drm-winsys hash table to fix ZaphodHeads.</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>winsys/radeon: Unmap GPU VM address range when destroying BO</li>
+</ul>
+
+<p>Tapani Pälli (6):</p>
+<ul>
+  <li>glsl: Allow dynamic sampler array indexing with GLSL ES &lt; 3.00</li>
+  <li>mesa/glsl: new compiler option EmitNoIndirectSampler</li>
+  <li>i915: use EmitNoIndirectSampler</li>
+  <li>mesa/st: use EmitNoIndirectSampler if !ARB_gpu_shader5</li>
+  <li>i965: use EmitNoIndirectSampler for gen &lt; 7</li>
+  <li>glsl: validate sampler array indexing for 'constant-index-expression'</li>
+</ul>
+
+
+</div>
+</body>
+</html>

From c427daa23ef4879550ed3b756d6a901475432c32 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Sat, 4 Jul 2015 12:48:39 +0100
Subject: [PATCH 0193/1208] docs: Add sha256sums for the 10.5.9 release

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
(cherry picked from commit 4a0bd3dcff3c07965828e648e14d89314d262169)
---
 docs/relnotes/10.5.9.html | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/relnotes/10.5.9.html b/docs/relnotes/10.5.9.html
index dd71feda4ba..a1d11c3b70d 100644
--- a/docs/relnotes/10.5.9.html
+++ b/docs/relnotes/10.5.9.html
@@ -31,7 +31,8 @@ because compatibility contexts are not supported.
 
 <h2>SHA256 checksums</h2>
 <pre>
-TBD
+0c081b59572ee9732e7438d34adc3817fe8cc8d4b58abc0e71fd4b4c904945cb  mesa-10.5.9.tar.gz
+71c69f31d3dbc35cfa79950e58a01d27030378d8c7ef1259a0b31d4d0487f4ec  mesa-10.5.9.tar.xz
 </pre>
 
 

From ff0a41b5d524d7f10494e0c9006389d184ed6330 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Sat, 4 Jul 2015 12:53:22 +0100
Subject: [PATCH 0194/1208] docs: add news item and link release notes for mesa
 10.5.9

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 docs/index.html    | 10 ++++++++++
 docs/relnotes.html |  1 +
 2 files changed, 11 insertions(+)

diff --git a/docs/index.html b/docs/index.html
index 29e62791131..4aa511dad1b 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,16 @@
 
 <h1>News</h1>
 
+<h2>July 04, 2015</h2>
+<p>
+<a href="relnotes/10.5.9.html">Mesa 10.5.9</a> is released.
+This is a bug-fix release.
+<br>
+NOTE: It is anticipated that 10.5.9 will be the final release in the 10.5
+series. Users of 10.5 are encouraged to migrate to the 10.6 series in order
+to obtain future fixes.
+</p>
+
 <h2>June 29, 2015</h2>
 <p>
 <a href="relnotes/10.6.1.html">Mesa 10.6.1</a> is released.
diff --git a/docs/relnotes.html b/docs/relnotes.html
index 3e2d13c84ed..35ae4c7ade0 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>
 
 <ul>
+<li><a href="relnotes/10.5.9.html">10.5.9 release notes</a>
 <li><a href="relnotes/10.6.1.html">10.6.1 release notes</a>
 <li><a href="relnotes/10.5.8.html">10.5.8 release notes</a>
 <li><a href="relnotes/10.6.0.html">10.6.0 release notes</a>

From d3f4f6b2e9380a91ab61b93c55ab36106345e7b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 24 Jun 2015 11:58:50 +0200
Subject: [PATCH 0195/1208] radeonsi: fix a hang with DrawTransformFeedback on
 4 SE chips
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: 10.6 10.5 <mesa-stable@lists.freedesktop.org>
Acked-by: Christian König <christain.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeonsi/si_state_draw.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 2e77d85a80d..e85ed15deb7 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -139,6 +139,10 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		    (info->indirect || info->instance_count > 1))
 			wd_switch_on_eop = true;
 
+		/* USE_OPAQUE doesn't work when WD_SWITCH_ON_EOP is 0. */
+		if (info->count_from_stream_output)
+			wd_switch_on_eop = true;
+
 		/* If the WD switch is false, the IA switch must be false too. */
 		assert(wd_switch_on_eop || !ia_switch_on_eop);
 	}

From d50598fbad16bfb2b46800b664d382f42af64db0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 26 Jun 2015 13:19:45 +0200
Subject: [PATCH 0196/1208] gallium/docs: remove out-of-date document about
 D3D11 features

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/docs/d3d11ddi.txt | 462 ----------------------------------
 1 file changed, 462 deletions(-)
 delete mode 100644 src/gallium/docs/d3d11ddi.txt

diff --git a/src/gallium/docs/d3d11ddi.txt b/src/gallium/docs/d3d11ddi.txt
deleted file mode 100644
index a7036481411..00000000000
--- a/src/gallium/docs/d3d11ddi.txt
+++ /dev/null
@@ -1,462 +0,0 @@
-This document compares the D3D10/D3D11 device driver interface with Gallium.
-It is written from the perspective of a developer implementing a D3D10/D3D11 driver as a Gallium state tracker.
-
-Note that naming and other cosmetic differences are not noted, since they don't really matter and would severely clutter the document.
-Gallium/OpenGL terminology is used in preference to D3D terminology.
-
-NOTE: this document tries to be complete but most likely isn't fully complete and also not fully correct: please submit patches if you spot anything incorrect
-
-Also note that this is specifically for the DirectX 10/11 Windows Vista/7 DDI interfaces.
-DirectX 9 has both user-mode (for Vista) and kernel mode (pre-Vista) interfaces, but they are significantly different from Gallium due to the presence of a lot of fixed function functionality.
-
-The user-visible DirectX 10/11 interfaces are distinct from the kernel DDI, but they match very closely.
-
-* Accessing Microsoft documentation
-
-See http://msdn.microsoft.com/en-us/library/dd445501.aspx ("D3D11DDI_DEVICEFUNCS") for D3D documentation.
-
-Also see http://download.microsoft.com/download/f/2/d/f2d5ee2c-b7ba-4cd0-9686-b6508b5479a1/direct3d10_web.pdf ("The Direct3D 10 System" by David Blythe) for an introduction to Direct3D 10 and the rationale for its design.
-
-The Windows Driver Kit contains the actual headers, as well as shader bytecode documentation.
-
-To get the headers from Linux, run the following, in a dedicated directory:
-wget http://download.microsoft.com/download/4/A/2/4A25C7D5-EFBE-4182-B6A9-AE6850409A78/GRMWDK_EN_7600_1.ISO
-sudo mount -o loop GRMWDK_EN_7600_1.ISO /mnt/tmp
-cabextract -x /mnt/tmp/wdk/headers_cab001.cab
-rename 's/^_(.*)_[0-9]*$/$1/' *
-sudo umount /mnt/tmp
-
-d3d10umddi.h contains the DDI interface analyzed in this document: note that it is much easier to read this online on MSDN.
-d3d{10,11}TokenizedProgramFormat.hpp contains the shader bytecode definitions: this is not available on MSDN.
-d3d9types.h contains DX9 shader bytecode, and DX9 types
-d3dumddi.h contains the DirectX 9 DDI interface
-
-* Glossary
-
-BC1: DXT1
-BC2: DXT3
-BC3: DXT5
-BC5: RGTC1
-BC6H: BPTC float
-BC7: BPTC
-CS = compute shader: OpenCL-like shader
-DS = domain shader: tessellation evaluation shader
-HS = hull shader: tessellation control shader
-IA = input assembler: primitive assembly
-Input layout: vertex elements
-OM = output merger: blender
-PS = pixel shader: fragment shader
-Primitive topology: primitive type
-Resource: buffer or texture
-Shader resource (view): sampler view
-SO = stream out: transform feedback
-Unordered access view: view supporting random read/write access (usually from compute shaders)
-
-* Legend
-
--: features D3D11 has and Gallium lacks
-+: features Gallium has and D3D11 lacks
-!: differences between D3D11 and Gallium
-*: possible improvements to Gallium
->: references to comparisons of special enumerations
-#: comment
-
-* Gallium functions with no direct D3D10/D3D11 equivalent
-
-clear
-	+ Gallium supports clearing both render targets and depth/stencil with a single call
-
-fence_signalled
-fence_finish
-	+ D3D10/D3D11 don't appear to support explicit fencing; queries can often substitute though, and flushing is supported
-
-set_clip_state
-	+ Gallium supports fixed function user clip planes, D3D10/D3D11 only support using the vertex shader for them
-
-set_polygon_stipple
-	+ Gallium supports polygon stipple
-
-clearRT/clearDS
-	+ Gallium supports subrectangle fills of surfaces, D3D10 only supports full clears of views
-
-* DirectX 10/11 DDI functions and Gallium equivalents
-
-AbandonCommandList (D3D11 only)
-	- Gallium does not support deferred contexts
-
-CalcPrivateBlendStateSize
-CalcPrivateDepthStencilStateSize
-CalcPrivateDepthStencilViewSize
-CalcPrivateElementLayoutSize
-CalcPrivateGeometryShaderWithStreamOutput
-CalcPrivateOpenedResourceSize
-CalcPrivateQuerySize
-CalcPrivateRasterizerStateSize
-CalcPrivateRenderTargetViewSize
-CalcPrivateResourceSize
-CalcPrivateSamplerSize
-CalcPrivateShaderResourceViewSize
-CalcPrivateShaderSize
-CalcDeferredContextHandleSize (D3D11 only)
-CalcPrivateCommandListSize (D3D11 only)
-CalcPrivateDeferredContextSize (D3D11 only)
-CalcPrivateTessellationShaderSize (D3D11 only)
-CalcPrivateUnorderedAccessViewSize (D3D11 only)
-	! D3D11 allocates private objects itself, using the size computed here
-	* Gallium could do something similar to be able to put the private data inline into state tracker objects: this would allow them to fit in the same cacheline and improve performance
-
-CheckDeferredContextHandleSizes (D3D11 only)
-	- Gallium does not support deferred contexts
-
-CheckFormatSupport -> screen->is_format_supported
-	! Gallium passes usages to this function, D3D11 returns them
-	- Gallium does not differentiate between blendable and non-blendable render targets
-	! Gallium includes sample count directly, D3D11 uses additional query 
-
-CheckMultisampleQualityLevels
-	! is merged with is_format_supported
-
-CommandListExecute (D3D11 only)
-	- Gallium does not support command lists
-
-CopyStructureCount (D3D11 only)
-	- Gallium does not support unordered access views (views that can be written to arbitrarily from compute shaders)
-
-ClearDepthStencilView -> clear_depth_stencil
-ClearRenderTargetView -> clear_render_target
-	# D3D11 is not totally clear about whether this applies to any view or only a "currently-bound view"
-	+ Gallium allows to clear both depth/stencil and render target(s) in a single operation
-	+ Gallium supports double-precision depth values (but not rgba values!)
-	* May want to also support double-precision rgba or use "float" for "depth"
-
-ClearUnorderedAccessViewFloat (D3D11 only)
-ClearUnorderedAccessViewUint (D3D11 only)
-	- Gallium does not support unordered access views (views that can be written to arbitrarily from compute shaders)
-
-CreateBlendState (extended in D3D10.1) -> create_blend_state
-	# D3D10 does not support per-RT blend modes (but per-RT blending), only D3D10.1 does
-	+ Gallium supports logic ops
-	+ Gallium supports dithering
-	+ Gallium supports using the broadcast alpha component of the blend constant color
-
-CreateCommandList (D3D11 only)
-	- Gallium does not support command lists
-
-CreateComputeShader (D3D11 only)
-	- Gallium does not support compute shaders
-
-CreateDeferredContext (D3D11 only)
-	- Gallium does not support deferred contexts
-
-CreateDomainShader (D3D11 only)
-	- Gallium does not support domain shaders
-
-CreateHullShader (D3D11 only)
-	- Gallium does not support hull shaders
-
-CreateUnorderedAccessView (D3D11 only)
-	- Gallium does not support unordered access views
-
-CreateDepthStencilState -> create_depth_stencil_alpha_state
-	! D3D11 has both a global stencil enable, and front/back enables; Gallium has only front/back enables
-	+ Gallium has per-face writemask/valuemasks, D3D11 uses the same value for back and front
-	+ Gallium supports the alpha test, which D3D11 lacks
-
-CreateDepthStencilView -> create_surface
-CreateRenderTargetView -> create_surface
-	! Gallium merges depthstencil and rendertarget views into pipe_surface
-	- lack of render-to-buffer support
-	+ Gallium supports using 3D texture zslices as a depth/stencil buffer (in theory)
-
-CreateElementLayout -> create_vertex_elements_state
-	! D3D11 allows sparse vertex elements (via InputRegister); in Gallium they must be specified sequentially
-	! D3D11 has an extra flag (InputSlotClass) that is the same as instance_divisor == 0
-
-CreateGeometryShader -> create_gs_state
-CreateGeometryShaderWithStreamOutput -> create_gs_state + create_stream_output_state
-CreatePixelShader -> create_fs_state
-CreateVertexShader -> create_vs_state
-	> bytecode is different (see D3d10tokenizedprogramformat.hpp)
-	! D3D11 describes input/outputs separately from bytecode; Gallium has the tgsi_scan.c module to extract it from TGSI
-	@ TODO: look into DirectX 10/11 semantics specification and bytecode
-
-CheckCounter
-CheckCounterInfo
-CreateQuery -> create_query
-	! D3D11 implements fences with "event" queries
-	* others are performance counters, we may want them but they are not critical
-
-CreateRasterizerState
-	+ Gallium, like OpenGL, supports PIPE_POLYGON_MODE_POINT
-	+ Gallium, like OpenGL, supports per-face polygon fill modes
-	+ Gallium, like OpenGL, supports culling everything
-	+ Gallium, like OpenGL, supports two-side lighting; D3D11 only has the facing attribute
-	+ Gallium, like OpenGL, supports per-fill-mode polygon offset enables
-	+ Gallium, like OpenGL, supports polygon smoothing
-	+ Gallium, like OpenGL, supports polygon stipple
-	+ Gallium, like OpenGL, supports point smoothing
-	+ Gallium, like OpenGL, supports point sprites
-	+ Gallium supports specifying point quad rasterization
-	+ Gallium, like OpenGL, supports per-point point size
-	+ Gallium, like OpenGL, supports line smoothing
-	+ Gallium, like OpenGL, supports line stipple
-	+ Gallium supports line last pixel rule specification
-	+ Gallium, like OpenGL, supports provoking vertex convention
-	+ Gallium supports D3D9 rasterization rules
-	+ Gallium supports fixed line width
-	+ Gallium supports fixed point size
-
-CreateResource -> texture_create or buffer_create
-	! D3D11 passes the dimensions of all mipmap levels to the create call, while Gallium has an implicit floor(x/2) rule
-	# Note that hardware often has the implicit rule, so the D3D11 interface seems to make little sense
-	# Also, the D3D11 API does not allow the user to specify mipmap sizes, so this really seems a dubious decision on Microsoft's part
-	- D3D11 supports specifying initial data to write in the resource
-	- Gallium does not support unordered access buffers
-	! D3D11 specifies mapping flags (i.e. read/write/discard);:it's unclear what they are used for here
-	- D3D11 supports odd things in the D3D10_DDI_RESOURCE_MISC_FLAG enum (D3D10_DDI_RESOURCE_MISC_DISCARD_ON_PRESENT, D3D11_DDI_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS, D3D11_DDI_RESOURCE_MISC_BUFFER_STRUCTURED)
-	- Gallium does not support indirect draw call parameter buffers
-	! D3D11 supports specifying hardware modes and other stuff here for scanout resources
-	! D3D11 implements cube maps as 2D array textures
-
-CreateSampler
-	- D3D11 supports a monochrome convolution filter for "text filtering"
-	+ Gallium supports non-normalized coordinates
-	+ Gallium supports CLAMP, MIRROR_CLAMP and MIRROR_CLAMP_TO_BORDER
-	+ Gallium supports setting min/max/mip filters and anisotropy independently
-
-CreateShaderResourceView (extended in D3D10.1) -> create_sampler_view
-	+ Gallium supports specifying a swizzle
-	! D3D11 implements "cube views" as views into a 2D array texture
-
-CsSetConstantBuffers (D3D11 only)
-CsSetSamplers (D3D11 only)
-CsSetShader (D3D11 only)
-CsSetShaderResources (D3D11 only)
-CsSetShaderWithIfaces (D3D11 only)
-CsSetUnorderedAccessViews (D3D11 only)
-	- Gallium does not support compute shaders
-
-DestroyBlendState
-DestroyCommandList (D3D11 only)
-DestroyDepthStencilState
-DestroyDepthStencilView
-DestroyDevice
-DestroyElementLayout
-DestroyQuery
-DestroyRasterizerState
-DestroyRenderTargetView
-DestroyResource
-DestroySampler
-DestroyShader
-DestroyShaderResourceView
-DestroyUnorderedAccessView (D3D11 only)
-	# these are trivial
-
-Dispatch (D3D11 only)
-	- Gallium does not support compute shaders
-
-DispatchIndirect (D3D11 only)
-	- Gallium does not support compute shaders
-
-Draw -> draw_vbo
-	! D3D11 sets primitive modes separately with IaSetTopology: it's not obvious which is better
-
-DrawAuto -> draw_auto
-
-DrawIndexed -> draw_vbo
-	! D3D11 sets primitive modes separately with IaSetTopology: it's not obvious which is better
-	+ D3D11 lacks explicit range, which is required for OpenGL
-
-DrawIndexedInstanced -> draw_vbo
-	! D3D11 sets primitive modes separately with IaSetTopology: it's not obvious which is better
-
-DrawIndexedInstancedIndirect (D3D11 only)
-	# this allows to use an hardware buffer to specify the parameters for multiple draw_vbo calls
-	- Gallium does not support draw call parameter buffers and indirect draw
-
-DrawInstanced -> draw_vbo
-	! D3D11 sets primitive modes separately with IaSetTopology: it's not obvious which is better
-
-DrawInstancedIndirect (D3D11 only)
-	# this allows to use an hardware buffer to specify the parameters for multiple draw_vbo calls
-	- Gallium does not support draw call parameter buffers and indirect draws
-
-DsSetConstantBuffers (D3D11 only)
-DsSetSamplers (D3D11 only)
-DsSetShader (D3D11 only)
-DsSetShaderResources (D3D11 only)
-DsSetShaderWithIfaces (D3D11 only)
-	- Gallium does not support domain shaders
-
-Flush -> flush
-	! Gallium supports fencing, D3D11 just has a dumb glFlush-like function
-
-GenMips
-	- Gallium lacks a mipmap generation interface, and does this manually with the 3D engine
-	* it may be useful to add a mipmap generation interface, since the hardware (especially older cards) may have a better way than using the 3D engine
-
-GsSetConstantBuffers -> for(i = StartBuffer; i < NumBuffers; ++i) set_constant_buffer(PIPE_SHADER_GEOMETRY, i, phBuffers[i])
-
-GsSetSamplers
-	- Gallium does not support sampling in geometry shaders
-
-GsSetShader -> bind_gs_state
-
-GsSetShaderWithIfaces (D3D11 only)
-	- Gallium does not support shader interfaces
-
-GsSetShaderResources
-	- Gallium does not support sampling in geometry shaders
-
-HsSetConstantBuffers (D3D11 only)
-HsSetSamplers (D3D11 only)
-HsSetShader (D3D11 only)
-HsSetShaderResources (D3D11 only)
-HsSetShaderWithIfaces (D3D11 only)
-	- Gallium does not support hull shaders
-
-IaSetIndexBuffer -> set_index_buffer
-	+ Gallium supports 8-bit indices
-	# the D3D11 interface allows index-size-unaligned byte offsets into the index buffer; most drivers will abort with an assertion
-
-IaSetInputLayout -> bind_vertex_elements_state
-
-IaSetTopology
-	! Gallium passes the topology = primitive type to the draw calls
-	* may want to add an interface for this
-	- Gallium lacks support for DirectX 11 tessellated primitives
-	+ Gallium supports line loops, triangle fans, quads, quad strips and polygons
-
-IaSetVertexBuffers -> set_vertex_buffers
-	- Gallium only allows setting all vertex buffers at once, while D3D11 supports setting a subset
-
-OpenResource -> texture_from_handle
-
-PsSetConstantBuffers -> for(i = StartBuffer; i < NumBuffers; ++i) set_constant_buffer(PIPE_SHADER_FRAGMENT, i, phBuffers[i])
-	* may want to split into fragment/vertex-specific versions
-
-PsSetSamplers -> bind_fragment_sampler_states
-	* may want to allow binding subsets instead of all at once
-
-PsSetShader -> bind_fs_state
-
-PsSetShaderWithIfaces (D3D11 only)
-	- Gallium does not support shader interfaces
-
-PsSetShaderResources -> set_sampler_views
-	* may want to allow binding subsets instead of all at once
-
-QueryBegin -> begin_query
-
-QueryEnd -> end_query
-
-QueryGetData -> get_query_result
-	- D3D11 supports reading an arbitrary data chunk for query results, Gallium only supports reading a 64-bit integer
-	+ D3D11 doesn't seem to support actually waiting for the query result (?!)
-	- D3D11 supports optionally not flushing command buffers here and instead returning DXGI_DDI_ERR_WASSTILLDRAWING
-
-RecycleCommandList (D3D11 only)
-RecycleCreateCommandList (D3D11 only)
-RecycleDestroyCommandList (D3D11 only)
-	- Gallium does not support command lists
-
-RecycleCreateDeferredContext (D3D11 only)
-	- Gallium does not support deferred contexts
-
-RelocateDeviceFuncs
-	- Gallium does not support moving pipe_context, while D3D11 seems to, using this
-
-ResetPrimitiveID (D3D10.1+ only, #ifdef D3D10PSGP)
-	# used to do vertex processing on the GPU on Intel G45 chipsets when it is faster this way (see www.intel.com/Assets/PDF/whitepaper/322931.pdf)
-	# presumably this resets the primitive id system value
-	- Gallium does not support vertex pipeline bypass anymore
-
-ResourceCopy
-ResourceCopyRegion
-ResourceConvert (D3D10.1+ only)
-ResourceConvertRegion (D3D10.1+ only)
-	-> resource_copy_region
-
-ResourceIsStagingBusy ->
-	- Gallium lacks this
-	+ Gallium can use fences
-
-ResourceReadAfterWriteHazard
-	- Gallium lacks this
-
-ResourceResolveSubresource -> blit
-
-ResourceMap
-ResourceUnmap
-DynamicConstantBufferMapDiscard
-DynamicConstantBufferUnmap
-DynamicIABufferMapDiscard
-DynamicIABufferMapNoOverwrite
-DynamicIABufferUnmap
-DynamicResourceMapDiscard
-DynamicResourceUnmap
-StagingResourceMap
-StagingResourceUnmap
-	-> transfer functions
-	! Gallium and D3D have different semantics for transfers
-	* D3D separates vertex/index buffers from constant buffers
-	! D3D separates some buffer flags into specialized calls
-
-ResourceUpdateSubresourceUP -> transfer functionality, transfer_inline_write in gallium-resources
-DefaultConstantBufferUpdateSubresourceUP -> transfer functionality, transfer_inline_write in gallium-resources
-
-SetBlendState -> bind_blend_state, set_blend_color and set_sample_mask
-	! D3D11 fuses bind_blend_state, set_blend_color and set_sample_mask in a single function
-
-SetDepthStencilState -> bind_depth_stencil_alpha_state and set_stencil_ref
-	! D3D11 fuses bind_depth_stencil_alpha_state and set_stencil_ref in a single function
-
-SetPredication -> render_condition
-	# here both D3D11 and Gallium seem very limited (hardware is too, probably though)
-	# ideally, we should support nested conditional rendering, as well as more complex tests (checking for an arbitrary range, after an AND with arbitrary mask )
-	# of couse, hardware support is probably as limited as OpenGL/D3D11
-	+ Gallium, like NV_conditional_render, supports by-region and wait flags
-	- D3D11 supports predication conditional on being equal any value (along with occlusion predicates); Gallium only supports on non-zero
-
-SetRasterizerState -> bind_rasterizer_state
-
-SetRenderTargets (extended in D3D11) -> set_framebuffer_state
-	! Gallium passed a width/height here, D3D11 does not
-	! Gallium lacks ClearTargets (but this is redundant and the driver can trivially compute this if desired)
-	- Gallium does not support unordered access views
-	- Gallium does not support geometry shader selection of texture array image / 3D texture zslice
-
-SetResourceMinLOD (D3D11 only) -> pipe_sampler_view::tex::first_level
-
-SetScissorRects
-	- Gallium lacks support for multiple geometry-shader-selectable scissor rectangles D3D11 has
-
-SetTextFilterSize
-	- Gallium lacks support for text filters
-
-SetVertexPipelineOutput (D3D10.1+ only)
-	# used to do vertex processing on the GPU on Intel G45 chipsets when it is faster this way (see www.intel.com/Assets/PDF/whitepaper/322931.pdf)
-	- Gallium does not support vertex pipeline bypass anymore
-
-SetViewports
-	- Gallium lacks support for multiple geometry-shader-selectable viewports D3D11 has
-
-ShaderResourceViewReadAfterWriteHazard
-	- Gallium lacks support for this
-	+ Gallium has texture_barrier
-
-SoSetTargets -> set_stream_output_buffers
-
-VsSetConstantBuffers -> for(i = StartBuffer; i < NumBuffers; ++i) set_constant_buffer(PIPE_SHADER_VERTEX, i, phBuffers[i])
-	* may want to split into fragment/vertex-specific versions
-
-VsSetSamplers -> bind_vertex_sampler_states
-	* may want to allow binding subsets instead of all at once
-
-VsSetShader -> bind_vs_state
-
-VsSetShaderWithIfaces (D3D11 only)
-	- Gallium does not support shader interfaces
-
-VsSetShaderResources  -> set_sampler_views
-	* may want to allow binding subsets instead of all at once

From 3da1c7919d0dffee3887f390fcf29893016e3043 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 26 Jun 2015 13:13:16 +0200
Subject: [PATCH 0197/1208] gallium: handle fence_finish timeout in various
 drivers

I copied what fence_signalled does.

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/drivers/freedreno/freedreno_fence.c | 3 +++
 src/gallium/drivers/i915/i915_screen.c          | 3 +++
 src/gallium/drivers/llvmpipe/lp_screen.c        | 3 +++
 src/gallium/drivers/nouveau/nouveau_screen.c    | 3 +++
 src/gallium/drivers/svga/svga_screen.c          | 3 +++
 5 files changed, 15 insertions(+)

diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c
index 375e58f7022..04a9feacd58 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.c
+++ b/src/gallium/drivers/freedreno/freedreno_fence.c
@@ -69,6 +69,9 @@ boolean fd_screen_fence_finish(struct pipe_screen *screen,
 		struct pipe_fence_handle *fence,
 		uint64_t timeout)
 {
+	if (!timeout)
+		return fd_screen_fence_signalled(screen, fence);
+
 	if (fd_pipe_wait(fence->screen->pipe, fence->timestamp))
 		return false;
 
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 0590da07b9a..10508ced2be 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -478,6 +478,9 @@ i915_fence_finish(struct pipe_screen *screen,
 {
    struct i915_screen *is = i915_screen(screen);
 
+   if (!timeout)
+      return is->iws->fence_signalled(is->iws, fence) == 1;
+
    return is->iws->fence_finish(is->iws, fence) == 1;
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 47f1897c732..d3392e7d63d 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -550,6 +550,9 @@ llvmpipe_fence_finish(struct pipe_screen *screen,
 {
    struct lp_fence *f = (struct lp_fence *) fence_handle;
 
+   if (!timeout)
+      return lp_fence_signalled(f);
+
    lp_fence_wait(f);
    return TRUE;
 }
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index c6e5074db19..9f77db082ea 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -79,6 +79,9 @@ nouveau_screen_fence_finish(struct pipe_screen *screen,
 			    struct pipe_fence_handle *pfence,
                             uint64_t timeout)
 {
+	if (!timeout)
+		return nouveau_fence_signalled(nouveau_fence(pfence));
+
 	return nouveau_fence_wait(nouveau_fence(pfence));
 }
 
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 770f4933b4c..8eff3d5e337 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -560,6 +560,9 @@ svga_fence_finish(struct pipe_screen *screen,
 {
    struct svga_winsys_screen *sws = svga_screen(screen)->sws;
 
+   if (!timeout)
+      return sws->fence_signalled(sws, fence, 0) == 0;
+
    SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "%s fence_ptr %p\n",
             __FUNCTION__, fence);
 

From bd214f030f1cb102a7fe41f40f140d4de2b304c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 26 Jun 2015 16:28:53 +0200
Subject: [PATCH 0198/1208] gallium: use fence_finish instead of
 fence_signalled in state trackers

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/state_trackers/clover/core/event.cpp | 2 +-
 src/gallium/state_trackers/nine/swapchain9.c     | 2 +-
 src/gallium/state_trackers/vdpau/presentation.c  | 2 +-
 src/gallium/state_trackers/xvmc/surface.c        | 2 +-
 src/mesa/state_tracker/st_cb_syncobj.c           | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gallium/state_trackers/clover/core/event.cpp b/src/gallium/state_trackers/clover/core/event.cpp
index e1f9de07f83..d75b8397794 100644
--- a/src/gallium/state_trackers/clover/core/event.cpp
+++ b/src/gallium/state_trackers/clover/core/event.cpp
@@ -141,7 +141,7 @@ hard_event::status() const {
    else if (!_fence)
       return CL_QUEUED;
 
-   else if (!screen->fence_signalled(screen, _fence))
+   else if (!screen->fence_finish(screen, _fence, 0))
       return CL_SUBMITTED;
 
    else
diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c
index c40bc602460..26164f0b118 100644
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -726,7 +726,7 @@ bypass_rendering:
         BOOL still_draw = FALSE;
         fence = swap_fences_see_front(This);
         if (fence) {
-            still_draw = !This->screen->fence_signalled(This->screen, fence);
+            still_draw = !This->screen->fence_finish(This->screen, fence, 0);
             This->screen->fence_reference(This->screen, &fence, NULL);
         }
         if (still_draw)
diff --git a/src/gallium/state_trackers/vdpau/presentation.c b/src/gallium/state_trackers/vdpau/presentation.c
index 7f8dbed7ee2..e53303708b2 100644
--- a/src/gallium/state_trackers/vdpau/presentation.c
+++ b/src/gallium/state_trackers/vdpau/presentation.c
@@ -369,7 +369,7 @@ vlVdpPresentationQueueQuerySurfaceStatus(VdpPresentationQueue presentation_queue
    } else {
       pipe_mutex_lock(pq->device->mutex);
       screen = pq->device->vscreen->pscreen;
-      if (screen->fence_signalled(screen, surf->fence)) {
+      if (screen->fence_finish(screen, surf->fence, 0)) {
          screen->fence_reference(screen, &surf->fence, NULL);
          *status = VDP_PRESENTATION_QUEUE_STATUS_VISIBLE;
          pipe_mutex_unlock(pq->device->mutex);
diff --git a/src/gallium/state_trackers/xvmc/surface.c b/src/gallium/state_trackers/xvmc/surface.c
index f32e85bf489..15eae59ff6e 100644
--- a/src/gallium/state_trackers/xvmc/surface.c
+++ b/src/gallium/state_trackers/xvmc/surface.c
@@ -489,7 +489,7 @@ Status XvMCGetSurfaceStatus(Display *dpy, XvMCSurface *surface, int *status)
    *status = 0;
 
    if (surface_priv->fence)
-      if (!pipe->screen->fence_signalled(pipe->screen, surface_priv->fence))
+      if (!pipe->screen->fence_finish(pipe->screen, surface_priv->fence, 0))
          *status |= XVMC_RENDERING;
 
    return Success;
diff --git a/src/mesa/state_tracker/st_cb_syncobj.c b/src/mesa/state_tracker/st_cb_syncobj.c
index 6d875b851a2..d23c93d98d8 100644
--- a/src/mesa/state_tracker/st_cb_syncobj.c
+++ b/src/mesa/state_tracker/st_cb_syncobj.c
@@ -81,7 +81,7 @@ static void st_check_sync(struct gl_context *ctx, struct gl_sync_object *obj)
    struct pipe_screen *screen = st_context(ctx)->pipe->screen;
    struct st_sync_object *so = (struct st_sync_object*)obj;
 
-   if (so->fence && screen->fence_signalled(screen, so->fence)) {
+   if (so->fence && screen->fence_finish(screen, so->fence, 0)) {
       screen->fence_reference(screen, &so->fence, NULL);
       so->b.StatusFlag = GL_TRUE;
    }

From 5a69929683b15d48e4f2fd47e2c816e716ab60ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 26 Jun 2015 16:34:31 +0200
Subject: [PATCH 0199/1208] gallium: remove redundant
 pipe_context::fence_signalled

fence_finish(timeout=0) does the same thing

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 .../drivers/freedreno/freedreno_screen.c      |  1 -
 src/gallium/drivers/i915/i915_screen.c        | 10 --------
 src/gallium/drivers/ilo/ilo_screen.c          |  8 -------
 src/gallium/drivers/llvmpipe/lp_screen.c      | 13 ----------
 src/gallium/drivers/nouveau/nouveau_screen.c  |  8 -------
 src/gallium/drivers/r300/r300_screen.c        |  9 -------
 src/gallium/drivers/radeon/r600_pipe_common.c |  9 -------
 src/gallium/drivers/rbug/rbug_screen.c        | 12 ----------
 src/gallium/drivers/softpipe/sp_fence.c       | 10 --------
 src/gallium/drivers/svga/svga_screen.c        | 10 --------
 src/gallium/drivers/trace/tr_screen.c         | 24 -------------------
 src/gallium/drivers/vc4/vc4_fence.c           | 11 ---------
 src/gallium/include/pipe/p_screen.h           |  6 -----
 13 files changed, 131 deletions(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 7330cd9c788..473823f1390 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -546,7 +546,6 @@ fd_screen_create(struct fd_device *dev)
 	pscreen->get_timestamp = fd_screen_get_timestamp;
 
 	pscreen->fence_reference = fd_screen_fence_ref;
-	pscreen->fence_signalled = fd_screen_fence_signalled;
 	pscreen->fence_finish = fd_screen_fence_finish;
 
 	util_format_s3tc_init();
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 10508ced2be..dc0004366b1 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -462,15 +462,6 @@ i915_fence_reference(struct pipe_screen *screen,
    is->iws->fence_reference(is->iws, ptr, fence);
 }
 
-static boolean
-i915_fence_signalled(struct pipe_screen *screen,
-                     struct pipe_fence_handle *fence)
-{
-   struct i915_screen *is = i915_screen(screen);
-
-   return is->iws->fence_signalled(is->iws, fence) == 1;
-}
-
 static boolean
 i915_fence_finish(struct pipe_screen *screen,
                   struct pipe_fence_handle *fence,
@@ -568,7 +559,6 @@ i915_screen_create(struct i915_winsys *iws)
    is->base.context_create = i915_create_context;
 
    is->base.fence_reference = i915_fence_reference;
-   is->base.fence_signalled = i915_fence_signalled;
    is->base.fence_finish = i915_fence_finish;
 
    i915_init_screen_resource_functions(is);
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index faebb9279b3..eb6c0466eb9 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -673,13 +673,6 @@ ilo_screen_fence_finish(struct pipe_screen *screen,
    return signaled;
 }
 
-static boolean
-ilo_screen_fence_signalled(struct pipe_screen *screen,
-                           struct pipe_fence_handle *fence)
-{
-   return ilo_screen_fence_finish(screen, fence, 0);
-}
-
 /**
  * Create a fence for \p bo.  When \p bo is not NULL, it must be submitted
  * before waited on or checked.
@@ -746,7 +739,6 @@ ilo_screen_create(struct intel_winsys *ws)
    is->base.flush_frontbuffer = NULL;
 
    is->base.fence_reference = ilo_screen_fence_reference;
-   is->base.fence_signalled = ilo_screen_fence_signalled;
    is->base.fence_finish = ilo_screen_fence_finish;
 
    is->base.get_driver_query_info = NULL;
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index d3392e7d63d..466c5d0cef8 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -528,18 +528,6 @@ llvmpipe_fence_reference(struct pipe_screen *screen,
 }
 
 
-/**
- * Has the fence been executed/finished?
- */
-static boolean
-llvmpipe_fence_signalled(struct pipe_screen *screen,
-                         struct pipe_fence_handle *fence)
-{
-   struct lp_fence *f = (struct lp_fence *) fence;
-   return lp_fence_signalled(f);
-}
-
-
 /**
  * Wait for the fence to finish.
  */
@@ -604,7 +592,6 @@ llvmpipe_create_screen(struct sw_winsys *winsys)
    screen->base.context_create = llvmpipe_create_context;
    screen->base.flush_frontbuffer = llvmpipe_flush_frontbuffer;
    screen->base.fence_reference = llvmpipe_fence_reference;
-   screen->base.fence_signalled = llvmpipe_fence_signalled;
    screen->base.fence_finish = llvmpipe_fence_finish;
 
    screen->base.get_timestamp = llvmpipe_get_timestamp;
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index 9f77db082ea..e5b3c159c84 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -67,13 +67,6 @@ nouveau_screen_fence_ref(struct pipe_screen *pscreen,
 	nouveau_fence_ref(nouveau_fence(pfence), (struct nouveau_fence **)ptr);
 }
 
-static boolean
-nouveau_screen_fence_signalled(struct pipe_screen *screen,
-                               struct pipe_fence_handle *pfence)
-{
-        return nouveau_fence_signalled(nouveau_fence(pfence));
-}
-
 static boolean
 nouveau_screen_fence_finish(struct pipe_screen *screen,
 			    struct pipe_fence_handle *pfence,
@@ -206,7 +199,6 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 	pscreen->get_timestamp = nouveau_screen_get_timestamp;
 
 	pscreen->fence_reference = nouveau_screen_fence_ref;
-	pscreen->fence_signalled = nouveau_screen_fence_signalled;
 	pscreen->fence_finish = nouveau_screen_fence_finish;
 
 	util_format_s3tc_init();
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index a7bca915f57..e8accefe1a8 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -660,14 +660,6 @@ static void r300_fence_reference(struct pipe_screen *screen,
     rws->fence_reference(ptr, fence);
 }
 
-static boolean r300_fence_signalled(struct pipe_screen *screen,
-                                    struct pipe_fence_handle *fence)
-{
-    struct radeon_winsys *rws = r300_screen(screen)->rws;
-
-    return rws->fence_wait(rws, fence, 0);
-}
-
 static boolean r300_fence_finish(struct pipe_screen *screen,
                                  struct pipe_fence_handle *fence,
                                  uint64_t timeout)
@@ -712,7 +704,6 @@ struct pipe_screen* r300_screen_create(struct radeon_winsys *rws)
     r300screen->screen.is_video_format_supported = vl_video_buffer_is_format_supported;
     r300screen->screen.context_create = r300_create_context;
     r300screen->screen.fence_reference = r300_fence_reference;
-    r300screen->screen.fence_signalled = r300_fence_signalled;
     r300screen->screen.fence_finish = r300_fence_finish;
 
     r300_init_screen_resource_functions(r300screen);
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 5dd28df5d5f..bcbf0b95373 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -721,14 +721,6 @@ static void r600_fence_reference(struct pipe_screen *screen,
 	rws->fence_reference(ptr, fence);
 }
 
-static boolean r600_fence_signalled(struct pipe_screen *screen,
-				    struct pipe_fence_handle *fence)
-{
-	struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
-
-	return rws->fence_wait(rws, fence, 0);
-}
-
 static boolean r600_fence_finish(struct pipe_screen *screen,
 				 struct pipe_fence_handle *fence,
 				 uint64_t timeout)
@@ -874,7 +866,6 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	rscreen->b.get_timestamp = r600_get_timestamp;
 	rscreen->b.fence_finish = r600_fence_finish;
 	rscreen->b.fence_reference = r600_fence_reference;
-	rscreen->b.fence_signalled = r600_fence_signalled;
 	rscreen->b.resource_destroy = u_resource_destroy_vtbl;
 	rscreen->b.resource_from_user_memory = r600_buffer_from_user_memory;
 
diff --git a/src/gallium/drivers/rbug/rbug_screen.c b/src/gallium/drivers/rbug/rbug_screen.c
index d5a3164e217..7da4e81560a 100644
--- a/src/gallium/drivers/rbug/rbug_screen.c
+++ b/src/gallium/drivers/rbug/rbug_screen.c
@@ -225,17 +225,6 @@ rbug_screen_fence_reference(struct pipe_screen *_screen,
                            fence);
 }
 
-static boolean
-rbug_screen_fence_signalled(struct pipe_screen *_screen,
-                            struct pipe_fence_handle *fence)
-{
-   struct rbug_screen *rb_screen = rbug_screen(_screen);
-   struct pipe_screen *screen = rb_screen->screen;
-
-   return screen->fence_signalled(screen,
-                                  fence);
-}
-
 static boolean
 rbug_screen_fence_finish(struct pipe_screen *_screen,
                          struct pipe_fence_handle *fence,
@@ -288,7 +277,6 @@ rbug_screen_create(struct pipe_screen *screen)
    rb_screen->base.resource_destroy = rbug_screen_resource_destroy;
    rb_screen->base.flush_frontbuffer = rbug_screen_flush_frontbuffer;
    rb_screen->base.fence_reference = rbug_screen_fence_reference;
-   rb_screen->base.fence_signalled = rbug_screen_fence_signalled;
    rb_screen->base.fence_finish = rbug_screen_fence_finish;
 
    rb_screen->screen = screen;
diff --git a/src/gallium/drivers/softpipe/sp_fence.c b/src/gallium/drivers/softpipe/sp_fence.c
index c2897ed1ef8..6168236ec96 100644
--- a/src/gallium/drivers/softpipe/sp_fence.c
+++ b/src/gallium/drivers/softpipe/sp_fence.c
@@ -40,15 +40,6 @@ softpipe_fence_reference(struct pipe_screen *screen,
 }
 
 
-static boolean
-softpipe_fence_signalled(struct pipe_screen *screen,
-                         struct pipe_fence_handle *fence)
-{
-   assert(fence);
-   return TRUE;
-}
-
-
 static boolean
 softpipe_fence_finish(struct pipe_screen *screen,
                       struct pipe_fence_handle *fence,
@@ -64,5 +55,4 @@ softpipe_init_screen_fence_funcs(struct pipe_screen *screen)
 {
    screen->fence_reference = softpipe_fence_reference;
    screen->fence_finish = softpipe_fence_finish;
-   screen->fence_signalled = softpipe_fence_signalled;
 }
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 8eff3d5e337..e2305bec359 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -544,15 +544,6 @@ svga_fence_reference(struct pipe_screen *screen,
 }
 
 
-static boolean
-svga_fence_signalled(struct pipe_screen *screen,
-                     struct pipe_fence_handle *fence)
-{
-   struct svga_winsys_screen *sws = svga_screen(screen)->sws;
-   return sws->fence_signalled(sws, fence, 0) == 0;
-}
-
-
 static boolean
 svga_fence_finish(struct pipe_screen *screen,
                   struct pipe_fence_handle *fence,
@@ -650,7 +641,6 @@ svga_screen_create(struct svga_winsys_screen *sws)
    screen->is_format_supported = svga_is_format_supported;
    screen->context_create = svga_context_create;
    screen->fence_reference = svga_fence_reference;
-   screen->fence_signalled = svga_fence_signalled;
    screen->fence_finish = svga_fence_finish;
    screen->get_driver_query_info = svga_get_driver_query_info;
    svgascreen->sws = sws;
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index 266626defa8..1d86a378eea 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -369,29 +369,6 @@ trace_screen_fence_reference(struct pipe_screen *_screen,
 }
 
 
-static boolean
-trace_screen_fence_signalled(struct pipe_screen *_screen,
-                             struct pipe_fence_handle *fence)
-{
-   struct trace_screen *tr_scr = trace_screen(_screen);
-   struct pipe_screen *screen = tr_scr->screen;
-   int result;
-
-   trace_dump_call_begin("pipe_screen", "fence_signalled");
-
-   trace_dump_arg(ptr, screen);
-   trace_dump_arg(ptr, fence);
-
-   result = screen->fence_signalled(screen, fence);
-
-   trace_dump_ret(bool, result);
-
-   trace_dump_call_end();
-
-   return result;
-}
-
-
 static boolean
 trace_screen_fence_finish(struct pipe_screen *_screen,
                           struct pipe_fence_handle *fence,
@@ -503,7 +480,6 @@ trace_screen_create(struct pipe_screen *screen)
    tr_scr->base.resource_get_handle = trace_screen_resource_get_handle;
    tr_scr->base.resource_destroy = trace_screen_resource_destroy;
    tr_scr->base.fence_reference = trace_screen_fence_reference;
-   tr_scr->base.fence_signalled = trace_screen_fence_signalled;
    tr_scr->base.fence_finish = trace_screen_fence_finish;
    tr_scr->base.flush_frontbuffer = trace_screen_flush_frontbuffer;
    tr_scr->base.get_timestamp = trace_screen_get_timestamp;
diff --git a/src/gallium/drivers/vc4/vc4_fence.c b/src/gallium/drivers/vc4/vc4_fence.c
index f2ee91de61a..f644bf9a04e 100644
--- a/src/gallium/drivers/vc4/vc4_fence.c
+++ b/src/gallium/drivers/vc4/vc4_fence.c
@@ -59,16 +59,6 @@ vc4_fence_reference(struct pipe_screen *pscreen,
         *p = f;
 }
 
-static boolean
-vc4_fence_signalled(struct pipe_screen *pscreen,
-                    struct pipe_fence_handle *pf)
-{
-        struct vc4_screen *screen = vc4_screen(pscreen);
-        struct vc4_fence *f = (struct vc4_fence *)pf;
-
-        return vc4_wait_seqno(screen, f->seqno, 0);
-}
-
 static boolean
 vc4_fence_finish(struct pipe_screen *pscreen,
                  struct pipe_fence_handle *pf,
@@ -98,6 +88,5 @@ void
 vc4_fence_init(struct vc4_screen *screen)
 {
         screen->base.fence_reference = vc4_fence_reference;
-        screen->base.fence_signalled = vc4_fence_signalled;
         screen->base.fence_finish = vc4_fence_finish;
 }
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index 98b2159defe..0d2658313e5 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -211,12 +211,6 @@ struct pipe_screen {
                             struct pipe_fence_handle **ptr,
                             struct pipe_fence_handle *fence );
 
-   /**
-    * Checks whether the fence has been signalled.
-    */
-   boolean (*fence_signalled)( struct pipe_screen *screen,
-                               struct pipe_fence_handle *fence );
-
    /**
     * Wait for the fence to finish.
     * \param timeout  in nanoseconds (may be PIPE_TIMEOUT_INFINITE).

From 872ede6fd136c7f9701cc60268ab195a48e75e67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 27 Jun 2015 13:57:06 +0200
Subject: [PATCH 0200/1208] st/mesa: if a fence isn't returned, assume it's
 signalled

The reason might be that no commands have been submitted before the flush
and the GPU is idle.
---
 src/mesa/state_tracker/st_cb_syncobj.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_cb_syncobj.c b/src/mesa/state_tracker/st_cb_syncobj.c
index d23c93d98d8..ec2687fba53 100644
--- a/src/mesa/state_tracker/st_cb_syncobj.c
+++ b/src/mesa/state_tracker/st_cb_syncobj.c
@@ -81,7 +81,13 @@ static void st_check_sync(struct gl_context *ctx, struct gl_sync_object *obj)
    struct pipe_screen *screen = st_context(ctx)->pipe->screen;
    struct st_sync_object *so = (struct st_sync_object*)obj;
 
-   if (so->fence && screen->fence_finish(screen, so->fence, 0)) {
+   /* If the fence doesn't exist, assume it's signalled. */
+   if (!so->fence) {
+      so->b.StatusFlag = GL_TRUE;
+      return;
+   }
+
+   if (screen->fence_finish(screen, so->fence, 0)) {
       screen->fence_reference(screen, &so->fence, NULL);
       so->b.StatusFlag = GL_TRUE;
    }
@@ -94,6 +100,12 @@ static void st_client_wait_sync(struct gl_context *ctx,
    struct pipe_screen *screen = st_context(ctx)->pipe->screen;
    struct st_sync_object *so = (struct st_sync_object*)obj;
 
+   /* If the fence doesn't exist, assume it's signalled. */
+   if (!so->fence) {
+      so->b.StatusFlag = GL_TRUE;
+      return;
+   }
+
    /* We don't care about GL_SYNC_FLUSH_COMMANDS_BIT, because flush is
     * already called when creating a fence. */
 

From 245b464d5caa21680373ae5929dccd294078cc50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 26 Jun 2015 19:01:23 +0200
Subject: [PATCH 0201/1208] gallium/radeon: mark the gpu load thread stop
 trigger as volatile

---
 src/gallium/drivers/radeon/r600_pipe_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 2b27e58e0e2..a471426ea27 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -290,7 +290,7 @@ struct r600_common_screen {
 	pipe_thread			gpu_load_thread;
 	unsigned			gpu_load_counter_busy;
 	unsigned			gpu_load_counter_idle;
-	unsigned			gpu_load_stop_thread; /* bool */
+	volatile unsigned		gpu_load_stop_thread; /* bool */
 };
 
 /* This encapsulates a state or an operation which can emitted into the GPU

From 3836857a777a248dd212ce7a1d7307d2984fda7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 25 Jun 2015 20:39:34 +0200
Subject: [PATCH 0202/1208] gallium/os: add os_wait_until_zero (v2)

This will be used by radeon and amdgpu winsyses.
Copied from the amdgpu winsys.

v2: use volatile and p_atomic_read

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/auxiliary/os/os_time.c | 38 +++++++++++++++++++++++++++++-
 src/gallium/auxiliary/os/os_time.h | 11 +++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/os/os_time.c b/src/gallium/auxiliary/os/os_time.c
index f7e4ca49c7c..bd09452df16 100644
--- a/src/gallium/auxiliary/os/os_time.c
+++ b/src/gallium/auxiliary/os/os_time.c
@@ -33,11 +33,13 @@
  */
 
 
-#include "pipe/p_config.h"
+#include "pipe/p_defines.h"
+#include "util/u_atomic.h"
 
 #if defined(PIPE_OS_UNIX)
 #  include <time.h> /* timeval */
 #  include <sys/time.h> /* timeval */
+#  include <sched.h> /* sched_yield */
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  include <windows.h>
 #else
@@ -92,3 +94,37 @@ os_time_sleep(int64_t usecs)
 }
 
 #endif
+
+
+bool
+os_wait_until_zero(volatile int *var, uint64_t timeout)
+{
+   if (!p_atomic_read(var))
+      return true;
+
+   if (!timeout)
+      return false;
+
+   if (timeout == PIPE_TIMEOUT_INFINITE) {
+      while (p_atomic_read(var)) {
+#if defined(PIPE_OS_UNIX)
+         sched_yield();
+#endif
+      }
+      return true;
+   }
+   else {
+      int64_t start_time = os_time_get_nano();
+      int64_t end_time = start_time + timeout;
+
+      while (p_atomic_read(var)) {
+         if (os_time_timeout(start_time, end_time, os_time_get_nano()))
+            return false;
+
+#if defined(PIPE_OS_UNIX)
+         sched_yield();
+#endif
+      }
+      return true;
+   }
+}
diff --git a/src/gallium/auxiliary/os/os_time.h b/src/gallium/auxiliary/os/os_time.h
index 4fab03cc671..2989af10fe2 100644
--- a/src/gallium/auxiliary/os/os_time.h
+++ b/src/gallium/auxiliary/os/os_time.h
@@ -94,6 +94,17 @@ os_time_timeout(int64_t start,
 }
 
 
+/**
+ * Wait until the variable at the given memory location is zero.
+ *
+ * \param var           variable
+ * \param timeout       timeout in ns, can be anything from 0 (no wait) to
+ *                      PIPE_TIME_INFINITE (wait forever)
+ * \return     true if the variable is zero
+ */
+bool
+os_wait_until_zero(volatile int *var, uint64_t timeout);
+
 #ifdef	__cplusplus
 }
 #endif

From 7316cc92f3810c9e53a22c35343190d8fb7980be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 27 Jun 2015 00:05:26 +0200
Subject: [PATCH 0203/1208] gallium/os: add conversion and wait functions for
 absolute timeouts

Absolute timeouts are used with the amdgpu kernel driver.
It also makes waiting for several variables and fences at the same time
easier (the timeout doesn't have to be recalculated after every wait call).

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/auxiliary/os/os_time.c | 41 ++++++++++++++++++++++++++++++
 src/gallium/auxiliary/os/os_time.h | 26 +++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/src/gallium/auxiliary/os/os_time.c b/src/gallium/auxiliary/os/os_time.c
index bd09452df16..3d2e4167222 100644
--- a/src/gallium/auxiliary/os/os_time.c
+++ b/src/gallium/auxiliary/os/os_time.c
@@ -96,6 +96,26 @@ os_time_sleep(int64_t usecs)
 #endif
 
 
+int64_t
+os_time_get_absolute_timeout(uint64_t timeout)
+{
+   int64_t time, abs_timeout;
+
+   /* Also check for the type upper bound. */
+   if (timeout == PIPE_TIMEOUT_INFINITE || timeout > INT64_MAX)
+      return PIPE_TIMEOUT_INFINITE;
+
+   time = os_time_get_nano();
+   abs_timeout = time + (int64_t)timeout;
+
+   /* Check for overflow. */
+   if (abs_timeout < time)
+      return PIPE_TIMEOUT_INFINITE;
+
+   return abs_timeout;
+}
+
+
 bool
 os_wait_until_zero(volatile int *var, uint64_t timeout)
 {
@@ -128,3 +148,24 @@ os_wait_until_zero(volatile int *var, uint64_t timeout)
       return true;
    }
 }
+
+
+bool
+os_wait_until_zero_abs_timeout(volatile int *var, int64_t timeout)
+{
+   if (!p_atomic_read(var))
+      return true;
+
+   if (timeout == PIPE_TIMEOUT_INFINITE)
+      return os_wait_until_zero(var, PIPE_TIMEOUT_INFINITE);
+
+   while (p_atomic_read(var)) {
+      if (os_time_get_nano() >= timeout)
+         return false;
+
+#if defined(PIPE_OS_UNIX)
+      sched_yield();
+#endif
+   }
+   return true;
+}
diff --git a/src/gallium/auxiliary/os/os_time.h b/src/gallium/auxiliary/os/os_time.h
index 2989af10fe2..21979a72ac5 100644
--- a/src/gallium/auxiliary/os/os_time.h
+++ b/src/gallium/auxiliary/os/os_time.h
@@ -94,6 +94,17 @@ os_time_timeout(int64_t start,
 }
 
 
+/**
+ * Convert a relative timeout in nanoseconds into an absolute timeout,
+ * in other words, it returns current time + timeout.
+ * os_time_get_nano() must be monotonic.
+ * PIPE_TIMEOUT_INFINITE is passed through unchanged. If the calculation
+ * overflows, PIPE_TIMEOUT_INFINITE is returned.
+ */
+int64_t
+os_time_get_absolute_timeout(uint64_t timeout);
+
+
 /**
  * Wait until the variable at the given memory location is zero.
  *
@@ -105,6 +116,21 @@ os_time_timeout(int64_t start,
 bool
 os_wait_until_zero(volatile int *var, uint64_t timeout);
 
+
+/**
+ * Wait until the variable at the given memory location is zero.
+ * The timeout is the absolute time when the waiting should stop. If it is
+ * less than or equal to the current time, it only returns the status and
+ * doesn't wait. PIPE_TIME_INFINITE waits forever. This requires that
+ * os_time_get_nano is monotonic.
+ *
+ * \param var       variable
+ * \param timeout   the time in ns when the waiting should stop
+ * \return     true if the variable is zero
+ */
+bool
+os_wait_until_zero_abs_timeout(volatile int *var, int64_t timeout);
+
 #ifdef	__cplusplus
 }
 #endif

From f1be3d8cdde17a9b9ae283e1bab2f46b992d3bf3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 27 Jun 2015 14:03:46 +0200
Subject: [PATCH 0204/1208] radeonsi: don't flush an empty IB if the only thing
 we need is a fence

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeonsi/si_hw_context.c | 16 +++++++++++++---
 src/gallium/drivers/radeonsi/si_pipe.c       |  1 +
 src/gallium/drivers/radeonsi/si_pipe.h       |  1 +
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 313ced7f5d1..08cc08e64fe 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -82,9 +82,15 @@ void si_context_gfx_flush(void *context, unsigned flags,
 {
 	struct si_context *ctx = context;
 	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys *ws = ctx->b.ws;
 
-	if (cs->cdw == ctx->b.initial_gfx_cs_size && !fence)
+	if (cs->cdw == ctx->b.initial_gfx_cs_size) {
+		if (fence)
+			ws->fence_reference(fence, ctx->last_gfx_fence);
+		if (!(flags & RADEON_FLUSH_ASYNC))
+			ws->cs_sync_flush(cs);
 		return;
+	}
 
 	ctx->b.rings.gfx.flushing = true;
 
@@ -101,9 +107,13 @@ void si_context_gfx_flush(void *context, unsigned flags,
 	flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
 
 	/* Flush the CS. */
-	ctx->b.ws->cs_flush(cs, flags, fence, ctx->screen->b.cs_count++);
+	ws->cs_flush(cs, flags, &ctx->last_gfx_fence,
+		     ctx->screen->b.cs_count++);
 	ctx->b.rings.gfx.flushing = false;
 
+	if (fence)
+		ws->fence_reference(fence, ctx->last_gfx_fence);
+
 #if SI_TRACE_CS
 	if (ctx->screen->b.trace_bo) {
 		struct si_screen *sscreen = ctx->screen;
@@ -111,7 +121,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
 
 		for (i = 0; i < 10; i++) {
 			usleep(5);
-			if (!ctx->b.ws->buffer_is_busy(sscreen->b.trace_bo->buf, RADEON_USAGE_READWRITE)) {
+			if (!ws->buffer_is_busy(sscreen->b.trace_bo->buf, RADEON_USAGE_READWRITE)) {
 				break;
 			}
 		}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 77b8d7d3a6a..13b67d210fd 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -44,6 +44,7 @@ static void si_destroy_context(struct pipe_context *context)
 	pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
 	r600_resource_reference(&sctx->border_color_table, NULL);
 	r600_resource_reference(&sctx->scratch_buffer, NULL);
+	sctx->b.ws->fence_reference(&sctx->last_gfx_fence, NULL);
 
 	si_pm4_free_state(sctx, sctx->init_config, ~0);
 	si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 2d67342f160..67cb0357231 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -137,6 +137,7 @@ struct si_context {
 	void				*pstipple_sampler_state;
 	struct si_screen		*screen;
 	struct si_pm4_state		*init_config;
+	struct pipe_fence_handle	*last_gfx_fence;
 
 	union {
 		struct {

From fc2726e4afa6dfb691affed576a38d2b0573465b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 26 Jun 2015 19:01:04 +0200
Subject: [PATCH 0205/1208] winsys/radeon: use os_wait_until_zero in
 radeon_bo_set_tiling

---
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 1f0caf60197..314d0ef8df8 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -778,9 +778,7 @@ static void radeon_bo_set_tiling(struct pb_buffer *_buf,
         cs->flush_cs(cs->flush_data, 0, NULL);
     }
 
-    while (p_atomic_read(&bo->num_active_ioctls)) {
-        sched_yield();
-    }
+    os_wait_until_zero(&bo->num_active_ioctls, PIPE_TIMEOUT_INFINITE);
 
     if (microtiled == RADEON_LAYOUT_TILED)
         args.tiling_flags |= RADEON_TILING_MICRO;

From 493af150fb3b1c007d791b24dcd5ea8a92ad763c Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Fri, 3 Jul 2015 13:15:21 +0100
Subject: [PATCH 0206/1208] i965/skl: Set the pulls bary bit in
 3DSTATE_PS_EXTRA

On Gen9+ there is a new bit in 3DSTATE_PS_EXTRA that must be set if
the shader sends a message to the pixel interpolator. This fixes the
interpolateAt* tests on SKL, apart from interpolateatsample-nonconst
but that is not implemented anywhere so it's not a regression.

Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Cc: "10.6 10.5" <mesa-stable@lists.freedesktop.org>
---
 src/mesa/drivers/dri/i965/brw_context.h   | 1 +
 src/mesa/drivers/dri/i965/brw_defines.h   | 1 +
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp  | 4 ++++
 src/mesa/drivers/dri/i965/gen8_ps_state.c | 3 +++
 4 files changed, 9 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 3553f6ec48c..759613951d8 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -415,6 +415,7 @@ struct brw_wm_prog_data {
    bool uses_pos_offset;
    bool uses_omask;
    bool uses_kill;
+   bool pulls_bary;
    uint32_t prog_offset_16;
 
    /**
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 66b9abc9991..19489aba5be 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -2145,6 +2145,7 @@ enum brw_pixel_shader_computed_depth_mode {
 # define GEN8_PSX_SHADER_DISABLES_ALPHA_TO_COVERAGE     (1 << 7)
 # define GEN8_PSX_SHADER_IS_PER_SAMPLE                  (1 << 6)
 # define GEN8_PSX_SHADER_COMPUTES_STENCIL               (1 << 5)
+# define GEN9_PSX_SHADER_PULLS_BARY                     (1 << 3)
 # define GEN8_PSX_SHADER_HAS_UAV                        (1 << 2)
 # define GEN8_PSX_SHADER_USES_INPUT_COVERAGE_MASK       (1 << 1)
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index bd71404ef8d..3ebc3a2c5e4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1481,6 +1481,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_interp_var_at_centroid:
    case nir_intrinsic_interp_var_at_sample:
    case nir_intrinsic_interp_var_at_offset: {
+      assert(stage == MESA_SHADER_FRAGMENT);
+
+      ((struct brw_wm_prog_data *) prog_data)->pulls_bary = true;
+
       fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
 
       /* For most messages, we need one reg of ignored data; the hardware
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index a88f109c691..d5445093e67 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -58,6 +58,9 @@ gen8_upload_ps_extra(struct brw_context *brw,
    if (prog_data->uses_omask)
       dw1 |= GEN8_PSX_OMASK_TO_RENDER_TARGET;
 
+   if (brw->gen >= 9 && prog_data->pulls_bary)
+      dw1 |= GEN9_PSX_SHADER_PULLS_BARY;
+
    if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx))
       dw1 |= GEN8_PSX_SHADER_HAS_UAV;
 

From d9ab95b365f058a46bc43a8cb96b6fff10a13faf Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 4 Mar 2015 15:46:57 -0800
Subject: [PATCH 0207/1208] i965: Reserve more batch space to accomodate Gen6
 perfmonitors.

Ben noticed that I said each PIPE_CONTROL was 4 DWords, but it's
actually 5 DWords on Gen6-7.  We've been reserving insufficient space
for performance monitoring on Sandybridge, which means it would likely
break if you used that functionality.  (Thankfully, no one does...)

Also, the existing number of 146 was the result of me flubbing up the
arithmetic: it should have actually been 140.

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/intel_batchbuffer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index ef8a6ffcca8..fdd07e0a117 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -22,12 +22,12 @@ extern "C" {
  *   - Disabling OA counters on Gen6+ (3 DWords = 12 bytes)
  *   - Ending MI_REPORT_PERF_COUNT on Gen5+, plus associated PIPE_CONTROLs:
  *     - Two sets of PIPE_CONTROLs, which become 3 PIPE_CONTROLs each on SNB,
- *       which are 4 DWords each ==> 2 * 3 * 4 * 4 = 96 bytes
+ *       which are 5 DWords each ==> 2 * 3 * 5 * 4 = 120 bytes
  *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
  *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
  *       Sandybridge PIPE_CONTROL madness.
  */
-#define BATCH_RESERVED 146
+#define BATCH_RESERVED 152
 
 struct intel_batchbuffer;
 

From 18039078e0254c7cb5e15b7186be05e2e4c10f38 Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Sat, 4 Jul 2015 22:40:58 +0100
Subject: [PATCH 0208/1208] glsl: Add missing check for whether an expression
 is an add operation

There is a piece of code that is trying to match expressions of the
form (mul (floor (add (abs x) 0.5) (sign x))). However the check for
the add expression wasn't checking whether it had the expected
operation. It looks like this was just an oversight because it doesn't
match the pattern for the rest of the code snippet. The existing line
to check whether add_expr!=NULL was added as part of a coverity fix in
3384179f.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91226
Cc: Matt Turner <mattst88@gmail.com>
Cc: "10.6" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/opt_algebraic.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glsl/opt_algebraic.cpp b/src/glsl/opt_algebraic.cpp
index fa5db70f2dd..9b8a426d79c 100644
--- a/src/glsl/opt_algebraic.cpp
+++ b/src/glsl/opt_algebraic.cpp
@@ -580,7 +580,7 @@ ir_algebraic_visitor::handle_expression(ir_expression *ir)
             continue;
 
          ir_expression *add_expr = floor_expr->operands[0]->as_expression();
-         if (!add_expr)
+         if (!add_expr || add_expr->operation != ir_binop_add)
             continue;
 
          for (int j = 0; j < 2; j++) {

From 86a3557d7c95ac945eedf42ab095639b255c1bed Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Sat, 4 Jul 2015 22:40:59 +0100
Subject: [PATCH 0209/1208] glsl: Make sure not to dereference NULL

In this bit of code point_five can be NULL if the expression is not a
constant. This fixes it to match the pattern of the rest of the chunk
of code so that it checks for NULLs.

Cc: Matt Turner <mattst88@gmail.com>
Cc: "10.6" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/opt_algebraic.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glsl/opt_algebraic.cpp b/src/glsl/opt_algebraic.cpp
index 9b8a426d79c..c4b87151199 100644
--- a/src/glsl/opt_algebraic.cpp
+++ b/src/glsl/opt_algebraic.cpp
@@ -589,7 +589,7 @@ ir_algebraic_visitor::handle_expression(ir_expression *ir)
                continue;
 
             ir_constant *point_five = add_expr->operands[1 - j]->as_constant();
-            if (!point_five->is_value(0.5, 0))
+            if (!point_five || !point_five->is_value(0.5, 0))
                continue;
 
             if (abs_expr->operands[0]->equals(sign_expr->operands[0])) {

From 128de6f6d7cd0eb5386dcc622afc6e28a8512e7f Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 6 Jul 2015 11:04:19 -0700
Subject: [PATCH 0210/1208] mesa: Add a MUST_CHECK macro for
 __attribute__((warn_unused_result)).

In the kernel, this is called __must_check; all our attribute macros in
Mesa appear to be uppercase, so I went with that.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 configure.ac      | 1 +
 src/util/macros.h | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/configure.ac b/configure.ac
index ea0f069cd31..d240c060573 100644
--- a/configure.ac
+++ b/configure.ac
@@ -210,6 +210,7 @@ AX_GCC_FUNC_ATTRIBUTE([format])
 AX_GCC_FUNC_ATTRIBUTE([malloc])
 AX_GCC_FUNC_ATTRIBUTE([packed])
 AX_GCC_FUNC_ATTRIBUTE([unused])
+AX_GCC_FUNC_ATTRIBUTE([warn_unused_result])
 
 AM_CONDITIONAL([GEN_ASM_OFFSETS], test "x$GEN_ASM_OFFSETS" = xyes)
 
diff --git a/src/util/macros.h b/src/util/macros.h
index 3b708ed6aa2..66698e72701 100644
--- a/src/util/macros.h
+++ b/src/util/macros.h
@@ -182,6 +182,12 @@ do {                       \
 #define UNUSED
 #endif
 
+#ifdef HAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT
+#define MUST_CHECK __attribute__((warn_unused_result))
+#else
+#define MUST_CHECK
+#endif
+
 /** Compute ceiling of integer quotient of A divided by B. */
 #define DIV_ROUND_UP( A, B )  ( (A) % (B) == 0 ? (A)/(B) : (A)/(B)+1 )
 

From 7b06af9d3ca7310197d39d55fc52c265da4bc59e Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Sat, 4 Jul 2015 03:03:33 +0200
Subject: [PATCH 0211/1208] gallivm: fix lp_build_compare_ext

The expansion should always be to the same width as the input arguments
no matter what, since these functions should work with any bit width of
the arguments (the sext is a no-op on any sane simd architecture).
Thus, fix the caller expecting differently.

This fixes https://bugs.freedesktop.org/show_bug.cgi?id=91222

Tested-by: Vinson Lee <vlee@freedesktop.org>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_logic.c       | 2 +-
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index f724cfa46ea..80b53e5c3f8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -81,7 +81,7 @@ lp_build_compare_ext(struct gallivm_state *gallivm,
                      boolean ordered)
 {
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, lp_type_int_vec(32, 32 * type.length));
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
    LLVMValueRef zeros = LLVMConstNull(int_vec_type);
    LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
    LLVMValueRef cond;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 1f2af85c92b..0ad78b0ace2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -1961,8 +1961,11 @@ dset_emit_cpu(
    struct lp_build_emit_data * emit_data,
    unsigned pipe_func)
 {
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
    LLVMValueRef cond = lp_build_cmp(&bld_base->dbl_bld, pipe_func,
                                     emit_data->args[0], emit_data->args[1]);
+   /* arguments were 64 bit but store as 32 bit */
+   cond = LLVMBuildTrunc(builder, cond, bld_base->int_bld.int_vec_type, "");
    emit_data->output[emit_data->chan] = cond;
 }
 

From b0334a9aeb9369fd20854ab2ef4b2ee0087492ab Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 25 Jun 2015 16:57:20 -0700
Subject: [PATCH 0212/1208] mesa: Convert some asserts into STATIC_ASSERT.

Reviewed-by: Chad Versace <chad.versace@intel.com>
---
 src/mesa/main/context.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index fdef41287f7..faa1de739d9 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -377,13 +377,12 @@ one_time_init( struct gl_context *ctx )
    if (!api_init_mask) {
       GLuint i;
 
-      /* do some implementation tests */
-      assert( sizeof(GLbyte) == 1 );
-      assert( sizeof(GLubyte) == 1 );
-      assert( sizeof(GLshort) == 2 );
-      assert( sizeof(GLushort) == 2 );
-      assert( sizeof(GLint) == 4 );
-      assert( sizeof(GLuint) == 4 );
+      STATIC_ASSERT(sizeof(GLbyte) == 1);
+      STATIC_ASSERT(sizeof(GLubyte) == 1);
+      STATIC_ASSERT(sizeof(GLshort) == 2);
+      STATIC_ASSERT(sizeof(GLushort) == 2);
+      STATIC_ASSERT(sizeof(GLint) == 4);
+      STATIC_ASSERT(sizeof(GLuint) == 4);
 
       _mesa_locale_init();
 

From 248b26429f52d0f19949a083aa3e0aeebcbe2138 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
Date: Mon, 6 Jul 2015 17:23:07 +0900
Subject: [PATCH 0213/1208] radeonsi: Use param export count from
 si_llvm_export_vs in si_shader_vs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This eliminates the error prone logic in si_shader_vs recalculating this
value.

It also fixes TGSI_SEMANTIC_CLIPDIST outputs incorrectly not being
counted for VS exports. They need to be counted because they are passed
to the pixel shader as parameters as well.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91193
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c      |  2 ++
 src/gallium/drivers/radeonsi/si_shader.h      |  1 +
 .../drivers/radeonsi/si_state_shaders.c       | 25 +++----------------
 3 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4d97b58aec8..753b238e2c0 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1218,6 +1218,8 @@ handle_semantic:
 		}
 	}
 
+	shader->nr_param_exports = param_count;
+
 	/* We need to add the position output manually if it's missing. */
 	if (!pos_args[0][0]) {
 		pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index b4339ae2b36..8d309b4eb37 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -165,6 +165,7 @@ struct si_shader {
 
 	bool			uses_instanceid;
 	unsigned		nr_pos_exports;
+	unsigned		nr_param_exports;
 	bool			is_gs_copy_shader;
 	bool			dx10_clamp_mode; /* convert NaNs to 0 */
 };
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index eef3baad164..a842d9db798 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -148,10 +148,9 @@ static void si_shader_gs(struct si_shader *shader)
 
 static void si_shader_vs(struct si_shader *shader)
 {
-	struct tgsi_shader_info *info = &shader->selector->info;
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
-	unsigned nparams, i, vgpr_comp_cnt;
+	unsigned nparams, vgpr_comp_cnt;
 	uint64_t va;
 	unsigned window_space =
 	   shader->selector->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
@@ -180,26 +179,8 @@ static void si_shader_vs(struct si_shader *shader)
 	}
 	assert(num_sgprs <= 104);
 
-	/* Certain attributes (position, psize, etc.) don't count as params.
-	 * VS is required to export at least one param and r600_shader_from_tgsi()
-	 * takes care of adding a dummy export.
-	 */
-	for (nparams = 0, i = 0 ; i < info->num_outputs; i++) {
-		switch (info->output_semantic_name[i]) {
-		case TGSI_SEMANTIC_CLIPVERTEX:
-		case TGSI_SEMANTIC_CLIPDIST:
-		case TGSI_SEMANTIC_CULLDIST:
-		case TGSI_SEMANTIC_POSITION:
-		case TGSI_SEMANTIC_PSIZE:
-		case TGSI_SEMANTIC_EDGEFLAG:
-			break;
-		default:
-			nparams++;
-		}
-	}
-	if (nparams < 1)
-		nparams = 1;
-
+	/* VS is required to export at least one param. */
+	nparams = MAX2(shader->nr_param_exports, 1);
 	si_pm4_set_reg(pm4, R_0286C4_SPI_VS_OUT_CONFIG,
 		       S_0286C4_VS_EXPORT_COUNT(nparams - 1));
 

From 40e2102e528498dd4c03c4567d3522241f4d1f22 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 6 Jul 2015 18:23:57 +0300
Subject: [PATCH 0214/1208] i965/gen4-5: Set ENDIF dst and src0 fields to the
 null register.

The hardware docs don't mention explicitly what these fields should
be, but I've verified experimentally on ILK that using a GRF as
destination causes the register to be corrupted when the execution
size of an ENDIF instruction is higher than 8 -- and because the
destination we were using was g0, eventually a hang.

Fixes some 150 piglit tests on Gen4-5 when forced to run shaders with
if conditionals 16-wide, e.g. shaders/glsl-fs-sampler-numbering-3.

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_eu_emit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 0f536046f6f..4d397622fc1 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -1584,8 +1584,8 @@ brw_ENDIF(struct brw_codegen *p)
    }
 
    if (devinfo->gen < 6) {
-      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(p, insn, brw_imm_d(0x0));
    } else if (devinfo->gen == 6) {
       brw_set_dest(p, insn, brw_imm_w(0));

From 24842e18aabdaeff41668b0e71e52d32975d2ccd Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 6 Jul 2015 19:11:54 +0300
Subject: [PATCH 0215/1208] i965/gen4-5: Program the execution size correctly
 for DO/WHILE instructions.

From the hardware docs for the DO instruction:

 "Execution size is ignored for this instruction."

My observation on ILK hardware contradicts the spec though, channels
over the execution size of a DO instruction won't enter the loop, and
channels over the execution size of a WHILE instruction will exit the
loop after the first iteration -- The latter is consistent with the
spec though, there's no claim about the execution size being ignored
for the WHILE instruction so it's not completely unexpected that it
has an influence on the evaluation of EMask.

The execute_size argument of brw_DO() shouldn't have any effect on
Gen6 and newer hardware.  On Gen4-5 WHILE instructions inherit the
execution size from the matching DO, so this patch should fix them
too.  The execution size of BREAK and CONT instructions was already
being set correctly.

Fixes some 50 piglit tests on Gen4-5 when forced to run shaders with
conditional and loop instructions 16-wide,
e.g. shaders/glsl-fs-continue-inside-do-while.

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 0a70bdc3c76..c986d91d988 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1869,7 +1869,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 	 break;
 
       case BRW_OPCODE_DO:
-	 brw_DO(p, BRW_EXECUTE_8);
+	 brw_DO(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
 	 break;
 
       case BRW_OPCODE_BREAK:

From 7009e2683ebb917393d87639f549588f22c03a32 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 6 Jul 2015 18:55:26 +0300
Subject: [PATCH 0216/1208] i965/gen4-5: Enable 16-wide dispatch on shaders
 with control flow.

This was probably disabled due to a combination of several bugs in the
generator code (fixed earlier in this series) and a misunderstanding
of the hardware spec.  The documentation for most control flow
instructions mentions among other restrictions:

 "Instruction compression is not allowed."

This however doesn't have any implications on 16 wide not being
supported, because none of the control flow instructions have
multi-register operands (control flow instructions are not compressed
on more recent hardware either, except maybe SNB's IF with inline
compare).  In fact Gen4-5 had 16-wide control flow masks and stacks,
and the spec mentions in several places that control flow instructions
push and pop 16 channels worth of data -- Otherwise there doesn't seem
to be any indication that it shouldn't work.

Causes no piglit regressions, and gives the following shader-db
results on ILK:

 total instructions in shared programs: 4711384 -> 4711384 (0.00%)
 instructions in affected programs:     0 -> 0
 helped:                                0
 HURT:                                  0
 GAINED:                                1215
 LOST:                                  0

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 3ebc3a2c5e4..e01dfa88d52 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -417,18 +417,12 @@ fs_visitor::nir_emit_if(nir_if *if_stmt)
 
    bld.emit(BRW_OPCODE_ENDIF);
 
-   if (!try_replace_with_sel() && devinfo->gen < 6) {
-      no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
+   try_replace_with_sel();
 }
 
 void
 fs_visitor::nir_emit_loop(nir_loop *loop)
 {
-   if (devinfo->gen < 6) {
-      no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
-
    bld.emit(BRW_OPCODE_DO);
 
    nir_emit_cf_list(&loop->body);

From f025aec906fce0f2918b6f4acb15548dc957ba67 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 6 Jul 2015 15:28:59 -0600
Subject: [PATCH 0217/1208] gallium/os: minor whitespace fixes in os_time.h

Trivial.
---
 src/gallium/auxiliary/os/os_time.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/gallium/auxiliary/os/os_time.h b/src/gallium/auxiliary/os/os_time.h
index 21979a72ac5..e786ee1885b 100644
--- a/src/gallium/auxiliary/os/os_time.h
+++ b/src/gallium/auxiliary/os/os_time.h
@@ -45,7 +45,7 @@
 #include "pipe/p_compiler.h"
 
 
-#ifdef	__cplusplus
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -61,8 +61,9 @@ os_time_get_nano(void);
  * Get the current time in microseconds from an unknown base.
  */
 static INLINE int64_t
-os_time_get(void) {
-    return os_time_get_nano() / 1000;
+os_time_get(void)
+{
+   return os_time_get_nano() / 1000;
 }
 
 
@@ -87,7 +88,7 @@ os_time_timeout(int64_t start,
                 int64_t end,
                 int64_t curr)
 {
-   if(start <= end)
+   if (start <= end)
       return !(start <= curr && curr < end);
    else
       return !((start <= curr) || (curr < end));
@@ -131,7 +132,7 @@ os_wait_until_zero(volatile int *var, uint64_t timeout);
 bool
 os_wait_until_zero_abs_timeout(volatile int *var, int64_t timeout);
 
-#ifdef	__cplusplus
+#ifdef __cplusplus
 }
 #endif
 

From 86ebd31c672f389f354e11b7aef4513dc8b76f13 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 7 Jul 2015 09:13:02 -0600
Subject: [PATCH 0218/1208] gallium/hud: replace byte units flag with
 pipe_driver_query_type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of using a boolean 'is bytes' value, use the pipe_driver_query_type
enum type.  This will let is add support for time values in the next patch.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/hud/hud_context.c      | 20 ++++++++++++--------
 src/gallium/auxiliary/hud/hud_driver_query.c |  9 +++------
 src/gallium/auxiliary/hud/hud_private.h      |  5 +++--
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index 6a124f7d716..9f42da9c219 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -231,14 +231,16 @@ hud_draw_string(struct hud_context *hud, unsigned x, unsigned y,
 }
 
 static void
-number_to_human_readable(uint64_t num, boolean is_in_bytes, char *out)
+number_to_human_readable(uint64_t num, enum pipe_driver_query_type type,
+                         char *out)
 {
    static const char *byte_units[] =
       {"", " KB", " MB", " GB", " TB", " PB", " EB"};
    static const char *metric_units[] =
       {"", " k", " M", " G", " T", " P", " E"};
-   const char **units = is_in_bytes ? byte_units : metric_units;
-   double divisor = is_in_bytes ? 1024 : 1000;
+   const char **units =
+      (type == PIPE_DRIVER_QUERY_TYPE_BYTES) ? byte_units : metric_units;
+   double divisor = (type == PIPE_DRIVER_QUERY_TYPE_BYTES) ? 1024 : 1000;
    int unit = 0;
    double d = num;
 
@@ -301,7 +303,7 @@ hud_pane_accumulate_vertices(struct hud_context *hud,
                    hud->font.glyph_height / 2;
 
       number_to_human_readable(pane->max_value * i / 5,
-                               pane->uses_byte_units, str);
+                               pane->type, str);
       hud_draw_string(hud, x, y, str);
    }
 
@@ -312,7 +314,7 @@ hud_pane_accumulate_vertices(struct hud_context *hud,
       unsigned y = pane->y2 + 2 + i*hud->font.glyph_height;
 
       number_to_human_readable(gr->current_value,
-                               pane->uses_byte_units, str);
+                               pane->type, str);
       hud_draw_string(hud, x, y, "  %s: %s", gr->name, str);
       i++;
    }
@@ -869,12 +871,14 @@ hud_parse_env_var(struct hud_context *hud, const char *env)
       else if (strcmp(name, "samples-passed") == 0 &&
                has_occlusion_query(hud->pipe->screen)) {
          hud_pipe_query_install(pane, hud->pipe, "samples-passed",
-                                PIPE_QUERY_OCCLUSION_COUNTER, 0, 0, FALSE);
+                                PIPE_QUERY_OCCLUSION_COUNTER, 0, 0,
+                                PIPE_DRIVER_QUERY_TYPE_UINT64);
       }
       else if (strcmp(name, "primitives-generated") == 0 &&
                has_streamout(hud->pipe->screen)) {
          hud_pipe_query_install(pane, hud->pipe, "primitives-generated",
-                                PIPE_QUERY_PRIMITIVES_GENERATED, 0, 0, FALSE);
+                                PIPE_QUERY_PRIMITIVES_GENERATED, 0, 0,
+                                PIPE_DRIVER_QUERY_TYPE_UINT64);
       }
       else {
          boolean processed = FALSE;
@@ -901,7 +905,7 @@ hud_parse_env_var(struct hud_context *hud, const char *env)
             if (i < Elements(pipeline_statistics_names)) {
                hud_pipe_query_install(pane, hud->pipe, name,
                                       PIPE_QUERY_PIPELINE_STATISTICS, i,
-                                      0, FALSE);
+                                      0, PIPE_DRIVER_QUERY_TYPE_UINT64);
                processed = TRUE;
             }
          }
diff --git a/src/gallium/auxiliary/hud/hud_driver_query.c b/src/gallium/auxiliary/hud/hud_driver_query.c
index ee71678e894..c47d232842f 100644
--- a/src/gallium/auxiliary/hud/hud_driver_query.c
+++ b/src/gallium/auxiliary/hud/hud_driver_query.c
@@ -150,7 +150,7 @@ void
 hud_pipe_query_install(struct hud_pane *pane, struct pipe_context *pipe,
                        const char *name, unsigned query_type,
                        unsigned result_index,
-                       uint64_t max_value, boolean uses_byte_units)
+                       uint64_t max_value, enum pipe_driver_query_type type)
 {
    struct hud_graph *gr;
    struct query_info *info;
@@ -178,8 +178,7 @@ hud_pipe_query_install(struct hud_pane *pane, struct pipe_context *pipe,
    hud_pane_add_graph(pane, gr);
    if (pane->max_value < max_value)
       hud_pane_set_max_value(pane, max_value);
-   if (uses_byte_units)
-      pane->uses_byte_units = TRUE;
+   pane->type = type;
 }
 
 boolean
@@ -189,7 +188,6 @@ hud_driver_query_install(struct hud_pane *pane, struct pipe_context *pipe,
    struct pipe_screen *screen = pipe->screen;
    struct pipe_driver_query_info query;
    unsigned num_queries, i;
-   boolean uses_byte_units;
    boolean found = FALSE;
 
    if (!screen->get_driver_query_info)
@@ -208,9 +206,8 @@ hud_driver_query_install(struct hud_pane *pane, struct pipe_context *pipe,
    if (!found)
       return FALSE;
 
-   uses_byte_units = query.type == PIPE_DRIVER_QUERY_TYPE_BYTES;
    hud_pipe_query_install(pane, pipe, query.name, query.query_type, 0,
-                          query.max_value.u64, uses_byte_units);
+                          query.max_value.u64, query.type);
 
    return TRUE;
 }
diff --git a/src/gallium/auxiliary/hud/hud_private.h b/src/gallium/auxiliary/hud/hud_private.h
index 632926b87f5..033be534a75 100644
--- a/src/gallium/auxiliary/hud/hud_private.h
+++ b/src/gallium/auxiliary/hud/hud_private.h
@@ -66,7 +66,7 @@ struct hud_pane {
    uint64_t ceiling;
    unsigned dyn_ceil_last_ran;
    boolean dyn_ceiling;
-   boolean uses_byte_units;
+   enum pipe_driver_query_type type;
    uint64_t period; /* in microseconds */
 
    struct list_head graph_list;
@@ -89,7 +89,8 @@ void hud_cpu_graph_install(struct hud_pane *pane, unsigned cpu_index);
 void hud_pipe_query_install(struct hud_pane *pane, struct pipe_context *pipe,
                             const char *name, unsigned query_type,
                             unsigned result_index,
-                            uint64_t max_value, boolean uses_byte_units);
+                            uint64_t max_value,
+                            enum pipe_driver_query_type type);
 boolean hud_driver_query_install(struct hud_pane *pane,
                                  struct pipe_context *pipe, const char *name);
 

From a804f5824352e4f714779bd9445c09b66d54bc4a Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 7 Jul 2015 09:15:59 -0600
Subject: [PATCH 0219/1208] gallium/hud: add
 PIPE_DRIVER_QUERY_TYPE_MICROSECONDS for HUD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows drivers to report queries in units of microseconds and
have the HUD display "us" (microseconds), "ms" (milliseconds) or "s"
(seconds) on the graph.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/hud/hud_context.c | 25 ++++++++++++++++++++-----
 src/gallium/include/pipe/p_defines.h    | 11 ++++++-----
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index 9f42da9c219..cb552208d18 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -238,8 +238,9 @@ number_to_human_readable(uint64_t num, enum pipe_driver_query_type type,
       {"", " KB", " MB", " GB", " TB", " PB", " EB"};
    static const char *metric_units[] =
       {"", " k", " M", " G", " T", " P", " E"};
-   const char **units =
-      (type == PIPE_DRIVER_QUERY_TYPE_BYTES) ? byte_units : metric_units;
+   static const char *time_units[] =
+      {" us", " ms", " s"};  /* based on microseconds */
+   const char *suffix;
    double divisor = (type == PIPE_DRIVER_QUERY_TYPE_BYTES) ? 1024 : 1000;
    int unit = 0;
    double d = num;
@@ -249,12 +250,26 @@ number_to_human_readable(uint64_t num, enum pipe_driver_query_type type,
       unit++;
    }
 
+   switch (type) {
+   case PIPE_DRIVER_QUERY_TYPE_MICROSECONDS:
+      assert(unit < ARRAY_SIZE(time_units));
+      suffix = time_units[unit];
+      break;
+   case PIPE_DRIVER_QUERY_TYPE_BYTES:
+      assert(unit < ARRAY_SIZE(byte_units));
+      suffix = byte_units[unit];
+      break;
+   default:
+      assert(unit < ARRAY_SIZE(metric_units));
+      suffix = metric_units[unit];
+   }
+
    if (d >= 100 || d == (int)d)
-      sprintf(out, "%.0f%s", d, units[unit]);
+      sprintf(out, "%.0f%s", d, suffix);
    else if (d >= 10 || d*10 == (int)(d*10))
-      sprintf(out, "%.1f%s", d, units[unit]);
+      sprintf(out, "%.1f%s", d, suffix);
    else
-      sprintf(out, "%.2f%s", d, units[unit]);
+      sprintf(out, "%.2f%s", d, suffix);
 }
 
 static void
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 153897af754..b0cd23dbeff 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -788,11 +788,12 @@ union pipe_color_union
 
 enum pipe_driver_query_type
 {
-   PIPE_DRIVER_QUERY_TYPE_UINT64     = 0,
-   PIPE_DRIVER_QUERY_TYPE_UINT       = 1,
-   PIPE_DRIVER_QUERY_TYPE_FLOAT      = 2,
-   PIPE_DRIVER_QUERY_TYPE_PERCENTAGE = 3,
-   PIPE_DRIVER_QUERY_TYPE_BYTES      = 4,
+   PIPE_DRIVER_QUERY_TYPE_UINT64       = 0,
+   PIPE_DRIVER_QUERY_TYPE_UINT         = 1,
+   PIPE_DRIVER_QUERY_TYPE_FLOAT        = 2,
+   PIPE_DRIVER_QUERY_TYPE_PERCENTAGE   = 3,
+   PIPE_DRIVER_QUERY_TYPE_BYTES        = 4,
+   PIPE_DRIVER_QUERY_TYPE_MICROSECONDS = 5,
 };
 
 enum pipe_driver_query_group_type

From 10cff5e1ae55406799f4b0ad6b327d4c45dbca11 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 7 Jul 2015 13:17:01 -0600
Subject: [PATCH 0220/1208] gallium/hud: display percentages with % suffix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/hud/hud_context.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index cb552208d18..bd571907f2f 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -255,6 +255,9 @@ number_to_human_readable(uint64_t num, enum pipe_driver_query_type type,
       assert(unit < ARRAY_SIZE(time_units));
       suffix = time_units[unit];
       break;
+   case PIPE_DRIVER_QUERY_TYPE_PERCENTAGE:
+      suffix = "%";
+      break;
    case PIPE_DRIVER_QUERY_TYPE_BYTES:
       assert(unit < ARRAY_SIZE(byte_units));
       suffix = byte_units[unit];

From 6611f65047575054a38ce83ebfe0331e39e1774f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 7 Jul 2015 18:28:31 +0200
Subject: [PATCH 0221/1208] st/dri: don't set PIPE_BIND_SCANOUT for MSAA
 surfaces

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91231

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/state_trackers/dri/dri2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index 8d93f786433..1eda036750e 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -554,7 +554,7 @@ dri2_allocate_textures(struct dri_context *ctx,
 
          if (drawable->textures[statt]) {
             templ.format = drawable->textures[statt]->format;
-            templ.bind = drawable->textures[statt]->bind;
+            templ.bind = drawable->textures[statt]->bind & ~PIPE_BIND_SCANOUT;
             templ.nr_samples = drawable->stvis.samples;
 
             /* Try to reuse the resource.

From 73d0e7f3451eaeb62ac039d2dcee1e1c6787e3db Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 1 Jul 2015 20:13:00 -0700
Subject: [PATCH 0222/1208] i965/vs: Fix matNxM vertex attributes where M != 4.

Matrix vertex attributes have their columns padded out to vec4s, which
I was failing to account for.  Scalar NIR expects them to be packed,
however.

Fixes 1256 dEQP tests on Broadwell.

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Tested-by: Mark Janes <mark.a.janes@intel.com>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index e01dfa88d52..5d1ea21884a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -91,12 +91,19 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
           * So, we need to copy from fs_reg(ATTR, var->location) to
           * offset(nir_inputs, var->data.driver_location).
           */
-         unsigned components = var->type->without_array()->components();
+         const glsl_type *const t = var->type->without_array();
+         const unsigned components = t->components();
+         const unsigned cols = t->matrix_columns;
+         const unsigned elts = t->vector_elements;
          unsigned array_length = var->type->is_array() ? var->type->length : 1;
          for (unsigned i = 0; i < array_length; i++) {
-            for (unsigned j = 0; j < components; j++) {
-               bld.MOV(retype(offset(input, bld, components * i + j), type),
-                       offset(fs_reg(ATTR, var->data.location + i, type), bld, j));
+            for (unsigned j = 0; j < cols; j++) {
+               for (unsigned k = 0; k < elts; k++) {
+                  bld.MOV(offset(retype(input, type), bld,
+                                 components * i + elts * j + k),
+                          offset(fs_reg(ATTR, var->data.location + i, type),
+                                 bld, 4 * j + k));
+               }
             }
          }
          break;

From 87d2e15b1aa6f438983405aa25bf067034c898b0 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Wed, 8 Jul 2015 09:20:40 +1000
Subject: [PATCH 0223/1208] mesa: use implementation specified
 MAX_VERTEX_ATTRIBS rather than hardcoded value

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/glsl/linker.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 8d4b40e4f0c..b7a783c0984 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3070,12 +3070,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       }
    }
 
-   /* FINISHME: The value of the max_attribute_index parameter is
-    * FINISHME: implementation dependent based on the value of
-    * FINISHME: GL_MAX_VERTEX_ATTRIBS.  GL_MAX_VERTEX_ATTRIBS must be
-    * FINISHME: at least 16, so hardcode 16 for now.
-    */
-   if (!assign_attribute_or_color_locations(prog, MESA_SHADER_VERTEX, 16)) {
+   if (!assign_attribute_or_color_locations(prog, MESA_SHADER_VERTEX,
+                                            ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs)) {
       goto done;
    }
 

From 38c2ec5ff0bf626578db7b84387279342aa48844 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 7 Jul 2015 23:05:45 -0400
Subject: [PATCH 0224/1208] nvc0: turn sample counts off during blit

Fixes the following piglits:
  occlusion_query_meta_fragments
  occlusion_query_meta_no_fragments

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: mesa-stable@lists.freedesktop.org
---
 src/gallium/drivers/nouveau/nvc0/nvc0_surface.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index a820de7259a..ac4dd25bcfa 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1376,6 +1376,7 @@ static void
 nvc0_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    boolean eng3d = FALSE;
 
    if (util_format_is_depth_or_stencil(info->dst.resource->format)) {
@@ -1439,11 +1440,17 @@ nvc0_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
         info->src.box.height != -info->dst.box.height))
       eng3d = TRUE;
 
+   if (nvc0->screen->num_occlusion_queries_active)
+      IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0);
+
    if (!eng3d)
       nvc0_blit_eng2d(nvc0, info);
    else
       nvc0_blit_3d(nvc0, info);
 
+   if (nvc0->screen->num_occlusion_queries_active)
+      IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
+
    NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_blit_count, 1);
 }
 

From c8d3ebaffc0d7d915c1c19d54dba61fd1e57b338 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 29 Apr 2015 13:32:38 +0100
Subject: [PATCH 0225/1208] i965: Query whether we have kernel support for the
 TIMESTAMP register once

Move the query for the TIMESTAMP register from context init to the
screen, so that it is only queried once for all contexts.

On 32bit systems, some old kernels trigger a hw bug resulting in the
TIMESTAMP register being shifted and the low 32bits always zero. Detect
this by repeating the read a few times and check the register is
incrementing every 80ns as expected and not stuck on zero (as would be
the case with the buggy kernel/hw.).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Martin Peres <martin.peres@linux.intel.com>
Reviewed-by: Martin Peres <martin.peres@linux.intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/intel_extensions.c |  6 +-----
 src/mesa/drivers/dri/i965/intel_screen.c     | 22 ++++++++++++++++++++
 src/mesa/drivers/dri/i965/intel_screen.h     |  2 ++
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 3423190c485..740ac8128bb 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -282,8 +282,6 @@ intelInitExtensions(struct gl_context *ctx)
    }
 
    if (brw->gen >= 6) {
-      uint64_t dummy;
-
       ctx->Extensions.ARB_blend_func_extended =
          !driQueryOptionb(&brw->optionCache, "disable_blend_func_extended");
       ctx->Extensions.ARB_conditional_render_inverted = true;
@@ -307,9 +305,7 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.EXT_transform_feedback = true;
       ctx->Extensions.OES_depth_texture_cube_map = true;
 
-      /* Test if the kernel has the ioctl. */
-      if (drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &dummy) == 0)
-         ctx->Extensions.ARB_timer_query = true;
+      ctx->Extensions.ARB_timer_query = brw->intelScreen->hw_has_timestamp;
 
       /* Only enable this in core profile because other parts of Mesa behave
        * slightly differently when the extension is enabled.
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index f9398d7859e..c0f5c927ed8 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1123,6 +1123,27 @@ intel_detect_swizzling(struct intel_screen *screen)
       return true;
 }
 
+static bool
+intel_detect_timestamp(struct intel_screen *screen)
+{
+   uint64_t dummy = 0;
+   int loop = 10;
+
+   /*
+    * On 32bit systems, some old kernels trigger a hw bug resulting in the
+    * TIMESTAMP register being shifted and the low 32bits always zero. Detect
+    * this by repeating the read a few times and check the register is
+    * incrementing every 80ns as expected and not stuck on zero (as would be
+    * the case with the buggy kernel/hw.).
+    */
+   do {
+      if (drm_intel_reg_read(screen->bufmgr, TIMESTAMP, &dummy))
+	 return false;
+   } while ((dummy & 0xffffffff) == 0 && --loop);
+
+   return loop > 0;
+}
+
 /**
  * Return array of MSAA modes supported by the hardware. The array is
  * zero-terminated and sorted in decreasing order.
@@ -1378,6 +1399,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
    intelScreen->hw_must_use_separate_stencil = intelScreen->devinfo->gen >= 7;
 
    intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen);
+   intelScreen->hw_has_timestamp = intel_detect_timestamp(intelScreen);
 
    const char *force_msaa = getenv("INTEL_FORCE_MSAA");
    if (force_msaa) {
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index 742b3d30eee..941e0fcc752 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -52,6 +52,8 @@ struct intel_screen
 
    bool hw_has_swizzling;
 
+   bool hw_has_timestamp;
+
    /**
     * Does the kernel support context reset notifications?
     */

From f2413457937f8f4a92e11379569be69e508d7477 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 10 Jun 2015 08:28:13 +0100
Subject: [PATCH 0226/1208] loader: Look for any version of currently linked
 libudev.so

Since there was an ABI break and linking twice against libudev.so.0 and
libudev.so.1 causes the application to quickly crash, we first check if
the application is currently linked against libudev before dlopening a
local handle. However for backwards/forwards compatability, we need to
inspect the application for current linkage against all known versions
first. Not doing so causes a crash when both libraries are present and
so mesa chooses libudev.so.1 but the application was linked against
libudev.so.0.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Emil Velikov:

I'm ever so slightly conserned that RTLD_NOLOAD is not part of the POSIX
standard, thus it's missing on some platforms (*BSD seems ok, while
Solaris, MacOS are not).

Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Cc: mesa-stable@lists.freedesktop.org
---
 src/loader/loader.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/loader/loader.c b/src/loader/loader.c
index 8452cd3560e..8780587a725 100644
--- a/src/loader/loader.c
+++ b/src/loader/loader.c
@@ -128,26 +128,36 @@ static void *udev_handle = NULL;
 static void *
 udev_dlopen_handle(void)
 {
-   if (!udev_handle) {
-      udev_handle = dlopen("libudev.so.1", RTLD_LOCAL | RTLD_LAZY);
+   char name[80];
+   unsigned flags = RTLD_NOLOAD | RTLD_LOCAL | RTLD_LAZY;
+   int version;
 
-      if (!udev_handle) {
-         /* libudev.so.1 changed the return types of the two unref functions
-          * from voids to pointers.  We don't use those return values, and the
-          * only ABI I've heard that cares about this kind of change (calling
-          * a function with a void * return that actually only returns void)
-          * might be ia64.
-          */
-         udev_handle = dlopen("libudev.so.0", RTLD_LOCAL | RTLD_LAZY);
+   /* libudev.so.1 changed the return types of the two unref functions
+    * from voids to pointers.  We don't use those return values, and the
+    * only ABI I've heard that cares about this kind of change (calling
+    * a function with a void * return that actually only returns void)
+    * might be ia64.
+    */
 
-         if (!udev_handle) {
-            log_(_LOADER_WARNING, "Couldn't dlopen libudev.so.1 or "
-                 "libudev.so.0, driver detection may be broken.\n");
-         }
+   /* First try opening an already linked libudev, then try loading one */
+   do {
+      for (version = 1; version >= 0; version--) {
+         snprintf(name, sizeof(name), "libudev.so.%d", version);
+         udev_handle = dlopen(name, flags);
+         if (udev_handle)
+            return udev_handle;
       }
-   }
 
-   return udev_handle;
+      if ((flags & RTLD_NOLOAD) == 0)
+         break;
+
+      flags &= ~RTLD_NOLOAD;
+   } while (1);
+
+   log_(_LOADER_WARNING,
+        "Couldn't dlopen libudev.so.1 or "
+        "libudev.so.0, driver detection may be broken.\n");
+   return NULL;
 }
 
 static int dlsym_failed = 0;

From f1d08c4f75794add30d1714a4cd9ce2bf335148d Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 1 May 2015 11:25:20 +0100
Subject: [PATCH 0227/1208] i965: Move pipecontrol workaround bo to
 brw_pipe_control

With the exception of gen8, the sole user of the workaround bo are for
emitting pipe controls. Move it out of the purview of the batchbuffer
and into the pipecontrol.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Martin Peres <martin.peres@linux.intel.com>
---
 src/mesa/drivers/dri/i965/brw_context.c       |  7 ++++
 src/mesa/drivers/dri/i965/brw_context.h       | 12 ++++--
 src/mesa/drivers/dri/i965/brw_pipe_control.c  | 40 ++++++++++++++++---
 src/mesa/drivers/dri/i965/gen8_depth_state.c  |  2 +-
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 12 ------
 src/mesa/drivers/dri/i965/intel_extensions.c  | 28 ++++++-------
 6 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 4b51fe5da56..8150b943bc8 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -819,6 +819,12 @@ brwCreateContext(gl_api api,
       }
    }
 
+   if (brw_init_pipe_control(brw, devinfo)) {
+      *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
+      intelDestroyContext(driContextPriv);
+      return false;
+   }
+
    brw_init_state(brw);
 
    intelInitExtensions(ctx);
@@ -942,6 +948,7 @@ intelDestroyContext(__DRIcontext * driContextPriv)
    if (ctx->swrast_context)
       _swrast_DestroyContext(&brw->ctx);
 
+   brw_fini_pipe_control(brw);
    intel_batchbuffer_free(brw);
 
    drm_intel_bo_unreference(brw->throttle_batch[1]);
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 759613951d8..65f34c368de 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -869,8 +869,6 @@ struct intel_batchbuffer {
    drm_intel_bo *bo;
    /** Last BO submitted to the hardware.  Used for glFinish(). */
    drm_intel_bo *last_bo;
-   /** BO for post-sync nonzero writes for gen6 workaround. */
-   drm_intel_bo *workaround_bo;
 
    uint16_t emit, total;
    uint16_t used, reserved_space;
@@ -882,8 +880,6 @@ struct intel_batchbuffer {
    enum brw_gpu_ring ring;
    bool needs_sol_reset;
 
-   uint8_t pipe_controls_since_last_cs_stall;
-
    struct {
       uint16_t used;
       int reloc_count;
@@ -1035,6 +1031,10 @@ struct brw_context
 
    drm_intel_context *hw_ctx;
 
+   /** BO for post-sync nonzero writes for gen6 workaround. */
+   drm_intel_bo *workaround_bo;
+   uint8_t pipe_controls_since_last_cs_stall;
+
    /**
     * Set of drm_intel_bo * that have been rendered to within this batchbuffer
     * and would need flushing before being used from another cache domain that
@@ -2001,6 +2001,10 @@ gen9_use_linear_1d_layout(const struct brw_context *brw,
                           const struct intel_mipmap_tree *mt);
 
 /* brw_pipe_control.c */
+int brw_init_pipe_control(struct brw_context *brw,
+			  const struct brw_device_info *info);
+void brw_fini_pipe_control(struct brw_context *brw);
+
 void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags);
 void brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
                                  drm_intel_bo *bo, uint32_t offset,
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index b4c86b9dff9..7ee3cb680f7 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -72,13 +72,13 @@ gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
    if (brw->gen == 7 && !brw->is_haswell) {
       if (flags & PIPE_CONTROL_CS_STALL) {
          /* If we're doing a CS stall, reset the counter and carry on. */
-         brw->batch.pipe_controls_since_last_cs_stall = 0;
+         brw->pipe_controls_since_last_cs_stall = 0;
          return 0;
       }
 
       /* If this is the fourth pipe control without a CS stall, do one now. */
-      if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
-         brw->batch.pipe_controls_since_last_cs_stall = 0;
+      if (++brw->pipe_controls_since_last_cs_stall == 4) {
+         brw->pipe_controls_since_last_cs_stall = 0;
          return PIPE_CONTROL_CS_STALL;
       }
    }
@@ -213,7 +213,7 @@ gen7_emit_vs_workaround_flush(struct brw_context *brw)
    brw_emit_pipe_control_write(brw,
                                PIPE_CONTROL_WRITE_IMMEDIATE
                                | PIPE_CONTROL_DEPTH_STALL,
-                               brw->batch.workaround_bo, 0,
+                               brw->workaround_bo, 0,
                                0, 0);
 }
 
@@ -227,7 +227,7 @@ gen7_emit_cs_stall_flush(struct brw_context *brw)
    brw_emit_pipe_control_write(brw,
                                PIPE_CONTROL_CS_STALL
                                | PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->batch.workaround_bo, 0,
+                               brw->workaround_bo, 0,
                                0, 0);
 }
 
@@ -277,7 +277,7 @@ brw_emit_post_sync_nonzero_flush(struct brw_context *brw)
                                PIPE_CONTROL_STALL_AT_SCOREBOARD);
 
    brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->batch.workaround_bo, 0, 0, 0);
+                               brw->workaround_bo, 0, 0, 0);
 }
 
 /* Emit a pipelined flush to either flush render and texture cache for
@@ -329,3 +329,31 @@ brw_emit_mi_flush(struct brw_context *brw)
 
    brw_render_cache_set_clear(brw);
 }
+
+int
+brw_init_pipe_control(struct brw_context *brw,
+                      const struct brw_device_info *devinfo)
+{
+   if (devinfo->gen < 6)
+      return 0;
+
+   /* We can't just use brw_state_batch to get a chunk of space for
+    * the gen6 workaround because it involves actually writing to
+    * the buffer, and the kernel doesn't let us write to the batch.
+    */
+   brw->workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
+                                           "pipe_control workaround",
+                                           4096, 4096);
+   if (brw->workaround_bo == NULL)
+      return -ENOMEM;
+
+   brw->pipe_controls_since_last_cs_stall = 0;
+
+   return 0;
+}
+
+void
+brw_fini_pipe_control(struct brw_context *brw)
+{
+   drm_intel_bo_unreference(brw->workaround_bo);
+}
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index bc05a310544..8f23702d66b 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -496,7 +496,7 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
     */
    brw_emit_pipe_control_write(brw,
                                PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->batch.workaround_bo, 0, 0, 0);
+                               brw->workaround_bo, 0, 0, 0);
 
    /* Emit 3DSTATE_WM_HZ_OP again to disable the state overrides. */
    BEGIN_BATCH(5);
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 54081a1412f..969d92c2c5f 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -44,16 +44,6 @@ intel_batchbuffer_init(struct brw_context *brw)
 {
    intel_batchbuffer_reset(brw);
 
-   if (brw->gen >= 6) {
-      /* We can't just use brw_state_batch to get a chunk of space for
-       * the gen6 workaround because it involves actually writing to
-       * the buffer, and the kernel doesn't let us write to the batch.
-       */
-      brw->batch.workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
-						      "pipe_control workaround",
-						      4096, 4096);
-   }
-
    if (!brw->has_llc) {
       brw->batch.cpu_map = malloc(BATCH_SZ);
       brw->batch.map = brw->batch.cpu_map;
@@ -82,7 +72,6 @@ intel_batchbuffer_reset(struct brw_context *brw)
    brw->batch.state_batch_offset = brw->batch.bo->size;
    brw->batch.used = 0;
    brw->batch.needs_sol_reset = false;
-   brw->batch.pipe_controls_since_last_cs_stall = 0;
 
    /* We don't know what ring the new batch will be sent to until we see the
     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
@@ -114,7 +103,6 @@ intel_batchbuffer_free(struct brw_context *brw)
    free(brw->batch.cpu_map);
    drm_intel_bo_unreference(brw->batch.last_bo);
    drm_intel_bo_unreference(brw->batch.bo);
-   drm_intel_bo_unreference(brw->batch.workaround_bo);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 740ac8128bb..6b3bd12ff54 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -64,10 +64,10 @@ can_do_pipelined_register_writes(struct brw_context *brw)
    /* Set a value in a BO to a known quantity.  The workaround BO already
     * exists and doesn't contain anything important, so we may as well use it.
     */
-   drm_intel_bo_map(brw->batch.workaround_bo, true);
-   data = brw->batch.workaround_bo->virtual;
+   drm_intel_bo_map(brw->workaround_bo, true);
+   data = brw->workaround_bo->virtual;
    data[offset] = 0xffffffff;
-   drm_intel_bo_unmap(brw->batch.workaround_bo);
+   drm_intel_bo_unmap(brw->workaround_bo);
 
    /* Write the register. */
    BEGIN_BATCH(3);
@@ -82,7 +82,7 @@ can_do_pipelined_register_writes(struct brw_context *brw)
    BEGIN_BATCH(3);
    OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
    OUT_BATCH(reg);
-   OUT_RELOC(brw->batch.workaround_bo,
+   OUT_RELOC(brw->workaround_bo,
              I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
              offset * sizeof(uint32_t));
    ADVANCE_BATCH();
@@ -90,10 +90,10 @@ can_do_pipelined_register_writes(struct brw_context *brw)
    intel_batchbuffer_flush(brw);
 
    /* Check whether the value got written. */
-   drm_intel_bo_map(brw->batch.workaround_bo, false);
-   data = brw->batch.workaround_bo->virtual;
+   drm_intel_bo_map(brw->workaround_bo, false);
+   data = brw->workaround_bo->virtual;
    bool success = data[offset] == expected_value;
-   drm_intel_bo_unmap(brw->batch.workaround_bo);
+   drm_intel_bo_unmap(brw->workaround_bo);
 
    result = success;
 
@@ -120,10 +120,10 @@ can_write_oacontrol(struct brw_context *brw)
    /* Set a value in a BO to a known quantity.  The workaround BO already
     * exists and doesn't contain anything important, so we may as well use it.
     */
-   drm_intel_bo_map(brw->batch.workaround_bo, true);
-   data = brw->batch.workaround_bo->virtual;
+   drm_intel_bo_map(brw->workaround_bo, true);
+   data = brw->workaround_bo->virtual;
    data[offset] = 0xffffffff;
-   drm_intel_bo_unmap(brw->batch.workaround_bo);
+   drm_intel_bo_unmap(brw->workaround_bo);
 
    /* Write OACONTROL. */
    BEGIN_BATCH(3);
@@ -138,7 +138,7 @@ can_write_oacontrol(struct brw_context *brw)
    BEGIN_BATCH(3);
    OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
    OUT_BATCH(OACONTROL);
-   OUT_RELOC(brw->batch.workaround_bo,
+   OUT_RELOC(brw->workaround_bo,
              I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
              offset * sizeof(uint32_t));
    ADVANCE_BATCH();
@@ -155,10 +155,10 @@ can_write_oacontrol(struct brw_context *brw)
    intel_batchbuffer_flush(brw);
 
    /* Check whether the value got written. */
-   drm_intel_bo_map(brw->batch.workaround_bo, false);
-   data = brw->batch.workaround_bo->virtual;
+   drm_intel_bo_map(brw->workaround_bo, false);
+   data = brw->workaround_bo->virtual;
    bool success = data[offset] == expected_value;
-   drm_intel_bo_unmap(brw->batch.workaround_bo);
+   drm_intel_bo_unmap(brw->workaround_bo);
 
    result = success;
 

From 64cb014037551c4b7bbed1cf2ca8f1126c970146 Mon Sep 17 00:00:00 2001
From: Varad Gautam <varadgautam@gmail.com>
Date: Sat, 27 Jun 2015 11:32:26 +0530
Subject: [PATCH 0228/1208] android: freedreno: add missing components to the
 build

Freedreno requires {a4xx,ir3}_SOURCES and NIR to build.

Signed-off-by: Varad Gautam <varadgautam@gmail.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/gallium/drivers/freedreno/Android.mk | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/freedreno/Android.mk b/src/gallium/drivers/freedreno/Android.mk
index a6712b2c115..ed51835e1fb 100644
--- a/src/gallium/drivers/freedreno/Android.mk
+++ b/src/gallium/drivers/freedreno/Android.mk
@@ -28,7 +28,9 @@ include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
 	$(C_SOURCES) \
 	$(a2xx_SOURCES) \
-	$(a3xx_SOURCES)
+	$(a3xx_SOURCES)	\
+	$(a4xx_SOURCES) \
+	$(ir3_SOURCES)
 
 LOCAL_CFLAGS := \
 	-Wno-packed-bitfield-compat
@@ -37,6 +39,7 @@ LOCAL_C_INCLUDES := \
 	$(LOCAL_PATH)/ir3
 
 LOCAL_SHARED_LIBRARIES := libdrm libdrm_freedreno
+LOCAL_STATIC_LIBRARIES := libmesa_glsl
 LOCAL_MODULE := libmesa_pipe_freedreno
 
 include $(GALLIUM_COMMON_MK)

From c7f3657450683827446072ad6b1e8fce04078162 Mon Sep 17 00:00:00 2001
From: Julien Isorce <julien.isorce@gmail.com>
Date: Wed, 1 Jul 2015 00:33:14 +0100
Subject: [PATCH 0229/1208] darwin: Suppress type conversion warnings for
 GLhandleARB

This patch and its description are inspired from Jose Fonseca
explanations and suggestions.

With this patch the following logic applies and only if __APPLE__:

When building mesa, GLhandleARB is defined as unsigned long and
at some point casted to GLuint in gl fuction implementations.
These exact points are where these errors and warnings appear.

When building an application GLhandleARB is defined as void*.
Later when calling a gl function, for example glBindAttribLocationARB,
it will be dispatched to _mesa_BindAttribLocation. So internally
void* will be treated as unsigned long which has the same size.
So the same truncation happens when casting it to GLuint.

Same when GLhandleARB appears as return value.
For mesa it will be GLuint -> unsigned long.
For an application it will be GLuint -> unsigned long -> void*.
Note that the value will be preserved when casting back to GLuint.

When GLhandleARB appears as a pointer there are also separate
entry-points, i.e. _mesa_FuncNameARB. So the same logic can
be applied.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=66346
Signed-off-by: Julien Isorce <julien.isorce@gmail.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 configure.ac       | 2 +-
 include/GL/glext.h | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index d240c060573..d819beffa50 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1354,7 +1354,7 @@ if test "x$enable_dri" = xyes; then
         fi
         ;;
     darwin*)
-        DEFINES="$DEFINES -DGLX_ALIAS_UNSUPPORTED"
+        DEFINES="$DEFINES -DGLX_ALIAS_UNSUPPORTED -DBUILDING_MESA"
         if test "x$with_dri_drivers" = "xyes"; then
             with_dri_drivers="swrast"
         fi
diff --git a/include/GL/glext.h b/include/GL/glext.h
index a3873a613f9..e5f1d891ec5 100644
--- a/include/GL/glext.h
+++ b/include/GL/glext.h
@@ -3879,7 +3879,12 @@ GLAPI void APIENTRY glMinSampleShadingARB (GLfloat value);
 #ifndef GL_ARB_shader_objects
 #define GL_ARB_shader_objects 1
 #ifdef __APPLE__
+#ifdef BUILDING_MESA
+/* Avoid uint <-> void* warnings */
+typedef unsigned long GLhandleARB;
+#else
 typedef void *GLhandleARB;
+#endif
 #else
 typedef unsigned int GLhandleARB;
 #endif

From 7d642442d9339e5b65c30802c44091816cdf18be Mon Sep 17 00:00:00 2001
From: Julien Isorce <j.isorce@samsung.com>
Date: Thu, 2 Jul 2015 23:10:38 +0100
Subject: [PATCH 0230/1208] egl: use unix defines on osx with clang

I also created an bug in Khronos 's bugzilla as you suggested:
https://www.khronos.org/bugzilla/show_bug.cgi?id=1356
I'll let you know if I get feedback from this bug or else where.

Patch with updated error messages:

[PATCH] eglplatform: treat __APPLE__ the same way as __unix__ to handle X11 types

  CC       eglapi.lo
./egldisplay.h:258:19: error: unknown type name 'Display'
_eglGetX11Display(Display *native_display, const EGLint *attrib_list);
eglapi.c:290:4: error: array size is negative
   STATIC_ASSERT(sizeof(void*) == sizeof(nativeDisplay));
eglapi.c:291:25: warning: cast to 'void *' from smaller integer type
   'EGLNativeDisplayType' (aka 'int') [-Wint-to-void-pointer-cast]
   native_display_ptr = (void*) nativeDisplay;
eglapi.c:307:32: error: use of undeclared identifier 'Display'
      dpy = _eglGetX11Display((Display*) native_display, attrib_list);
eglapi.c:776:35: error: use of undeclared identifier 'Window'
      native_window = (void*) (* (Window*) native_window);
eglapi.c:847:35: error: use of undeclared identifier 'Pixmap'
      native_pixmap = (void*) (* (Pixmap*) native_pixmap);

Bugzilla Mesa: https://bugs.freedesktop.org/show_bug.cgi?id=90249
Bugzilla Khronos: https://www.khronos.org/bugzilla/show_bug.cgi?id=1356
Signed-off-by: Julien Isorce <j.isorce@samsung.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 include/EGL/eglplatform.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/EGL/eglplatform.h b/include/EGL/eglplatform.h
index 7802542ad0f..b376e642822 100644
--- a/include/EGL/eglplatform.h
+++ b/include/EGL/eglplatform.h
@@ -77,7 +77,7 @@ typedef HDC     EGLNativeDisplayType;
 typedef HBITMAP EGLNativePixmapType;
 typedef HWND    EGLNativeWindowType;
 
-#elif defined(__APPLE__) || defined(__WINSCW__) || defined(__SYMBIAN32__)  /* Symbian */
+#elif defined(__WINSCW__) || defined(__SYMBIAN32__)  /* Symbian */
 
 typedef int   EGLNativeDisplayType;
 typedef void *EGLNativeWindowType;
@@ -105,7 +105,7 @@ typedef struct ANativeWindow*           EGLNativeWindowType;
 typedef struct egl_native_pixmap_t*     EGLNativePixmapType;
 typedef void*                           EGLNativeDisplayType;
 
-#elif defined(__unix__)
+#elif defined(__unix__) || defined(__APPLE__)
 
 #if defined(MESA_EGL_NO_X11_HEADERS)
 

From e27ea996444743b8cbdca096a4aab47dd405ebf9 Mon Sep 17 00:00:00 2001
From: Julien Isorce <julien.isorce@gmail.com>
Date: Thu, 18 Jun 2015 06:53:52 +0100
Subject: [PATCH 0231/1208] egl/dri2: load libglapi.0.dylib on osx

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=90903
Signed-off-by: Julien Isorce <j.isorce@samsung.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/egl/drivers/dri2/egl_dri2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 400ee635e77..65194cb990c 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -2368,6 +2368,8 @@ dri2_load(_EGLDriver *drv)
 #ifdef HAVE_SHARED_GLAPI
 #ifdef HAVE_ANDROID_PLATFORM
    const char *libname = "libglapi.so";
+#elif defined(__APPLE__)
+   const char *libname = "libglapi.0.dylib";
 #else
    const char *libname = "libglapi.so.0";
 #endif

From efb36271a92b44ee0e35c4f833610dbea776badd Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 8 Jul 2015 01:57:00 -0700
Subject: [PATCH 0232/1208] nir: Fix comment above nir_convert_from_ssa()
 prototype.

Connor renamed the parameter, inverting the sense.
Update the comment accordingly.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
---
 src/glsl/nir/nir.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 4cb7d2f1eac..9e2a2819199 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1659,9 +1659,9 @@ bool nir_ssa_defs_interfere(nir_ssa_def *a, nir_ssa_def *b);
 void nir_convert_to_ssa_impl(nir_function_impl *impl);
 void nir_convert_to_ssa(nir_shader *shader);
 
-/* If convert_everything is true, convert all values (even those not involved
- * in a phi node) to registers. If false, only convert SSA values involved in
- * phi nodes to registers.
+/* If phi_webs_only is true, only convert SSA values involved in phi nodes to
+ * registers.  If false, convert all values (even those not involved in a phi
+ * node) to registers.
  */
 void nir_convert_from_ssa(nir_shader *shader, bool phi_webs_only);
 

From ec151e2f72bd4a239573770aea563d47d0268708 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 6 Jul 2015 22:06:08 +0200
Subject: [PATCH 0233/1208] nvc0: fix wrong use of BLIT_SRC_Y_INT for 2D
 texture copy

According to nv50, this should be src->ms_y instead of src->ms_x. This
code is here since 2012, so it's probably a typo error which has never
been detected since a long time. I didn't do a full piglit run to check
if it fixes some other weird issues.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_surface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index ac4dd25bcfa..3c17f1663ed 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -189,7 +189,7 @@ nvc0_2d_texture_do_copy(struct nouveau_pushbuf *push,
    PUSH_DATA (push, 0);
    PUSH_DATA (push, sx << src->ms_x);
    PUSH_DATA (push, 0);
-   PUSH_DATA (push, sy << src->ms_x);
+   PUSH_DATA (push, sy << src->ms_y);
 
    return 0;
 }

From adc816a1e41812e6489a5bc388f80de65504be5b Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 6 Jul 2015 23:34:23 +0200
Subject: [PATCH 0234/1208] nv50: avoid segfault with enabled but unbound
 vertex attrib

Before validating vertex arrays we need to check if a VBO is present.
Checking if vb->buffer is not NULL fixes the issue.

Fixes the following piglit test:
  gl-3.1-vao-broken-attrib

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv50/nv50_vbo.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 1fd33b8aa59..3d200bd65e8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -382,6 +382,11 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
       if (nv50->vbo_user & (1 << b)) {
          address = addrs[b] + ve->pipe.src_offset;
          limit = addrs[b] + limits[b];
+      } else
+      if (!vb->buffer) {
+         BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FETCH(i)), 1);
+         PUSH_DATA (push, 0);
+         continue;
       } else {
          struct nv04_resource *buf = nv04_resource(vb->buffer);
          if (!(refd & (1 << b))) {

From bbfdf5c17b695c31915e293e1ec858cbcb340894 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Fri, 29 May 2015 15:10:31 +0200
Subject: [PATCH 0235/1208] vl: cleanup video buffer private when the decoder
 is destroyed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=90728

Signed-off-by: Christian König <christian.koenig@amd.com>
CC: mesa-stable@lists.freedesktop.org
---
 src/gallium/auxiliary/vl/vl_mpeg12_decoder.c | 24 ++++++++++++++++++++
 src/gallium/auxiliary/vl/vl_mpeg12_decoder.h |  4 ++++
 2 files changed, 28 insertions(+)

diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
index 8579460e070..a3ad6e6623a 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
@@ -84,6 +84,9 @@ static const unsigned const_empty_block_mask_420[3][2][2] = {
 
 struct video_buffer_private
 {
+   struct list_head list;
+   struct pipe_video_buffer *video_buffer;
+
    struct pipe_sampler_view *sampler_view_planes[VL_NUM_COMPONENTS];
    struct pipe_surface      *surfaces[VL_MAX_SURFACES];
 
@@ -99,6 +102,8 @@ destroy_video_buffer_private(void *private)
    struct video_buffer_private *priv = private;
    unsigned i;
 
+   list_del(&priv->list);
+
    for (i = 0; i < VL_NUM_COMPONENTS; ++i)
       pipe_sampler_view_reference(&priv->sampler_view_planes[i], NULL);
 
@@ -126,6 +131,9 @@ get_video_buffer_private(struct vl_mpeg12_decoder *dec, struct pipe_video_buffer
 
    priv = CALLOC_STRUCT(video_buffer_private);
 
+   list_add(&priv->list, &dec->buffer_privates);
+   priv->video_buffer = buf;
+
    sv = buf->get_sampler_view_planes(buf);
    for (i = 0; i < VL_NUM_COMPONENTS; ++i)
       if (sv[i])
@@ -141,6 +149,18 @@ get_video_buffer_private(struct vl_mpeg12_decoder *dec, struct pipe_video_buffer
    return priv;
 }
 
+static void
+free_video_buffer_privates(struct vl_mpeg12_decoder *dec)
+{
+   struct video_buffer_private *priv, *next;
+
+   LIST_FOR_EACH_ENTRY_SAFE(priv, next, &dec->buffer_privates, list) {
+      struct pipe_video_buffer *buf = priv->video_buffer;
+
+      vl_video_buffer_set_associated_data(buf, &dec->base, NULL, NULL);
+   }
+}
+
 static bool
 init_zscan_buffer(struct vl_mpeg12_decoder *dec, struct vl_mpeg12_buffer *buffer)
 {
@@ -464,6 +484,8 @@ vl_mpeg12_destroy(struct pipe_video_codec *decoder)
 
    assert(decoder);
 
+   free_video_buffer_privates(dec);
+
    /* Asserted in softpipe_delete_fs_state() for some reason */
    dec->context->bind_vs_state(dec->context, NULL);
    dec->context->bind_fs_state(dec->context, NULL);
@@ -1187,6 +1209,8 @@ vl_create_mpeg12_decoder(struct pipe_context *context,
    if (!init_pipe_state(dec))
       goto error_pipe_state;
 
+   list_inithead(&dec->buffer_privates);
+
    return &dec->base;
 
 error_pipe_state:
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.h b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.h
index 2a604054387..505dd675f66 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.h
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.h
@@ -30,6 +30,8 @@
 
 #include "pipe/p_video_codec.h"
 
+#include "util/list.h"
+
 #include "vl_mpeg12_bitstream.h"
 #include "vl_zscan.h"
 #include "vl_idct.h"
@@ -77,6 +79,8 @@ struct vl_mpeg12_decoder
 
    unsigned current_buffer;
    struct vl_mpeg12_buffer *dec_buffers[4];
+
+   struct list_head buffer_privates;
 };
 
 struct vl_mpeg12_buffer

From 2cfa64e159a68998b76bdbcd20f8c7810379fce0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Mon, 29 Jun 2015 10:19:36 +0200
Subject: [PATCH 0236/1208] st/vdpau: fix mixer size checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need to check what the 3D pipe is able to handle for the mixer, not what
the decoder is able to decode. This fixes output of resolutions like 720x1280.

Signed-off-by: Christian König <christian.koenig@amd.com>
CC: mesa-stable@lists.freedesktop.org
---
 src/gallium/state_trackers/vdpau/mixer.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/gallium/state_trackers/vdpau/mixer.c b/src/gallium/state_trackers/vdpau/mixer.c
index 4118eb86997..c0b1ecc55fa 100644
--- a/src/gallium/state_trackers/vdpau/mixer.c
+++ b/src/gallium/state_trackers/vdpau/mixer.c
@@ -49,7 +49,8 @@ vlVdpVideoMixerCreate(VdpDevice device,
    vlVdpVideoMixer *vmixer = NULL;
    VdpStatus ret;
    struct pipe_screen *screen;
-   unsigned max_width, max_height, i;
+   uint32_t max_2d_texture_level;
+   unsigned max_size, i;
 
    vlVdpDevice *dev = vlGetDataHTAB(device);
    if (!dev)
@@ -134,18 +135,17 @@ vlVdpVideoMixerCreate(VdpDevice device,
       VDPAU_MSG(VDPAU_WARN, "[VDPAU] Max layers > 4 not supported\n", vmixer->max_layers);
       goto no_params;
    }
-   max_width = screen->get_video_param(screen, PIPE_VIDEO_PROFILE_UNKNOWN,
-                                       PIPE_VIDEO_ENTRYPOINT_BITSTREAM, PIPE_VIDEO_CAP_MAX_WIDTH);
-   max_height = screen->get_video_param(screen, PIPE_VIDEO_PROFILE_UNKNOWN,
-                                        PIPE_VIDEO_ENTRYPOINT_BITSTREAM, PIPE_VIDEO_CAP_MAX_HEIGHT);
-   if (vmixer->video_width < 48 ||
-       vmixer->video_width > max_width) {
-      VDPAU_MSG(VDPAU_WARN, "[VDPAU] 48 < %u < %u not valid for width\n", vmixer->video_width, max_width);
+
+   max_2d_texture_level = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
+   max_size = pow(2, max_2d_texture_level-1);
+   if (vmixer->video_width < 48 || vmixer->video_width > max_size) {
+      VDPAU_MSG(VDPAU_WARN, "[VDPAU] 48 < %u < %u not valid for width\n",
+                vmixer->video_width, max_size);
       goto no_params;
    }
-   if (vmixer->video_height < 48 ||
-       vmixer->video_height > max_height) {
-      VDPAU_MSG(VDPAU_WARN, "[VDPAU] 48 < %u < %u  not valid for height\n", vmixer->video_height, max_height);
+   if (vmixer->video_height < 48 || vmixer->video_height > max_size) {
+      VDPAU_MSG(VDPAU_WARN, "[VDPAU] 48 < %u < %u  not valid for height\n",
+                vmixer->video_height, max_size);
       goto no_params;
    }
    vmixer->luma_key_min = 0.f;

From 0166b4c165271bd7525a91049e58e390cb596c60 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Thu, 9 Jul 2015 10:35:19 -0700
Subject: [PATCH 0237/1208] i965/hsw: Implement end of batch workaround

This patch can cause an infinite recursion if the previous patch titled, "i965:
Track finished batch state" isn't present (backporters take notice).

v2: Sent out the wrong patch originally. This patches switches the order of
flushes, doing the generic flush before the CC_STATE, and the required
workaround flush afterwards

v3: Only perform workaround for render ring
Add text to the BATCH_RESERVE comments

v4 (By Ken): Rebase; update citation to mention PRM and Wa name; combine two
blocks.

http://otc-mesa-ci.jf.intel.com/job/bwidawsk/171/

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 27 +++++++++++++++++--
 src/mesa/drivers/dri/i965/intel_batchbuffer.h |  4 +++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 969d92c2c5f..d93ee6eed8a 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -32,6 +32,7 @@
 #include "intel_buffers.h"
 #include "intel_fbo.h"
 #include "brw_context.h"
+#include "brw_defines.h"
 
 #include <xf86drm.h>
 #include <i915_drm.h>
@@ -206,10 +207,32 @@ brw_finish_batch(struct brw_context *brw)
     */
    brw_emit_query_end(brw);
 
-   /* We may also need to snapshot and disable OA counters. */
-   if (brw->batch.ring == RENDER_RING)
+   if (brw->batch.ring == RENDER_RING) {
+      /* We may also need to snapshot and disable OA counters. */
       brw_perf_monitor_finish_batch(brw);
 
+      if (brw->is_haswell) {
+         /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
+          * 3DSTATE_CC_STATE_POINTERS > "Note":
+          *
+          * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
+          *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
+          *
+          * From the example in the docs, it seems to expect a regular pipe control
+          * flush here as well. We may have done it already, but meh.
+          *
+          * See also WaAvoidRCZCounterRollover.
+          */
+         brw_emit_mi_flush(brw);
+         BEGIN_BATCH(2);
+         OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
+         OUT_BATCH(brw->cc.state_offset | 1);
+         ADVANCE_BATCH();
+         brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                          PIPE_CONTROL_CS_STALL);
+      }
+   }
+
    /* Mark that the current program cache BO has been used by the GPU.
     * It will be reallocated if we need to put new programs in for the
     * next batch.
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index fdd07e0a117..8eaedd1896b 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -26,6 +26,10 @@ extern "C" {
  *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
  *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
  *       Sandybridge PIPE_CONTROL madness.
+ *   - CC_STATE workaround on HSW (12 * 4 = 48 bytes)
+ *     - 5 dwords for initial mi_flush
+ *     - 2 dwords for CC state setup
+ *     - 5 dwords for the required pipe control at the end
  */
 #define BATCH_RESERVED 152
 

From c04339486a26b7bee3575bf30dde4f7152a70211 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Tue, 7 Jul 2015 18:51:30 -0700
Subject: [PATCH 0238/1208] i965: Set brw->batch.emit only #ifdef DEBUG.

It's only used inside #ifdef DEBUG. Cuts ~1.7k of .text, and more
importantly prevents a larger code size regression in the next commit
when the .used field is replaced and calculated on demand.

   text     data      bss      dec      hex  filename
4945468   195152    26192  5166812   4ed6dc  i965_dri.so before
4943740   195152    26192  5165084   4ed01c  i965_dri.so after

And surround the emit and total fields with #ifdef DEBUG to prevent
such mistakes from happening again.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_context.h       | 2 ++
 src/mesa/drivers/dri/i965/intel_batchbuffer.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 65f34c368de..44d1aeaf54a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -870,7 +870,9 @@ struct intel_batchbuffer {
    /** Last BO submitted to the hardware.  Used for glFinish(). */
    drm_intel_bo *last_bo;
 
+#ifdef DEBUG
    uint16_t emit, total;
+#endif
    uint16_t used, reserved_space;
    uint32_t *map;
    uint32_t *cpu_map;
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index 8eaedd1896b..e58eae4115d 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -138,8 +138,8 @@ intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
 {
    intel_batchbuffer_require_space(brw, n * 4, ring);
 
-   brw->batch.emit = brw->batch.used;
 #ifdef DEBUG
+   brw->batch.emit = brw->batch.used;
    brw->batch.total = n;
 #endif
 }

From a2dde3a8dabbbd45fb3155771bc1802866ff5f61 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Tue, 7 Jul 2015 23:33:57 -0700
Subject: [PATCH 0239/1208] util: Don't link to SHA1 library if shader-cache is
 disabled.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/util/Makefile.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/Makefile.am b/src/util/Makefile.am
index 2e7542e4245..1e087b40d38 100644
--- a/src/util/Makefile.am
+++ b/src/util/Makefile.am
@@ -46,9 +46,9 @@ libmesautil_la_SOURCES = \
 
 if ENABLE_SHADER_CACHE
 libmesautil_la_SOURCES += $(MESA_UTIL_SHADER_CACHE_FILES)
-endif
 
 libmesautil_la_LIBADD = $(SHA1_LIBS)
+endif
 
 roundeven_test_LDADD = -lm
 

From 27d8a690c41748b39c65d1ff51bb63e9f860bae1 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 8 Jul 2015 15:56:15 -0600
Subject: [PATCH 0240/1208] gallium/docs: s/treaded/treated/ typo in tgsi.rst

Trivial.
---
 src/gallium/docs/source/tgsi.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 4e869e72b24..314c9ca8fa2 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2591,7 +2591,7 @@ Array Declaration
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
 Declarations can optional have an ArrayID attribute which can be referred by
-indirect addressing operands. An ArrayID of zero is reserved and treaded as
+indirect addressing operands. An ArrayID of zero is reserved and treated as
 if no ArrayID is specified.
 
 If an indirect addressing operand refers to a specific declaration by using

From 1f02a82c8bcac67ced81243631bad6ee1bb810ee Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 8 Jul 2015 18:05:27 -0600
Subject: [PATCH 0241/1208] gallium: fix comment typo in p_shader_tokens.h

---
 src/gallium/include/pipe/p_shader_tokens.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index bb57e805c29..6e07b2c5c7c 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -685,7 +685,7 @@ struct tgsi_src_register
  *
  * File, Index and Swizzle are handled the same as in tgsi_src_register.
  *
- * If ArrayID is zero the whole register file might be is indirectly addressed,
+ * If ArrayID is zero the whole register file might be indirectly addressed,
  * if not only the Declaration with this ArrayID is accessed by this operand.
  *
  */

From 04a57a7ee92403a1d9e01eada69f1ab133fc0b47 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 9 Jul 2015 16:58:04 -0600
Subject: [PATCH 0242/1208] tgsi: whitespace fixes in tgsi_parse.c

Trivial.
---
 src/gallium/auxiliary/tgsi/tgsi_parse.c | 26 ++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 1162b265522..b8873b24241 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -36,7 +36,7 @@ tgsi_parse_init(
    const struct tgsi_token *tokens )
 {
    ctx->FullHeader.Header = *(struct tgsi_header *) &tokens[0];
-   if( ctx->FullHeader.Header.HeaderSize >= 2 ) {
+   if (ctx->FullHeader.Header.HeaderSize >= 2) {
       ctx->FullHeader.Processor = *(struct tgsi_processor *) &tokens[1];
    }
    else {
@@ -113,11 +113,11 @@ tgsi_parse_token(
          next_token(ctx, &decl->Dim);
       }
 
-      if( decl->Declaration.Interpolate ) {
+      if (decl->Declaration.Interpolate) {
          next_token( ctx, &decl->Interp );
       }
 
-      if( decl->Declaration.Semantic ) {
+      if (decl->Declaration.Semantic) {
          next_token( ctx, &decl->Semantic );
       }
 
@@ -129,7 +129,7 @@ tgsi_parse_token(
          next_token(ctx, &decl->SamplerView);
       }
 
-      if( decl->Declaration.Array ) {
+      if (decl->Declaration.Array) {
          next_token(ctx, &decl->Array);
       }
 
@@ -190,21 +190,21 @@ tgsi_parse_token(
 
       if (inst->Instruction.Texture) {
          next_token( ctx, &inst->Texture);
-         for( i = 0; i < inst->Texture.NumOffsets; i++ ) {
+         for (i = 0; i < inst->Texture.NumOffsets; i++) {
             next_token( ctx, &inst->TexOffsets[i] );
          }
       }
 
       assert( inst->Instruction.NumDstRegs <= TGSI_FULL_MAX_DST_REGISTERS );
 
-      for(  i = 0; i < inst->Instruction.NumDstRegs; i++ ) {
+      for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
 
          next_token( ctx, &inst->Dst[i].Register );
 
-         if( inst->Dst[i].Register.Indirect )
+         if (inst->Dst[i].Register.Indirect)
             next_token( ctx, &inst->Dst[i].Indirect );
 
-         if( inst->Dst[i].Register.Dimension ) {
+         if (inst->Dst[i].Register.Dimension) {
             next_token( ctx, &inst->Dst[i].Dimension );
 
             /*
@@ -212,21 +212,21 @@ tgsi_parse_token(
              */
             assert( !inst->Dst[i].Dimension.Dimension );
 
-            if( inst->Dst[i].Dimension.Indirect )
+            if (inst->Dst[i].Dimension.Indirect)
                next_token( ctx, &inst->Dst[i].DimIndirect );
          }
       }
 
       assert( inst->Instruction.NumSrcRegs <= TGSI_FULL_MAX_SRC_REGISTERS );
 
-      for( i = 0; i < inst->Instruction.NumSrcRegs; i++ ) {
+      for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 
          next_token( ctx, &inst->Src[i].Register );
 
-         if( inst->Src[i].Register.Indirect )
+         if (inst->Src[i].Register.Indirect)
             next_token( ctx, &inst->Src[i].Indirect );
 
-         if( inst->Src[i].Register.Dimension ) {
+         if (inst->Src[i].Register.Dimension) {
             next_token( ctx, &inst->Src[i].Dimension );
 
             /*
@@ -234,7 +234,7 @@ tgsi_parse_token(
              */
             assert( !inst->Src[i].Dimension.Dimension );
 
-            if( inst->Src[i].Dimension.Indirect )
+            if (inst->Src[i].Dimension.Indirect)
                next_token( ctx, &inst->Src[i].DimIndirect );
          }
       }

From 308c0bf74307af0f3385cdcbb00aa0534ec3e5da Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 12 Mar 2015 10:43:23 -0700
Subject: [PATCH 0243/1208] i965: Switch on shader stage in
 nir_setup_outputs().

Adding new shader stages to a switch statement is less confusing than an
if-else-if ladder where all but the first case are fragment shader
specific (but don't claim to be).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 57 +++++++++++++-----------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 5d1ea21884a..10903a11c31 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -141,38 +141,45 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
          var->type->is_array() ? var->type->fields.array->vector_elements
                                : var->type->vector_elements;
 
-      if (stage == MESA_SHADER_VERTEX) {
+      switch (stage) {
+      case MESA_SHADER_VERTEX:
          for (int i = 0; i < ALIGN(type_size(var->type), 4) / 4; i++) {
             int output = var->data.location + i;
             this->outputs[output] = offset(reg, bld, 4 * i);
             this->output_components[output] = vector_elements;
          }
-      } else if (var->data.index > 0) {
-         assert(var->data.location == FRAG_RESULT_DATA0);
-         assert(var->data.index == 1);
-         this->dual_src_output = reg;
-         this->do_dual_src = true;
-      } else if (var->data.location == FRAG_RESULT_COLOR) {
-         /* Writing gl_FragColor outputs to all color regions. */
-         for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
-            this->outputs[i] = reg;
-            this->output_components[i] = 4;
-         }
-      } else if (var->data.location == FRAG_RESULT_DEPTH) {
-         this->frag_depth = reg;
-      } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) {
-         this->sample_mask = reg;
-      } else {
-         /* gl_FragData or a user-defined FS output */
-         assert(var->data.location >= FRAG_RESULT_DATA0 &&
-                var->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
+         break;
+      case MESA_SHADER_FRAGMENT:
+         if (var->data.index > 0) {
+            assert(var->data.location == FRAG_RESULT_DATA0);
+            assert(var->data.index == 1);
+            this->dual_src_output = reg;
+            this->do_dual_src = true;
+         } else if (var->data.location == FRAG_RESULT_COLOR) {
+            /* Writing gl_FragColor outputs to all color regions. */
+            for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
+               this->outputs[i] = reg;
+               this->output_components[i] = 4;
+            }
+         } else if (var->data.location == FRAG_RESULT_DEPTH) {
+            this->frag_depth = reg;
+         } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) {
+            this->sample_mask = reg;
+         } else {
+            /* gl_FragData or a user-defined FS output */
+            assert(var->data.location >= FRAG_RESULT_DATA0 &&
+                   var->data.location < FRAG_RESULT_DATA0+BRW_MAX_DRAW_BUFFERS);
 
-         /* General color output. */
-         for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
-            int output = var->data.location - FRAG_RESULT_DATA0 + i;
-            this->outputs[output] = offset(reg, bld, vector_elements * i);
-            this->output_components[output] = vector_elements;
+            /* General color output. */
+            for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
+               int output = var->data.location - FRAG_RESULT_DATA0 + i;
+               this->outputs[output] = offset(reg, bld, vector_elements * i);
+               this->output_components[output] = vector_elements;
+            }
          }
+         break;
+      default:
+         unreachable("unhandled shader stage");
       }
    }
 }

From 0163c99e8f6959b5d6c7a937a322127cfdf9315f Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 29 Jun 2015 21:58:47 -0700
Subject: [PATCH 0244/1208] i965/vec4: Plumb log_data through so the
 backend_shader field gets set.

Jason plumbed this through a while back in the FS backend, but
apparently we were just passing NULL in the vec4 backend.

This patch passes brw in as intended.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp            |  2 +-
 src/mesa/drivers/dri/i965/brw_vec4.h              |  1 +
 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp | 10 ++++++----
 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h   |  1 +
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp    |  3 ++-
 src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp |  4 +++-
 src/mesa/drivers/dri/i965/brw_vs.h                |  1 +
 src/mesa/drivers/dri/i965/gen6_gs_visitor.h       |  4 +++-
 8 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index a5c686ceaaf..2a565645ce2 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1940,7 +1940,7 @@ brw_vs_emit(struct brw_context *brw,
    if (!assembly) {
       prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-      vec4_vs_visitor v(brw->intelScreen->compiler,
+      vec4_vs_visitor v(brw->intelScreen->compiler, brw,
                         c, prog_data, prog, mem_ctx, st_index,
                         !_mesa_is_gles3(&brw->ctx));
       if (!v.run(brw_select_clip_planes(&brw->ctx))) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 2ac16932189..043557b0c9d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -77,6 +77,7 @@ class vec4_visitor : public backend_shader, public ir_visitor
 {
 public:
    vec4_visitor(const struct brw_compiler *compiler,
+                void *log_data,
                 struct brw_vec4_compile *c,
                 struct gl_program *prog,
                 const struct brw_vue_prog_key *key,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 69bcf5afc51..80c59aff6ce 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -35,12 +35,14 @@ const unsigned MAX_GS_INPUT_VERTICES = 6;
 namespace brw {
 
 vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
+                                 void *log_data,
                                  struct brw_gs_compile *c,
                                  struct gl_shader_program *prog,
                                  void *mem_ctx,
                                  bool no_spills,
                                  int shader_time_index)
-   : vec4_visitor(compiler, &c->base, &c->gp->program.Base, &c->key.base,
+   : vec4_visitor(compiler, log_data,
+                  &c->base, &c->gp->program.Base, &c->key.base,
                   &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx,
                   no_spills, shader_time_index),
      c(c)
@@ -662,7 +664,7 @@ brw_gs_emit(struct brw_context *brw,
           likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
          c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-         vec4_gs_visitor v(brw->intelScreen->compiler,
+         vec4_gs_visitor v(brw->intelScreen->compiler, brw,
                            c, prog, mem_ctx, true /* no_spills */, st_index);
          if (v.run(NULL /* clip planes */)) {
             return generate_assembly(brw, prog, &c->gp->program.Base,
@@ -704,11 +706,11 @@ brw_gs_emit(struct brw_context *brw,
    const unsigned *ret = NULL;
 
    if (brw->gen >= 7)
-      gs = new vec4_gs_visitor(brw->intelScreen->compiler,
+      gs = new vec4_gs_visitor(brw->intelScreen->compiler, brw,
                                c, prog, mem_ctx, false /* no_spills */,
                                st_index);
    else
-      gs = new gen6_gs_visitor(brw->intelScreen->compiler,
+      gs = new gen6_gs_visitor(brw->intelScreen->compiler, brw,
                                c, prog, mem_ctx, false /* no_spills */,
                                st_index);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index e693c56b58f..e48d8610574 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -69,6 +69,7 @@ class vec4_gs_visitor : public vec4_visitor
 {
 public:
    vec4_gs_visitor(const struct brw_compiler *compiler,
+                   void *log_data,
                    struct brw_gs_compile *c,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 8d7a80b19eb..21d9b010b44 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -3678,6 +3678,7 @@ vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
 }
 
 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
+                           void *log_data,
                            struct brw_vec4_compile *c,
                            struct gl_program *prog,
                            const struct brw_vue_prog_key *key,
@@ -3687,7 +3688,7 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
 			   void *mem_ctx,
                            bool no_spills,
                            int shader_time_index)
-   : backend_shader(compiler, NULL, mem_ctx,
+   : backend_shader(compiler, log_data, mem_ctx,
                     shader_prog, prog, &prog_data->base, stage),
      c(c),
      key(key),
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index f93062b46d0..eed4d6d5fbc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -212,13 +212,15 @@ vec4_vs_visitor::emit_thread_end()
 
 
 vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
+                                 void *log_data,
                                  struct brw_vs_compile *vs_compile,
                                  struct brw_vs_prog_data *vs_prog_data,
                                  struct gl_shader_program *prog,
                                  void *mem_ctx,
                                  int shader_time_index,
                                  bool use_legacy_snorm_formula)
-   : vec4_visitor(compiler, &vs_compile->base, &vs_compile->vp->program.Base,
+   : vec4_visitor(compiler, log_data,
+                  &vs_compile->base, &vs_compile->vp->program.Base,
                   &vs_compile->key.base, &vs_prog_data->base, prog,
                   MESA_SHADER_VERTEX,
                   mem_ctx, false /* no_spills */,
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 61f9b006a58..a15cba0b42c 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -91,6 +91,7 @@ class vec4_vs_visitor : public vec4_visitor
 {
 public:
    vec4_vs_visitor(const struct brw_compiler *compiler,
+                   void *log_data,
                    struct brw_vs_compile *vs_compile,
                    struct brw_vs_prog_data *vs_prog_data,
                    struct gl_shader_program *prog,
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
index 27254ebb727..862d0eaacf0 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
@@ -36,12 +36,14 @@ class gen6_gs_visitor : public vec4_gs_visitor
 {
 public:
    gen6_gs_visitor(const struct brw_compiler *comp,
+                   void *log_data,
                    struct brw_gs_compile *c,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
                    bool no_spills,
                    int shader_time_index) :
-      vec4_gs_visitor(comp, c, prog, mem_ctx, no_spills, shader_time_index) {}
+      vec4_gs_visitor(comp, log_data, c, prog, mem_ctx, no_spills,
+                      shader_time_index) {}
 
 protected:
    virtual void assign_binding_table_offsets();

From dc776ffb900b21421158ef8efbd675bdd47593bc Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 28 Jun 2015 20:45:47 -0700
Subject: [PATCH 0245/1208] i965/vec4: Move perf_debug about register spilling
 into the visitor.

This patch makes us only issue the performance warning about register
spilling if we actually spilled registers.  We also use scratch space
for indirect addressing and the like.

This is basically commit c51163b0cf7aff0375b1a5ea4cb3da9d9e164044 for
the vec4 backend.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_gs.c     |  4 ----
 src/mesa/drivers/dri/i965/brw_vec4.cpp | 16 +++++++++++++---
 src/mesa/drivers/dri/i965/brw_vs.c     |  4 ----
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 52c73031a3c..7f947e0eb25 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -268,10 +268,6 @@ brw_codegen_gs_prog(struct brw_context *brw,
 
    /* Scratch space is used for register spilling */
    if (c.base.last_scratch) {
-      perf_debug("Geometry shader triggered register spilling.  "
-                 "Try reducing the number of live vec4 values to "
-                 "improve performance.\n");
-
       c.prog_data.base.base.total_scratch
          = brw_get_scratch_size(c.base.last_scratch*REG_SIZE);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 2a565645ce2..60f73e2c82c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1827,9 +1827,19 @@ vec4_visitor::run(gl_clip_plane *clip_planes)
       }
    }
 
-   while (!reg_allocate()) {
-      if (failed)
-         return false;
+   bool allocated_without_spills = reg_allocate();
+
+   if (!allocated_without_spills) {
+      compiler->shader_perf_log(log_data,
+                                "%s shader triggered register spilling.  "
+                                "Try reducing the number of live vec4 values "
+                                "to improve performance.\n",
+                                stage_name);
+
+      while (!reg_allocate()) {
+         if (failed)
+            return false;
+      }
    }
 
    opt_schedule_instructions();
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 6e9848fb1e9..edbcbcff818 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -196,10 +196,6 @@ brw_codegen_vs_prog(struct brw_context *brw,
 
    /* Scratch space is used for register spilling */
    if (c.base.last_scratch) {
-      perf_debug("Vertex shader triggered register spilling.  "
-                 "Try reducing the number of live vec4 values to "
-                 "improve performance.\n");
-
       prog_data.base.base.total_scratch
          = brw_get_scratch_size(c.base.last_scratch*REG_SIZE);
 

From 8524deb8c8fc37abc2cb2717be64a533746a92f9 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 28 Jun 2015 20:55:25 -0700
Subject: [PATCH 0246/1208] i965/vec4: Move total_scratch calculation into the
 visitor.

This is more consistent with how we do it in the FS backend, and reduces
a tiny bit of duplication.  It'll also allow for a bit more tidying.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_gs.c     | 5 +----
 src/mesa/drivers/dri/i965/brw_vec4.cpp | 7 +++++--
 src/mesa/drivers/dri/i965/brw_vs.c     | 5 +----
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 7f947e0eb25..9c59c8a0dfc 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -267,10 +267,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
    }
 
    /* Scratch space is used for register spilling */
-   if (c.base.last_scratch) {
-      c.prog_data.base.base.total_scratch
-         = brw_get_scratch_size(c.base.last_scratch*REG_SIZE);
-
+   if (c.prog_data.base.base.total_scratch) {
       brw_get_scratch_bo(brw, &stage_state->scratch_bo,
 			 c.prog_data.base.base.total_scratch *
                          brw->max_gs_threads);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 60f73e2c82c..7b367ec036c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1846,6 +1846,11 @@ vec4_visitor::run(gl_clip_plane *clip_planes)
 
    opt_set_dependency_control();
 
+   if (c->last_scratch > 0) {
+      prog_data->base.total_scratch =
+         brw_get_scratch_size(c->last_scratch * REG_SIZE);
+   }
+
    /* If any state parameters were appended, then ParameterValues could have
     * been realloced, in which case the driver uniform storage set up by
     * _mesa_associate_uniform_storage() would point to freed memory.  Make
@@ -1943,8 +1948,6 @@ brw_vs_emit(struct brw_context *brw,
       }
       g.generate_code(v.cfg, 8);
       assembly = g.get_assembly(final_assembly_size);
-
-      c->base.last_scratch = v.last_scratch;
    }
 
    if (!assembly) {
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index edbcbcff818..ee3f6640938 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -195,10 +195,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
    }
 
    /* Scratch space is used for register spilling */
-   if (c.base.last_scratch) {
-      prog_data.base.base.total_scratch
-         = brw_get_scratch_size(c.base.last_scratch*REG_SIZE);
-
+   if (prog_data.base.base.total_scratch) {
       brw_get_scratch_bo(brw, &brw->vs.base.scratch_bo,
 			 prog_data.base.base.total_scratch *
                          brw->max_vs_threads);

From 13372a0ce746cde6fa6e0aa3c5130e4227f123e0 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 28 Jun 2015 21:02:15 -0700
Subject: [PATCH 0247/1208] i965/vec4: Move c->last_scratch into vec4_visitor.

Nothing outside of vec4_visitor uses it, so we may as well keep it
internal.

Commit db9c915abcc5ad78d2d11d0e732f04cc94631350 for the vec4 backend.

(The empty class will be going away soon.)

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp          |  4 ++--
 src/mesa/drivers/dri/i965/brw_vec4.h            |  8 ++------
 .../drivers/dri/i965/brw_vec4_gs_visitor.cpp    |  2 +-
 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h |  1 -
 .../drivers/dri/i965/brw_vec4_reg_allocate.cpp  |  2 +-
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp  | 17 ++++++++---------
 .../drivers/dri/i965/brw_vec4_vs_visitor.cpp    |  2 +-
 src/mesa/drivers/dri/i965/brw_vs.h              |  1 -
 8 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 7b367ec036c..e5db2687aa0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1846,9 +1846,9 @@ vec4_visitor::run(gl_clip_plane *clip_planes)
 
    opt_set_dependency_control();
 
-   if (c->last_scratch > 0) {
+   if (last_scratch > 0) {
       prog_data->base.total_scratch =
-         brw_get_scratch_size(c->last_scratch * REG_SIZE);
+         brw_get_scratch_size(last_scratch * REG_SIZE);
    }
 
    /* If any state parameters were appended, then ParameterValues could have
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 043557b0c9d..36436511df0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -47,10 +47,6 @@ extern "C" {
 #include "glsl/ir.h"
 
 
-struct brw_vec4_compile {
-   GLuint last_scratch; /**< measured in 32-byte (register size) units */
-};
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -78,7 +74,6 @@ class vec4_visitor : public backend_shader, public ir_visitor
 public:
    vec4_visitor(const struct brw_compiler *compiler,
                 void *log_data,
-                struct brw_vec4_compile *c,
                 struct gl_program *prog,
                 const struct brw_vue_prog_key *key,
                 struct brw_vue_prog_data *prog_data,
@@ -104,7 +99,6 @@ public:
       return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
    }
 
-   struct brw_vec4_compile * const c;
    const struct brw_vue_prog_key * const key;
    struct brw_vue_prog_data * const prog_data;
    unsigned int sanity_param_count;
@@ -412,6 +406,8 @@ private:
    const bool no_spills;
 
    int shader_time_index;
+
+   unsigned last_scratch; /**< measured in 32-byte (register size) units */
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 80c59aff6ce..2f948ee73c0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -42,7 +42,7 @@ vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
                                  bool no_spills,
                                  int shader_time_index)
    : vec4_visitor(compiler, log_data,
-                  &c->base, &c->gp->program.Base, &c->key.base,
+                  &c->gp->program.Base, &c->key.base,
                   &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx,
                   no_spills, shader_time_index),
      c(c)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index e48d8610574..e51399d620d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -37,7 +37,6 @@
  */
 struct brw_gs_compile
 {
-   struct brw_vec4_compile base;
    struct brw_gs_prog_key key;
    struct brw_gs_prog_data prog_data;
    struct brw_vue_map input_vue_map;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
index 555c42e2f24..cd89edd64de 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
@@ -339,7 +339,7 @@ void
 vec4_visitor::spill_reg(int spill_reg_nr)
 {
    assert(alloc.sizes[spill_reg_nr] == 1);
-   unsigned int spill_offset = c->last_scratch++;
+   unsigned int spill_offset = last_scratch++;
 
    /* Generate spill/unspill instructions for the objects being spilled. */
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 21d9b010b44..c9c26611d95 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -3484,16 +3484,16 @@ vec4_visitor::move_grf_array_access_to_scratch()
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       if (inst->dst.file == GRF && inst->dst.reladdr) {
          if (scratch_loc[inst->dst.reg] == -1) {
-            scratch_loc[inst->dst.reg] = c->last_scratch;
-            c->last_scratch += this->alloc.sizes[inst->dst.reg];
+            scratch_loc[inst->dst.reg] = last_scratch;
+            last_scratch += this->alloc.sizes[inst->dst.reg];
          }
 
          for (src_reg *iter = inst->dst.reladdr;
               iter->reladdr;
               iter = iter->reladdr) {
             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
-               scratch_loc[iter->reg] = c->last_scratch;
-               c->last_scratch += this->alloc.sizes[iter->reg];
+               scratch_loc[iter->reg] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->reg];
             }
          }
       }
@@ -3503,8 +3503,8 @@ vec4_visitor::move_grf_array_access_to_scratch()
               iter->reladdr;
               iter = iter->reladdr) {
             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
-               scratch_loc[iter->reg] = c->last_scratch;
-               c->last_scratch += this->alloc.sizes[iter->reg];
+               scratch_loc[iter->reg] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->reg];
             }
          }
       }
@@ -3679,7 +3679,6 @@ vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
 
 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
                            void *log_data,
-                           struct brw_vec4_compile *c,
                            struct gl_program *prog,
                            const struct brw_vue_prog_key *key,
                            struct brw_vue_prog_data *prog_data,
@@ -3690,7 +3689,6 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
                            int shader_time_index)
    : backend_shader(compiler, log_data, mem_ctx,
                     shader_prog, prog, &prog_data->base, stage),
-     c(c),
      key(key),
      prog_data(prog_data),
      sanity_param_count(0),
@@ -3698,7 +3696,8 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
      first_non_payload_grf(0),
      need_all_constants_in_pull_buffer(false),
      no_spills(no_spills),
-     shader_time_index(shader_time_index)
+     shader_time_index(shader_time_index),
+     last_scratch(0)
 {
    this->failed = false;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index eed4d6d5fbc..35b601a81aa 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -220,7 +220,7 @@ vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
                                  int shader_time_index,
                                  bool use_legacy_snorm_formula)
    : vec4_visitor(compiler, log_data,
-                  &vs_compile->base, &vs_compile->vp->program.Base,
+                  &vs_compile->vp->program.Base,
                   &vs_compile->key.base, &vs_prog_data->base, prog,
                   MESA_SHADER_VERTEX,
                   mem_ctx, false /* no_spills */,
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index a15cba0b42c..3a131b0767d 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -51,7 +51,6 @@
 #define BRW_ATTRIB_WA_SCALE         64  /* interpret as scaled in shader */
 
 struct brw_vs_compile {
-   struct brw_vec4_compile base;
    struct brw_vs_prog_key key;
 
    struct brw_vertex_program *vp;

From 64390967c1abc326875e495f233afec6e685db72 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 29 Jun 2015 22:07:37 -0700
Subject: [PATCH 0248/1208] i965/vs: Remove 'c'/vs_compile from
 vec4_vs_visitor.

At this point, the brw_vs_compile structure only contains the key and
gl_vertex_program pointer.  We may as well pass and store them directly;
it's simpler and more convenient (key-> instead of vs_compile->key...).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp            |  4 ++--
 src/mesa/drivers/dri/i965/brw_vec4_vp.cpp         |  9 +++------
 src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp | 11 ++++++-----
 src/mesa/drivers/dri/i965/brw_vs.h                |  6 ++++--
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index e5db2687aa0..42d014cb1d8 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1953,8 +1953,8 @@ brw_vs_emit(struct brw_context *brw,
    if (!assembly) {
       prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-      vec4_vs_visitor v(brw->intelScreen->compiler, brw,
-                        c, prog_data, prog, mem_ctx, st_index,
+      vec4_vs_visitor v(brw->intelScreen->compiler, brw, &c->key, prog_data,
+                        &c->vp->program, prog, mem_ctx, st_index,
                         !_mesa_is_gles3(&brw->ctx));
       if (!v.run(brw_select_clip_planes(&brw->ctx))) {
          if (prog) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
index dcbd2405078..d1a72d787e7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
@@ -394,8 +394,7 @@ vec4_vs_visitor::emit_program_code()
     * pull constants.  Do that now.
     */
    if (this->need_all_constants_in_pull_buffer) {
-      const struct gl_program_parameter_list *params =
-         vs_compile->vp->program.Base.Parameters;
+      const struct gl_program_parameter_list *params = vp->Base.Parameters;
       unsigned i;
       for (i = 0; i < params->NumParameters * 4; i++) {
          stage_prog_data->pull_param[i] =
@@ -415,8 +414,7 @@ vec4_vs_visitor::setup_vp_regs()
       vp_temp_regs[i] = src_reg(this, glsl_type::vec4_type);
 
    /* PROGRAM_STATE_VAR etc. */
-   struct gl_program_parameter_list *plist =
-      vs_compile->vp->program.Base.Parameters;
+   struct gl_program_parameter_list *plist = vp->Base.Parameters;
    for (unsigned p = 0; p < plist->NumParameters; p++) {
       unsigned components = plist->Parameters[p].Size;
 
@@ -486,8 +484,7 @@ vec4_vs_visitor::get_vp_dst_reg(const prog_dst_register &dst)
 src_reg
 vec4_vs_visitor::get_vp_src_reg(const prog_src_register &src)
 {
-   struct gl_program_parameter_list *plist =
-      vs_compile->vp->program.Base.Parameters;
+   struct gl_program_parameter_list *plist = vp->Base.Parameters;
 
    src_reg result;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index 35b601a81aa..b7ec8b9c169 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -36,7 +36,7 @@ vec4_vs_visitor::emit_prolog()
 
    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
-         uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
+         uint8_t wa_flags = key->gl_attrib_wa_flags[i];
          dst_reg reg(ATTR, i);
          dst_reg reg_d = reg;
          reg_d.type = BRW_REGISTER_TYPE_D;
@@ -213,20 +213,21 @@ vec4_vs_visitor::emit_thread_end()
 
 vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
                                  void *log_data,
-                                 struct brw_vs_compile *vs_compile,
+                                 const struct brw_vs_prog_key *key,
                                  struct brw_vs_prog_data *vs_prog_data,
+                                 struct gl_vertex_program *vp,
                                  struct gl_shader_program *prog,
                                  void *mem_ctx,
                                  int shader_time_index,
                                  bool use_legacy_snorm_formula)
    : vec4_visitor(compiler, log_data,
-                  &vs_compile->vp->program.Base,
-                  &vs_compile->key.base, &vs_prog_data->base, prog,
+                  &vp->Base, &key->base, &vs_prog_data->base, prog,
                   MESA_SHADER_VERTEX,
                   mem_ctx, false /* no_spills */,
                   shader_time_index),
-     vs_compile(vs_compile),
+     key(key),
      vs_prog_data(vs_prog_data),
+     vp(vp),
      use_legacy_snorm_formula(use_legacy_snorm_formula)
 {
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 3a131b0767d..76bd5f4aae4 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -91,8 +91,9 @@ class vec4_vs_visitor : public vec4_visitor
 public:
    vec4_vs_visitor(const struct brw_compiler *compiler,
                    void *log_data,
-                   struct brw_vs_compile *vs_compile,
+                   const struct brw_vs_prog_key *key,
                    struct brw_vs_prog_data *vs_prog_data,
+                   struct gl_vertex_program *vp,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
                    int shader_time_index,
@@ -113,8 +114,9 @@ private:
    dst_reg get_vp_dst_reg(const prog_dst_register &dst);
    src_reg get_vp_src_reg(const prog_src_register &src);
 
-   struct brw_vs_compile * const vs_compile;
+   const struct brw_vs_prog_key *const key;
    struct brw_vs_prog_data * const vs_prog_data;
+   struct gl_vertex_program *const vp;
    src_reg *vp_temp_regs;
    src_reg vp_addr_reg;
 

From f12302b89836a24255674a251f7a6902b4e9af7c Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 28 Jun 2015 21:16:29 -0700
Subject: [PATCH 0249/1208] i965/vs: Get rid of brw_vs_compile completely.

After tearing it out another level or two, and just passing the key and
vp directly, we can finally remove this struct.  It also eliminates a
pointless memcpy() of the key.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp | 38 +++++++++++++-------------
 src/mesa/drivers/dri/i965/brw_vs.c     | 20 ++++++--------
 src/mesa/drivers/dri/i965/brw_vs.h     | 13 +++------
 3 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 42d014cb1d8..53270fb6eba 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1872,10 +1872,11 @@ extern "C" {
  */
 const unsigned *
 brw_vs_emit(struct brw_context *brw,
-            struct gl_shader_program *prog,
-            struct brw_vs_compile *c,
-            struct brw_vs_prog_data *prog_data,
             void *mem_ctx,
+            const struct brw_vs_prog_key *key,
+            struct brw_vs_prog_data *prog_data,
+            struct gl_vertex_program *vp,
+            struct gl_shader_program *prog,
             unsigned *final_assembly_size)
 {
    bool start_busy = false;
@@ -1894,29 +1895,28 @@ brw_vs_emit(struct brw_context *brw,
 
    int st_index = -1;
    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
-      st_index = brw_get_shader_time_index(brw, prog, &c->vp->program.Base,
-                                           ST_VS);
+      st_index = brw_get_shader_time_index(brw, prog, &vp->Base, ST_VS);
 
    if (unlikely(INTEL_DEBUG & DEBUG_VS))
-      brw_dump_ir("vertex", prog, &shader->base, &c->vp->program.Base);
+      brw_dump_ir("vertex", prog, &shader->base, &vp->Base);
 
    if (brw->intelScreen->compiler->scalar_vs) {
-      if (!c->vp->program.Base.nir) {
+      if (!vp->Base.nir) {
          /* Normally we generate NIR in LinkShader() or
           * ProgramStringNotify(), but Mesa's fixed-function vertex program
           * handling doesn't notify the driver at all.  Just do it here, at
           * the last minute, even though it's lame.
           */
-         assert(c->vp->program.Base.Id == 0 && prog == NULL);
-         c->vp->program.Base.nir =
-            brw_create_nir(brw, NULL, &c->vp->program.Base, MESA_SHADER_VERTEX);
+         assert(vp->Base.Id == 0 && prog == NULL);
+         vp->Base.nir =
+            brw_create_nir(brw, NULL, &vp->Base, MESA_SHADER_VERTEX);
       }
 
       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
       fs_visitor v(brw->intelScreen->compiler, brw,
-                   mem_ctx, MESA_SHADER_VERTEX, &c->key,
-                   &prog_data->base.base, prog, &c->vp->program.Base,
+                   mem_ctx, MESA_SHADER_VERTEX, key,
+                   &prog_data->base.base, prog, &vp->Base,
                    8, st_index);
       if (!v.run_vs(brw_select_clip_planes(&brw->ctx))) {
          if (prog) {
@@ -1931,8 +1931,8 @@ brw_vs_emit(struct brw_context *brw,
       }
 
       fs_generator g(brw->intelScreen->compiler, brw,
-                     mem_ctx, (void *) &c->key, &prog_data->base.base,
-                     &c->vp->program.Base, v.promoted_constants,
+                     mem_ctx, (void *) key, &prog_data->base.base,
+                     &vp->Base, v.promoted_constants,
                      v.runtime_check_aads_emit, "VS");
       if (INTEL_DEBUG & DEBUG_VS) {
          char *name;
@@ -1942,7 +1942,7 @@ brw_vs_emit(struct brw_context *brw,
                                    prog->Name);
          } else {
             name = ralloc_asprintf(mem_ctx, "vertex program %d",
-                                   c->vp->program.Base.Id);
+                                   vp->Base.Id);
          }
          g.enable_debug(name);
       }
@@ -1953,8 +1953,8 @@ brw_vs_emit(struct brw_context *brw,
    if (!assembly) {
       prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-      vec4_vs_visitor v(brw->intelScreen->compiler, brw, &c->key, prog_data,
-                        &c->vp->program, prog, mem_ctx, st_index,
+      vec4_vs_visitor v(brw->intelScreen->compiler, brw, key, prog_data,
+                        vp, prog, mem_ctx, st_index,
                         !_mesa_is_gles3(&brw->ctx));
       if (!v.run(brw_select_clip_planes(&brw->ctx))) {
          if (prog) {
@@ -1969,14 +1969,14 @@ brw_vs_emit(struct brw_context *brw,
       }
 
       vec4_generator g(brw->intelScreen->compiler, brw,
-                       prog, &c->vp->program.Base, &prog_data->base,
+                       prog, &vp->Base, &prog_data->base,
                        mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS");
       assembly = g.generate_assembly(v.cfg, final_assembly_size);
    }
 
    if (unlikely(brw->perf_debug) && shader) {
       if (shader->compiled_once) {
-         brw_vs_debug_recompile(brw, prog, &c->key);
+         brw_vs_debug_recompile(brw, prog, key);
       }
       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
          perf_debug("VS compile took %.03f ms and stalled the GPU\n",
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index ee3f6640938..2b9b005c782 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -94,7 +94,6 @@ brw_codegen_vs_prog(struct brw_context *brw,
 {
    GLuint program_size;
    const GLuint *program;
-   struct brw_vs_compile c;
    struct brw_vs_prog_data prog_data;
    struct brw_stage_prog_data *stage_prog_data = &prog_data.base.base;
    void *mem_ctx;
@@ -104,8 +103,6 @@ brw_codegen_vs_prog(struct brw_context *brw,
    if (prog)
       vs = prog->_LinkedShaders[MESA_SHADER_VERTEX];
 
-   memset(&c, 0, sizeof(c));
-   memcpy(&c.key, key, sizeof(*key));
    memset(&prog_data, 0, sizeof(prog_data));
 
    /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */
@@ -114,8 +111,6 @@ brw_codegen_vs_prog(struct brw_context *brw,
 
    mem_ctx = ralloc_context(NULL);
 
-   c.vp = vp;
-
    /* Allocate the references to the uniforms that will end up in the
     * prog_data associated with the compiled program, and which will be freed
     * by the state cache.
@@ -134,7 +129,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
    /* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip
     * planes as uniforms.
     */
-   param_count += c.key.base.nr_userclip_plane_consts * 4;
+   param_count += key->base.nr_userclip_plane_consts * 4;
 
    stage_prog_data->param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
@@ -145,7 +140,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
    GLbitfield64 outputs_written = vp->program.Base.OutputsWritten;
    prog_data.inputs_read = vp->program.Base.InputsRead;
 
-   if (c.key.copy_edgeflag) {
+   if (key->copy_edgeflag) {
       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE);
       prog_data.inputs_read |= VERT_BIT_EDGEFLAG;
    }
@@ -158,7 +153,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
        * coords, which would be a pain to handle.
        */
       for (i = 0; i < 8; i++) {
-         if (c.key.point_coord_replace & (1 << i))
+         if (key->point_coord_replace & (1 << i))
             outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i);
       }
 
@@ -173,7 +168,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
     * distance varying slots whenever clipping is enabled, even if the vertex
     * shader doesn't write to gl_ClipDistance.
     */
-   if (c.key.base.userclip_active) {
+   if (key->base.userclip_active) {
       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
    }
@@ -182,13 +177,14 @@ brw_codegen_vs_prog(struct brw_context *brw,
                        &prog_data.base.vue_map, outputs_written);
 
    if (0) {
-      _mesa_fprint_program_opt(stderr, &c.vp->program.Base, PROG_PRINT_DEBUG,
+      _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG,
 			       true);
    }
 
    /* Emit GEN4 code.
     */
-   program = brw_vs_emit(brw, prog, &c, &prog_data, mem_ctx, &program_size);
+   program = brw_vs_emit(brw, mem_ctx, key, &prog_data,
+                         &vp->program, prog, &program_size);
    if (program == NULL) {
       ralloc_free(mem_ctx);
       return false;
@@ -202,7 +198,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
    }
 
    brw_upload_cache(&brw->cache, BRW_CACHE_VS_PROG,
-		    &c.key, sizeof(c.key),
+		    key, sizeof(struct brw_vs_prog_key),
 		    program, program_size,
 		    &prog_data, sizeof(prog_data),
 		    &brw->vs.base.prog_offset, &brw->vs.prog_data);
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 76bd5f4aae4..88927eeb3eb 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -50,21 +50,16 @@
 #define BRW_ATTRIB_WA_SIGN          32  /* interpret as signed in shader */
 #define BRW_ATTRIB_WA_SCALE         64  /* interpret as scaled in shader */
 
-struct brw_vs_compile {
-   struct brw_vs_prog_key key;
-
-   struct brw_vertex_program *vp;
-};
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 const unsigned *brw_vs_emit(struct brw_context *brw,
-                            struct gl_shader_program *prog,
-                            struct brw_vs_compile *c,
-                            struct brw_vs_prog_data *prog_data,
                             void *mem_ctx,
+                            const struct brw_vs_prog_key *key,
+                            struct brw_vs_prog_data *prog_data,
+                            struct gl_vertex_program *vp,
+                            struct gl_shader_program *shader_prog,
                             unsigned *program_size);
 void brw_vs_debug_recompile(struct brw_context *brw,
                             struct gl_shader_program *prog,

From 0a8af6361eecaba0f34a668328746924b61caa6a Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 8 Jul 2015 13:30:22 -0400
Subject: [PATCH 0250/1208] xa: don't leak fences

XA was never unref'ing last_fence in the various call paths to
pipe->flush().  Add this to xa_context_flush() and update the other
open-coded calls to pipe->flush() to use xa_context_flush() instead.

This fixes a memory leak reported with xf86-video-freedreno.

Reported-by: Nicolas Dechesne <nicolas.dechesne@linaro.org>
Cc: "10.5 10.6" <mesa-stable@lists.freedesktop.org>
Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/state_trackers/xa/xa_context.c | 6 +++++-
 src/gallium/state_trackers/xa/xa_tracker.c | 2 +-
 src/gallium/state_trackers/xa/xa_yuv.c     | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/gallium/state_trackers/xa/xa_context.c b/src/gallium/state_trackers/xa/xa_context.c
index fd49c82a559..ebfb290af13 100644
--- a/src/gallium/state_trackers/xa/xa_context.c
+++ b/src/gallium/state_trackers/xa/xa_context.c
@@ -37,7 +37,11 @@
 XA_EXPORT void
 xa_context_flush(struct xa_context *ctx)
 {
-	ctx->pipe->flush(ctx->pipe, &ctx->last_fence, 0);
+    if (ctx->last_fence) {
+        struct pipe_screen *screen = ctx->xa->screen;
+        screen->fence_reference(screen, &ctx->last_fence, NULL);
+    }
+    ctx->pipe->flush(ctx->pipe, &ctx->last_fence, 0);
 }
 
 XA_EXPORT struct xa_context *
diff --git a/src/gallium/state_trackers/xa/xa_tracker.c b/src/gallium/state_trackers/xa/xa_tracker.c
index f69ac8edf27..59e8108b1b5 100644
--- a/src/gallium/state_trackers/xa/xa_tracker.c
+++ b/src/gallium/state_trackers/xa/xa_tracker.c
@@ -461,7 +461,7 @@ xa_surface_redefine(struct xa_surface *srf,
 			xa_min(save_height, template->height0), &src_box);
 	pipe->resource_copy_region(pipe, texture,
 				   0, 0, 0, 0, srf->tex, 0, &src_box);
-	pipe->flush(pipe, &xa->default_ctx->last_fence, 0);
+	xa_context_flush(xa->default_ctx);
     }
 
     pipe_resource_reference(&srf->tex, texture);
diff --git a/src/gallium/state_trackers/xa/xa_yuv.c b/src/gallium/state_trackers/xa/xa_yuv.c
index 15196392ac7..97a1833ff15 100644
--- a/src/gallium/state_trackers/xa/xa_yuv.c
+++ b/src/gallium/state_trackers/xa/xa_yuv.c
@@ -154,7 +154,7 @@ xa_yuv_planar_blit(struct xa_context *r,
 	box++;
     }
 
-    r->pipe->flush(r->pipe, &r->last_fence, 0);
+    xa_context_flush(r);
 
     xa_ctx_sampler_views_destroy(r);
     xa_ctx_srf_destroy(r);

From f60354ee72fdee988fd604994e8b8c8d75fe78be Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 8 Jul 2015 15:00:51 -0400
Subject: [PATCH 0251/1208] gallium: clarify reference counting for fence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Nowhere was it spelled out that the state tracker may expect the pipe
driver to unref the old fence.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/include/pipe/p_context.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index c2eedf8e7c7..d2c2e4c8d14 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -361,8 +361,14 @@ struct pipe_context {
                         const void *clear_value,
                         int clear_value_size);
 
-   /** Flush draw commands
+   /**
+    * Flush draw commands
     *
+    * NOTE: use screen->fence_reference() (or equivalent) to transfer
+    * new fence ref to **fence, to ensure that previous fence is unref'd
+    *
+    * \param fence  if not NULL, an old fence to unref and transfer a
+    *    new fence reference to
     * \param flags  bitfield of enum pipe_flush_flags values.
     */
    void (*flush)(struct pipe_context *pipe,

From 7e0a26defe65dad7ffc8e7a95b5577be51feb2bc Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 8 Jul 2015 14:48:01 -0400
Subject: [PATCH 0252/1208] freedreno: unref old fence

Some, but not all, state trackers will explicitly unref (and set to
NULL) the previous *fence before calling pipe->flush().  So driver
should use fence_ref() which will unref the old fence if not NULL.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/freedreno_context.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 668ef3629bf..127fb5f5aa0 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -144,8 +144,10 @@ fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
 {
 	fd_context_render(pctx);
 
-	if (fence)
+	if (fence) {
+		fd_screen_fence_ref(pctx->screen, fence, NULL);
 		*fence = fd_fence_create(pctx);
+	}
 }
 
 void

From 749dced4b363963b2230a18b0776fa92653116b8 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 8 Jul 2015 14:51:10 -0400
Subject: [PATCH 0253/1208] ilo: unref old fence

Some, but not all, state trackers will explicitly unref (and set to
NULL) the previous *fence before calling pipe->flush().  So driver
should use fence_ref() which will unref the old fence if not NULL.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Acked-by: Chia-I Wu <olvaffe@gmail.com>
---
 src/gallium/drivers/ilo/ilo_context.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/ilo/ilo_context.c b/src/gallium/drivers/ilo/ilo_context.c
index 3d5c7b636a8..b9a16aab81d 100644
--- a/src/gallium/drivers/ilo/ilo_context.c
+++ b/src/gallium/drivers/ilo/ilo_context.c
@@ -62,6 +62,8 @@ ilo_flush(struct pipe_context *pipe,
          (flags & PIPE_FLUSH_END_OF_FRAME) ? "frame end" : "user request");
 
    if (f) {
+      struct pipe_screen *screen = pipe->screen;
+      screen->fence_reference(screen, f, NULL);
       *f = ilo_screen_fence_create(pipe->screen, ilo->cp->last_submitted_bo);
    }
 }

From ab3ba21f979605b90b2fb44482138732b42514b0 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 8 Jul 2015 14:51:46 -0400
Subject: [PATCH 0254/1208] vc4: unref old fence

Some, but not all, state trackers will explicitly unref (and set to
NULL) the previous *fence before calling pipe->flush().  So driver
should use fence_ref() which will unref the old fence if not NULL.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Acked-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/drivers/vc4/vc4_context.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 630f8e68896..316598f0a9d 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -103,8 +103,10 @@ vc4_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
         vc4_flush(pctx);
 
         if (fence) {
+                struct pipe_screen *screen = pctx->screen;
                 struct vc4_fence *f = vc4_fence_create(vc4->screen,
                                                        vc4->last_emit_seqno);
+                screen->fence_reference(screen, fence, NULL);
                 *fence = (struct pipe_fence_handle *)f;
         }
 }

From 422296e38d04789cc4ca336b46979b44abd19b5d Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Thu, 2 Jul 2015 18:15:43 -0400
Subject: [PATCH 0255/1208] freedreno: fix crash in fd_invalidate_resource()

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/freedreno_resource.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index 95f79df565e..20495779d50 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -72,11 +72,11 @@ fd_invalidate_resource(struct fd_context *ctx, struct pipe_resource *prsc)
 
 	/* Textures */
 	for (i = 0; i < ctx->verttex.num_textures && !(ctx->dirty & FD_DIRTY_VERTTEX); i++) {
-		if (ctx->verttex.textures[i]->texture == prsc)
+		if (ctx->verttex.textures[i] && (ctx->verttex.textures[i]->texture == prsc))
 			ctx->dirty |= FD_DIRTY_VERTTEX;
 	}
 	for (i = 0; i < ctx->fragtex.num_textures && !(ctx->dirty & FD_DIRTY_FRAGTEX); i++) {
-		if (ctx->fragtex.textures[i]->texture == prsc)
+		if (ctx->fragtex.textures[i] && (ctx->fragtex.textures[i]->texture == prsc))
 			ctx->dirty |= FD_DIRTY_FRAGTEX;
 	}
 }

From 65b2ae510bb07b75f583ecedfd59766621e1cb43 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 5 Jul 2015 18:23:25 -0400
Subject: [PATCH 0256/1208] freedreno/ir3: shader-db traces

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/freedreno_screen.c      |  1 +
 .../drivers/freedreno/freedreno_util.h        |  1 +
 .../drivers/freedreno/ir3/ir3_cmdline.c       | 10 +++--
 .../drivers/freedreno/ir3/ir3_compiler.h      |  1 +
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  |  4 +-
 .../drivers/freedreno/ir3/ir3_shader.c        | 38 ++++++++++++++++++-
 .../drivers/freedreno/ir3/ir3_shader.h        | 20 ++++++++++
 7 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 473823f1390..7b1b4e357c4 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -69,6 +69,7 @@ static const struct debug_named_value debug_options[] = {
 		{"nobin",     FD_DBG_NOBIN,  "Disable hw binning"},
 		{"optmsgs",   FD_DBG_OPTMSGS,"Enable optimizer debug messages"},
 		{"glsl120",   FD_DBG_GLSL120,"Temporary flag to force GLSL 1.20 (rather than 1.30) on a3xx+"},
+		{"shaderdb",  FD_DBG_SHADERDB, "Enable shaderdb output"},
 		DEBUG_NAMED_VALUE_END
 };
 
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index deb0e602ce2..1b78763c58e 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -64,6 +64,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_NOBIN    0x0100
 #define FD_DBG_OPTMSGS  0x0200
 #define FD_DBG_GLSL120  0x0400
+#define FD_DBG_SHADERDB 0x0800
 
 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index 2b89fb442b4..d2aabe1140d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -46,7 +46,7 @@
 static void dump_info(struct ir3_shader_variant *so, const char *str)
 {
 	uint32_t *bin;
-	const char *type = (so->type == SHADER_VERTEX) ? "VERT" : "FRAG";
+	const char *type = ir3_shader_stage(so->shader);
 	// TODO make gpu_id configurable on cmdline
 	bin = ir3_shader_assemble(so, 320);
 	debug_printf("; %s: %s\n", type, str);
@@ -106,6 +106,7 @@ int main(int argc, char **argv)
 	struct tgsi_parse_context parse;
 	struct ir3_compiler *compiler;
 	struct ir3_shader_variant v;
+	struct ir3_shader s;
 	struct ir3_shader_key key = {};
 	const char *info;
 	void *ptr;
@@ -182,6 +183,7 @@ int main(int argc, char **argv)
 
 	memset(&v, 0, sizeof(v));
 	v.key = key;
+	v.shader = &s;
 
 	ret = read_file(filename, &ptr, &size);
 	if (ret) {
@@ -198,13 +200,13 @@ int main(int argc, char **argv)
 	tgsi_parse_init(&parse, toks);
 	switch (parse.FullHeader.Processor.Processor) {
 	case TGSI_PROCESSOR_FRAGMENT:
-		v.type = SHADER_FRAGMENT;
+		s.type = v.type = SHADER_FRAGMENT;
 		break;
 	case TGSI_PROCESSOR_VERTEX:
-		v.type = SHADER_VERTEX;
+		s.type = v.type = SHADER_VERTEX;
 		break;
 	case TGSI_PROCESSOR_COMPUTE:
-		v.type = SHADER_COMPUTE;
+		s.type = v.type = SHADER_COMPUTE;
 		break;
 	}
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
index 86b1161d9cb..378f737924c 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
@@ -36,6 +36,7 @@ struct ir3_ra_reg_set;
 struct ir3_compiler {
 	uint32_t gpu_id;
 	struct ir3_ra_reg_set *set;
+	uint32_t shader_count;
 };
 
 struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 8d8d4434d9e..22885ff85f3 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1983,7 +1983,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in)
 	unsigned semantic_index = in->data.index;
 	unsigned n = in->data.driver_location;
 
-	DBG("; in: %u:%u, len=%ux%u, loc=%u\n",
+	DBG("; in: %u:%u, len=%ux%u, loc=%u",
 			semantic_name, semantic_index, array_len,
 			ncomp, n);
 
@@ -2078,7 +2078,7 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
 	unsigned n = out->data.driver_location;
 	unsigned comp = 0;
 
-	DBG("; out: %u:%u, len=%ux%u, loc=%u\n",
+	DBG("; out: %u:%u, len=%ux%u, loc=%u",
 			semantic_name, semantic_index, array_len,
 			ncomp, n);
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index bfcc2ca8a53..210dc298f4e 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -147,6 +147,25 @@ assemble_variant(struct ir3_shader_variant *v)
 		ir3_shader_disasm(v, bin);
 	}
 
+	if (fd_mesa_debug & FD_DBG_SHADERDB) {
+		/* print generic shader info: */
+		fprintf(stderr, "SHADER-DB: %s prog %d/%d: %u instructions, %u dwords\n",
+				ir3_shader_stage(v->shader),
+				v->shader->id, v->id,
+				v->info.instrs_count,
+				v->info.sizedwords);
+		fprintf(stderr, "SHADER-DB: %s prog %d/%d: %u half, %u full\n",
+				ir3_shader_stage(v->shader),
+				v->shader->id, v->id,
+				v->info.max_half_reg + 1,
+				v->info.max_reg + 1);
+		fprintf(stderr, "SHADER-DB: %s prog %d/%d: %u const, %u constlen\n",
+				ir3_shader_stage(v->shader),
+				v->shader->id, v->id,
+				v->info.max_const + 1,
+				v->constlen);
+	}
+
 	free(bin);
 
 	/* no need to keep the ir around beyond this point: */
@@ -164,6 +183,7 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 	if (!v)
 		return NULL;
 
+	v->id = ++shader->variant_count;
 	v->shader = shader;
 	v->key = key;
 	v->type = shader->type;
@@ -258,9 +278,18 @@ ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens,
 {
 	struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
 	shader->compiler = fd_context(pctx)->screen->compiler;
+	shader->id = ++shader->compiler->shader_count;
 	shader->pctx = pctx;
 	shader->type = type;
 	shader->tokens = tgsi_dup_tokens(tokens);
+	if (fd_mesa_debug & FD_DBG_SHADERDB) {
+		/* if shader-db run, create a standard variant immediately
+		 * (as otherwise nothing will trigger the shader to be
+		 * actually compiled)
+		 */
+		static struct ir3_shader_key key = {};
+		ir3_shader_variant(shader, key);
+	}
 	return shader;
 }
 
@@ -283,7 +312,7 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin)
 {
 	struct ir3 *ir = so->ir;
 	struct ir3_register *reg;
-	const char *type = (so->type == SHADER_VERTEX) ? "VERT" : "FRAG";
+	const char *type = ir3_shader_stage(so->shader);
 	uint8_t regid;
 	unsigned i;
 
@@ -348,11 +377,16 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin)
 	debug_printf("\n");
 
 	/* print generic shader info: */
-	debug_printf("; %s: %u instructions, %d half, %d full\n", type,
+	debug_printf("; %s prog %d/%d: %u instructions, %d half, %d full\n",
+			type, so->shader->id, so->id,
 			so->info.instrs_count,
 			so->info.max_half_reg + 1,
 			so->info.max_reg + 1);
 
+	debug_printf("; %d const, %u constlen\n",
+			so->info.max_const + 1,
+			so->constlen);
+
 	/* print shader type specific info: */
 	switch (so->type) {
 	case SHADER_VERTEX:
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 3cf3167e1cc..79b9f8a3eeb 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -100,6 +100,9 @@ ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b)
 struct ir3_shader_variant {
 	struct fd_bo *bo;
 
+	/* variant id (for debug) */
+	uint32_t id;
+
 	struct ir3_shader_key key;
 
 	struct ir3_info info;
@@ -192,6 +195,10 @@ struct ir3_shader_variant {
 struct ir3_shader {
 	enum shader_t type;
 
+	/* shader id (for debug): */
+	uint32_t id;
+	uint32_t variant_count;
+
 	struct ir3_compiler *compiler;
 
 	struct pipe_context *pctx;
@@ -214,6 +221,19 @@ struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
 		struct ir3_shader_key key);
 void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin);
 
+static inline const char *
+ir3_shader_stage(struct ir3_shader *shader)
+{
+	switch (shader->type) {
+	case SHADER_VERTEX:     return "VERT";
+	case SHADER_FRAGMENT:   return "FRAG";
+	case SHADER_COMPUTE:    return "CL";
+	default:
+		unreachable("invalid type");
+		return NULL;
+	}
+}
+
 /*
  * Helper/util:
  */

From a1a6f007823f203755fb54a1f3b7f53ae6cbfef0 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 5 Jul 2015 19:53:10 -0400
Subject: [PATCH 0257/1208] freedreno/ir3/ra: fix failed assert for a0/p0

The address and predicate register are special, they don't get assigned
in RA.  So do a better job of ignoring them rather than hitting later
asserts.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_ra.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index de48ecfe280..eaf3b3c35e8 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -469,6 +469,11 @@ ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 		if (instr->regs_count == 0)
 			continue;
+		/* couple special cases: */
+		if (writes_addr(instr) || writes_pred(instr)) {
+			id->cls = -1;
+			continue;
+		}
 		id->defn = get_definer(ctx, instr, &id->sz, &id->off);
 		id->cls = size_to_class(id->sz, is_half(id->defn));
 	}

From e44845472a4e04e7b6a82ab6c768f9648729d7e9 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 5 Jul 2015 20:17:56 -0400
Subject: [PATCH 0258/1208] freedreno/ir3/sched: fixup new instr's block

If we split addr/pred, the original instruction could have originated
from a different block.  If we don't fixup the block ptr we hit asserts
later (in debug builds).

NOTE: perhaps we don't want to try to preserve addr/pred reg's across
block boundaries.. this at least needs some thought in case addr/pred
writes end up inside a conditional block..

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_sched.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 0cb1589eb4d..2ee325518f7 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -477,6 +477,10 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 			if (new_instr) {
 				list_del(&new_instr->node);
 				list_addtail(&new_instr->node, &unscheduled_list);
+				/* the original instr that wrote addr/pred may have
+				 * originated from a different block:
+				 */
+				new_instr->block = block;
 			}
 
 		} else {

From 2b7a54452fbb7e6436aa4ecc700cb2fe2f96ad86 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Thu, 9 Jul 2015 18:14:36 -0400
Subject: [PATCH 0259/1208] freedreno: update generated headers

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a2xx/a2xx.xml.h | 16 ++--
 .../drivers/freedreno/a2xx/fd2_context.c      |  4 +-
 src/gallium/drivers/freedreno/a3xx/a3xx.xml.h | 14 +--
 .../drivers/freedreno/a3xx/fd3_context.c      |  2 +-
 src/gallium/drivers/freedreno/a3xx/fd3_draw.c |  4 +-
 .../drivers/freedreno/a3xx/fd3_query.c        |  2 +-
 src/gallium/drivers/freedreno/a4xx/a4xx.xml.h | 85 ++++++++++++++++---
 .../drivers/freedreno/a4xx/fd4_context.c      |  2 +-
 src/gallium/drivers/freedreno/a4xx/fd4_emit.c |  6 +-
 .../drivers/freedreno/adreno_common.xml.h     | 16 ++--
 .../drivers/freedreno/adreno_pm4.xml.h        | 18 ++--
 11 files changed, 114 insertions(+), 55 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index f4f6b94c1ea..a4306314bfe 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -8,15 +8,15 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  60800 bytes, from 2015-07-10 14:00:13)
 
-Copyright (C) 2013-2014 by the following authors:
+Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 
 Permission is hereby granted, free of charge, to any person obtaining
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.c b/src/gallium/drivers/freedreno/a2xx/fd2_context.c
index a0bf01ffd1f..6089ebc1516 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_context.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.c
@@ -67,7 +67,7 @@ create_solid_vertexbuf(struct pipe_context *pctx)
 }
 
 static const uint8_t a22x_primtypes[PIPE_PRIM_MAX] = {
-		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_A2XX,
+		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_PSIZE,
 		[PIPE_PRIM_LINES]          = DI_PT_LINELIST,
 		[PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
 		[PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
@@ -77,7 +77,7 @@ static const uint8_t a22x_primtypes[PIPE_PRIM_MAX] = {
 };
 
 static const uint8_t a20x_primtypes[PIPE_PRIM_MAX] = {
-		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_A2XX,
+		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_PSIZE,
 		[PIPE_PRIM_LINES]          = DI_PT_LINELIST,
 		[PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
 		[PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index a3bc74eda85..e99e317b1cc 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -8,13 +8,13 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  60800 bytes, from 2015-07-10 14:00:13)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
index 7e5a99ea571..8441898382b 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -88,7 +88,7 @@ create_blit_texcoord_vertexbuf(struct pipe_context *pctx)
 }
 
 static const uint8_t primtypes[PIPE_PRIM_MAX] = {
-		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_A3XX,
+		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST,
 		[PIPE_PRIM_LINES]          = DI_PT_LINELIST,
 		[PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
 		[PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index 29f4bae93fa..070ed43a279 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -82,8 +82,8 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			info->restart_index : 0xffffffff);
 
 	if (ctx->rasterizer && ctx->rasterizer->point_size_per_vertex &&
-		info->mode == PIPE_PRIM_POINTS)
-		primtype = DI_PT_POINTLIST_A2XX;
+			(info->mode == PIPE_PRIM_POINTS))
+		primtype = DI_PT_POINTLIST_PSIZE;
 
 	fd_draw_emit(ctx, ring,
 			primtype,
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_query.c b/src/gallium/drivers/freedreno/a3xx/fd3_query.c
index 7abab543427..8fc0a0d4229 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_query.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_query.c
@@ -64,7 +64,7 @@ occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
 
 	OUT_PKT3(ring, CP_DRAW_INDX, 3);
 	OUT_RING(ring, 0x00000000);
-	OUT_RING(ring, DRAW(DI_PT_POINTLIST_A2XX, DI_SRC_SEL_AUTO_INDEX,
+	OUT_RING(ring, DRAW(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
 						INDEX_SIZE_IGN, USE_VISIBILITY, 0));
 	OUT_RING(ring, 0);             /* NumIndices */
 
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index 0e7d3cf6db1..b53badc0bdf 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -8,13 +8,13 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  60800 bytes, from 2015-07-10 14:00:13)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -570,6 +570,15 @@ static inline uint32_t A4XX_RB_FS_OUTPUT_SAMPLE_MASK(uint32_t val)
 	return ((val) << A4XX_RB_FS_OUTPUT_SAMPLE_MASK__SHIFT) & A4XX_RB_FS_OUTPUT_SAMPLE_MASK__MASK;
 }
 
+#define REG_A4XX_RB_SAMPLE_COUNT_CONTROL			0x000020fa
+#define A4XX_RB_SAMPLE_COUNT_CONTROL_COPY			0x00000002
+#define A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__MASK			0xfffffffc
+#define A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__SHIFT		2
+static inline uint32_t A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR(uint32_t val)
+{
+	return ((val >> 2) << A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__SHIFT) & A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__MASK;
+}
+
 #define REG_A4XX_RB_RENDER_COMPONENTS				0x000020fb
 #define A4XX_RB_RENDER_COMPONENTS_RT0__MASK			0x0000000f
 #define A4XX_RB_RENDER_COMPONENTS_RT0__SHIFT			0
@@ -1167,6 +1176,8 @@ static inline uint32_t REG_A4XX_CP_SCRATCH_REG(uint32_t i0) { return 0x00000578
 
 #define REG_A4XX_SP_VS_STATUS					0x00000ec0
 
+#define REG_A4XX_SP_MODE_CONTROL				0x00000ec3
+
 #define REG_A4XX_SP_PERFCTR_SP_SEL_11				0x00000ecf
 
 #define REG_A4XX_SP_SP_CTRL_REG					0x000022c0
@@ -1432,6 +1443,20 @@ static inline uint32_t A4XX_SP_FS_MRT_REG_MRTFORMAT(enum a4xx_color_fmt val)
 	return ((val) << A4XX_SP_FS_MRT_REG_MRTFORMAT__SHIFT) & A4XX_SP_FS_MRT_REG_MRTFORMAT__MASK;
 }
 
+#define REG_A4XX_SP_CS_CTRL_REG0				0x00002300
+
+#define REG_A4XX_SP_CS_OBJ_OFFSET_REG				0x00002301
+
+#define REG_A4XX_SP_CS_OBJ_START				0x00002302
+
+#define REG_A4XX_SP_CS_PVT_MEM_PARAM				0x00002303
+
+#define REG_A4XX_SP_CS_PVT_MEM_ADDR				0x00002304
+
+#define REG_A4XX_SP_CS_PVT_MEM_SIZE				0x00002305
+
+#define REG_A4XX_SP_CS_LENGTH_REG				0x00002306
+
 #define REG_A4XX_SP_HS_OBJ_OFFSET_REG				0x0000230d
 #define A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK	0x01ff0000
 #define A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT	16
@@ -1758,6 +1783,8 @@ static inline uint32_t A4XX_VFD_DECODE_INSTR_SHIFTCNT(uint32_t val)
 
 #define REG_A4XX_TPL1_DEBUG_ECO_CONTROL				0x00000f00
 
+#define REG_A4XX_TPL1_TP_MODE_CONTROL				0x00000f03
+
 #define REG_A4XX_TPL1_PERFCTR_TP_SEL_7				0x00000f0b
 
 #define REG_A4XX_TPL1_TP_TEX_OFFSET				0x00002380
@@ -1800,6 +1827,10 @@ static inline uint32_t A4XX_TPL1_TP_TEX_COUNT_GS(uint32_t val)
 
 #define REG_A4XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR		0x000023a1
 
+#define REG_A4XX_TPL1_TP_CS_BORDER_COLOR_BASE_ADDR		0x000023a4
+
+#define REG_A4XX_TPL1_TP_CS_SAMPLER_BASE_ADDR			0x000023a5
+
 #define REG_A4XX_TPL1_TP_CS_TEXMEMOBJ_BASE_ADDR			0x000023a6
 
 #define REG_A4XX_GRAS_TSE_STATUS				0x00000c80
@@ -2078,6 +2109,8 @@ static inline uint32_t A4XX_GRAS_SC_EXTENT_WINDOW_TL_Y(uint32_t val)
 
 #define REG_A4XX_HLSQ_DEBUG_ECO_CONTROL				0x00000e04
 
+#define REG_A4XX_HLSQ_MODE_CONTROL				0x00000e05
+
 #define REG_A4XX_HLSQ_PERF_PIPE_MASK				0x00000e0e
 
 #define REG_A4XX_HLSQ_CONTROL_0_REG				0x000023c0
@@ -2158,6 +2191,8 @@ static inline uint32_t A4XX_HLSQ_CONTROL_3_REG_REGID(uint32_t val)
 	return ((val) << A4XX_HLSQ_CONTROL_3_REG_REGID__SHIFT) & A4XX_HLSQ_CONTROL_3_REG_REGID__MASK;
 }
 
+#define REG_A4XX_HLSQ_CONTROL_4_REG				0x000023c4
+
 #define REG_A4XX_HLSQ_VS_CONTROL_REG				0x000023c5
 #define A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK		0x000000ff
 #define A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__SHIFT		0
@@ -2293,6 +2328,36 @@ static inline uint32_t A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(uint32_t val)
 	return ((val) << A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH__SHIFT) & A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH__MASK;
 }
 
+#define REG_A4XX_HLSQ_CS_CONTROL				0x000023ca
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_0				0x000023cd
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_1				0x000023ce
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_2				0x000023cf
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_3				0x000023d0
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_4				0x000023d1
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_5				0x000023d2
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_6				0x000023d3
+
+#define REG_A4XX_HLSQ_CL_CONTROL_0				0x000023d4
+
+#define REG_A4XX_HLSQ_CL_CONTROL_1				0x000023d5
+
+#define REG_A4XX_HLSQ_CL_KERNEL_CONST				0x000023d6
+
+#define REG_A4XX_HLSQ_CL_KERNEL_GROUP_X				0x000023d7
+
+#define REG_A4XX_HLSQ_CL_KERNEL_GROUP_Y				0x000023d8
+
+#define REG_A4XX_HLSQ_CL_KERNEL_GROUP_Z				0x000023d9
+
+#define REG_A4XX_HLSQ_CL_WG_OFFSET				0x000023da
+
 #define REG_A4XX_HLSQ_UPDATE_CONTROL				0x000023db
 
 #define REG_A4XX_PC_BINNING_COMMAND				0x00000d00
@@ -2389,16 +2454,10 @@ static inline uint32_t A4XX_PC_HS_PARAM_PRIMTYPE(enum adreno_pa_su_sc_draw val)
 
 #define REG_A4XX_UNKNOWN_0D01					0x00000d01
 
-#define REG_A4XX_UNKNOWN_0E05					0x00000e05
-
 #define REG_A4XX_UNKNOWN_0E42					0x00000e42
 
 #define REG_A4XX_UNKNOWN_0EC2					0x00000ec2
 
-#define REG_A4XX_UNKNOWN_0EC3					0x00000ec3
-
-#define REG_A4XX_UNKNOWN_0F03					0x00000f03
-
 #define REG_A4XX_UNKNOWN_2001					0x00002001
 
 #define REG_A4XX_UNKNOWN_209B					0x0000209b
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
index 2321876dd48..6e109b6205a 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
@@ -86,7 +86,7 @@ create_blit_texcoord_vertexbuf(struct pipe_context *pctx)
 }
 
 static const uint8_t primtypes[PIPE_PRIM_MAX] = {
-		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_A3XX,
+		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST,
 		[PIPE_PRIM_LINES]          = DI_PT_LINELIST,
 		[PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
 		[PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 6cd2cd730f5..f3e1ccebccc 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -611,10 +611,10 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_PKT0(ring, REG_A4XX_GRAS_DEBUG_ECO_CONTROL, 1);
 	OUT_RING(ring, 0x00000000);
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0EC3, 1);
+	OUT_PKT0(ring, REG_A4XX_SP_MODE_CONTROL, 1);
 	OUT_RING(ring, 0x00000006);
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0F03, 1);
+	OUT_PKT0(ring, REG_A4XX_TPL1_TP_MODE_CONTROL, 1);
 	OUT_RING(ring, 0x0000003a);
 
 	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0D01, 1);
@@ -633,7 +633,7 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_RING(ring, 0x00000000);
 	OUT_RING(ring, 0x00000012);
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0E05, 1);
+	OUT_PKT0(ring, REG_A4XX_HLSQ_MODE_CONTROL, 1);
 	OUT_RING(ring, 0x00000000);
 
 	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0CC5, 1);
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index b23aa830770..c45b70399b2 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -8,15 +8,15 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  60800 bytes, from 2015-07-10 14:00:13)
 
-Copyright (C) 2013-2014 by the following authors:
+Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 
 Permission is hereby granted, free of charge, to any person obtaining
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 2b24c5b4e78..7c2451b2e71 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -8,13 +8,13 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  60800 bytes, from 2015-07-10 14:00:13)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -67,7 +67,7 @@ enum vgt_event_type {
 
 enum pc_di_primtype {
 	DI_PT_NONE = 0,
-	DI_PT_POINTLIST_A2XX = 1,
+	DI_PT_POINTLIST_PSIZE = 1,
 	DI_PT_LINELIST = 2,
 	DI_PT_LINESTRIP = 3,
 	DI_PT_TRILIST = 4,
@@ -75,7 +75,7 @@ enum pc_di_primtype {
 	DI_PT_TRISTRIP = 6,
 	DI_PT_LINELOOP = 7,
 	DI_PT_RECTLIST = 8,
-	DI_PT_POINTLIST_A3XX = 9,
+	DI_PT_POINTLIST = 9,
 	DI_PT_LINE_ADJ = 10,
 	DI_PT_LINESTRIP_ADJ = 11,
 	DI_PT_TRI_ADJ = 12,

From 15d3524ad24a698095cc542cf9a527c8a8615f78 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Thu, 2 Jul 2015 18:07:27 -0400
Subject: [PATCH 0260/1208] freedreno/a4xx: occlusion query support

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/a4xx/fd4_query.c        | 86 ++++++++++++++++++-
 1 file changed, 85 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
index 6db1c11b94b..4f69e0c1694 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
@@ -31,9 +31,93 @@
 #include "freedreno_util.h"
 
 #include "fd4_query.h"
+#include "fd4_draw.h"
 #include "fd4_format.h"
 
+
+struct fd_rb_samp_ctrs {
+	uint64_t ctr[16];
+};
+
+/*
+ * Occlusion Query:
+ *
+ * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
+ * interpret results
+ */
+
+static struct fd_hw_sample *
+occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
+{
+	struct fd_hw_sample *samp =
+			fd_hw_sample_init(ctx, sizeof(struct fd_rb_samp_ctrs));
+
+	/* low bits of sample addr should be zero (since they are control
+	 * flags in RB_SAMPLE_COUNT_CONTROL):
+	 */
+	debug_assert((samp->offset & 0x3) == 0);
+
+	/* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
+	 * HW_QUERY_BASE_REG register:
+	 */
+	OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+	OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000);
+	OUT_RING(ring, HW_QUERY_BASE_REG);
+	OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY |
+			samp->offset);
+
+	OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3);
+	OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
+						INDEX4_SIZE_32_BIT, USE_VISIBILITY));
+	OUT_RING(ring, 1);             /* NumInstances */
+	OUT_RING(ring, 0);             /* NumIndices */
+
+	fd_event_write(ctx, ring, ZPASS_DONE);
+
+	return samp;
+}
+
+static uint64_t
+count_samples(const struct fd_rb_samp_ctrs *start,
+		const struct fd_rb_samp_ctrs *end)
+{
+	return end->ctr[0] - start->ctr[0];
+}
+
+static void
+occlusion_counter_accumulate_result(struct fd_context *ctx,
+		const void *start, const void *end,
+		union pipe_query_result *result)
+{
+	uint64_t n = count_samples(start, end);
+	result->u64 += n;
+}
+
+static void
+occlusion_predicate_accumulate_result(struct fd_context *ctx,
+		const void *start, const void *end,
+		union pipe_query_result *result)
+{
+	uint64_t n = count_samples(start, end);
+	result->b |= (n > 0);
+}
+
+static const struct fd_hw_sample_provider occlusion_counter = {
+		.query_type = PIPE_QUERY_OCCLUSION_COUNTER,
+		.active = FD_STAGE_DRAW,
+		.get_sample = occlusion_get_sample,
+		.accumulate_result = occlusion_counter_accumulate_result,
+};
+
+static const struct fd_hw_sample_provider occlusion_predicate = {
+		.query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
+		.active = FD_STAGE_DRAW,
+		.get_sample = occlusion_get_sample,
+		.accumulate_result = occlusion_predicate_accumulate_result,
+};
+
 void fd4_query_context_init(struct pipe_context *pctx)
 {
-	/* TODO */
+	fd_hw_query_register_provider(pctx, &occlusion_counter);
+	fd_hw_query_register_provider(pctx, &occlusion_predicate);
 }

From 75784243df1f5bb0652fb243b37d69f36d493a86 Mon Sep 17 00:00:00 2001
From: Chad Versace <chad.versace@intel.com>
Date: Thu, 9 Jul 2015 18:46:21 -0700
Subject: [PATCH 0261/1208] mesa: Fix generation of git_sha1.h.tmp for gitlinks

Don't assume that $(top_srcdir)/.git is a directory. It may be a
gitlink file [1] if $(top_srcdir) is a submodule checkout or a linked
worktree [2].

[1] A "gitlink" is a text file that specifies the real location of
    the gitdir.
[2] Linked worktrees are a new feature in Git 2.5.

Cc: "10.6, 10.5" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/Makefile.am | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index c86ded979b9..eb4a3da3c84 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -38,8 +38,11 @@ gl_HEADERS = $(top_srcdir)/include/GL/*.h
 
 .PHONY: main/git_sha1.h.tmp
 main/git_sha1.h.tmp:
+	@# Don't assume that $(top_srcdir)/.git is a directory. It may be
+	@# a gitlink file if $(top_srcdir) is a submodule checkout or a linked
+	@# worktree.
 	@touch main/git_sha1.h.tmp
-	@if test -d $(top_srcdir)/.git; then \
+	@if test -e $(top_srcdir)/.git; then \
 		if which git > /dev/null; then \
 		    git --git-dir=$(top_srcdir)/.git log -n 1 --oneline | \
 			sed 's/^\([^ ]*\) .*/#define MESA_GIT_SHA1 "git-\1"/' \

From 0fae4e451bc60de1138729d20e03100e93cc6f38 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Wed, 8 Jul 2015 21:26:02 +1000
Subject: [PATCH 0262/1208] glsl: use set rather than old hash table for
 ir_validate

When the new hash table implementation was added to Mesa it claimed to be much
faster, see commits 35fd61bd99c1 and 72e55bb6888ff.

The set implementation follows the same implementation strategy so this should
be faster and there was no need to store a data field.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/glsl/ir_validate.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp
index cfe0df3dca6..684bef28035 100644
--- a/src/glsl/ir_validate.cpp
+++ b/src/glsl/ir_validate.cpp
@@ -35,7 +35,8 @@
 
 #include "ir.h"
 #include "ir_hierarchical_visitor.h"
-#include "program/hash_table.h"
+#include "util/hash_table.h"
+#include "util/set.h"
 #include "glsl_types.h"
 
 namespace {
@@ -44,18 +45,18 @@ class ir_validate : public ir_hierarchical_visitor {
 public:
    ir_validate()
    {
-      this->ht = hash_table_ctor(0, hash_table_pointer_hash,
-				 hash_table_pointer_compare);
+      this->ir_set = _mesa_set_create(NULL, _mesa_hash_pointer,
+                                      _mesa_key_pointer_equal);
 
       this->current_function = NULL;
 
       this->callback_enter = ir_validate::validate_ir;
-      this->data_enter = ht;
+      this->data_enter = ir_set;
    }
 
    ~ir_validate()
    {
-      hash_table_dtor(this->ht);
+      _mesa_set_destroy(this->ir_set, NULL);
    }
 
    virtual ir_visitor_status visit(ir_variable *v);
@@ -80,7 +81,7 @@ public:
 
    ir_function *current_function;
 
-   struct hash_table *ht;
+   struct set *ir_set;
 };
 
 } /* anonymous namespace */
@@ -94,7 +95,7 @@ ir_validate::visit(ir_dereference_variable *ir)
       abort();
    }
 
-   if (hash_table_find(ht, ir->var) == NULL) {
+   if (_mesa_set_search(ir_set, ir->var) == NULL) {
       printf("ir_dereference_variable @ %p specifies undeclared variable "
 	     "`%s' @ %p\n",
 	     (void *) ir, ir->var->name, (void *) ir->var);
@@ -730,8 +731,7 @@ ir_validate::visit(ir_variable *ir)
    if (ir->name && ir->is_name_ralloced())
       assert(ralloc_parent(ir->name) == ir);
 
-   hash_table_insert(ht, ir, ir);
-
+   _mesa_set_add(ir_set, ir);
 
    /* If a variable is an array, verify that the maximum array index is in
     * bounds.  There was once an error in AST-to-HIR conversion that set this
@@ -885,15 +885,15 @@ dump_ir:
 void
 ir_validate::validate_ir(ir_instruction *ir, void *data)
 {
-   struct hash_table *ht = (struct hash_table *) data;
+   struct set *ir_set = (struct set *) data;
 
-   if (hash_table_find(ht, ir)) {
+   if (_mesa_set_search(ir_set, ir)) {
       printf("Instruction node present twice in ir tree:\n");
       ir->print();
       printf("\n");
       abort();
    }
-   hash_table_insert(ht, ir, ir);
+   _mesa_set_add(ir_set, ir);
 }
 
 void

From 0edb084f9d0444c451a08fd2ed7daee2eb8a6f4a Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 1 Jul 2015 17:01:24 -0700
Subject: [PATCH 0263/1208] i965/gs: Move vertex_count != 0 check up a level;
 skip one caller.

Paul's original code had emit_control_data_bits() skip the URB write if
vertex_count was 0.  This meant wrapping every control data write in a
conditional write.

We accumulate control data bits in a single UD (32-bit) register.  For
simple shaders that don't emit many vertices, the control data header
will be <= 32-bits long, so we only need to write it once at the end of
the shader.

For shaders with larger headers, we write out batches of control data
bits at EmitVertex(), when (vertex_count * bits_per_vertex) % 32 == 0.
On the first EmitVertex() call, the above expression will evaluate to
true simply because vertex_count == 0.  But we want to avoid emitting
the control data bits, because we haven't accumulated 32-bits worth yet.

In other words, the vertex_count != 0 check is really only necessary in
the EmitVertex() batching case, not the end-of-thread case.

This saves a CMP/IF/ENDIF in every shader that uses EndPrimitive() or
multiple streams.  The only downside is that a shader which emits no
vertices at all will execute an additional URB write---but such shaders
are pointless and not worth optimizing.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 2f948ee73c0..55408eb0b0c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -348,11 +348,6 @@ vec4_gs_visitor::emit_control_data_bits()
    if (c->control_data_header_size_bits > 128)
       urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
 
-   /* If vertex_count is 0, then no control data bits have been accumulated
-    * yet, so we should do nothing.
-    */
-   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ));
-   emit(IF(BRW_PREDICATE_NORMAL));
    {
       /* If we are using either channel masks or a per-slot offset, then we
        * need to figure out which DWORD we are trying to write to, using the
@@ -431,7 +426,6 @@ vec4_gs_visitor::emit_control_data_bits()
       inst->base_mrf = base_mrf;
       inst->mlen = 2;
    }
-   emit(BRW_OPCODE_ENDIF);
 }
 
 void
@@ -531,9 +525,17 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
             emit(AND(dst_null_d(), this->vertex_count,
                      (uint32_t) (32 / c->control_data_bits_per_vertex - 1)));
          inst->conditional_mod = BRW_CONDITIONAL_Z;
+
          emit(IF(BRW_PREDICATE_NORMAL));
          {
+            /* If vertex_count is 0, then no control data bits have been
+             * accumulated yet, so we skip emitting them.
+             */
+            emit(CMP(dst_null_d(), this->vertex_count, 0u,
+                     BRW_CONDITIONAL_NEQ));
+            emit(IF(BRW_PREDICATE_NORMAL));
             emit_control_data_bits();
+            emit(BRW_OPCODE_ENDIF);
 
             /* Reset control_data_bits to 0 so we can start accumulating a new
              * batch.

From a078e13a7cfba9275bea2a1c7f80ac54bcf40036 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 1 Jul 2015 17:01:54 -0700
Subject: [PATCH 0264/1208] i965: Fix indentation in emit_control_data_bits().

The last patch left the code indented too far.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 .../drivers/dri/i965/brw_vec4_gs_visitor.cpp  | 152 +++++++++---------
 1 file changed, 75 insertions(+), 77 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 55408eb0b0c..d6b350bea3f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -348,84 +348,82 @@ vec4_gs_visitor::emit_control_data_bits()
    if (c->control_data_header_size_bits > 128)
       urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
 
-   {
-      /* If we are using either channel masks or a per-slot offset, then we
-       * need to figure out which DWORD we are trying to write to, using the
-       * formula:
-       *
-       *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
-       *
-       * Since bits_per_vertex is a power of two, and is known at compile
-       * time, this can be optimized to:
-       *
-       *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
-       */
-      src_reg dword_index(this, glsl_type::uint_type);
-      if (urb_write_flags) {
-         src_reg prev_count(this, glsl_type::uint_type);
-         emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
-         unsigned log2_bits_per_vertex =
-            _mesa_fls(c->control_data_bits_per_vertex);
-         emit(SHR(dst_reg(dword_index), prev_count,
-                  (uint32_t) (6 - log2_bits_per_vertex)));
-      }
-
-      /* Start building the URB write message.  The first MRF gets a copy of
-       * R0.
-       */
-      int base_mrf = 1;
-      dst_reg mrf_reg(MRF, base_mrf);
-      src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-      vec4_instruction *inst = emit(MOV(mrf_reg, r0));
-      inst->force_writemask_all = true;
-
-      if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
-         /* Set the per-slot offset to dword_index / 4, to that we'll write to
-          * the appropriate OWORD within the control data header.
-          */
-         src_reg per_slot_offset(this, glsl_type::uint_type);
-         emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
-         emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
-      }
-
-      if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
-         /* Set the channel masks to 1 << (dword_index % 4), so that we'll
-          * write to the appropriate DWORD within the OWORD.  We need to do
-          * this computation with force_writemask_all, otherwise garbage data
-          * from invocation 0 might clobber the mask for invocation 1 when
-          * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
-          * together.
-          */
-         src_reg channel(this, glsl_type::uint_type);
-         inst = emit(AND(dst_reg(channel), dword_index, 3u));
-         inst->force_writemask_all = true;
-         src_reg one(this, glsl_type::uint_type);
-         inst = emit(MOV(dst_reg(one), 1u));
-         inst->force_writemask_all = true;
-         src_reg channel_mask(this, glsl_type::uint_type);
-         inst = emit(SHL(dst_reg(channel_mask), one, channel));
-         inst->force_writemask_all = true;
-         emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
-                                               channel_mask);
-         emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
-      }
-
-      /* Store the control data bits in the message payload and send it. */
-      dst_reg mrf_reg2(MRF, base_mrf + 1);
-      inst = emit(MOV(mrf_reg2, this->control_data_bits));
-      inst->force_writemask_all = true;
-      inst = emit(GS_OPCODE_URB_WRITE);
-      inst->urb_write_flags = urb_write_flags;
-      /* We need to increment Global Offset by 256-bits to make room for
-       * Broadwell's extra "Vertex Count" payload at the beginning of the
-       * URB entry.  Since this is an OWord message, Global Offset is counted
-       * in 128-bit units, so we must set it to 2.
-       */
-      if (devinfo->gen >= 8)
-         inst->offset = 2;
-      inst->base_mrf = base_mrf;
-      inst->mlen = 2;
+   /* If we are using either channel masks or a per-slot offset, then we
+    * need to figure out which DWORD we are trying to write to, using the
+    * formula:
+    *
+    *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
+    *
+    * Since bits_per_vertex is a power of two, and is known at compile
+    * time, this can be optimized to:
+    *
+    *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
+    */
+   src_reg dword_index(this, glsl_type::uint_type);
+   if (urb_write_flags) {
+      src_reg prev_count(this, glsl_type::uint_type);
+      emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
+      unsigned log2_bits_per_vertex =
+         _mesa_fls(c->control_data_bits_per_vertex);
+      emit(SHR(dst_reg(dword_index), prev_count,
+               (uint32_t) (6 - log2_bits_per_vertex)));
    }
+
+   /* Start building the URB write message.  The first MRF gets a copy of
+    * R0.
+    */
+   int base_mrf = 1;
+   dst_reg mrf_reg(MRF, base_mrf);
+   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+   inst->force_writemask_all = true;
+
+   if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
+      /* Set the per-slot offset to dword_index / 4, to that we'll write to
+       * the appropriate OWORD within the control data header.
+       */
+      src_reg per_slot_offset(this, glsl_type::uint_type);
+      emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
+      emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
+   }
+
+   if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
+      /* Set the channel masks to 1 << (dword_index % 4), so that we'll
+       * write to the appropriate DWORD within the OWORD.  We need to do
+       * this computation with force_writemask_all, otherwise garbage data
+       * from invocation 0 might clobber the mask for invocation 1 when
+       * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
+       * together.
+       */
+      src_reg channel(this, glsl_type::uint_type);
+      inst = emit(AND(dst_reg(channel), dword_index, 3u));
+      inst->force_writemask_all = true;
+      src_reg one(this, glsl_type::uint_type);
+      inst = emit(MOV(dst_reg(one), 1u));
+      inst->force_writemask_all = true;
+      src_reg channel_mask(this, glsl_type::uint_type);
+      inst = emit(SHL(dst_reg(channel_mask), one, channel));
+      inst->force_writemask_all = true;
+      emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
+                                            channel_mask);
+      emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
+   }
+
+   /* Store the control data bits in the message payload and send it. */
+   dst_reg mrf_reg2(MRF, base_mrf + 1);
+   inst = emit(MOV(mrf_reg2, this->control_data_bits));
+   inst->force_writemask_all = true;
+   inst = emit(GS_OPCODE_URB_WRITE);
+   inst->urb_write_flags = urb_write_flags;
+   /* We need to increment Global Offset by 256-bits to make room for
+    * Broadwell's extra "Vertex Count" payload at the beginning of the
+    * URB entry.  Since this is an OWord message, Global Offset is counted
+    * in 128-bit units, so we must set it to 2.
+    */
+   if (devinfo->gen >= 8)
+      inst->offset = 2;
+   inst->base_mrf = base_mrf;
+   inst->mlen = 2;
 }
 
 void

From f3a620e2a63956a37367b9e393d4c1ecd41e5d43 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 22 Apr 2015 17:46:08 -0700
Subject: [PATCH 0265/1208] i965: Label the repclear shader "meta repclear"
 rather than "meta clear".

Color clears can be performed via two separate shaders - one is the
generic "meta clear" shader (in meta.c); the other is the i965 specific
"repclear" shader (in brw_meta_fast_clear.c).

Giving them separate names makes them distinguishable when reading
INTEL_DEBUG=shader_time output.

v2: Call it "meta repclear", as suggested by Jason.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_meta_fast_clear.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index 5b8191c093b..c5e556ee9eb 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -128,7 +128,7 @@ brw_bind_rep_write_shader(struct brw_context *brw, float *color)
    _mesa_AttachShader(clear->shader_prog, vs);
    _mesa_DeleteShader(vs);
    _mesa_BindAttribLocation(clear->shader_prog, 0, "position");
-   _mesa_ObjectLabel(GL_PROGRAM, clear->shader_prog, -1, "meta clear");
+   _mesa_ObjectLabel(GL_PROGRAM, clear->shader_prog, -1, "meta repclear");
    _mesa_LinkProgram(clear->shader_prog);
 
    clear->color_location =

From 6be024f44dea7df6608e5a3111deffc61dbf6d6d Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 25 Jun 2015 09:17:38 -0700
Subject: [PATCH 0266/1208] i965/gen6: Set up layer constraints properly for
 depth buffers.

This ports over Chris Forbes' equivalent fixes in gen7_misc_state.c
from commit 77d55ef4819436ebbf9786a1e720ec00707bbb19.

No Piglit changes on Sandybridge.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/gen6_depth_state.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/gen6_depth_state.c b/src/mesa/drivers/dri/i965/gen6_depth_state.c
index 8f0d7dc5431..febd4781100 100644
--- a/src/mesa/drivers/dri/i965/gen6_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_depth_state.c
@@ -73,7 +73,7 @@ gen6_emit_depth_stencil_hiz(struct brw_context *brw,
    rb = (struct gl_renderbuffer*) irb;
 
    if (rb) {
-      depth = MAX2(rb->Depth, 1);
+      depth = MAX2(irb->layer_count, 1);
       if (rb->TexImage)
          gl_target = rb->TexImage->TexObject->Target;
    }
@@ -89,6 +89,10 @@ gen6_emit_depth_stencil_hiz(struct brw_context *brw,
       surftype = BRW_SURFACE_2D;
       depth *= 6;
       break;
+   case GL_TEXTURE_3D:
+      assert(mt);
+      depth = MAX2(mt->logical_depth0, 1);
+      /* fallthrough */
    default:
       surftype = translate_tex_target(gl_target);
       break;

From 4fe15717ce2fc0b1c239d3d7bf9a7bb04fb50dd5 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 25 Jun 2015 10:08:06 -0700
Subject: [PATCH 0267/1208] i965: Remove special case for layered drawbuffer
 attachments.

When binding a layered texture, the layer is already 0.  There's no need
to special case this.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
---
 src/mesa/drivers/dri/i965/gen6_surface_state.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/gen6_surface_state.c b/src/mesa/drivers/dri/i965/gen6_surface_state.c
index 03e913a0a76..39de62f2304 100644
--- a/src/mesa/drivers/dri/i965/gen6_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_surface_state.c
@@ -88,7 +88,8 @@ gen6_update_renderbuffer_surface(struct brw_context *brw,
       break;
    }
 
-   const int min_array_element = layered ? 0 : irb->mt_layer;
+   const int min_array_element = irb->mt_layer;
+   assert(!layered || irb->mt_layer == 0);
 
    surf[0] = SET_FIELD(surftype, BRW_SURFACE_TYPE) |
              SET_FIELD(format, BRW_SURFACE_FORMAT);

From 1bfa25e88d21f95b9e176232bb091af77c294578 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 10 Jul 2015 16:42:18 -0400
Subject: [PATCH 0268/1208] nv50, nvc0: enable at least one color RT if
 alphatest is enabled

Fixes the following piglits:
  fbo-alphatest-nocolor
  fbo-alphatest-nocolor-ff

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: mesa-stable@lists.freedesktop.org
---
 .../drivers/nouveau/nv50/nv50_state_validate.c | 18 ++++++++++++++++++
 .../drivers/nouveau/nvc0/nvc0_state_validate.c | 18 ++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 116bf4bba7c..293f9802df8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -296,6 +296,23 @@ nv50_check_program_ucps(struct nv50_context *nv50,
    nv50_fp_linkage_validate(nv50);
 }
 
+/* alpha test is disabled if there are no color RTs, so make sure we have at
+ * least one if alpha test is enabled. Note that this must run after
+ * nv50_validate_fb, otherwise that will override the RT count setting.
+ */
+static void
+nv50_validate_derived_2(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+
+   if (nv50->zsa && nv50->zsa->pipe.alpha.enabled &&
+       nv50->framebuffer.nr_cbufs == 0) {
+      nv50_fb_set_null_rt(push, 0);
+      BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
+      PUSH_DATA (push, (076543210 << 4) | 1);
+   }
+}
+
 static void
 nv50_validate_clip(struct nv50_context *nv50)
 {
@@ -456,6 +473,7 @@ static struct state_validate {
     { nv50_gp_linkage_validate,    NV50_NEW_GMTYPROG | NV50_NEW_VERTPROG },
     { nv50_validate_derived_rs,    NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
                                    NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
+    { nv50_validate_derived_2,     NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER },
     { nv50_validate_clip,          NV50_NEW_CLIP | NV50_NEW_RASTERIZER |
                                    NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
     { nv50_constbufs_validate,     NV50_NEW_CONSTBUF },
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index c52399ab312..785e52e9561 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -535,6 +535,23 @@ nvc0_validate_derived_1(struct nvc0_context *nvc0)
    }
 }
 
+/* alpha test is disabled if there are no color RTs, so make sure we have at
+ * least one if alpha test is enabled. Note that this must run after
+ * nvc0_validate_fb, otherwise that will override the RT count setting.
+ */
+static void
+nvc0_validate_derived_2(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   if (nvc0->zsa && nvc0->zsa->pipe.alpha.enabled &&
+       nvc0->framebuffer.nr_cbufs == 0) {
+      nvc0_fb_set_null_rt(push, 0);
+      BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1);
+      PUSH_DATA (push, (076543210 << 4) | 1);
+   }
+}
+
 static void
 nvc0_switch_pipe_context(struct nvc0_context *ctx_to)
 {
@@ -597,6 +614,7 @@ static struct state_validate {
     { nvc0_fragprog_validate,      NVC0_NEW_FRAGPROG },
     { nvc0_validate_derived_1,     NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA |
                                    NVC0_NEW_RASTERIZER },
+    { nvc0_validate_derived_2,     NVC0_NEW_ZSA | NVC0_NEW_FRAMEBUFFER },
     { nvc0_validate_clip,          NVC0_NEW_CLIP | NVC0_NEW_RASTERIZER |
                                    NVC0_NEW_VERTPROG |
                                    NVC0_NEW_TEVLPROG |

From c397bd14077b760125604426a99aba00d6193788 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 9 Jul 2015 15:22:09 +1000
Subject: [PATCH 0269/1208] r600g: fix sampler/ubo indexing on cayman

Cayman needs a different method to upload the CF IDX0/1

This fixes 31 piglits when ARB_gpu_shader5 is forced on
with cayman.

Reviewed-by: Glenn Kennard <glenn.kennard@gmail.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/eg_asm.c | 17 +++++++++++------
 src/gallium/drivers/r600/eg_sq.h  |  7 +++++++
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index 295cb4d80b7..42e8b0b1761 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -160,6 +160,9 @@ int egcm_load_index_reg(struct r600_bytecode *bc, unsigned id, bool inside_alu_c
 	alu.op = ALU_OP1_MOVA_INT;
 	alu.src[0].sel = bc->index_reg[id];
 	alu.src[0].chan = 0;
+	if (bc->chip_class == CAYMAN)
+		alu.dst.sel = id == 0 ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
+
 	alu.last = 1;
 	r = r600_bytecode_add_alu(bc, &alu);
 	if (r)
@@ -167,12 +170,14 @@ int egcm_load_index_reg(struct r600_bytecode *bc, unsigned id, bool inside_alu_c
 
 	bc->ar_loaded = 0; /* clobbered */
 
-	memset(&alu, 0, sizeof(alu));
-	alu.op = id == 0 ? ALU_OP0_SET_CF_IDX0 : ALU_OP0_SET_CF_IDX1;
-	alu.last = 1;
-	r = r600_bytecode_add_alu(bc, &alu);
-	if (r)
-		return r;
+	if (bc->chip_class == EVERGREEN) {
+		memset(&alu, 0, sizeof(alu));
+		alu.op = id == 0 ? ALU_OP0_SET_CF_IDX0 : ALU_OP0_SET_CF_IDX1;
+		alu.last = 1;
+		r = r600_bytecode_add_alu(bc, &alu);
+		if (r)
+			return r;
+	}
 
 	/* Must split ALU group as index only applies to following group */
 	if (inside_alu_clause) {
diff --git a/src/gallium/drivers/r600/eg_sq.h b/src/gallium/drivers/r600/eg_sq.h
index b534872f062..97e230f56c7 100644
--- a/src/gallium/drivers/r600/eg_sq.h
+++ b/src/gallium/drivers/r600/eg_sq.h
@@ -521,4 +521,11 @@
 
 #define V_SQ_REL_ABSOLUTE 0
 #define V_SQ_REL_RELATIVE 1
+
+/* CAYMAN has special encoding for MOVA_INT destination */
+#define CM_V_SQ_MOVA_DST_AR_X 0
+#define CM_V_SQ_MOVA_DST_CF_PC 1
+#define CM_V_SQ_MOVA_DST_CF_IDX0 2
+#define CM_V_SQ_MOVA_DST_CF_IDX1 3
+
 #endif

From e70d0515603df081916f6f31bb9e0455298b10cc Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 9 Jul 2015 15:49:56 +1000
Subject: [PATCH 0270/1208] r600g: move sampler/ubo index registers before temp
 reg

temp_reg needs to be last, as we increment things
away from it, otherwise on cayman some tests were overwriting
the index regs.

Fixes 2 piglit with ARB_gpu_shader5 forced on cayman.

Reviewed-by: Glenn Kennard <glenn.kennard@gmail.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/r600_shader.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index af7622e9b34..1a72bf6e77e 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -1931,15 +1931,14 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
 	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
 			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
+	ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
+	ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
+
 	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
-		ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 1;
-		ctx.temp_reg = ctx.bc->ar_reg + 2;
-		ctx.bc->index_reg[0] = ctx.bc->ar_reg + 3;
-		ctx.bc->index_reg[1] = ctx.bc->ar_reg + 4;
+		ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 3;
+		ctx.temp_reg = ctx.bc->ar_reg + 4;
 	} else {
-		ctx.temp_reg = ctx.bc->ar_reg + 1;
-		ctx.bc->index_reg[0] = ctx.bc->ar_reg + 2;
-		ctx.bc->index_reg[1] = ctx.bc->ar_reg + 3;
+		ctx.temp_reg = ctx.bc->ar_reg + 3;
 	}
 
 	shader->max_arrays = 0;

From ad2c3905d3460a6ddfc6756fc58a78332d82e72f Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 1 Jul 2015 06:31:13 +0100
Subject: [PATCH 0271/1208] tgsi: add DFMA to the opcode infer functions.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/tgsi/tgsi_info.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 4b16ef387f3..8da42b98468 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -381,6 +381,7 @@ tgsi_opcode_infer_type( uint opcode )
       return TGSI_TYPE_SIGNED;
    case TGSI_OPCODE_DADD:
    case TGSI_OPCODE_DABS:
+   case TGSI_OPCODE_DFMA:
    case TGSI_OPCODE_DNEG:
    case TGSI_OPCODE_DMUL:
    case TGSI_OPCODE_DMAX:

From 66d354384505fd5ef67b8683db94e8967aba338b Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Sat, 11 Jul 2015 19:46:49 +0100
Subject: [PATCH 0272/1208] Add release notes for the 10.6.2 release

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
(cherry picked from commit 9643cce94c8a1938e3342fb83d025a1e5c2aa79b)
---
 docs/relnotes/10.6.2.html | 164 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 docs/relnotes/10.6.2.html

diff --git a/docs/relnotes/10.6.2.html b/docs/relnotes/10.6.2.html
new file mode 100644
index 00000000000..8ba6cdb607e
--- /dev/null
+++ b/docs/relnotes/10.6.2.html
@@ -0,0 +1,164 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.6.2 Release Notes / July 11, 2015</h1>
+
+<p>
+Mesa 10.6.2 is a bug fix release which fixes bugs found since the 10.6.1 release.
+</p>
+<p>
+Mesa 10.6.2 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=73528">Bug 73528</a> - Deferred lighting in Second Life causes system hiccups and screen flickering</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=80500">Bug 80500</a> - Flickering shadows in unreleased title trace</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=82186">Bug 82186</a> - [r600g] BARTS GPU lockup with minecraft shaders</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=84225">Bug 84225</a> - Allow constant-index-expression sampler array indexing with GLSL-ES &lt; 300</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90537">Bug 90537</a> - radeonsi bo/va conflict on RADEON_GEM_VA (rscreen-&gt;ws-&gt;buffer_from_handle returns NULL)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90873">Bug 90873</a> - Kernel hang, TearFree On, Mate desktop environment</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91022">Bug 91022</a> - [g45 g965 bisected] assertions generated from textureGrad cube samplers fix</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91047">Bug 91047</a> - [SNB Bisected] Messed up Fog in Super Smash Bros. Melee in Dolphin</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91056">Bug 91056</a> - The Bard's Tale (2005, native)  has rendering issues</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91117">Bug 91117</a> - Nimbus (running in wine) has rendering issues, objects are semi-transparent</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91124">Bug 91124</a> - Civilization V (in Wine) has rendering issues: text missing, menu bar corrupted</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91173">Bug 91173</a> - Oddworld: Stranger's Wrath HD: disfigured models in wrong colors</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91226">Bug 91226</a> - Crash in glLinkProgram (NEW)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91231">Bug 91231</a> - [NV92] Psychonauts (native) segfaults on start when DRI3 enabled</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Chris Wilson (1):</p>
+<ul>
+  <li>loader: Look for any version of currently linked libudev.so</li>
+</ul>
+
+<p>Emil Velikov (2):</p>
+<ul>
+  <li>docs: Add sha256 checksums for the 10.6.1 release</li>
+  <li>Update version to 10.6.2</li>
+</ul>
+
+<p>Ilia Mirkin (8):</p>
+<ul>
+  <li>nv50/ir: propagate modifier to right arg when const-folding mad</li>
+  <li>nv50/ir: fix emission of address reg in 3rd source</li>
+  <li>nv50/ir: copy joinAt when splitting both before and after</li>
+  <li>mesa: reset the source packing when creating temp transfer image</li>
+  <li>nv50/ir: don't emit src2 in immediate form</li>
+  <li>mesa/prog: relative offsets into constbufs are not constant</li>
+  <li>nv50/ir: UCMP arguments are float, so make sure modifiers are applied</li>
+  <li>nvc0: turn sample counts off during blit</li>
+</ul>
+
+<p>Kenneth Graunke (5):</p>
+<ul>
+  <li>i965/fs: Fix ir_txs in emit_texture_gen4_simd16().</li>
+  <li>i965: Reserve more batch space to accomodate Gen6 perfmonitors.</li>
+  <li>i965/vs: Fix matNxM vertex attributes where M != 4.</li>
+  <li>Revert "glsl: clone inputs and outputs during linking"</li>
+  <li>Revert "i965: Delete linked GLSL IR when using NIR."</li>
+</ul>
+
+<p>Marek Olšák (3):</p>
+<ul>
+  <li>r600g: disable single-sample fast color clear due to hangs</li>
+  <li>radeonsi: fix a hang with DrawTransformFeedback on 4 SE chips</li>
+  <li>st/dri: don't set PIPE_BIND_SCANOUT for MSAA surfaces</li>
+</ul>
+
+<p>Mario Kleiner (2):</p>
+<ul>
+  <li>nouveau: Use dup fd as key in drm-winsys hash table to fix ZaphodHeads.</li>
+  <li>winsys/radeon: Use dup fd as key in drm-winsys hash table to fix ZaphodHeads.</li>
+</ul>
+
+<p>Matt Turner (2):</p>
+<ul>
+  <li>i965/fs: Don't mess up stride for uniform integer multiplication.</li>
+  <li>Revert SHA1 additions.</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>winsys/radeon: Unmap GPU VM address range when destroying BO</li>
+</ul>
+
+<p>Mike Stroyan (2):</p>
+<ul>
+  <li>meta: Only change and restore viewport 0 in mesa meta mode</li>
+  <li>i965: allocate at least 1 BLEND_STATE element</li>
+</ul>
+
+<p>Neil Roberts (4):</p>
+<ul>
+  <li>i965/skl: Set the pulls bary bit in 3DSTATE_PS_EXTRA</li>
+  <li>glsl: Add missing check for whether an expression is an add operation</li>
+  <li>glsl: Make sure not to dereference NULL</li>
+  <li>i965: Don't try to print the GLSL IR if it has been freed</li>
+</ul>
+
+<p>Tapani Pälli (8):</p>
+<ul>
+  <li>glsl: clone inputs and outputs during linking</li>
+  <li>i965: Delete linked GLSL IR when using NIR.</li>
+  <li>glsl: Allow dynamic sampler array indexing with GLSL ES &lt; 3.00</li>
+  <li>mesa/glsl: new compiler option EmitNoIndirectSampler</li>
+  <li>i965: use EmitNoIndirectSampler for gen &lt; 7</li>
+  <li>i915: use EmitNoIndirectSampler</li>
+  <li>mesa/st: use EmitNoIndirectSampler if !ARB_gpu_shader5</li>
+  <li>glsl: validate sampler array indexing for 'constant-index-expression'</li>
+</ul>
+
+
+</div>
+</body>
+</html>

From 6dfce109c27b4e15373adcbbde981140912001ae Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Sat, 11 Jul 2015 20:33:16 +0100
Subject: [PATCH 0273/1208] docs: Add sha256 checksums for the 10.6.2 release

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
(cherry picked from commit 89cbd91b17989ec7eb1cb93ac427a84dca56cd79)
---
 docs/relnotes/10.6.2.html | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/relnotes/10.6.2.html b/docs/relnotes/10.6.2.html
index 8ba6cdb607e..d95417a8521 100644
--- a/docs/relnotes/10.6.2.html
+++ b/docs/relnotes/10.6.2.html
@@ -31,7 +31,8 @@ because compatibility contexts are not supported.
 
 <h2>SHA256 checksums</h2>
 <pre>
-TBD
+9c7ab9300dda6c912faaaff97995ec1820ba21d114d9cf555f145cbad90995f4  mesa-10.6.2.tar.gz
+05753d3db4212900927b9894221a1669a10f56786e86a7e818b6e18a0817dca9  mesa-10.6.2.tar.xz
 </pre>
 
 

From 846c60fc7df587a24d5bf0835497aa25034538b3 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Sat, 11 Jul 2015 20:36:44 +0100
Subject: [PATCH 0274/1208] docs: add news item and link release notes for mesa
 10.6.2

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 docs/index.html    | 6 ++++++
 docs/relnotes.html | 1 +
 2 files changed, 7 insertions(+)

diff --git a/docs/index.html b/docs/index.html
index 4aa511dad1b..36b993eeb98 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,12 @@
 
 <h1>News</h1>
 
+<h2>July 11 2015</h2>
+<p>
+<a href="relnotes/10.6.2.html">Mesa 10.6.2</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>July 04, 2015</h2>
 <p>
 <a href="relnotes/10.5.9.html">Mesa 10.5.9</a> is released.
diff --git a/docs/relnotes.html b/docs/relnotes.html
index 35ae4c7ade0..1acbce0551a 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>
 
 <ul>
+<li><a href="relnotes/10.6.2.html">10.6.2 release notes</a>
 <li><a href="relnotes/10.5.9.html">10.5.9 release notes</a>
 <li><a href="relnotes/10.6.1.html">10.6.1 release notes</a>
 <li><a href="relnotes/10.5.8.html">10.5.8 release notes</a>

From 6cc29cf5e2c21dc0937b2c794758be61d3281324 Mon Sep 17 00:00:00 2001
From: Rhys Kidd <rhyskidd@gmail.com>
Date: Sat, 27 Jun 2015 13:14:37 +1000
Subject: [PATCH 0275/1208] doxygen: Add doxygen_sqlite3.db to .gitignore

Signed-off-by: Rhys Kidd <rhyskidd@gmail.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 doxygen/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doxygen/.gitignore b/doxygen/.gitignore
index abf56ac682d..a5f3921b445 100644
--- a/doxygen/.gitignore
+++ b/doxygen/.gitignore
@@ -1,3 +1,4 @@
+*.db
 *.tag
 *.tmp
 agpgart

From 5d219908ce045805647b85d1d302b58887e63c1b Mon Sep 17 00:00:00 2001
From: Rhys Kidd <rhyskidd@gmail.com>
Date: Sat, 27 Jun 2015 13:14:38 +1000
Subject: [PATCH 0276/1208] doxygen: Remove doxygen_sqlite3.db with 'make
 clean'

Signed-off-by: Rhys Kidd <rhyskidd@gmail.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 doxygen/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doxygen/Makefile b/doxygen/Makefile
index 0a95a3516a2..01c2691cfe0 100644
--- a/doxygen/Makefile
+++ b/doxygen/Makefile
@@ -33,3 +33,4 @@ subset: $(SUBSET:.doxy=.tag)
 clean:
 	-rm -rf $(FULL:.doxy=) $(SUBSET:.doxy=)
 	-rm -rf *.tag
+	-rm -rf *.db

From f7008ebcdc4d936e8b2b1a317d870e907e4d369f Mon Sep 17 00:00:00 2001
From: Guillaume Desmottes <guillaume.desmottes@collabora.co.uk>
Date: Fri, 17 Apr 2015 15:13:35 +0200
Subject: [PATCH 0277/1208] dri3_open: don't leak the reply

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=90073
Signed-off-by: Guillaume Desmottes <guillaume.desmottes@collabora.co.uk>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/glx/dri3_glx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c
index dfb0093395f..96f13e6a07b 100644
--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -1679,6 +1679,8 @@ dri3_open(Display *dpy,
    fd = xcb_dri3_open_reply_fds(c, reply)[0];
    fcntl(fd, F_SETFD, FD_CLOEXEC);
 
+   free(reply);
+
    return fd;
 }
 

From 8108de4774f2542a8fe65de71b82221821f73434 Mon Sep 17 00:00:00 2001
From: Guillaume Desmottes <guillaume.desmottes@collabora.co.uk>
Date: Fri, 17 Apr 2015 15:13:36 +0200
Subject: [PATCH 0278/1208] loader: don't leak udev_enumerate

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=90073
Signed-off-by: Guillaume Desmottes <guillaume.desmottes@collabora.co.uk>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/loader/loader.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/loader/loader.c b/src/loader/loader.c
index 8780587a725..8da1858734a 100644
--- a/src/loader/loader.c
+++ b/src/loader/loader.c
@@ -273,6 +273,8 @@ get_render_node_from_id_path_tag(struct udev *udev,
                (struct udev_enumerate *));
    UDEV_SYMBOL(struct udev_list_entry *, udev_enumerate_get_list_entry,
                (struct udev_enumerate *));
+   UDEV_SYMBOL(void, udev_enumerate_unref,
+               (struct udev_enumerate *));
    UDEV_SYMBOL(struct udev_list_entry *, udev_list_entry_get_next,
                (struct udev_list_entry *));
    UDEV_SYMBOL(const char *, udev_list_entry_get_name,
@@ -307,6 +309,8 @@ get_render_node_from_id_path_tag(struct udev *udev,
       udev_device_unref(device);
    }
 
+   udev_enumerate_unref(e);
+
    if (found) {
       path_res = strdup(udev_device_get_devnode(device));
       udev_device_unref(device);

From 4cbf0a0ccf2fb4545b206066b756fd9a07acab92 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 1 Jul 2015 04:58:24 +0100
Subject: [PATCH 0279/1208] radeonsi: ARB_gpu_shader_fp64 +
 ARB_vertex_attrib_64bit support.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds the translation from TGSI to AMDGPU llvm backend, for the
64-bit opcodes. The backend pretty much handles everything for us
fine. There is one patch required for SI DFRAC support, that I know
off.

[airlied: fixed missing comma, updated relnotes]

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                                  |   4 +-
 docs/relnotes/10.7.0.html                     |   4 +-
 src/gallium/drivers/radeon/radeon_llvm.h      |   7 +-
 .../drivers/radeon/radeon_setup_tgsi_llvm.c   | 143 +++++++++++++++++-
 src/gallium/drivers/radeonsi/si_pipe.c        |   1 +
 src/gallium/drivers/radeonsi/si_shader.c      |  31 +++-
 6 files changed, 175 insertions(+), 15 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 94bbcd12dfc..33a282edd5b 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -109,7 +109,7 @@ GL 4.0, GLSL 4.00:
   - Enhanced per-sample shading                        DONE (r600, radeonsi)
   - Interpolation functions                            DONE (r600)
   - New overload resolution rules                      DONE
-  GL_ARB_gpu_shader_fp64                               DONE (nvc0, llvmpipe, softpipe)
+  GL_ARB_gpu_shader_fp64                               DONE (nvc0, radeonsi, llvmpipe, softpipe)
   GL_ARB_sample_shading                                DONE (i965, nv50, nvc0, r600, radeonsi)
   GL_ARB_shader_subroutine                             started (Dave)
   GL_ARB_tessellation_shader                           started (Chris, Ilia)
@@ -127,7 +127,7 @@ GL 4.1, GLSL 4.10:
   GL_ARB_get_program_binary                            DONE (0 binary formats)
   GL_ARB_separate_shader_objects                       DONE (all drivers)
   GL_ARB_shader_precision                              started (Micah)
-  GL_ARB_vertex_attrib_64bit                           DONE (nvc0, llvmpipe, softpipe)
+  GL_ARB_vertex_attrib_64bit                           DONE (nvc0, radeonsi, llvmpipe, softpipe)
   GL_ARB_viewport_array                                DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
 
 
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index 7f55b067a4a..42ea807e1ff 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -47,9 +47,9 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_AMD_vertex_shader_viewport_index on radeonsi</li>
 <li>GL_ARB_fragment_layer_viewport on radeonsi</li>
 <li>GL_ARB_framebuffer_no_attachments on i965</li>
-<li>GL_ARB_gpu_shader_fp64 on llvmpipe</li>
+<li>GL_ARB_gpu_shader_fp64 on llvmpipe, radeonsi</li>
 <li>GL_ARB_shader_stencil_export on llvmpipe</li>
-<li>GL_ARB_vertex_attrib_64bit on llvmpipe</li>
+<li>GL_ARB_vertex_attrib_64bit on llvmpipe, radeonsi</li>
 <li>GL_ARB_viewport_array on radeonsi</li>
 <li>GLX_ARB_create_context_robustness on r600, radeonsi</li>
 <li>EGL_EXT_create_context_robustness on r600, radeonsi</li>
diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 6a9557b0b73..591e698d482 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -146,6 +146,8 @@ static inline LLVMTypeRef tgsi2llvmtype(
 	case TGSI_TYPE_UNSIGNED:
 	case TGSI_TYPE_SIGNED:
 		return LLVMInt32TypeInContext(ctx);
+	case TGSI_TYPE_DOUBLE:
+		return LLVMDoubleTypeInContext(ctx);
 	case TGSI_TYPE_UNTYPED:
 	case TGSI_TYPE_FLOAT:
 		return LLVMFloatTypeInContext(ctx);
@@ -205,6 +207,9 @@ build_tgsi_intrinsic_nomem(
 		struct lp_build_tgsi_context * bld_base,
 		struct lp_build_emit_data * emit_data);
 
-
+LLVMValueRef
+radeon_llvm_emit_fetch_double(struct lp_build_tgsi_context *bld_base,
+			      LLVMValueRef ptr,
+			      LLVMValueRef ptr2);
 
 #endif /* RADEON_LLVM_H */
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index c8c980d9d32..444a41c01da 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -116,6 +116,28 @@ emit_fetch(
 	enum tgsi_opcode_type type,
 	unsigned swizzle);
 
+LLVMValueRef
+radeon_llvm_emit_fetch_double(
+	struct lp_build_tgsi_context *bld_base,
+	LLVMValueRef ptr,
+	LLVMValueRef ptr2)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMValueRef result;
+
+	result = LLVMGetUndef(LLVMVectorType(LLVMIntTypeInContext(bld_base->base.gallivm->context, 32), bld_base->base.type.length * 2));
+
+	result = LLVMBuildInsertElement(builder,
+					result,
+					bitcast(bld_base, TGSI_TYPE_UNSIGNED, ptr),
+					bld_base->int_bld.zero, "");
+	result = LLVMBuildInsertElement(builder,
+					result,
+					bitcast(bld_base, TGSI_TYPE_UNSIGNED, ptr2),
+					bld_base->int_bld.one, "");
+	return bitcast(bld_base, TGSI_TYPE_DOUBLE, result);
+}
+
 static LLVMValueRef
 emit_array_fetch(
 	struct lp_build_tgsi_context *bld_base,
@@ -160,7 +182,7 @@ emit_fetch(
 	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
 	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
 	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
-	LLVMValueRef result = NULL, ptr;
+	LLVMValueRef result = NULL, ptr, ptr2;
 
 	if (swizzle == ~0) {
 		LLVMValueRef values[TGSI_NUM_CHANNELS];
@@ -184,11 +206,27 @@ emit_fetch(
 	switch(reg->Register.File) {
 	case TGSI_FILE_IMMEDIATE: {
 		LLVMTypeRef ctype = tgsi2llvmtype(bld_base, type);
-		return LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype);
+		if (type == TGSI_TYPE_DOUBLE) {
+			result = LLVMGetUndef(LLVMVectorType(LLVMIntTypeInContext(bld_base->base.gallivm->context, 32), bld_base->base.type.length * 2));
+			result = LLVMConstInsertElement(result,
+							bld->immediates[reg->Register.Index][swizzle],
+							bld_base->int_bld.zero);
+			result = LLVMConstInsertElement(result,
+							bld->immediates[reg->Register.Index][swizzle + 1],
+							bld_base->int_bld.one);
+			return LLVMConstBitCast(result, ctype);
+		} else {
+			return LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype);
+		}
 	}
 
 	case TGSI_FILE_INPUT:
 		result = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)];
+		if (type == TGSI_TYPE_DOUBLE) {
+			ptr = result;
+			ptr2 = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle + 1)];
+			return radeon_llvm_emit_fetch_double(bld_base, ptr, ptr2);
+		}
 		break;
 
 	case TGSI_FILE_TEMPORARY:
@@ -199,11 +237,23 @@ emit_fetch(
 			break;
 		}
 		ptr = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle];
+		if (type == TGSI_TYPE_DOUBLE) {
+			ptr2 = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle + 1];
+			return radeon_llvm_emit_fetch_double(bld_base,
+						 LLVMBuildLoad(builder, ptr, ""),
+						 LLVMBuildLoad(builder, ptr2, ""));
+		}
 		result = LLVMBuildLoad(builder, ptr, "");
 		break;
 
 	case TGSI_FILE_OUTPUT:
 		ptr = lp_get_output_ptr(bld, reg->Register.Index, swizzle);
+		if (type == TGSI_TYPE_DOUBLE) {
+			ptr2 = lp_get_output_ptr(bld, reg->Register.Index, swizzle + 1);
+			return radeon_llvm_emit_fetch_double(bld_base,
+						 LLVMBuildLoad(builder, ptr, ""),
+						 LLVMBuildLoad(builder, ptr2, ""));
+		}
 		result = LLVMBuildLoad(builder, ptr, "");
 		break;
 
@@ -348,9 +398,10 @@ emit_store(
 	struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
 	const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 	LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-	LLVMValueRef temp_ptr;
+	LLVMValueRef temp_ptr, temp_ptr2 = NULL;
 	unsigned chan, chan_index;
 	boolean is_vec_store = FALSE;
+	enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode);
 
 	if (dst[0]) {
 		LLVMTypeKind k = LLVMGetTypeKind(LLVMTypeOf(dst[0]));
@@ -371,6 +422,8 @@ emit_store(
 	TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
 		LLVMValueRef value = dst[chan_index];
 
+		if (dtype == TGSI_TYPE_DOUBLE && (chan_index == 1 || chan_index == 3))
+			continue;
 		if (inst->Instruction.Saturate)
 			value = radeon_llvm_saturate(bld_base, value);
 
@@ -379,8 +432,9 @@ emit_store(
 			LLVMBuildStore(builder, value, temp_ptr);
 			continue;
 		}
-	
-		value = bitcast(bld_base, TGSI_TYPE_FLOAT, value);
+
+		if (dtype != TGSI_TYPE_DOUBLE)
+			value = bitcast(bld_base, TGSI_TYPE_FLOAT, value);
 
 		if (reg->Register.Indirect) {
 			struct tgsi_declaration_range range = get_array_range(bld_base,
@@ -418,6 +472,8 @@ emit_store(
 			switch(reg->Register.File) {
 			case TGSI_FILE_OUTPUT:
 				temp_ptr = bld->outputs[reg->Register.Index][chan_index];
+				if (dtype == TGSI_TYPE_DOUBLE)
+					temp_ptr2 = bld->outputs[reg->Register.Index][chan_index + 1];
 				break;
 
 			case TGSI_FILE_TEMPORARY:
@@ -428,12 +484,28 @@ emit_store(
 					break;
 				}
 				temp_ptr = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index];
+				if (dtype == TGSI_TYPE_DOUBLE)
+					temp_ptr2 = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index + 1];
+
 				break;
 
 			default:
 				return;
 			}
-			LLVMBuildStore(builder, value, temp_ptr);
+			if (dtype != TGSI_TYPE_DOUBLE)
+				LLVMBuildStore(builder, value, temp_ptr);
+			else {
+				LLVMValueRef ptr = LLVMBuildBitCast(builder, value,
+								    LLVMVectorType(LLVMIntTypeInContext(bld_base->base.gallivm->context, 32), 2), "");
+				LLVMValueRef val2;
+				value = LLVMBuildExtractElement(builder, ptr,
+								bld_base->uint_bld.zero, "");
+				val2 = LLVMBuildExtractElement(builder, ptr,
+								bld_base->uint_bld.one, "");
+
+				LLVMBuildStore(builder, bitcast(bld_base, TGSI_TYPE_FLOAT, value), temp_ptr);
+				LLVMBuildStore(builder, bitcast(bld_base, TGSI_TYPE_FLOAT, val2), temp_ptr2);
+			}
 		}
 	}
 }
@@ -996,6 +1068,35 @@ static void emit_fcmp(
 	emit_data->output[emit_data->chan] = v;
 }
 
+static void emit_dcmp(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMContextRef context = bld_base->base.gallivm->context;
+	LLVMRealPredicate pred;
+
+	/* Use ordered for everything but NE (which is usual for
+	 * float comparisons)
+	 */
+	switch (emit_data->inst->Instruction.Opcode) {
+	case TGSI_OPCODE_DSEQ: pred = LLVMRealOEQ; break;
+	case TGSI_OPCODE_DSGE: pred = LLVMRealOGE; break;
+	case TGSI_OPCODE_DSLT: pred = LLVMRealOLT; break;
+	case TGSI_OPCODE_DSNE: pred = LLVMRealUNE; break;
+	default: assert(!"unknown instruction"); pred = 0; break;
+	}
+
+	LLVMValueRef v = LLVMBuildFCmp(builder, pred,
+			emit_data->args[0], emit_data->args[1],"");
+
+	v = LLVMBuildSExtOrBitCast(builder, v,
+			LLVMInt32TypeInContext(context), "");
+
+	emit_data->output[emit_data->chan] = v;
+}
+
 static void emit_not(
 		const struct lp_build_tgsi_action * action,
 		struct lp_build_tgsi_context * bld_base,
@@ -1161,6 +1262,16 @@ static void emit_ineg(
 			emit_data->args[0], "");
 }
 
+static void emit_dneg(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildFNeg(builder,
+			emit_data->args[0], "");
+}
+
 static void emit_f2i(
 		const struct lp_build_tgsi_action * action,
 		struct lp_build_tgsi_context * bld_base,
@@ -1423,6 +1534,12 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	lp_build_context_init(&bld_base->base, &ctx->gallivm, type);
 	lp_build_context_init(&ctx->soa.bld_base.uint_bld, &ctx->gallivm, lp_uint_type(type));
 	lp_build_context_init(&ctx->soa.bld_base.int_bld, &ctx->gallivm, lp_int_type(type));
+	{
+		struct lp_type dbl_type;
+		dbl_type = type;
+		dbl_type.width *= 2;
+		lp_build_context_init(&ctx->soa.bld_base.dbl_bld, &ctx->gallivm, dbl_type);
+	}
 
 	bld_base->soa = 1;
 	bld_base->emit_store = emit_store;
@@ -1461,10 +1578,24 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_CONT].emit = cont_emit;
 	bld_base->op_actions[TGSI_OPCODE_COS].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.cos.f32";
+	bld_base->op_actions[TGSI_OPCODE_DABS].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_DABS].intr_name = "fabs";
+	bld_base->op_actions[TGSI_OPCODE_DFMA].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_DFMA].intr_name = "llvm.fma.f64";
+	bld_base->op_actions[TGSI_OPCODE_DFRAC].intr_name = "llvm.AMDIL.fraction.";
+	bld_base->op_actions[TGSI_OPCODE_DNEG].emit = emit_dneg;
+	bld_base->op_actions[TGSI_OPCODE_DSEQ].emit = emit_dcmp;
+	bld_base->op_actions[TGSI_OPCODE_DSGE].emit = emit_dcmp;
+	bld_base->op_actions[TGSI_OPCODE_DSLT].emit = emit_dcmp;
+	bld_base->op_actions[TGSI_OPCODE_DSNE].emit = emit_dcmp;
 	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
 	bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
 	bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_DRSQ].intr_name = "llvm.AMDGPU.rsq.f64";
+	bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_DSQRT].intr_name = "llvm.sqrt.f64";
 	bld_base->op_actions[TGSI_OPCODE_ELSE].emit = else_emit;
 	bld_base->op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit;
 	bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 13b67d210fd..a9dce2cdd32 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -451,6 +451,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_CAP_PREFERRED_IR:
 		return PIPE_SHADER_IR_TGSI;
 	case PIPE_SHADER_CAP_DOUBLES:
+		return HAVE_LLVM >= 0x0307;
 	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 		return 0;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 753b238e2c0..75a29aeebc9 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -703,8 +703,15 @@ static LLVMValueRef fetch_constant(
 	buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
 	idx = reg->Register.Index * 4 + swizzle;
 
-	if (!reg->Register.Indirect)
-		return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]);
+	if (!reg->Register.Indirect) {
+		if (type != TGSI_TYPE_DOUBLE)
+			return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]);
+		else {
+			return radeon_llvm_emit_fetch_double(bld_base,
+							     si_shader_ctx->constants[buf][idx],
+							     si_shader_ctx->constants[buf][idx + 1]);
+		}
+	}
 
 	addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
 	addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
@@ -713,9 +720,25 @@ static LLVMValueRef fetch_constant(
 			    lp_build_const_int32(base->gallivm, idx * 4));
 
 	result = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
-			    addr, base->elem_type);
+				   addr, bld_base->base.elem_type);
 
-	return bitcast(bld_base, type, result);
+	if (type != TGSI_TYPE_DOUBLE)
+		result = bitcast(bld_base, type, result);
+	else {
+		LLVMValueRef addr2, result2;
+		addr2 = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
+		addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
+		addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
+		addr2 = lp_build_add(&bld_base->uint_bld, addr2,
+				     lp_build_const_int32(base->gallivm, idx * 4));
+
+		result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
+				   addr2, bld_base->base.elem_type);
+
+		result = radeon_llvm_emit_fetch_double(bld_base,
+					               result, result2);
+	}
+	return result;
 }
 
 /* Initialize arguments for the shader export intrinsic */

From de5c2b6f2b53924bceab6f4b8255d8e9dcad21b4 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 13 Jul 2015 09:11:20 +0100
Subject: [PATCH 0280/1208] radeonsi: direct emit intrinsic for DFRAC.

Michel reported this still failed, and this fixed it

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 444a41c01da..7c0318152d1 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -1582,6 +1582,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_DABS].intr_name = "fabs";
 	bld_base->op_actions[TGSI_OPCODE_DFMA].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_DFMA].intr_name = "llvm.fma.f64";
+	bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_DFRAC].intr_name = "llvm.AMDIL.fraction.";
 	bld_base->op_actions[TGSI_OPCODE_DNEG].emit = emit_dneg;
 	bld_base->op_actions[TGSI_OPCODE_DSEQ].emit = emit_dcmp;

From 69a1b9959e59653da262185c4e2cf57d24939b19 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 29 Jun 2015 12:36:45 +0100
Subject: [PATCH 0281/1208] pipe-loader: drop support for non-render node
 devices

Render nodes have been around for quite some time. Removing support via
the master/primary node allows us to clean up the conditional
compilation and simplify the build greatly.

For example currently we the pipe-loader, which explicitly links against
xcb and friends (for X auth) if found at compile-time. That
would cause problems as one will be forced to use X/xcb, even if it's a
headless system that is used for opencl.

v2: Clarify the linking topic in the commit message.

Cc: Tom Stellard <thomas.stellard@amd.com>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 .../auxiliary/pipe-loader/pipe_loader_drm.c   | 60 +------------------
 1 file changed, 1 insertion(+), 59 deletions(-)

diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
index ffeb29906b5..009e1dfdf6f 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -168,14 +168,6 @@ pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd,
    return false;
 }
 
-static int
-open_drm_minor(int minor)
-{
-   char path[PATH_MAX];
-   snprintf(path, sizeof(path), DRM_DEV_NAME, DRM_DIR_NAME, minor);
-   return open(path, O_RDWR, 0);
-}
-
 static int
 open_drm_render_node_minor(int minor)
 {
@@ -188,15 +180,8 @@ open_drm_render_node_minor(int minor)
 int
 pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
 {
-   int i, k, fd, num_render_node_devs;
-   int j = 0;
+   int i, j, fd;
 
-   struct {
-      unsigned vendor_id;
-      unsigned chip_id;
-   } render_node_devs[DRM_RENDER_NODE_MAX_NODES];
-
-   /* Look for render nodes first */
    for (i = DRM_RENDER_NODE_MIN_MINOR, j = 0;
         i <= DRM_RENDER_NODE_MAX_MINOR; i++) {
       fd = open_drm_render_node_minor(i);
@@ -209,9 +194,6 @@ pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
          continue;
       }
 
-      render_node_devs[j].vendor_id = dev->u.pci.vendor_id;
-      render_node_devs[j].chip_id = dev->u.pci.chip_id;
-
       if (j < ndev) {
          devs[j] = dev;
       } else {
@@ -221,46 +203,6 @@ pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
       j++;
    }
 
-   num_render_node_devs = j;
-
-   /* Next look for drm devices. */
-   for (i = 0; i < DRM_MAX_MINOR; i++) {
-      struct pipe_loader_device *dev;
-      boolean duplicate = FALSE;
-      fd = open_drm_minor(i);
-      if (fd < 0)
-         continue;
-
-      if (!pipe_loader_drm_probe_fd(&dev, fd, true)) {
-         close(fd);
-         continue;
-      }
-
-      /* Check to make sure we aren't already accessing this device via
-       * render nodes.
-       */
-      for (k = 0; k < num_render_node_devs; k++) {
-         if (dev->u.pci.vendor_id == render_node_devs[k].vendor_id &&
-             dev->u.pci.chip_id == render_node_devs[k].chip_id) {
-            close(fd);
-            dev->ops->release(&dev);
-            duplicate = TRUE;
-            break;
-         }
-      }
-
-      if (duplicate)
-         continue;
-
-      if (j < ndev) {
-         devs[j] = dev;
-      } else {
-         dev->ops->release(&dev);
-      }
-
-      j++;
-   }
-
    return j;
 }
 

From a27ec5dc460b91dc44675f48cddbbb2631ee824f Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 30 Jun 2015 15:53:27 +0100
Subject: [PATCH 0282/1208] pipe-loader: simplify pipe_loader_drm_probe

Do not iterate and (attempt to) open the render device, if we're over
the requested number of devices.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
index 009e1dfdf6f..1ff5f3e8df8 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -180,12 +180,13 @@ open_drm_render_node_minor(int minor)
 int
 pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
 {
+   struct pipe_loader_device *dev;
    int i, j, fd;
 
    for (i = DRM_RENDER_NODE_MIN_MINOR, j = 0;
-        i <= DRM_RENDER_NODE_MAX_MINOR; i++) {
+        i <= DRM_RENDER_NODE_MAX_MINOR && j < ndev; i++) {
+
       fd = open_drm_render_node_minor(i);
-      struct pipe_loader_device *dev;
       if (fd < 0)
          continue;
 
@@ -194,13 +195,7 @@ pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
          continue;
       }
 
-      if (j < ndev) {
-         devs[j] = dev;
-      } else {
-         close(fd);
-         dev->ops->release(&dev);
-      }
-      j++;
+      devs[j++] = dev;
    }
 
    return j;

From 0959d7312d37dd9841cbf7a53cb40b3cfa6e5fc9 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 29 Jun 2015 12:44:44 +0100
Subject: [PATCH 0283/1208] pipe-loader: remove pipe_loader_drm_probe_fd()
 x_auth argument

No longer used by anyone, as of last commit.

Cc: Tom Stellard <thomas.stellard@amd.com>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 .../auxiliary/pipe-loader/pipe_loader.h       |  6 +-
 .../auxiliary/pipe-loader/pipe_loader_drm.c   | 83 +------------------
 src/gallium/auxiliary/vl/vl_winsys_dri.c      |  2 +-
 src/gallium/state_trackers/dri/dri2.c         |  2 +-
 src/gallium/state_trackers/xa/xa_tracker.c    |  2 +-
 src/gallium/targets/d3dadapter9/drm.c         |  2 +-
 6 files changed, 7 insertions(+), 90 deletions(-)

diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.h b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
index 9f43f17a6e2..95cbc262e57 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.h
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
@@ -195,13 +195,9 @@ pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev);
  * This function is platform-specific.
  *
  * \sa pipe_loader_probe
- *
- * \param auth_x If true, the pipe-loader will attempt to
- *               authenticate with the X server.
  */
 bool
-pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd,
-                         boolean auth_x);
+pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd);
 
 #endif
 
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
index 1ff5f3e8df8..05f7c57c904 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -35,12 +35,6 @@
 #include <xf86drm.h>
 #include <unistd.h>
 
-#ifdef HAVE_PIPE_LOADER_XCB
-
-#include <xcb/dri2.h>
-
-#endif
-
 #include "loader.h"
 #include "state_tracker/drm_driver.h"
 #include "pipe_loader_priv.h"
@@ -64,78 +58,8 @@ struct pipe_loader_drm_device {
 
 static struct pipe_loader_ops pipe_loader_drm_ops;
 
-#ifdef HAVE_PIPE_LOADER_XCB
-
-static xcb_screen_t *
-get_xcb_screen(xcb_screen_iterator_t iter, int screen)
-{
-    for (; iter.rem; --screen, xcb_screen_next(&iter))
-        if (screen == 0)
-            return iter.data;
-
-    return NULL;
-}
-
-#endif
-
-static void
-pipe_loader_drm_x_auth(int fd)
-{
-#ifdef HAVE_PIPE_LOADER_XCB
-   /* Try authenticate with the X server to give us access to devices that X
-    * is running on. */
-   xcb_connection_t *xcb_conn;
-   const xcb_setup_t *xcb_setup;
-   xcb_screen_iterator_t s;
-   xcb_dri2_connect_cookie_t connect_cookie;
-   xcb_dri2_connect_reply_t *connect;
-   drm_magic_t magic;
-   xcb_dri2_authenticate_cookie_t authenticate_cookie;
-   xcb_dri2_authenticate_reply_t *authenticate;
-   int screen;
-
-   xcb_conn = xcb_connect(NULL, &screen);
-
-   if(!xcb_conn)
-      return;
-
-   xcb_setup = xcb_get_setup(xcb_conn);
-
-  if (!xcb_setup)
-    goto disconnect;
-
-   s = xcb_setup_roots_iterator(xcb_setup);
-   connect_cookie = xcb_dri2_connect_unchecked(xcb_conn,
-                                               get_xcb_screen(s, screen)->root,
-                                               XCB_DRI2_DRIVER_TYPE_DRI);
-   connect = xcb_dri2_connect_reply(xcb_conn, connect_cookie, NULL);
-
-   if (!connect || connect->driver_name_length
-                   + connect->device_name_length == 0) {
-
-      goto disconnect;
-   }
-
-   if (drmGetMagic(fd, &magic))
-      goto disconnect;
-
-   authenticate_cookie = xcb_dri2_authenticate_unchecked(xcb_conn,
-                                                         s.data->root,
-                                                         magic);
-   authenticate = xcb_dri2_authenticate_reply(xcb_conn,
-                                              authenticate_cookie,
-                                              NULL);
-   FREE(authenticate);
-
-disconnect:
-   xcb_disconnect(xcb_conn);
-
-#endif
-}
-
 bool
-pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd,
-                         boolean auth_x)
+pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd)
 {
    struct pipe_loader_drm_device *ddev = CALLOC_STRUCT(pipe_loader_drm_device);
    int vendor_id, chip_id;
@@ -153,9 +77,6 @@ pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd,
    ddev->base.ops = &pipe_loader_drm_ops;
    ddev->fd = fd;
 
-   if (auth_x)
-      pipe_loader_drm_x_auth(fd);
-
    ddev->base.driver_name = loader_get_driver_for_fd(fd, _LOADER_GALLIUM);
    if (!ddev->base.driver_name)
       goto fail;
@@ -190,7 +111,7 @@ pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
       if (fd < 0)
          continue;
 
-      if (!pipe_loader_drm_probe_fd(&dev, fd, false)) {
+      if (!pipe_loader_drm_probe_fd(&dev, fd)) {
          close(fd);
          continue;
       }
diff --git a/src/gallium/auxiliary/vl/vl_winsys_dri.c b/src/gallium/auxiliary/vl/vl_winsys_dri.c
index 7e61b88e6b5..8e39569f515 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_dri.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri.c
@@ -379,7 +379,7 @@ vl_screen_create(Display *display, int screen)
 #if GALLIUM_STATIC_TARGETS
    scrn->base.pscreen = dd_create_screen(fd);
 #else
-   if (pipe_loader_drm_probe_fd(&scrn->base.dev, fd, false))
+   if (pipe_loader_drm_probe_fd(&scrn->base.dev, fd))
       scrn->base.pscreen = pipe_loader_create_screen(scrn->base.dev, PIPE_SEARCH_DIR);
 #endif // GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index 1eda036750e..91b443147d6 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -1460,7 +1460,7 @@ dri2_init_screen(__DRIscreen * sPriv)
    throttle_ret = dd_configuration(DRM_CONF_THROTTLE);
    dmabuf_ret = dd_configuration(DRM_CONF_SHARE_FD);
 #else
-   if (pipe_loader_drm_probe_fd(&screen->dev, screen->fd, false)) {
+   if (pipe_loader_drm_probe_fd(&screen->dev, screen->fd)) {
       pscreen = pipe_loader_create_screen(screen->dev, PIPE_SEARCH_DIR);
 
       throttle_ret = pipe_loader_configuration(screen->dev, DRM_CONF_THROTTLE);
diff --git a/src/gallium/state_trackers/xa/xa_tracker.c b/src/gallium/state_trackers/xa/xa_tracker.c
index 59e8108b1b5..21ca57ca633 100644
--- a/src/gallium/state_trackers/xa/xa_tracker.c
+++ b/src/gallium/state_trackers/xa/xa_tracker.c
@@ -153,7 +153,7 @@ xa_tracker_create(int drm_fd)
     loader_fd = dup(drm_fd);
     if (loader_fd == -1)
         return NULL;
-    if (pipe_loader_drm_probe_fd(&xa->dev, loader_fd, false))
+    if (pipe_loader_drm_probe_fd(&xa->dev, loader_fd))
 	xa->screen = pipe_loader_create_screen(xa->dev, PIPE_SEARCH_DIR);
 #endif
     if (!xa->screen)
diff --git a/src/gallium/targets/d3dadapter9/drm.c b/src/gallium/targets/d3dadapter9/drm.c
index 6342ab801a9..cc660606c1f 100644
--- a/src/gallium/targets/d3dadapter9/drm.c
+++ b/src/gallium/targets/d3dadapter9/drm.c
@@ -243,7 +243,7 @@ drm_create_adapter( int fd,
     ctx->base.hal = dd_create_screen(fd);
 #else
     /* use pipe-loader to dlopen appropriate drm driver */
-    if (!pipe_loader_drm_probe_fd(&ctx->dev, fd, FALSE)) {
+    if (!pipe_loader_drm_probe_fd(&ctx->dev, fd)) {
         ERR("Failed to probe drm fd %d.\n", fd);
         FREE(ctx);
         close(fd);

From abc20120e4aa5a3782f40f7d4a7c6a4f953fca9c Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 29 Jun 2015 13:02:21 +0100
Subject: [PATCH 0284/1208] automake: pipe-loader: remove the 'client'
 pipe-loader

Was only around as opencl's pipe-loader wanted to link against xcb in
some cases.

Cc: Rob Clark <robclark@freedesktop.org>
Cc: Tom Stellard <thomas.stellard@amd.com>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 configure.ac                                  | 11 -----
 src/gallium/auxiliary/pipe-loader/Makefile.am | 42 ++++++++-----------
 src/gallium/targets/opencl/Makefile.am        |  4 +-
 src/gallium/tests/trivial/Makefile.am         |  4 +-
 4 files changed, 22 insertions(+), 39 deletions(-)

diff --git a/configure.ac b/configure.ac
index d819beffa50..6926b6f986d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2242,21 +2242,10 @@ if test "x$enable_gallium_loader" = xyes; then
 
     if test "x$enable_gallium_drm_loader" = xyes; then
         GALLIUM_PIPE_LOADER_DEFINES="$GALLIUM_PIPE_LOADER_DEFINES -DHAVE_PIPE_LOADER_DRM"
-        PKG_CHECK_MODULES([GALLIUM_PIPE_LOADER_XCB], [xcb xcb-dri2],
-                          pipe_loader_have_xcb=yes, pipe_loader_have_xcb=no)
-        if test "x$pipe_loader_have_xcb" = xyes; then
-            GALLIUM_PIPE_LOADER_CLIENT_DEFINES="$GALLIUM_PIPE_LOADER_CLIENT_DEFINES -DHAVE_PIPE_LOADER_XCB"
-            GALLIUM_PIPE_LOADER_CLIENT_LIBS="$GALLIUM_PIPE_LOADER_CLIENT_LIBS $GALLIUM_PIPE_LOADER_XCB_LIBS $LIBDRM_LIBS"
-        fi
     fi
 
-    GALLIUM_PIPE_LOADER_CLIENT_DEFINES="$GALLIUM_PIPE_LOADER_CLIENT_DEFINES $GALLIUM_PIPE_LOADER_DEFINES"
-    GALLIUM_PIPE_LOADER_CLIENT_LIBS="$GALLIUM_PIPE_LOADER_CLIENT_LIBS $GALLIUM_PIPE_LOADER_LIBS"
-
     AC_SUBST([GALLIUM_PIPE_LOADER_DEFINES])
     AC_SUBST([GALLIUM_PIPE_LOADER_LIBS])
-    AC_SUBST([GALLIUM_PIPE_LOADER_CLIENT_DEFINES])
-    AC_SUBST([GALLIUM_PIPE_LOADER_CLIENT_LIBS])
 fi
 
 AM_CONDITIONAL(HAVE_I915_DRI, test x$HAVE_I915_DRI = xyes)
diff --git a/src/gallium/auxiliary/pipe-loader/Makefile.am b/src/gallium/auxiliary/pipe-loader/Makefile.am
index cb6035d85c9..bcd49b5fce5 100644
--- a/src/gallium/auxiliary/pipe-loader/Makefile.am
+++ b/src/gallium/auxiliary/pipe-loader/Makefile.am
@@ -1,37 +1,31 @@
 include Makefile.sources
+include $(top_srcdir)/src/gallium/Automake.inc
 
-AM_CPPFLAGS = $(DEFINES) \
-	$(VISIBILITY_CFLAGS) \
-	-I$(top_srcdir)/include \
-	-I$(top_srcdir)/src \
+# XXX: check if we need the gallium/winsys include
+AM_CFLAGS = \
 	-I$(top_srcdir)/src/loader \
-	-I$(top_srcdir)/src/gallium/include \
-	-I$(top_srcdir)/src/gallium/auxiliary \
-	-I$(top_srcdir)/src/gallium/winsys
+	-I$(top_srcdir)/src/gallium/winsys \
+	$(GALLIUM_PIPE_LOADER_DEFINES) \
+	$(GALLIUM_CFLAGS) \
+	$(VISIBILITY_CFLAGS)
 
 noinst_LTLIBRARIES = libpipe_loader.la
-noinst_LTLIBRARIES += libpipe_loader_client.la
+
+libpipe_loader_la_SOURCES = \
+	$(COMMON_SOURCES)
+
+libpipe_loader_la_LIBADD  = $(COMMON_LIBADD) \
+	$(GALLIUM_PIPE_LOADER_LIBS)
 
 if HAVE_DRM_LOADER_GALLIUM
-AM_CFLAGS = $(LIBDRM_CFLAGS)
+AM_CFLAGS += \
+	$(LIBDRM_CFLAGS)
 
-COMMON_SOURCES += $(DRM_SOURCES)
+libpipe_loader_la_SOURCES += \
+	$(DRM_SOURCES)
 
-COMMON_LIBADD = \
+libpipe_loader_la_LIBADD = \
 	$(top_builddir)/src/loader/libloader.la
 
 endif
 
-libpipe_loader_la_CFLAGS  = \
-	$(GALLIUM_PIPE_LOADER_DEFINES) \
-	$(AM_CFLAGS) $(AM_CPPFLAGS)
-libpipe_loader_la_SOURCES = $(COMMON_SOURCES)
-libpipe_loader_la_LIBADD  = $(COMMON_LIBADD) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
-
-libpipe_loader_client_la_CFLAGS  = \
-	$(GALLIUM_PIPE_LOADER_CLIENT_DEFINES) \
-	$(AM_CFLAGS) $(AM_CPPFLAGS)
-libpipe_loader_client_la_SOURCES = $(COMMON_SOURCES)
-libpipe_loader_client_la_LIBADD  = $(COMMON_LIBADD) \
-	$(GALLIUM_PIPE_LOADER_CLIENT_LIBS)
diff --git a/src/gallium/targets/opencl/Makefile.am b/src/gallium/targets/opencl/Makefile.am
index 70e60e20052..6ea04d4ce5b 100644
--- a/src/gallium/targets/opencl/Makefile.am
+++ b/src/gallium/targets/opencl/Makefile.am
@@ -15,12 +15,12 @@ lib@OPENCL_LIBNAME@_la_LDFLAGS += \
 endif
 
 lib@OPENCL_LIBNAME@_la_LIBADD = \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_client.la \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
 	$(top_builddir)/src/gallium/state_trackers/clover/libclover.la \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_CLIENT_LIBS) \
+	$(GALLIUM_PIPE_LOADER_LIBS) \
 	$(ELF_LIB) \
 	-ldl \
 	-lclangCodeGen \
diff --git a/src/gallium/tests/trivial/Makefile.am b/src/gallium/tests/trivial/Makefile.am
index fcd240e85bb..82cab5ee210 100644
--- a/src/gallium/tests/trivial/Makefile.am
+++ b/src/gallium/tests/trivial/Makefile.am
@@ -12,11 +12,11 @@ AM_CPPFLAGS = \
 	$(GALLIUM_PIPE_LOADER_DEFINES)
 
 LDADD = \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_client.la \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_CLIENT_LIBS) \
+	$(GALLIUM_PIPE_LOADER_LIBS) \
 	$(GALLIUM_COMMON_LIB_DEPS)
 
 noinst_PROGRAMS = compute tri quad-tex

From c73d30dfe90d9aa096fc64024612a6543bd748c7 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 29 Jun 2015 13:08:06 +0100
Subject: [PATCH 0285/1208] automake: remove empty GALLIUM_PIPE_LOADER_LIBS

Cc: Rob Clark <robclark@freedesktop.org>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 configure.ac                                  | 1 -
 src/gallium/auxiliary/pipe-loader/Makefile.am | 3 ---
 src/gallium/targets/d3dadapter9/Makefile.am   | 3 +--
 src/gallium/targets/dri/Makefile.am           | 3 +--
 src/gallium/targets/omx/Makefile.am           | 3 +--
 src/gallium/targets/opencl/Makefile.am        | 1 -
 src/gallium/targets/va/Makefile.am            | 3 +--
 src/gallium/targets/vdpau/Makefile.am         | 3 +--
 src/gallium/targets/xa/Makefile.am            | 3 +--
 src/gallium/targets/xvmc/Makefile.am          | 4 +---
 src/gallium/tests/trivial/Makefile.am         | 1 -
 11 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/configure.ac b/configure.ac
index 6926b6f986d..0d40e2f1a90 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2245,7 +2245,6 @@ if test "x$enable_gallium_loader" = xyes; then
     fi
 
     AC_SUBST([GALLIUM_PIPE_LOADER_DEFINES])
-    AC_SUBST([GALLIUM_PIPE_LOADER_LIBS])
 fi
 
 AM_CONDITIONAL(HAVE_I915_DRI, test x$HAVE_I915_DRI = xyes)
diff --git a/src/gallium/auxiliary/pipe-loader/Makefile.am b/src/gallium/auxiliary/pipe-loader/Makefile.am
index bcd49b5fce5..8c837996539 100644
--- a/src/gallium/auxiliary/pipe-loader/Makefile.am
+++ b/src/gallium/auxiliary/pipe-loader/Makefile.am
@@ -14,9 +14,6 @@ noinst_LTLIBRARIES = libpipe_loader.la
 libpipe_loader_la_SOURCES = \
 	$(COMMON_SOURCES)
 
-libpipe_loader_la_LIBADD  = $(COMMON_LIBADD) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
-
 if HAVE_DRM_LOADER_GALLIUM
 AM_CFLAGS += \
 	$(LIBDRM_CFLAGS)
diff --git a/src/gallium/targets/d3dadapter9/Makefile.am b/src/gallium/targets/d3dadapter9/Makefile.am
index 591978f1f61..fe5b0b11679 100644
--- a/src/gallium/targets/d3dadapter9/Makefile.am
+++ b/src/gallium/targets/d3dadapter9/Makefile.am
@@ -118,8 +118,7 @@ else # HAVE_GALLIUM_STATIC_TARGETS
 
 d3dadapter9_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/dri/Makefile.am b/src/gallium/targets/dri/Makefile.am
index 96483964589..7c86ea13652 100644
--- a/src/gallium/targets/dri/Makefile.am
+++ b/src/gallium/targets/dri/Makefile.am
@@ -95,8 +95,7 @@ else # HAVE_GALLIUM_STATIC_TARGETS
 
 gallium_dri_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/omx/Makefile.am b/src/gallium/targets/omx/Makefile.am
index f52e66946ed..a4dff487dd8 100644
--- a/src/gallium/targets/omx/Makefile.am
+++ b/src/gallium/targets/omx/Makefile.am
@@ -57,8 +57,7 @@ else # HAVE_GALLIUM_STATIC_TARGETS
 
 libomx_mesa_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/opencl/Makefile.am b/src/gallium/targets/opencl/Makefile.am
index 6ea04d4ce5b..441b438d339 100644
--- a/src/gallium/targets/opencl/Makefile.am
+++ b/src/gallium/targets/opencl/Makefile.am
@@ -20,7 +20,6 @@ lib@OPENCL_LIBNAME@_la_LIBADD = \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS) \
 	$(ELF_LIB) \
 	-ldl \
 	-lclangCodeGen \
diff --git a/src/gallium/targets/va/Makefile.am b/src/gallium/targets/va/Makefile.am
index 57c7e353ae9..9613f041b58 100644
--- a/src/gallium/targets/va/Makefile.am
+++ b/src/gallium/targets/va/Makefile.am
@@ -54,8 +54,7 @@ else # HAVE_GALLIUM_STATIC_TARGETS
 
 gallium_drv_video_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/vdpau/Makefile.am b/src/gallium/targets/vdpau/Makefile.am
index 9455fc4cae5..7eb62c1cc78 100644
--- a/src/gallium/targets/vdpau/Makefile.am
+++ b/src/gallium/targets/vdpau/Makefile.am
@@ -66,8 +66,7 @@ else # HAVE_GALLIUM_STATIC_TARGETS
 
 libvdpau_gallium_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/xa/Makefile.am b/src/gallium/targets/xa/Makefile.am
index 8ddb9672bd7..92173dedce3 100644
--- a/src/gallium/targets/xa/Makefile.am
+++ b/src/gallium/targets/xa/Makefile.am
@@ -81,8 +81,7 @@ else # HAVE_GALLIUM_STATIC_TARGETS
 
 libxatracker_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/xvmc/Makefile.am b/src/gallium/targets/xvmc/Makefile.am
index 3c16c8d51eb..b3285890822 100644
--- a/src/gallium/targets/xvmc/Makefile.am
+++ b/src/gallium/targets/xvmc/Makefile.am
@@ -52,11 +52,9 @@ libXvMCgallium_la_LIBADD += $(TARGET_LIB_DEPS) \
 	$(TARGET_RADEON_WINSYS) $(TARGET_RADEON_COMMON)
 
 else # HAVE_GALLIUM_STATIC_TARGETS
-# XXX: Use the pipe-loader-client over pipe-loader ?
 libXvMCgallium_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/tests/trivial/Makefile.am b/src/gallium/tests/trivial/Makefile.am
index 82cab5ee210..56b7f3ffc66 100644
--- a/src/gallium/tests/trivial/Makefile.am
+++ b/src/gallium/tests/trivial/Makefile.am
@@ -16,7 +16,6 @@ LDADD = \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS) \
 	$(GALLIUM_COMMON_LIB_DEPS)
 
 noinst_PROGRAMS = compute tri quad-tex

From 132031b110a9fd652f3c9d5727502134ef9c22c1 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 29 Jun 2015 13:28:33 +0100
Subject: [PATCH 0286/1208] pipe-loader: remove pipe_loader_sw_probe_xlib

It was only useful for st/egl, although I've never got to merging the
pipe-loader and inline-helpers before it was removed. There are no users
for it ATM.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 configure.ac                                  | 13 ++-------
 src/gallium/Automake.inc                      |  7 -----
 .../auxiliary/pipe-loader/pipe_loader.h       | 19 -------------
 .../auxiliary/pipe-loader/pipe_loader_sw.c    | 27 -------------------
 4 files changed, 2 insertions(+), 64 deletions(-)

diff --git a/configure.ac b/configure.ac
index 0d40e2f1a90..11ce0e53d0c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -953,11 +953,9 @@ dnl
 dnl Driver specific build directories
 dnl
 
-case "x$enable_glx$enable_xlib_glx" in
-xyesyes)
+if test -n "$with_gallium_drivers" -a "x$enable_glx$enable_xlib_glx" = xyesyes; then
     NEED_WINSYS_XLIB="yes"
-    ;;
-esac
+fi
 
 if test "x$enable_dri" = xyes; then
     enable_gallium_loader="$enable_shared_pipe_drivers"
@@ -1806,9 +1804,6 @@ else
     EGL_NATIVE_PLATFORM="_EGL_INVALID_PLATFORM"
 fi
 
-if echo "$egl_platforms" | grep -q 'x11'; then
-    NEED_WINSYS_XLIB=yes
-fi
 AM_CONDITIONAL(HAVE_EGL_PLATFORM_X11, echo "$egl_platforms" | grep -q 'x11')
 AM_CONDITIONAL(HAVE_EGL_PLATFORM_WAYLAND, echo "$egl_platforms" | grep -q 'wayland')
 AM_CONDITIONAL(HAVE_EGL_PLATFORM_DRM, echo "$egl_platforms" | grep -q 'drm')
@@ -2232,10 +2227,6 @@ AM_CONDITIONAL(HAVE_GALLIUM_STATIC_TARGETS, test "x$enable_shared_pipe_drivers"
 #       use by XA tracker in particular, but could be used in any case
 #       where communication with xserver is not desired).
 if test "x$enable_gallium_loader" = xyes; then
-    if test "x$NEED_WINSYS_XLIB" = xyes; then
-        GALLIUM_PIPE_LOADER_DEFINES="$GALLIUM_PIPE_LOADER_DEFINES -DHAVE_PIPE_LOADER_XLIB"
-    fi
-
     if test "x$enable_dri" = xyes; then
         GALLIUM_PIPE_LOADER_DEFINES="$GALLIUM_PIPE_LOADER_DEFINES -DHAVE_PIPE_LOADER_DRI"
     fi
diff --git a/src/gallium/Automake.inc b/src/gallium/Automake.inc
index 95aae50d64b..ee07ab6c8f9 100644
--- a/src/gallium/Automake.inc
+++ b/src/gallium/Automake.inc
@@ -67,10 +67,3 @@ if HAVE_DRISW
 GALLIUM_PIPE_LOADER_WINSYS_LIBS += \
 	$(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la
 endif
-
-if NEED_WINSYS_XLIB
-GALLIUM_PIPE_LOADER_WINSYS_LIBS += \
-	$(top_builddir)/src/gallium/winsys/sw/xlib/libws_xlib.la \
-	-lX11 -lXext -lXfixes \
-	$(LIBDRM_LIBS)
-endif
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.h b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
index 95cbc262e57..9b8712666bb 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.h
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
@@ -36,10 +36,6 @@
 #include "pipe/p_compiler.h"
 #include "state_tracker/drm_driver.h"
 
-#ifdef HAVE_PIPE_LOADER_XLIB
-#include <X11/Xlib.h>
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -116,21 +112,6 @@ pipe_loader_configuration(struct pipe_loader_device *dev,
 void
 pipe_loader_release(struct pipe_loader_device **devs, int ndev);
 
-#ifdef HAVE_PIPE_LOADER_XLIB
-
-/**
- * Initialize Xlib for an associated display.
- *
- * This function is platform-specific.
- *
- * \sa pipe_loader_probe
- */
-bool
-pipe_loader_sw_probe_xlib(struct pipe_loader_device **devs, Display *display);
-
-#endif
-
-
 #ifdef HAVE_PIPE_LOADER_DRI
 
 /**
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
index 3d332645231..6794930193d 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
@@ -32,10 +32,6 @@
 #include "sw/dri/dri_sw_winsys.h"
 #include "sw/null/null_sw_winsys.h"
 #include "sw/wrapper/wrapper_sw_winsys.h"
-#ifdef HAVE_PIPE_LOADER_XLIB
-/* Explicitly wrap the header to ease build without X11 headers */
-#include "sw/xlib/xlib_sw_winsys.h"
-#endif
 #include "target-helpers/inline_sw_helper.h"
 #include "state_tracker/drisw_api.h"
 
@@ -53,29 +49,6 @@ static struct sw_winsys *(*backends[])() = {
    null_sw_create
 };
 
-#ifdef HAVE_PIPE_LOADER_XLIB
-bool
-pipe_loader_sw_probe_xlib(struct pipe_loader_device **devs, Display *display)
-{
-   struct pipe_loader_sw_device *sdev = CALLOC_STRUCT(pipe_loader_sw_device);
-
-   if (!sdev)
-      return false;
-
-   sdev->base.type = PIPE_LOADER_DEVICE_SOFTWARE;
-   sdev->base.driver_name = "swrast";
-   sdev->base.ops = &pipe_loader_sw_ops;
-   sdev->ws = xlib_create_sw_winsys(display);
-   if (!sdev->ws) {
-      FREE(sdev);
-      return false;
-   }
-   *devs = &sdev->base;
-
-   return true;
-}
-#endif
-
 #ifdef HAVE_PIPE_LOADER_DRI
 bool
 pipe_loader_sw_probe_dri(struct pipe_loader_device **devs, struct drisw_loader_funcs *drisw_lf)

From cc32d25454c382a971e81ae584a4296fdf492e70 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 29 Jun 2015 14:01:39 +0100
Subject: [PATCH 0287/1208] pipe-loader: use loader_open_device() rather than
 open()

The former handles O_CLOEXEC (and the lack of it) appropriately.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
index 05f7c57c904..cc6d74a964b 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -95,7 +95,7 @@ open_drm_render_node_minor(int minor)
    char path[PATH_MAX];
    snprintf(path, sizeof(path), DRM_RENDER_NODE_DEV_NAME_FORMAT, DRM_DIR_NAME,
             minor);
-   return open(path, O_RDWR, 0);
+   return loader_open_device(path);
 }
 
 int

From dd50ccf0f4dcba2fd586d5b5c58750259e29c357 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 29 Jun 2015 14:03:22 +0100
Subject: [PATCH 0288/1208] auxiliary/vl: use loader_open_device() over open()

The former handles O_CLOEXEC (and the lack of it) appropriately.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/gallium/auxiliary/Makefile.am        | 1 +
 src/gallium/auxiliary/vl/vl_winsys_dri.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/Makefile.am b/src/gallium/auxiliary/Makefile.am
index ab91062ef1b..04f77d002c8 100644
--- a/src/gallium/auxiliary/Makefile.am
+++ b/src/gallium/auxiliary/Makefile.am
@@ -8,6 +8,7 @@ include $(top_srcdir)/src/gallium/Automake.inc
 noinst_LTLIBRARIES = libgallium.la
 
 AM_CFLAGS = \
+	-I$(top_srcdir)/src/loader \
 	-I$(top_builddir)/src/glsl/nir \
 	-I$(top_srcdir)/src/gallium/auxiliary/util \
 	$(GALLIUM_CFLAGS) \
diff --git a/src/gallium/auxiliary/vl/vl_winsys_dri.c b/src/gallium/auxiliary/vl/vl_winsys_dri.c
index 8e39569f515..b445f68a1c7 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_dri.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri.c
@@ -37,6 +37,8 @@
 #include <xf86drm.h>
 #include <errno.h>
 
+#include "loader.h"
+
 #include "pipe/p_screen.h"
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
@@ -361,7 +363,7 @@ vl_screen_create(Display *display, int screen)
    if (!device_name)
       goto free_connect;
    memcpy(device_name, xcb_dri2_connect_device_name(connect), device_name_length);
-   fd = open(device_name, O_RDWR);
+   fd = loader_open_device(device_name);
    free(device_name);
 
    if (fd < 0)

From 9027d53b2a00b3073f904cc3cb995e8953e41036 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 30 Jun 2015 14:37:19 +0100
Subject: [PATCH 0289/1208] radeonsi: directly include radeon/* headers

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/gallium/drivers/radeonsi/cik_sdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 86111cb86e8..47b586f171e 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -27,7 +27,7 @@
 
 #include "sid.h"
 #include "si_pipe.h"
-#include "../radeon/r600_cs.h"
+#include "radeon/r600_cs.h"
 
 #include "util/u_format.h"
 

From c505064b2cea14c9da115a26e9326b9c0c7dca3b Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 13 Jul 2015 20:01:39 +0100
Subject: [PATCH 0290/1208] bugzilla_mesa.sh: sort the bugs list by number

v2: Use change sed/sort based on Ilia's suggestion.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 bin/bugzilla_mesa.sh | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/bin/bugzilla_mesa.sh b/bin/bugzilla_mesa.sh
index 491ca0e7c0b..0cff4261f75 100755
--- a/bin/bugzilla_mesa.sh
+++ b/bin/bugzilla_mesa.sh
@@ -15,17 +15,14 @@
 # $ DRYRUN=yes bin/bugzilla_mesa.sh mesa-9.0.2..mesa-9.0.3 | wc -l
 
 
-# regex pattern: trim before url
-trim_before='s/.*\(http\)/\1/'
+# regex pattern: trim before bug number
+trim_before='s/.*show_bug.cgi?id=\([0-9]*\).*/\1/'
 
-# regex pattern: trim after url
-trim_after='s/\(show_bug.cgi?id=[0-9]*\).*/\1/'
-
-# regex pattern: always use https
-use_https='s/http:/https:/'
+# regex pattern: reconstruct the url
+use_after='s,^,https://bugs.freedesktop.org/show_bug.cgi?id=,'
 
 # extract fdo urls from commit log
-urls=$(git log $* | grep 'bugs.freedesktop.org/show_bug' | sed -e $trim_before -e $trim_after -e $use_https | sort | uniq)
+urls=$(git log $* | grep 'bugs.freedesktop.org/show_bug' | sed -e $trim_before | sort -n -u | sed -e $use_after)
 
 # if DRYRUN is set to "yes", simply print the URLs and don't fetch the
 # details from fdo bugzilla.

From a025e539e430b7bbfae9b786bd79d0d608f1acf8 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 6 Jul 2015 09:42:01 +0100
Subject: [PATCH 0291/1208] i965: bump libdrm requirement to 2.4.61 and drop
 in-tree workaround
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 configure.ac                             | 2 +-
 src/mesa/drivers/dri/i965/intel_screen.c | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/configure.ac b/configure.ac
index 11ce0e53d0c..bdfd13417e0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -68,7 +68,7 @@ AC_SUBST([OSMESA_VERSION])
 dnl Versions for external dependencies
 LIBDRM_REQUIRED=2.4.38
 LIBDRM_RADEON_REQUIRED=2.4.56
-LIBDRM_INTEL_REQUIRED=2.4.60
+LIBDRM_INTEL_REQUIRED=2.4.61
 LIBDRM_NVVIEUX_REQUIRED=2.4.33
 LIBDRM_NOUVEAU_REQUIRED=2.4.62
 LIBDRM_FREEDRENO_REQUIRED=2.4.57
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index c0f5c927ed8..fd343eeb4e1 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1330,11 +1330,6 @@ set_max_gl_versions(struct intel_screen *screen)
    }
 }
 
-/* drop when libdrm 2.4.61 is released */
-#ifndef I915_PARAM_REVISION
-#define I915_PARAM_REVISION 32
-#endif
-
 static int
 brw_get_revision(int fd)
 {

From 82b9b2e523ad53f54d5620f47f7aea4f11397b81 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 7 Jul 2015 14:13:33 +0100
Subject: [PATCH 0292/1208] i915: remove unused driFd variable

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i915/intel_context.c | 1 -
 src/mesa/drivers/dri/i915/intel_context.h | 2 --
 2 files changed, 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i915/intel_context.c b/src/mesa/drivers/dri/i915/intel_context.c
index 5618dcd8358..c780103228f 100644
--- a/src/mesa/drivers/dri/i915/intel_context.c
+++ b/src/mesa/drivers/dri/i915/intel_context.c
@@ -428,7 +428,6 @@ intelInitContext(struct intel_context *intel,
 
    driContextPriv->driverPrivate = intel;
    intel->driContext = driContextPriv;
-   intel->driFd = sPriv->fd;
 
    intel->gen = intelScreen->gen;
 
diff --git a/src/mesa/drivers/dri/i915/intel_context.h b/src/mesa/drivers/dri/i915/intel_context.h
index 350d35d9033..4ec4015d453 100644
--- a/src/mesa/drivers/dri/i915/intel_context.h
+++ b/src/mesa/drivers/dri/i915/intel_context.h
@@ -273,8 +273,6 @@ struct intel_context
 
    bool use_early_z;
 
-   int driFd;
-
    __DRIcontext *driContext;
    struct intel_screen *intelScreen;
 

From 10a7b579fdc0e3f3b38920ae5c103c058cc63eec Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 7 Jul 2015 14:44:11 +0100
Subject: [PATCH 0293/1208] radeon: remove dri_mirror state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most of the data stored(duplicated) was unused, and for the one that is
follow the approach set by other drivers.
This eliminates the use of legacy (dri1) types.

Cc: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/drivers/dri/radeon/radeon_common.c   | 18 +++++++++---------
 .../dri/radeon/radeon_common_context.c        |  7 ++-----
 .../dri/radeon/radeon_common_context.h        | 19 +++----------------
 src/mesa/drivers/dri/radeon/radeon_texture.c  |  2 +-
 4 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index 2a8bd6c9edc..d834d9bde3f 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -164,7 +164,7 @@ uint32_t radeonGetAge(radeonContextPtr radeon)
 
 	gp.param = RADEON_PARAM_LAST_CLEAR;
 	gp.value = (int *)&age;
-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+	ret = drmCommandWriteRead(radeon->radeonScreen->driScreen->fd, DRM_RADEON_GETPARAM,
 				  &gp, sizeof(gp));
 	if (ret) {
 		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __func__,
@@ -358,8 +358,8 @@ void radeonDrawBuffer( struct gl_context *ctx, GLenum mode )
        * that the front-buffer has actually been allocated.
        */
 		if (!was_front_buffer_rendering && radeon->is_front_buffer_rendering) {
-			radeon_update_renderbuffers(radeon->dri.context,
-				radeon->dri.context->driDrawablePriv, GL_FALSE);
+			radeon_update_renderbuffers(radeon->driContext,
+				radeon->driContext->driDrawablePriv, GL_FALSE);
       }
 	}
 
@@ -375,8 +375,8 @@ void radeonReadBuffer( struct gl_context *ctx, GLenum mode )
 					|| (mode == GL_FRONT);
 
 		if (!was_front_buffer_reading && rmesa->is_front_buffer_reading) {
-			radeon_update_renderbuffers(rmesa->dri.context,
-						    rmesa->dri.context->driReadablePriv, GL_FALSE);
+			radeon_update_renderbuffers(rmesa->driContext,
+						    rmesa->driContext->driReadablePriv, GL_FALSE);
 	 	}
 	}
 	/* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
@@ -399,7 +399,7 @@ void radeon_window_moved(radeonContextPtr radeon)
 void radeon_viewport(struct gl_context *ctx)
 {
 	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-	__DRIcontext *driContext = radeon->dri.context;
+	__DRIcontext *driContext = radeon->driContext;
 	void (*old_viewport)(struct gl_context *ctx);
 
 	if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
@@ -693,6 +693,7 @@ void rcommonInitCmdBuf(radeonContextPtr rmesa)
 {
 	GLuint size;
 	struct drm_radeon_gem_info mminfo = { 0 };
+	int fd = rmesa->radeonScreen->driScreen->fd;
 
 	/* Initialize command buffer */
 	size = 256 * driQueryOptioni(&rmesa->optionCache,
@@ -711,8 +712,7 @@ void rcommonInitCmdBuf(radeonContextPtr rmesa)
 			"Allocating %d bytes command buffer (max state is %d bytes)\n",
 			size * 4, rmesa->hw.max_state_size * 4);
 
-	rmesa->cmdbuf.csm =
-		radeon_cs_manager_gem_ctor(rmesa->radeonScreen->driScreen->fd);
+	rmesa->cmdbuf.csm = radeon_cs_manager_gem_ctor(fd);
 	if (rmesa->cmdbuf.csm == NULL) {
 		/* FIXME: fatal error */
 		return;
@@ -725,7 +725,7 @@ void rcommonInitCmdBuf(radeonContextPtr rmesa)
 				  (void (*)(void *))rmesa->glCtx.Driver.Flush, &rmesa->glCtx);
 
 
-	if (!drmCommandWriteRead(rmesa->dri.fd, DRM_RADEON_GEM_INFO,
+	if (!drmCommandWriteRead(fd, DRM_RADEON_GEM_INFO,
 				 &mminfo, sizeof(mminfo))) {
 		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_VRAM,
 				    mminfo.vram_visible);
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index 3d0cedaf33a..4660d98c9a2 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -162,10 +162,7 @@ GLboolean radeonInitContext(radeonContextPtr radeon,
 	_mesa_meta_init(ctx);
 
 	/* DRI fields */
-	radeon->dri.context = driContextPriv;
-	radeon->dri.screen = sPriv;
-	radeon->dri.fd = sPriv->fd;
-	radeon->dri.drmMinor = sPriv->drm_version.minor;
+	radeon->driContext = driContextPriv;
 
 	/* Setup IRQs */
 	fthrottle_mode = driQueryOptioni(&radeon->optionCache, "fthrottle_mode");
@@ -325,7 +322,7 @@ radeon_bits_per_pixel(const struct radeon_renderbuffer *rb)
  */
 void radeon_prepare_render(radeonContextPtr radeon)
 {
-    __DRIcontext *driContext = radeon->dri.context;
+    __DRIcontext *driContext = radeon->driContext;
     __DRIdrawable *drawable;
     __DRIscreen *screen;
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.h b/src/mesa/drivers/dri/radeon/radeon_common_context.h
index dc72592b90c..d142a871b40 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.h
@@ -342,17 +342,6 @@ struct radeon_store {
 	int elts_start;
 };
 
-struct radeon_dri_mirror {
-	__DRIcontext *context;	/* DRI context */
-	__DRIscreen *screen;	/* DRI screen */
-
-	drm_context_t hwContext;
-	drm_hw_lock_t *hwLock;
-	int hwLockCount;
-	int fd;
-	int drmMinor;
-};
-
 typedef void (*radeon_tri_func) (radeonContextPtr,
 				 radeonVertex *,
 				 radeonVertex *, radeonVertex *);
@@ -385,6 +374,7 @@ struct radeon_cmdbuf {
 
 struct radeon_context {
    struct gl_context glCtx;             /**< base class, must be first */
+   __DRIcontext *driContext;               /* DRI context */
    radeonScreenPtr radeonScreen;	/* Screen private DRI data */
 
    /* Texture object bookkeeping
@@ -407,9 +397,6 @@ struct radeon_context {
    /* Drawable information */
    unsigned int lastStamp;
 
-   /* Mirrors of some DRI state */
-   struct radeon_dri_mirror dri;
-
    /* Busy waiting */
    GLuint do_usleeps;
    GLuint do_irqs;
@@ -502,12 +489,12 @@ static inline radeonContextPtr RADEON_CONTEXT(struct gl_context *ctx)
 
 static inline __DRIdrawable* radeon_get_drawable(radeonContextPtr radeon)
 {
-	return radeon->dri.context->driDrawablePriv;
+	return radeon->driContext->driDrawablePriv;
 }
 
 static inline __DRIdrawable* radeon_get_readable(radeonContextPtr radeon)
 {
-	return radeon->dri.context->driReadablePriv;
+	return radeon->driContext->driReadablePriv;
 }
 
 extern const char const *radeonVendorString;
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index edfd48b283b..1a178988b68 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -531,7 +531,7 @@ void radeon_image_target_texture_2d(struct gl_context *ctx, GLenum target,
 	__DRIscreen *screen;
 	__DRIimage *image;
 
-	screen = radeon->dri.screen;
+	screen = radeon->radeonScreen->driScreen;
 	image = screen->dri2.image->lookupEGLImage(screen, image_handle,
 						   screen->loaderPrivate);
 	if (image == NULL)

From 7a50bf6c7f7729f5eee3ddf7aa9b38a81873f2c6 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Fri, 10 Jul 2015 21:27:13 +0100
Subject: [PATCH 0294/1208] auxiliary/vl: use the correct screen index
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Inspired (copied) from Marek's commit for egl/x11
commit 0b56e23e7f3(egl/dri2: use the correct screen index)

v2: Fix copy/pasta errors.

Cc: 10.6 <mesa-stable@lists.freedesktop.org>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/vl/vl_winsys_dri.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/vl/vl_winsys_dri.c b/src/gallium/auxiliary/vl/vl_winsys_dri.c
index b445f68a1c7..3b1b87f9523 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_dri.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri.c
@@ -295,6 +295,16 @@ vl_screen_get_private(struct vl_screen *vscreen)
    return vscreen;
 }
 
+static xcb_screen_t *
+get_xcb_screen(xcb_screen_iterator_t iter, int screen)
+{
+    for (; iter.rem; --screen, xcb_screen_next(&iter))
+        if (screen == 0)
+            return iter.data;
+
+    return NULL;
+}
+
 struct vl_screen*
 vl_screen_create(Display *display, int screen)
 {
@@ -336,8 +346,7 @@ vl_screen_create(Display *display, int screen)
       goto free_query;
 
    s = xcb_setup_roots_iterator(xcb_get_setup(scrn->conn));
-   while (screen--)
-	xcb_screen_next(&s);
+
    driverType = XCB_DRI2_DRIVER_TYPE_DRI;
 #ifdef DRI2DriverPrimeShift
    {
@@ -353,7 +362,7 @@ vl_screen_create(Display *display, int screen)
    }
 #endif
 
-   connect_cookie = xcb_dri2_connect_unchecked(scrn->conn, s.data->root, driverType);
+   connect_cookie = xcb_dri2_connect_unchecked(scrn->conn, get_xcb_screen(s, screen)->root, driverType);
    connect = xcb_dri2_connect_reply(scrn->conn, connect_cookie, NULL);
    if (connect == NULL || connect->driver_name_length + connect->device_name_length == 0)
       goto free_connect;
@@ -372,7 +381,7 @@ vl_screen_create(Display *display, int screen)
    if (drmGetMagic(fd, &magic))
       goto free_connect;
 
-   authenticate_cookie = xcb_dri2_authenticate_unchecked(scrn->conn, s.data->root, magic);
+   authenticate_cookie = xcb_dri2_authenticate_unchecked(scrn->conn, get_xcb_screen(s, screen)->root, magic);
    authenticate = xcb_dri2_authenticate_reply(scrn->conn, authenticate_cookie, NULL);
 
    if (authenticate == NULL || !authenticate->authenticated)

From 431a0658616575953868f6d16bb9641306cceea8 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Tue, 14 Jul 2015 07:37:59 +1000
Subject: [PATCH 0295/1208] glsl: replace some more old hash_table uses

The util/hash_table was intended to be a fast hash table
replacement for the program/hash_table see 35fd61bd99c1 and 72e55bb6888ff.

This change replaces some more uses of the old hash table.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/glsl/glsl_types.cpp | 80 ++++++++++++++++++++---------------------
 src/glsl/glsl_types.h   |  2 +-
 2 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
index 281ff51f3e9..4b344f1eb4a 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -25,7 +25,7 @@
 #include "main/core.h" /* for Elements, MAX2 */
 #include "glsl_parser_extras.h"
 #include "glsl_types.h"
-#include "program/hash_table.h"
+#include "util/hash_table.h"
 
 
 mtx_t glsl_type::mutex = _MTX_INITIALIZER_NP;
@@ -329,12 +329,12 @@ _mesa_glsl_release_types(void)
     * necessary.
     */
    if (glsl_type::array_types != NULL) {
-      hash_table_dtor(glsl_type::array_types);
+      _mesa_hash_table_destroy(glsl_type::array_types, NULL);
       glsl_type::array_types = NULL;
    }
 
    if (glsl_type::record_types != NULL) {
-      hash_table_dtor(glsl_type::record_types);
+      _mesa_hash_table_destroy(glsl_type::record_types, NULL);
       glsl_type::record_types = NULL;
    }
 }
@@ -648,27 +648,28 @@ glsl_type::get_array_instance(const glsl_type *base, unsigned array_size)
    mtx_lock(&glsl_type::mutex);
 
    if (array_types == NULL) {
-      array_types = hash_table_ctor(64, hash_table_string_hash,
-				    hash_table_string_compare);
+      array_types = _mesa_hash_table_create(NULL, _mesa_key_hash_string,
+                                            _mesa_key_string_equal);
    }
 
-   const glsl_type *t = (glsl_type *) hash_table_find(array_types, key);
-
-   if (t == NULL) {
+   const struct hash_entry *entry = _mesa_hash_table_search(array_types, key);
+   if (entry == NULL) {
       mtx_unlock(&glsl_type::mutex);
-      t = new glsl_type(base, array_size);
+      const glsl_type *t = new glsl_type(base, array_size);
       mtx_lock(&glsl_type::mutex);
 
-      hash_table_insert(array_types, (void *) t, ralloc_strdup(mem_ctx, key));
+      entry = _mesa_hash_table_insert(array_types,
+                                      ralloc_strdup(mem_ctx, key),
+                                      (void *) t);
    }
 
-   assert(t->base_type == GLSL_TYPE_ARRAY);
-   assert(t->length == array_size);
-   assert(t->fields.array == base);
+   assert(((glsl_type *) entry->data)->base_type == GLSL_TYPE_ARRAY);
+   assert(((glsl_type *) entry->data)->length == array_size);
+   assert(((glsl_type *) entry->data)->fields.array == base);
 
    mtx_unlock(&glsl_type::mutex);
 
-   return t;
+   return (glsl_type *) entry->data;
 }
 
 
@@ -722,19 +723,13 @@ glsl_type::record_compare(const glsl_type *b) const
 }
 
 
-int
+bool
 glsl_type::record_key_compare(const void *a, const void *b)
 {
    const glsl_type *const key1 = (glsl_type *) a;
    const glsl_type *const key2 = (glsl_type *) b;
 
-   /* Return zero is the types match (there is zero difference) or non-zero
-    * otherwise.
-    */
-   if (strcmp(key1->name, key2->name) != 0)
-      return 1;
-
-   return !key1->record_compare(key2);
+   return strcmp(key1->name, key2->name) == 0 && key1->record_compare(key2);
 }
 
 
@@ -772,25 +767,27 @@ glsl_type::get_record_instance(const glsl_struct_field *fields,
    mtx_lock(&glsl_type::mutex);
 
    if (record_types == NULL) {
-      record_types = hash_table_ctor(64, record_key_hash, record_key_compare);
+      record_types = _mesa_hash_table_create(NULL, record_key_hash,
+                                             record_key_compare);
    }
 
-   const glsl_type *t = (glsl_type *) hash_table_find(record_types, & key);
-   if (t == NULL) {
+   const struct hash_entry *entry = _mesa_hash_table_search(record_types,
+                                                            &key);
+   if (entry == NULL) {
       mtx_unlock(&glsl_type::mutex);
-      t = new glsl_type(fields, num_fields, name);
+      const glsl_type *t = new glsl_type(fields, num_fields, name);
       mtx_lock(&glsl_type::mutex);
 
-      hash_table_insert(record_types, (void *) t, t);
+      entry = _mesa_hash_table_insert(record_types, t, (void *) t);
    }
 
-   assert(t->base_type == GLSL_TYPE_STRUCT);
-   assert(t->length == num_fields);
-   assert(strcmp(t->name, name) == 0);
+   assert(((glsl_type *) entry->data)->base_type == GLSL_TYPE_STRUCT);
+   assert(((glsl_type *) entry->data)->length == num_fields);
+   assert(strcmp(((glsl_type *) entry->data)->name, name) == 0);
 
    mtx_unlock(&glsl_type::mutex);
 
-   return t;
+   return (glsl_type *) entry->data;
 }
 
 
@@ -805,25 +802,28 @@ glsl_type::get_interface_instance(const glsl_struct_field *fields,
    mtx_lock(&glsl_type::mutex);
 
    if (interface_types == NULL) {
-      interface_types = hash_table_ctor(64, record_key_hash, record_key_compare);
+      interface_types = _mesa_hash_table_create(NULL, record_key_hash,
+                                                record_key_compare);
    }
 
-   const glsl_type *t = (glsl_type *) hash_table_find(interface_types, & key);
-   if (t == NULL) {
+   const struct hash_entry *entry = _mesa_hash_table_search(interface_types,
+                                                            &key);
+   if (entry == NULL) {
       mtx_unlock(&glsl_type::mutex);
-      t = new glsl_type(fields, num_fields, packing, block_name);
+      const glsl_type *t = new glsl_type(fields, num_fields,
+                                         packing, block_name);
       mtx_lock(&glsl_type::mutex);
 
-      hash_table_insert(interface_types, (void *) t, t);
+      entry = _mesa_hash_table_insert(interface_types, t, (void *) t);
    }
 
-   assert(t->base_type == GLSL_TYPE_INTERFACE);
-   assert(t->length == num_fields);
-   assert(strcmp(t->name, block_name) == 0);
+   assert(((glsl_type *) entry->data)->base_type == GLSL_TYPE_INTERFACE);
+   assert(((glsl_type *) entry->data)->length == num_fields);
+   assert(strcmp(((glsl_type *) entry->data)->name, block_name) == 0);
 
    mtx_unlock(&glsl_type::mutex);
 
-   return t;
+   return (glsl_type *) entry->data;
 }
 
 
diff --git a/src/glsl/glsl_types.h b/src/glsl/glsl_types.h
index f54a9393e73..93a1d257ad8 100644
--- a/src/glsl/glsl_types.h
+++ b/src/glsl/glsl_types.h
@@ -688,7 +688,7 @@ private:
    /** Hash table containing the known interface types. */
    static struct hash_table *interface_types;
 
-   static int record_key_compare(const void *a, const void *b);
+   static bool record_key_compare(const void *a, const void *b);
    static unsigned record_key_hash(const void *key);
 
    /**

From 75df8f00192415eb4ad378708ff3745390931b4e Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Tue, 14 Jul 2015 07:41:26 +1000
Subject: [PATCH 0296/1208] glsl: free interface_types

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/glsl/glsl_types.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
index 4b344f1eb4a..aaf7c7c48d6 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -337,6 +337,11 @@ _mesa_glsl_release_types(void)
       _mesa_hash_table_destroy(glsl_type::record_types, NULL);
       glsl_type::record_types = NULL;
    }
+
+   if (glsl_type::interface_types != NULL) {
+      _mesa_hash_table_destroy(glsl_type::interface_types, NULL);
+      glsl_type::interface_types = NULL;
+   }
 }
 
 

From 3095ee9b8bd4154cc63b6332c21b16954555e241 Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Tue, 17 Mar 2015 12:17:27 +0100
Subject: [PATCH 0297/1208] mesa: define ARB_shader_storage_buffer_object
 extension

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/glcpp/glcpp-parse.y    |  3 ++
 src/glsl/glsl_parser_extras.cpp | 63 +++++++++++++++++----------------
 src/glsl/glsl_parser_extras.h   |  7 ++++
 src/mesa/main/extensions.c      |  1 +
 src/mesa/main/mtypes.h          |  1 +
 5 files changed, 44 insertions(+), 31 deletions(-)

diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index a11b6b2c7c8..ed1bffbde3c 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -2483,6 +2483,9 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 
               if (extensions->ARB_shader_precision)
                  add_builtin_define(parser, "GL_ARB_shader_precision", 1);
+
+	      if (extensions->ARB_shader_storage_buffer_object)
+	         add_builtin_define(parser, "GL_ARB_shader_storage_buffer_object", 1);
 	   }
 	}
 
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 046d5d7b5bf..0d7e5215672 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -551,37 +551,38 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
 
    /* ARB extensions go here, sorted alphabetically.
     */
-   EXT(ARB_arrays_of_arrays,           true,  false,     ARB_arrays_of_arrays),
-   EXT(ARB_compute_shader,             true,  false,     ARB_compute_shader),
-   EXT(ARB_conservative_depth,         true,  false,     ARB_conservative_depth),
-   EXT(ARB_derivative_control,         true,  false,     ARB_derivative_control),
-   EXT(ARB_draw_buffers,               true,  false,     dummy_true),
-   EXT(ARB_draw_instanced,             true,  false,     ARB_draw_instanced),
-   EXT(ARB_explicit_attrib_location,   true,  false,     ARB_explicit_attrib_location),
-   EXT(ARB_explicit_uniform_location,  true,  false,     ARB_explicit_uniform_location),
-   EXT(ARB_fragment_coord_conventions, true,  false,     ARB_fragment_coord_conventions),
-   EXT(ARB_fragment_layer_viewport,    true,  false,     ARB_fragment_layer_viewport),
-   EXT(ARB_gpu_shader5,                true,  false,     ARB_gpu_shader5),
-   EXT(ARB_gpu_shader_fp64,            true,  false,     ARB_gpu_shader_fp64),
-   EXT(ARB_sample_shading,             true,  false,     ARB_sample_shading),
-   EXT(ARB_separate_shader_objects,    true,  false,     dummy_true),
-   EXT(ARB_shader_atomic_counters,     true,  false,     ARB_shader_atomic_counters),
-   EXT(ARB_shader_bit_encoding,        true,  false,     ARB_shader_bit_encoding),
-   EXT(ARB_shader_image_load_store,    true,  false,     ARB_shader_image_load_store),
-   EXT(ARB_shader_precision,           true,  false,     ARB_shader_precision),
-   EXT(ARB_shader_stencil_export,      true,  false,     ARB_shader_stencil_export),
-   EXT(ARB_shader_texture_lod,         true,  false,     ARB_shader_texture_lod),
-   EXT(ARB_shading_language_420pack,   true,  false,     ARB_shading_language_420pack),
-   EXT(ARB_shading_language_packing,   true,  false,     ARB_shading_language_packing),
-   EXT(ARB_texture_cube_map_array,     true,  false,     ARB_texture_cube_map_array),
-   EXT(ARB_texture_gather,             true,  false,     ARB_texture_gather),
-   EXT(ARB_texture_multisample,        true,  false,     ARB_texture_multisample),
-   EXT(ARB_texture_query_levels,       true,  false,     ARB_texture_query_levels),
-   EXT(ARB_texture_query_lod,          true,  false,     ARB_texture_query_lod),
-   EXT(ARB_texture_rectangle,          true,  false,     dummy_true),
-   EXT(ARB_uniform_buffer_object,      true,  false,     ARB_uniform_buffer_object),
-   EXT(ARB_vertex_attrib_64bit,        true,  false,     ARB_vertex_attrib_64bit),
-   EXT(ARB_viewport_array,             true,  false,     ARB_viewport_array),
+   EXT(ARB_arrays_of_arrays,             true,  false,     ARB_arrays_of_arrays),
+   EXT(ARB_compute_shader,               true,  false,     ARB_compute_shader),
+   EXT(ARB_conservative_depth,           true,  false,     ARB_conservative_depth),
+   EXT(ARB_derivative_control,           true,  false,     ARB_derivative_control),
+   EXT(ARB_draw_buffers,                 true,  false,     dummy_true),
+   EXT(ARB_draw_instanced,               true,  false,     ARB_draw_instanced),
+   EXT(ARB_explicit_attrib_location,     true,  false,     ARB_explicit_attrib_location),
+   EXT(ARB_explicit_uniform_location,    true,  false,     ARB_explicit_uniform_location),
+   EXT(ARB_fragment_coord_conventions,   true,  false,     ARB_fragment_coord_conventions),
+   EXT(ARB_fragment_layer_viewport,      true,  false,     ARB_fragment_layer_viewport),
+   EXT(ARB_gpu_shader5,                  true,  false,     ARB_gpu_shader5),
+   EXT(ARB_gpu_shader_fp64,              true,  false,     ARB_gpu_shader_fp64),
+   EXT(ARB_sample_shading,               true,  false,     ARB_sample_shading),
+   EXT(ARB_separate_shader_objects,      true,  false,     dummy_true),
+   EXT(ARB_shader_atomic_counters,       true,  false,     ARB_shader_atomic_counters),
+   EXT(ARB_shader_bit_encoding,          true,  false,     ARB_shader_bit_encoding),
+   EXT(ARB_shader_image_load_store,      true,  false,     ARB_shader_image_load_store),
+   EXT(ARB_shader_precision,             true,  false,     ARB_shader_precision),
+   EXT(ARB_shader_stencil_export,        true,  false,     ARB_shader_stencil_export),
+   EXT(ARB_shader_storage_buffer_object, true,  false,     ARB_shader_storage_buffer_object),
+   EXT(ARB_shader_texture_lod,           true,  false,     ARB_shader_texture_lod),
+   EXT(ARB_shading_language_420pack,     true,  false,     ARB_shading_language_420pack),
+   EXT(ARB_shading_language_packing,     true,  false,     ARB_shading_language_packing),
+   EXT(ARB_texture_cube_map_array,       true,  false,     ARB_texture_cube_map_array),
+   EXT(ARB_texture_gather,               true,  false,     ARB_texture_gather),
+   EXT(ARB_texture_multisample,          true,  false,     ARB_texture_multisample),
+   EXT(ARB_texture_query_levels,         true,  false,     ARB_texture_query_levels),
+   EXT(ARB_texture_query_lod,            true,  false,     ARB_texture_query_lod),
+   EXT(ARB_texture_rectangle,            true,  false,     dummy_true),
+   EXT(ARB_uniform_buffer_object,        true,  false,     ARB_uniform_buffer_object),
+   EXT(ARB_vertex_attrib_64bit,          true,  false,     ARB_vertex_attrib_64bit),
+   EXT(ARB_viewport_array,               true,  false,     ARB_viewport_array),
 
    /* KHR extensions go here, sorted alphabetically.
     */
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index 02ddbbd14be..4996b845044 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -215,6 +215,11 @@ struct _mesa_glsl_parse_state {
       return ARB_uniform_buffer_object_enable || is_version(140, 300);
    }
 
+   bool has_shader_storage_buffer_objects() const
+   {
+      return ARB_shader_storage_buffer_object_enable || is_version(430, 0);
+   }
+
    bool has_separate_shader_objects() const
    {
       return ARB_separate_shader_objects_enable || is_version(410, 310)
@@ -462,6 +467,8 @@ struct _mesa_glsl_parse_state {
    bool ARB_shader_precision_warn;
    bool ARB_shader_stencil_export_enable;
    bool ARB_shader_stencil_export_warn;
+   bool ARB_shader_storage_buffer_object_enable;
+   bool ARB_shader_storage_buffer_object_warn;
    bool ARB_shader_texture_lod_enable;
    bool ARB_shader_texture_lod_warn;
    bool ARB_shading_language_420pack_enable;
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 4176a69ed7c..d753e5fea31 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -154,6 +154,7 @@ static const struct extension extension_table[] = {
    { "GL_ARB_shader_objects",                      o(dummy_true),                              GL,             2002 },
    { "GL_ARB_shader_precision",                    o(ARB_shader_precision),                    GL,             2010 },
    { "GL_ARB_shader_stencil_export",               o(ARB_shader_stencil_export),               GL,             2009 },
+   { "GL_ARB_shader_storage_buffer_object",        o(ARB_shader_storage_buffer_object),        GL,             2012 },
    { "GL_ARB_shader_texture_lod",                  o(ARB_shader_texture_lod),                  GL,             2009 },
    { "GL_ARB_shading_language_100",                o(dummy_true),                              GLL,            2003 },
    { "GL_ARB_shading_language_packing",            o(ARB_shading_language_packing),            GL,             2011 },
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 7b55677de30..9ec342bea48 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3685,6 +3685,7 @@ struct gl_extensions
    GLboolean ARB_shader_image_load_store;
    GLboolean ARB_shader_precision;
    GLboolean ARB_shader_stencil_export;
+   GLboolean ARB_shader_storage_buffer_object;
    GLboolean ARB_shader_texture_lod;
    GLboolean ARB_shading_language_packing;
    GLboolean ARB_shading_language_420pack;

From 18feaa8f36b311c443fd56666507ec1768fb9582 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20H=C3=B8gsberg?= <krh@bitplanet.net>
Date: Wed, 13 May 2015 10:41:55 +0200
Subject: [PATCH 0298/1208] glsl: Add ir_var_shader_storage

This will be used to identify buffer variables inside shader storage
buffer objects, which are very similar to uniforms except for a few
differences, most important of which is that they are writable.

Since buffer variables are so similar to uniforms, we will almost always
want them to go through the same paths as uniforms.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/builtin_variables.cpp                   |  5 +++--
 src/glsl/glsl_symbol_table.cpp                   | 16 +++++++++++-----
 src/glsl/ir.cpp                                  |  3 +++
 src/glsl/ir.h                                    |  5 ++++-
 src/glsl/ir_function.cpp                         |  1 +
 src/glsl/ir_print_visitor.cpp                    |  3 ++-
 src/glsl/ir_reader.cpp                           |  2 ++
 src/glsl/loop_unroll.cpp                         |  1 +
 src/glsl/lower_named_interface_blocks.cpp        |  5 +++--
 src/glsl/lower_variable_index_to_cond_assign.cpp |  1 +
 src/glsl/opt_structure_splitting.cpp             |  5 +++--
 11 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index a765d35fde0..aba1750c09d 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -436,11 +436,12 @@ builtin_variable_generator::add_variable(const char *name,
       var->data.read_only = true;
       break;
    case ir_var_shader_out:
+   case ir_var_shader_storage:
       break;
    default:
       /* The only variables that are added using this function should be
-       * uniforms, shader inputs, and shader outputs, constants (which use
-       * ir_var_auto), and system values.
+       * uniforms, shader storage, shader inputs, and shader outputs, constants
+       * (which use ir_var_auto), and system values.
        */
       assert(0);
       break;
diff --git a/src/glsl/glsl_symbol_table.cpp b/src/glsl/glsl_symbol_table.cpp
index 2294dda42c8..536f0a3a8c2 100644
--- a/src/glsl/glsl_symbol_table.cpp
+++ b/src/glsl/glsl_symbol_table.cpp
@@ -36,6 +36,9 @@ public:
       case ir_var_uniform:
          dest = &ibu;
          break;
+      case ir_var_shader_storage:
+         dest = &iss;
+         break;
       case ir_var_shader_in:
          dest = &ibi;
          break;
@@ -60,6 +63,8 @@ public:
       switch (mode) {
       case ir_var_uniform:
          return ibu;
+      case ir_var_shader_storage:
+         return iss;
       case ir_var_shader_in:
          return ibi;
       case ir_var_shader_out:
@@ -71,24 +76,25 @@ public:
    }
 
    symbol_table_entry(ir_variable *v)               :
-      v(v), f(0), t(0), ibu(0), ibi(0), ibo(0), a(0) {}
+      v(v), f(0), t(0), ibu(0), iss(0), ibi(0), ibo(0), a(0) {}
    symbol_table_entry(ir_function *f)               :
-      v(0), f(f), t(0), ibu(0), ibi(0), ibo(0), a(0) {}
+      v(0), f(f), t(0), ibu(0), iss(0), ibi(0), ibo(0), a(0) {}
    symbol_table_entry(const glsl_type *t)           :
-      v(0), f(0), t(t), ibu(0), ibi(0), ibo(0), a(0) {}
+      v(0), f(0), t(t), ibu(0), iss(0), ibi(0), ibo(0), a(0) {}
    symbol_table_entry(const glsl_type *t, enum ir_variable_mode mode) :
-      v(0), f(0), t(0), ibu(0), ibi(0), ibo(0), a(0)
+      v(0), f(0), t(0), ibu(0), iss(0), ibi(0), ibo(0), a(0)
    {
       assert(t->is_interface());
       add_interface(t, mode);
    }
    symbol_table_entry(const class ast_type_specifier *a):
-      v(0), f(0), t(0), ibu(0), ibi(0), ibo(0), a(a) {}
+      v(0), f(0), t(0), ibu(0), iss(0), ibi(0), ibo(0), a(a) {}
 
    ir_variable *v;
    ir_function *f;
    const glsl_type *t;
    const glsl_type *ibu;
+   const glsl_type *iss;
    const glsl_type *ibi;
    const glsl_type *ibo;
    const class ast_type_specifier *a;
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index dbd064feecc..9a25bf413f4 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -1975,6 +1975,9 @@ mode_string(const ir_variable *var)
    case ir_var_uniform:
       return "uniform";
 
+   case ir_var_shader_storage:
+      return "buffer";
+
    case ir_var_shader_in:
       return "shader input";
 
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index f9045553501..2b9533a6437 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -324,6 +324,7 @@ protected:
 enum ir_variable_mode {
    ir_var_auto = 0,     /**< Function local variables and globals. */
    ir_var_uniform,      /**< Variable declared as a uniform. */
+   ir_var_shader_storage,   /**< Variable declared as an ssbo. */
    ir_var_shader_in,
    ir_var_shader_out,
    ir_var_function_in,
@@ -445,7 +446,9 @@ public:
     */
    inline bool is_in_uniform_block() const
    {
-      return this->data.mode == ir_var_uniform && this->interface_type != NULL;
+      return (this->data.mode == ir_var_uniform ||
+              this->data.mode == ir_var_shader_storage) &&
+             this->interface_type != NULL;
    }
 
    /**
diff --git a/src/glsl/ir_function.cpp b/src/glsl/ir_function.cpp
index 13194439003..93034bedb5a 100644
--- a/src/glsl/ir_function.cpp
+++ b/src/glsl/ir_function.cpp
@@ -72,6 +72,7 @@ parameter_lists_match(_mesa_glsl_parse_state *state,
       switch ((enum ir_variable_mode)(param->data.mode)) {
       case ir_var_auto:
       case ir_var_uniform:
+      case ir_var_shader_storage:
       case ir_var_temporary:
 	 /* These are all error conditions.  It is invalid for a parameter to
 	  * a function to be declared as auto (not in, out, or inout) or
diff --git a/src/glsl/ir_print_visitor.cpp b/src/glsl/ir_print_visitor.cpp
index 4cbcad4ec61..922f98b6b8c 100644
--- a/src/glsl/ir_print_visitor.cpp
+++ b/src/glsl/ir_print_visitor.cpp
@@ -168,7 +168,8 @@ void ir_print_visitor::visit(ir_variable *ir)
    const char *const cent = (ir->data.centroid) ? "centroid " : "";
    const char *const samp = (ir->data.sample) ? "sample " : "";
    const char *const inv = (ir->data.invariant) ? "invariant " : "";
-   const char *const mode[] = { "", "uniform ", "shader_in ", "shader_out ",
+   const char *const mode[] = { "", "uniform ", "shader_storage",
+                                "shader_in ", "shader_out ",
                                 "in ", "out ", "inout ",
 			        "const_in ", "sys ", "temporary " };
    STATIC_ASSERT(ARRAY_SIZE(mode) == ir_var_mode_count);
diff --git a/src/glsl/ir_reader.cpp b/src/glsl/ir_reader.cpp
index 4eae4131c57..d3ece19b22c 100644
--- a/src/glsl/ir_reader.cpp
+++ b/src/glsl/ir_reader.cpp
@@ -421,6 +421,8 @@ ir_reader::read_declaration(s_expression *expr)
 	 var->data.invariant = 1;
       } else if (strcmp(qualifier->value(), "uniform") == 0) {
 	 var->data.mode = ir_var_uniform;
+      } else if (strcmp(qualifier->value(), "shader_storage") == 0) {
+	 var->data.mode = ir_var_shader_storage;
       } else if (strcmp(qualifier->value(), "auto") == 0) {
 	 var->data.mode = ir_var_auto;
       } else if (strcmp(qualifier->value(), "in") == 0) {
diff --git a/src/glsl/loop_unroll.cpp b/src/glsl/loop_unroll.cpp
index 7a00fb8fea8..b9ea3507782 100644
--- a/src/glsl/loop_unroll.cpp
+++ b/src/glsl/loop_unroll.cpp
@@ -145,6 +145,7 @@ public:
                   unsupported_variable_indexing = true;
                break;
             case ir_var_uniform:
+            case ir_var_shader_storage:
                if (options->EmitNoIndirectUniform)
                   unsupported_variable_indexing = true;
                break;
diff --git a/src/glsl/lower_named_interface_blocks.cpp b/src/glsl/lower_named_interface_blocks.cpp
index 7304c51399a..28d7987d3c3 100644
--- a/src/glsl/lower_named_interface_blocks.cpp
+++ b/src/glsl/lower_named_interface_blocks.cpp
@@ -108,7 +108,8 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
        * but, this will require changes to the other uniform block
        * support code.
        */
-      if (var->data.mode == ir_var_uniform)
+      if (var->data.mode == ir_var_uniform ||
+          var->data.mode == ir_var_shader_storage)
          continue;
 
       const glsl_type * iface_t = var->type;
@@ -212,7 +213,7 @@ flatten_named_interface_blocks_declarations::handle_rvalue(ir_rvalue **rvalue)
     * but, this will require changes to the other uniform block
     * support code.
     */
-   if (var->data.mode == ir_var_uniform)
+   if (var->data.mode == ir_var_uniform || var->data.mode == ir_var_shader_storage)
       return;
 
    if (var->get_interface_type() != NULL) {
diff --git a/src/glsl/lower_variable_index_to_cond_assign.cpp b/src/glsl/lower_variable_index_to_cond_assign.cpp
index d878cb07811..4a6a76c4eba 100644
--- a/src/glsl/lower_variable_index_to_cond_assign.cpp
+++ b/src/glsl/lower_variable_index_to_cond_assign.cpp
@@ -370,6 +370,7 @@ public:
       case ir_var_temporary:
 	 return this->lower_temps;
       case ir_var_uniform:
+      case ir_var_shader_storage:
 	 return this->lower_uniforms;
       case ir_var_function_in:
       case ir_var_const_in:
diff --git a/src/glsl/opt_structure_splitting.cpp b/src/glsl/opt_structure_splitting.cpp
index 5e82fe93aa7..abf4310feb3 100644
--- a/src/glsl/opt_structure_splitting.cpp
+++ b/src/glsl/opt_structure_splitting.cpp
@@ -103,8 +103,9 @@ ir_structure_reference_visitor::get_variable_entry(ir_variable *var)
 {
    assert(var);
 
-   if (!var->type->is_record() || var->data.mode == ir_var_uniform
-       || var->data.mode == ir_var_shader_in || var->data.mode == ir_var_shader_out)
+   if (!var->type->is_record() ||
+       var->data.mode == ir_var_uniform || var->data.mode == ir_var_shader_storage ||
+       var->data.mode == ir_var_shader_in || var->data.mode == ir_var_shader_out)
       return NULL;
 
    foreach_in_list(variable_entry, entry, &this->variable_list) {

From 1146696f75ea0f2b49e6379c2a62602dfeb51190 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 5 Jun 2015 09:11:53 +0200
Subject: [PATCH 0299/1208] mesa: rename is_in_uniform_block to
 is_in_buffer_block

Since this now checks if a variable is inside a uniform or a shader
storage block.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/ast_to_hir.cpp                        | 2 +-
 src/glsl/ir.h                                  | 5 +++--
 src/glsl/link_uniform_block_active_visitor.cpp | 6 +++---
 src/glsl/link_uniform_initializers.cpp         | 4 ++--
 src/glsl/link_uniforms.cpp                     | 6 +++---
 src/glsl/linker.cpp                            | 2 +-
 src/glsl/lower_ubo_reference.cpp               | 2 +-
 src/glsl/opt_dead_code.cpp                     | 2 +-
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 2 +-
 src/mesa/program/ir_to_mesa.cpp                | 2 +-
 10 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index de6a86de07e..00f35eb29df 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -2007,7 +2007,7 @@ validate_matrix_layout_for_type(struct _mesa_glsl_parse_state *state,
                                 const glsl_type *type,
                                 ir_variable *var)
 {
-   if (var && !var->is_in_uniform_block()) {
+   if (var && !var->is_in_buffer_block()) {
       /* Layout qualifiers may only apply to interface blocks and fields in
        * them.
        */
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 2b9533a6437..1c7829b0326 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -442,9 +442,10 @@ public:
    glsl_interp_qualifier determine_interpolation_mode(bool flat_shade);
 
    /**
-    * Determine whether or not a variable is part of a uniform block.
+    * Determine whether or not a variable is part of a uniform or
+    * shader storage block.
     */
-   inline bool is_in_uniform_block() const
+   inline bool is_in_buffer_block() const
    {
       return (this->data.mode == ir_var_uniform ||
               this->data.mode == ir_var_shader_storage) &&
diff --git a/src/glsl/link_uniform_block_active_visitor.cpp b/src/glsl/link_uniform_block_active_visitor.cpp
index 292cde343f9..ddfd2b23748 100644
--- a/src/glsl/link_uniform_block_active_visitor.cpp
+++ b/src/glsl/link_uniform_block_active_visitor.cpp
@@ -73,7 +73,7 @@ process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
 ir_visitor_status
 link_uniform_block_active_visitor::visit(ir_variable *var)
 {
-   if (!var->is_in_uniform_block())
+   if (!var->is_in_buffer_block())
       return visit_continue;
 
    const glsl_type *const block_type = var->is_interface_instance()
@@ -124,7 +124,7 @@ link_uniform_block_active_visitor::visit_enter(ir_dereference_array *ir)
     * function.
     */
    if (var == NULL
-       || !var->is_in_uniform_block()
+       || !var->is_in_buffer_block()
        || !var->is_interface_instance())
       return visit_continue;
 
@@ -194,7 +194,7 @@ link_uniform_block_active_visitor::visit(ir_dereference_variable *ir)
 {
    ir_variable *var = ir->var;
 
-   if (!var->is_in_uniform_block())
+   if (!var->is_in_buffer_block())
       return visit_continue;
 
    assert(!var->is_interface_instance() || !var->type->is_array());
diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp
index 204acfa22b2..d1f904e9972 100644
--- a/src/glsl/link_uniform_initializers.cpp
+++ b/src/glsl/link_uniform_initializers.cpp
@@ -267,7 +267,7 @@ link_set_uniform_initializers(struct gl_shader_program *prog,
 
             if (type->without_array()->is_sampler()) {
                linker::set_sampler_binding(prog, var->name, var->data.binding);
-            } else if (var->is_in_uniform_block()) {
+            } else if (var->is_in_buffer_block()) {
                const glsl_type *const iface_type = var->get_interface_type();
 
                /* If the variable is an array and it is an interface instance,
@@ -280,7 +280,7 @@ link_set_uniform_initializers(struct gl_shader_program *prog,
                 *         float f[4];
                 *     };
                 *
-                * In this case "f" would pass is_in_uniform_block (above) and
+                * In this case "f" would pass is_in_buffer_block (above) and
                 * type->is_array(), but it will fail is_interface_instance().
                 */
                if (var->is_interface_instance() && var->type->is_array()) {
diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index 11ae06f9bfb..5fdf25e0a66 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -298,7 +298,7 @@ public:
 
    void process(ir_variable *var)
    {
-      this->is_ubo_var = var->is_in_uniform_block();
+      this->is_ubo_var = var->is_in_buffer_block();
       if (var->is_interface_instance())
          program_resource_visitor::process(var->get_interface_type(),
                                            var->get_interface_type()->name);
@@ -431,7 +431,7 @@ public:
       field_counter = 0;
 
       ubo_block_index = -1;
-      if (var->is_in_uniform_block()) {
+      if (var->is_in_buffer_block()) {
          if (var->is_interface_instance() && var->type->is_array()) {
             unsigned l = strlen(var->get_interface_type()->name);
 
@@ -763,7 +763,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
    foreach_in_list(ir_instruction, node, shader->ir) {
       ir_variable *const var = node->as_variable();
 
-      if ((var == NULL) || !var->is_in_uniform_block())
+      if ((var == NULL) || !var->is_in_buffer_block())
 	 continue;
 
       assert(var->data.mode == ir_var_uniform);
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index b7a783c0984..3005b70a4f6 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1873,7 +1873,7 @@ update_array_sizes(struct gl_shader_program *prog)
           * locations assigned based on the declaration ordering and
           * sizes, array compaction would mess that up.
 	  */
-	 if (var->is_in_uniform_block() || var->type->contains_atomic())
+	 if (var->is_in_buffer_block() || var->type->contains_atomic())
 	    continue;
 
 	 unsigned int size = var->data.max_array_access;
diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp
index 4ea4ccb03f2..a61ff29be3e 100644
--- a/src/glsl/lower_ubo_reference.cpp
+++ b/src/glsl/lower_ubo_reference.cpp
@@ -228,7 +228,7 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
       return;
 
    ir_variable *var = deref->variable_referenced();
-   if (!var || !var->is_in_uniform_block())
+   if (!var || !var->is_in_buffer_block())
       return;
 
    mem_ctx = ralloc_parent(*rvalue);
diff --git a/src/glsl/opt_dead_code.cpp b/src/glsl/opt_dead_code.cpp
index f45bf5dfdf8..7b4730a39bb 100644
--- a/src/glsl/opt_dead_code.cpp
+++ b/src/glsl/opt_dead_code.cpp
@@ -115,7 +115,7 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
              * If the variable is in a uniform block with one of those
              * layouts, do not eliminate it.
              */
-            if (entry->var->is_in_uniform_block()) {
+            if (entry->var->is_in_buffer_block()) {
                const glsl_type *const block_type =
                   entry->var->is_interface_instance()
                   ? entry->var->type : entry->var->get_interface_type();
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index c9c26611d95..67f2b5c29ff 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1063,7 +1063,7 @@ vec4_visitor::visit(ir_variable *ir)
        * Some uniforms, such as samplers and atomic counters, have no actual
        * storage, so we should ignore them.
        */
-      if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
+      if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
          return;
 
       /* Track how big the whole uniform variable is, in case we need to put a
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 0b2eb122364..2bd212efaa1 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -2384,7 +2384,7 @@ _mesa_generate_parameters_list_for_uniforms(struct gl_shader_program
       ir_variable *var = node->as_variable();
 
       if ((var == NULL) || (var->data.mode != ir_var_uniform)
-	  || var->is_in_uniform_block() || (strncmp(var->name, "gl_", 3) == 0))
+	  || var->is_in_buffer_block() || (strncmp(var->name, "gl_", 3) == 0))
 	 continue;
 
       add.process(var);

From 6b09598d63b8b6069b230fbe8283c75cf86f711a Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 18 May 2015 15:47:18 +0200
Subject: [PATCH 0300/1208] nir: add nir_var_shader_storage

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/nir/glsl_to_nir.cpp     | 4 ++++
 src/glsl/nir/nir.h               | 1 +
 src/glsl/nir/nir_lower_atomics.c | 3 ++-
 src/glsl/nir/nir_lower_io.c      | 9 ++++++---
 src/glsl/nir/nir_print.c         | 5 +++--
 src/glsl/nir/nir_validate.c      | 6 ++++--
 6 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 95531bbcd8f..66430f39995 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -280,6 +280,9 @@ nir_visitor::visit(ir_variable *ir)
       var->data.mode = nir_var_uniform;
       break;
 
+   case ir_var_shader_storage:
+      var->data.mode = nir_var_shader_storage;
+      break;
 
    case ir_var_system_value:
       var->data.mode = nir_var_system_value;
@@ -371,6 +374,7 @@ nir_visitor::visit(ir_variable *ir)
       break;
 
    case nir_var_uniform:
+   case nir_var_shader_storage:
       exec_list_push_tail(&shader->uniforms, &var->node);
       break;
 
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 9e2a2819199..e9a506c5971 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -87,6 +87,7 @@ typedef enum {
    nir_var_global,
    nir_var_local,
    nir_var_uniform,
+   nir_var_shader_storage,
    nir_var_system_value
 } nir_variable_mode;
 
diff --git a/src/glsl/nir/nir_lower_atomics.c b/src/glsl/nir/nir_lower_atomics.c
index 0457de60d9a..ce3615a3aa1 100644
--- a/src/glsl/nir/nir_lower_atomics.c
+++ b/src/glsl/nir/nir_lower_atomics.c
@@ -55,7 +55,8 @@ lower_instr(nir_intrinsic_instr *instr, nir_function_impl *impl)
       return;
    }
 
-   if (instr->variables[0]->var->data.mode != nir_var_uniform)
+   if (instr->variables[0]->var->data.mode != nir_var_uniform &&
+       instr->variables[0]->var->data.mode != nir_var_shader_storage)
       return; /* atomics passed as function arguments can't be lowered */
 
    void *mem_ctx = ralloc_parent(instr);
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c
index 6761d5bad33..a9dd77691b2 100644
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -86,7 +86,8 @@ nir_assign_var_locations_scalar(struct exec_list *var_list, unsigned *size)
        * UBO's have their own address spaces, so don't count them towards the
        * number of global uniforms
        */
-      if (var->data.mode == nir_var_uniform && var->interface_type != NULL)
+      if ((var->data.mode == nir_var_uniform || var->data.mode == nir_var_shader_storage) &&
+          var->interface_type != NULL)
          continue;
 
       var->data.driver_location = location;
@@ -153,7 +154,8 @@ nir_assign_var_locations_scalar_direct_first(nir_shader *shader,
    unsigned location = 0;
 
    foreach_list_typed(nir_variable, var, node, var_list) {
-      if (var->data.mode == nir_var_uniform && var->interface_type != NULL)
+      if ((var->data.mode == nir_var_uniform || var->data.mode == nir_var_shader_storage) &&
+          var->interface_type != NULL)
          continue;
 
       if (_mesa_set_search(indirect_set, var))
@@ -166,7 +168,8 @@ nir_assign_var_locations_scalar_direct_first(nir_shader *shader,
    *direct_size = location;
 
    foreach_list_typed(nir_variable, var, node, var_list) {
-      if (var->data.mode == nir_var_uniform && var->interface_type != NULL)
+      if ((var->data.mode == nir_var_uniform || var->data.mode == nir_var_shader_storage) &&
+          var->interface_type != NULL)
          continue;
 
       if (!_mesa_set_search(indirect_set, var))
diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c
index eb4045cec8e..f591c4b5f8d 100644
--- a/src/glsl/nir/nir_print.c
+++ b/src/glsl/nir/nir_print.c
@@ -214,7 +214,7 @@ print_var_decl(nir_variable *var, print_var_state *state, FILE *fp)
    const char *const samp = (var->data.sample) ? "sample " : "";
    const char *const inv = (var->data.invariant) ? "invariant " : "";
    const char *const mode[] = { "shader_in ", "shader_out ", "", "",
-                                "uniform ", "system " };
+                                "uniform ", "shader_storage", "system " };
    const char *const interp[] = { "", "smooth", "flat", "noperspective" };
 
    fprintf(fp, "%s%s%s%s%s ",
@@ -239,7 +239,8 @@ print_var_decl(nir_variable *var, print_var_state *state, FILE *fp)
 
    if (var->data.mode == nir_var_shader_in ||
        var->data.mode == nir_var_shader_out ||
-       var->data.mode == nir_var_uniform) {
+       var->data.mode == nir_var_uniform ||
+       var->data.mode == nir_var_shader_storage) {
       fprintf(fp, " (%u, %u)", var->data.location, var->data.driver_location);
    }
 
diff --git a/src/glsl/nir/nir_validate.c b/src/glsl/nir/nir_validate.c
index da92ed90472..dc799414d24 100644
--- a/src/glsl/nir/nir_validate.c
+++ b/src/glsl/nir/nir_validate.c
@@ -400,11 +400,13 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
       break;
    case nir_intrinsic_store_var:
       assert(instr->variables[0]->var->data.mode != nir_var_shader_in &&
-             instr->variables[0]->var->data.mode != nir_var_uniform);
+             instr->variables[0]->var->data.mode != nir_var_uniform &&
+             instr->variables[0]->var->data.mode != nir_var_shader_storage);
       break;
    case nir_intrinsic_copy_var:
       assert(instr->variables[0]->var->data.mode != nir_var_shader_in &&
-             instr->variables[0]->var->data.mode != nir_var_uniform);
+             instr->variables[0]->var->data.mode != nir_var_uniform &&
+             instr->variables[0]->var->data.mode != nir_var_shader_storage);
       assert(instr->variables[1]->var->data.mode != nir_var_shader_out);
       break;
    default:

From 84fc5fece006f2bd95287496e32482ac08bfd399 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20H=C3=B8gsberg?= <krh@bitplanet.net>
Date: Wed, 13 May 2015 10:53:46 +0200
Subject: [PATCH 0301/1208] glsl: Implement parser support for 'buffer'
 qualifier

This is used to identify shader storage buffer interface blocks where
buffer variables are declared.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/ast.h                  |  1 +
 src/glsl/ast_to_hir.cpp         | 14 ++++++++++----
 src/glsl/ast_type.cpp           |  3 ++-
 src/glsl/glsl_lexer.ll          |  1 +
 src/glsl/glsl_parser.yy         | 30 ++++++++++++++++++++++++++----
 src/glsl/glsl_parser_extras.cpp |  2 ++
 6 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/src/glsl/ast.h b/src/glsl/ast.h
index ef74e5137b2..49212298e16 100644
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -435,6 +435,7 @@ struct ast_type_qualifier {
 	 unsigned centroid:1;
          unsigned sample:1;
 	 unsigned uniform:1;
+	 unsigned buffer:1;
 	 unsigned smooth:1;
 	 unsigned flat:1;
 	 unsigned noperspective:1;
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 00f35eb29df..f9f1c083f72 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -2501,6 +2501,8 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       var->data.mode = ir_var_shader_out;
    else if (qual->flags.q.uniform)
       var->data.mode = ir_var_uniform;
+   else if (qual->flags.q.buffer)
+      var->data.mode = ir_var_shader_storage;
 
    if (!is_parameter && is_varying_var(var, state->stage)) {
       /* User-defined ins/outs are not permitted in compute shaders. */
@@ -5265,8 +5267,9 @@ ast_type_specifier::hir(exec_list *instructions,
  * \c glsl_struct_field to describe the members.
  *
  * If we're processing an interface block, var_mode should be the type of the
- * interface block (ir_var_shader_in, ir_var_shader_out, or ir_var_uniform).
- * If we're processing a structure, var_mode should be ir_var_auto.
+ * interface block (ir_var_shader_in, ir_var_shader_out, ir_var_uniform or
+ * ir_var_shader_storage).  If we're processing a structure, var_mode should be
+ * ir_var_auto.
  *
  * \return
  * The number of fields processed.  A pointer to the array structure fields is
@@ -5396,10 +5399,10 @@ ast_process_structure_or_interface_block(exec_list *instructions,
          fields[i].stream = qual->flags.q.explicit_stream ? qual->stream : -1;
 
          if (qual->flags.q.row_major || qual->flags.q.column_major) {
-            if (!qual->flags.q.uniform) {
+            if (!qual->flags.q.uniform && !qual->flags.q.buffer) {
                _mesa_glsl_error(&loc, state,
                                 "row_major and column_major can only be "
-                                "applied to uniform interface blocks");
+                                "applied to interface blocks");
             } else
                validate_matrix_layout_for_type(state, &loc, field_type, NULL);
          }
@@ -5596,6 +5599,9 @@ ast_interface_block::hir(exec_list *instructions,
    } else if (this->layout.flags.q.uniform) {
       var_mode = ir_var_uniform;
       iface_type_name = "uniform";
+   } else if (this->layout.flags.q.buffer) {
+      var_mode = ir_var_shader_storage;
+      iface_type_name = "buffer";
    } else {
       var_mode = ir_var_auto;
       iface_type_name = "UNKNOWN";
diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index 1bcf6a2e81f..fa4806af1c0 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -78,7 +78,8 @@ ast_type_qualifier::has_storage() const
           || this->flags.q.varying
           || this->flags.q.in
           || this->flags.q.out
-          || this->flags.q.uniform;
+          || this->flags.q.uniform
+          || this->flags.q.buffer;
 }
 
 bool
diff --git a/src/glsl/glsl_lexer.ll b/src/glsl/glsl_lexer.ll
index 10db5b8b632..845deebcd6f 100644
--- a/src/glsl/glsl_lexer.ll
+++ b/src/glsl/glsl_lexer.ll
@@ -308,6 +308,7 @@ in		return IN_TOK;
 out		return OUT_TOK;
 inout		return INOUT_TOK;
 uniform		return UNIFORM;
+buffer		return BUFFER;
 varying		DEPRECATED_ES_KEYWORD(VARYING);
 centroid	KEYWORD(120, 300, 120, 300, CENTROID);
 invariant	KEYWORD(120, 100, 120, 100, INVARIANT);
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 3ce9e103f20..8564cb99374 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -134,7 +134,7 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 }
 
 %token ATTRIBUTE CONST_TOK BOOL_TOK FLOAT_TOK INT_TOK UINT_TOK DOUBLE_TOK
-%token BREAK CONTINUE DO ELSE FOR IF DISCARD RETURN SWITCH CASE DEFAULT
+%token BREAK BUFFER CONTINUE DO ELSE FOR IF DISCARD RETURN SWITCH CASE DEFAULT
 %token BVEC2 BVEC3 BVEC4 IVEC2 IVEC3 IVEC4 UVEC2 UVEC3 UVEC4 VEC2 VEC3 VEC4 DVEC2 DVEC3 DVEC4
 %token CENTROID IN_TOK OUT_TOK INOUT_TOK UNIFORM VARYING SAMPLE
 %token NOPERSPECTIVE FLAT SMOOTH
@@ -1805,6 +1805,11 @@ storage_qualifier:
       memset(& $$, 0, sizeof($$));
       $$.flags.q.uniform = 1;
    }
+   | BUFFER
+   {
+      memset(& $$, 0, sizeof($$));
+      $$.flags.q.buffer = 1;
+   }
    ;
 
 memory_qualifier:
@@ -2507,7 +2512,17 @@ basic_interface_block:
       block->block_name = $2;
       block->declarations.push_degenerate_list_at_head(& $4->link);
 
-      if ($1.flags.q.uniform) {
+      if ($1.flags.q.buffer) {
+         if (!state->has_shader_storage_buffer_objects()) {
+            _mesa_glsl_error(& @1, state,
+                             "#version 430 / GL_ARB_shader_storage_buffer_object "
+                             "required for defining shader storage blocks");
+         } else if (state->ARB_shader_storage_buffer_object_warn) {
+            _mesa_glsl_warning(& @1, state,
+                               "#version 430 / GL_ARB_shader_storage_buffer_object "
+                               "required for defining shader storage blocks");
+         }
+      } else if ($1.flags.q.uniform) {
          if (!state->has_uniform_buffer_objects()) {
             _mesa_glsl_error(& @1, state,
                              "#version 140 / GL_ARB_uniform_buffer_object "
@@ -2551,11 +2566,13 @@ basic_interface_block:
       uint64_t interface_type_mask;
       struct ast_type_qualifier temp_type_qualifier;
 
-      /* Get a bitmask containing only the in/out/uniform flags, allowing us
-       * to ignore other irrelevant flags like interpolation qualifiers.
+      /* Get a bitmask containing only the in/out/uniform/buffer
+       * flags, allowing us to ignore other irrelevant flags like
+       * interpolation qualifiers.
        */
       temp_type_qualifier.flags.i = 0;
       temp_type_qualifier.flags.q.uniform = true;
+      temp_type_qualifier.flags.q.buffer = true;
       temp_type_qualifier.flags.q.in = true;
       temp_type_qualifier.flags.q.out = true;
       interface_type_mask = temp_type_qualifier.flags.i;
@@ -2642,6 +2659,11 @@ interface_qualifier:
       memset(& $$, 0, sizeof($$));
       $$.flags.q.uniform = 1;
    }
+   | BUFFER
+   {
+      memset(& $$, 0, sizeof($$));
+      $$.flags.q.buffer = 1;
+   }
    ;
 
 instance_name_opt:
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 0d7e5215672..5412f0b7887 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -854,6 +854,8 @@ _mesa_ast_type_qualifier_print(const struct ast_type_qualifier *q)
       printf("sample ");
    if (q->flags.q.uniform)
       printf("uniform ");
+   if (q->flags.q.buffer)
+      printf("buffer ");
    if (q->flags.q.smooth)
       printf("smooth ");
    if (q->flags.q.flat)

From a78a589efc5440443439d474e45fa1ef8b79178c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20H=C3=B8gsberg?= <krh@bitplanet.net>
Date: Wed, 13 May 2015 11:17:23 +0200
Subject: [PATCH 0302/1208] glsl: link buffer variables and shader storage
 buffer interface blocks

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/link_interface_blocks.cpp     | 15 ++++++++++++---
 src/glsl/link_uniform_initializers.cpp |  3 ++-
 src/glsl/link_uniforms.cpp             |  8 +++++---
 src/glsl/linker.cpp                    |  4 ++--
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/glsl/link_interface_blocks.cpp b/src/glsl/link_interface_blocks.cpp
index 07f5b4223a8..f9ddb13cd56 100644
--- a/src/glsl/link_interface_blocks.cpp
+++ b/src/glsl/link_interface_blocks.cpp
@@ -112,7 +112,8 @@ intrastage_match(interface_block_definition *a,
     * it's not clear from the spec whether they need to match, but
     * Mesa's implementation relies on them matching.
     */
-   if (a->instance_name != NULL && mode != ir_var_uniform &&
+   if (a->instance_name != NULL &&
+       mode != ir_var_uniform && mode != ir_var_shader_storage &&
        strcmp(a->instance_name, b->instance_name) != 0) {
       return false;
    }
@@ -253,6 +254,7 @@ validate_intrastage_interface_blocks(struct gl_shader_program *prog,
    interface_block_definitions in_interfaces;
    interface_block_definitions out_interfaces;
    interface_block_definitions uniform_interfaces;
+   interface_block_definitions buffer_interfaces;
 
    for (unsigned int i = 0; i < num_shaders; i++) {
       if (shader_list[i] == NULL)
@@ -279,6 +281,9 @@ validate_intrastage_interface_blocks(struct gl_shader_program *prog,
          case ir_var_uniform:
             definitions = &uniform_interfaces;
             break;
+         case ir_var_shader_storage:
+            definitions = &buffer_interfaces;
+            break;
          default:
             /* Only in, out, and uniform interfaces are legal, so we should
              * never get here.
@@ -361,7 +366,9 @@ validate_interstage_uniform_blocks(struct gl_shader_program *prog,
       const gl_shader *stage = stages[i];
       foreach_in_list(ir_instruction, node, stage->ir) {
          ir_variable *var = node->as_variable();
-         if (!var || !var->get_interface_type() || var->data.mode != ir_var_uniform)
+         if (!var || !var->get_interface_type() ||
+             (var->data.mode != ir_var_uniform &&
+              var->data.mode != ir_var_shader_storage))
             continue;
 
          interface_block_definition *old_def =
@@ -374,7 +381,9 @@ validate_interstage_uniform_blocks(struct gl_shader_program *prog,
              * uniform matchin rules (for uniforms, it is as though all
              * shaders are in the same shader stage).
              */
-            if (!intrastage_match(old_def, &new_def, ir_var_uniform, prog)) {
+            if (!intrastage_match(old_def, &new_def,
+                                  (ir_variable_mode) var->data.mode,
+                                  prog)) {
                linker_error(prog, "definitions of interface block `%s' do not "
                             "match\n", var->get_interface_type()->name);
                return;
diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp
index d1f904e9972..6322a2d52cc 100644
--- a/src/glsl/link_uniform_initializers.cpp
+++ b/src/glsl/link_uniform_initializers.cpp
@@ -256,7 +256,8 @@ link_set_uniform_initializers(struct gl_shader_program *prog,
       foreach_in_list(ir_instruction, node, shader->ir) {
 	 ir_variable *const var = node->as_variable();
 
-	 if (!var || var->data.mode != ir_var_uniform)
+	 if (!var || (var->data.mode != ir_var_uniform &&
+	     var->data.mode != ir_var_shader_storage))
 	    continue;
 
 	 if (!mem_ctx)
diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index 5fdf25e0a66..e786ddcaa90 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -766,7 +766,8 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
       if ((var == NULL) || !var->is_in_buffer_block())
 	 continue;
 
-      assert(var->data.mode == ir_var_uniform);
+      assert(var->data.mode == ir_var_uniform ||
+             var->data.mode == ir_var_shader_storage);
 
       if (var->is_interface_instance()) {
          var->data.location = 0;
@@ -943,7 +944,8 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       foreach_in_list(ir_instruction, node, sh->ir) {
 	 ir_variable *const var = node->as_variable();
 
-	 if ((var == NULL) || (var->data.mode != ir_var_uniform))
+	 if ((var == NULL) || (var->data.mode != ir_var_uniform &&
+	                       var->data.mode != ir_var_shader_storage))
 	    continue;
 
 	 uniform_size.process(var);
@@ -987,7 +989,7 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       foreach_in_list(ir_instruction, node, prog->_LinkedShaders[i]->ir) {
 	 ir_variable *const var = node->as_variable();
 
-	 if ((var == NULL) || (var->data.mode != ir_var_uniform))
+	 if ((var == NULL) || (var->data.mode != ir_var_uniform && var->data.mode != ir_var_shader_storage))
 	    continue;
 
 	 parcel.set_and_process(prog, var);
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 3005b70a4f6..28ad2f38e0e 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -777,7 +777,7 @@ cross_validate_globals(struct gl_shader_program *prog,
 	 if (var == NULL)
 	    continue;
 
-	 if (uniforms_only && (var->data.mode != ir_var_uniform))
+	 if (uniforms_only && (var->data.mode != ir_var_uniform && var->data.mode != ir_var_shader_storage))
 	    continue;
 
 	 /* Don't cross validate temporaries that are at global scope.  These
@@ -2574,7 +2574,7 @@ check_explicit_uniform_locations(struct gl_context *ctx,
 
       foreach_in_list(ir_instruction, node, sh->ir) {
          ir_variable *var = node->as_variable();
-         if ((var && var->data.mode == ir_var_uniform) &&
+         if (var && (var->data.mode == ir_var_uniform || var->data.mode == ir_var_shader_storage) &&
              var->data.explicit_location) {
             if (!reserve_explicit_locations(prog, uniform_map, var)) {
                delete uniform_map;

From df89ed1591c9d1c55e79fe8effb976c21b172a7d Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 23 Mar 2015 11:19:12 +0100
Subject: [PATCH 0303/1208] glsl: Identify active uniform blocks that are
 buffer blocks as such.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/link_uniform_block_active_visitor.cpp | 1 +
 src/glsl/link_uniform_block_active_visitor.h   | 1 +
 src/glsl/link_uniform_blocks.cpp               | 4 ++++
 src/mesa/main/mtypes.h                         | 5 +++++
 4 files changed, 11 insertions(+)

diff --git a/src/glsl/link_uniform_block_active_visitor.cpp b/src/glsl/link_uniform_block_active_visitor.cpp
index ddfd2b23748..510294783a0 100644
--- a/src/glsl/link_uniform_block_active_visitor.cpp
+++ b/src/glsl/link_uniform_block_active_visitor.cpp
@@ -44,6 +44,7 @@ process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
 
       b->type = block_type;
       b->has_instance_name = var->is_interface_instance();
+      b->is_shader_storage = var->data.mode == ir_var_shader_storage;
 
       if (var->data.explicit_binding) {
          b->has_binding = true;
diff --git a/src/glsl/link_uniform_block_active_visitor.h b/src/glsl/link_uniform_block_active_visitor.h
index e5ea501553c..b663a884db4 100644
--- a/src/glsl/link_uniform_block_active_visitor.h
+++ b/src/glsl/link_uniform_block_active_visitor.h
@@ -38,6 +38,7 @@ struct link_uniform_block_active {
 
    bool has_instance_name;
    bool has_binding;
+   bool is_shader_storage;
 };
 
 class link_uniform_block_active_visitor : public ir_hierarchical_visitor {
diff --git a/src/glsl/link_uniform_blocks.cpp b/src/glsl/link_uniform_blocks.cpp
index 898544bea82..4df39e200d5 100644
--- a/src/glsl/link_uniform_blocks.cpp
+++ b/src/glsl/link_uniform_blocks.cpp
@@ -293,6 +293,8 @@ link_uniform_blocks(void *mem_ctx,
             blocks[i].NumUniforms =
                (unsigned)(ptrdiff_t)(&variables[parcel.index] - blocks[i].Uniforms);
 
+            blocks[i].IsShaderStorage = b->is_shader_storage;
+
             i++;
          }
       } else {
@@ -311,6 +313,8 @@ link_uniform_blocks(void *mem_ctx,
          blocks[i].NumUniforms =
             (unsigned)(ptrdiff_t)(&variables[parcel.index] - blocks[i].Uniforms);
 
+         blocks[i].IsShaderStorage = b->is_shader_storage;
+
          i++;
       }
    }
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 9ec342bea48..86508c36f6a 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2564,6 +2564,11 @@ struct gl_uniform_block
     */
    GLuint UniformBufferSize;
 
+   /**
+    * Is this actually an interface block for a shader storage buffer?
+    */
+   bool IsShaderStorage;
+
    /**
     * Layout specified in the shader
     *

From cd50906e0334d7ad0102e5733a152d55d672776b Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 19 Mar 2015 10:15:30 +0100
Subject: [PATCH 0304/1208] mesa: Add shader storage buffer support to struct
 gl_context

This includes the array of bindings, the current buffer bound to the
GL_SHADER_STORAGE_BUFFER target and a set of general limits and default
values for shader storage buffers.

v2:
- Use spec values for the new defined constants (Jordan)

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/main/bufferobj.c |  5 +++++
 src/mesa/main/config.h    |  2 ++
 src/mesa/main/context.c   |  6 ++++++
 src/mesa/main/mtypes.h    | 38 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 51 insertions(+)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 66dee680258..c5d4adada48 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -112,6 +112,11 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
          return &ctx->UniformBuffer;
       }
       break;
+   case GL_SHADER_STORAGE_BUFFER:
+      if (ctx->Extensions.ARB_shader_storage_buffer_object) {
+         return &ctx->ShaderStorageBuffer;
+      }
+      break;
    case GL_ATOMIC_COUNTER_BUFFER:
       if (ctx->Extensions.ARB_shader_atomic_counters) {
          return &ctx->AtomicBuffer;
diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index 9c3baf4c6aa..177f1766016 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -171,8 +171,10 @@
 #define MAX_PROGRAM_LOCAL_PARAMS       4096
 #define MAX_UNIFORMS                   4096
 #define MAX_UNIFORM_BUFFERS            15 /* + 1 default uniform buffer */
+#define MAX_SHADER_STORAGE_BUFFERS     7  /* + 1 default shader storage buffer */
 /* 6 is for vertex, hull, domain, geometry, fragment, and compute shader. */
 #define MAX_COMBINED_UNIFORM_BUFFERS   (MAX_UNIFORM_BUFFERS * 6)
+#define MAX_COMBINED_SHADER_STORAGE_BUFFERS   (MAX_SHADER_STORAGE_BUFFERS * 6)
 #define MAX_ATOMIC_COUNTERS            4096
 /* 6 is for vertex, hull, domain, geometry, fragment, and compute shader. */
 #define MAX_COMBINED_ATOMIC_BUFFERS    (MAX_UNIFORM_BUFFERS * 6)
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index faa1de739d9..5470c56bdea 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -597,6 +597,12 @@ _mesa_init_constants(struct gl_constants *consts, gl_api api)
    consts->MaxUniformBlockSize = 16384;
    consts->UniformBufferOffsetAlignment = 1;
 
+   /** GL_ARB_shader_storage_buffer_object */
+   consts->MaxCombinedShaderStorageBlocks = 8;
+   consts->MaxShaderStorageBufferBindings = 8;
+   consts->MaxShaderStorageBlockSize = 16 * 1024 * 1024;
+   consts->ShaderStorageBufferOffsetAlignment = 256;
+
    /* GL_ARB_explicit_uniform_location, GL_MAX_UNIFORM_LOCATIONS */
    consts->MaxUserAssignableUniformLocations =
       4 * MESA_SHADER_STAGES * MAX_UNIFORMS;
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 86508c36f6a..f1ab4eb0ab3 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3403,6 +3403,15 @@ struct gl_constants
    GLuint UniformBufferOffsetAlignment;
    /** @} */
 
+   /** @{
+    * GL_ARB_shader_storage_buffer_object
+    */
+   GLuint MaxCombinedShaderStorageBlocks;
+   GLuint MaxShaderStorageBufferBindings;
+   GLuint MaxShaderStorageBlockSize;
+   GLuint ShaderStorageBufferOffsetAlignment;
+   /** @} */
+
    /**
     * GL_ARB_explicit_uniform_location
     */
@@ -4049,6 +4058,20 @@ struct gl_uniform_buffer_binding
    GLboolean AutomaticSize;
 };
 
+struct gl_shader_storage_buffer_binding
+{
+   struct gl_buffer_object *BufferObject;
+   /** Start of shader storage block data in the buffer */
+   GLintptr Offset;
+   /** Size of data allowed to be referenced from the buffer (in bytes) */
+   GLsizeiptr Size;
+   /**
+    * glBindBufferBase() indicates that the Size should be ignored and only
+    * limited by the current size of the BufferObject.
+    */
+   GLboolean AutomaticSize;
+};
+
 /**
  * ARB_shader_image_load_store image unit.
  */
@@ -4295,6 +4318,12 @@ struct gl_context
     */
    struct gl_buffer_object *UniformBuffer;
 
+   /**
+    * Current GL_ARB_shader_storage_buffer_object binding referenced by
+    * GL_SHADER_STORAGE_BUFFER target for glBufferData, glMapBuffer, etc.
+    */
+   struct gl_buffer_object *ShaderStorageBuffer;
+
    /**
     * Array of uniform buffers for GL_ARB_uniform_buffer_object and GL 3.1.
     * This is set up using glBindBufferRange() or glBindBufferBase().  They are
@@ -4304,6 +4333,15 @@ struct gl_context
    struct gl_uniform_buffer_binding
       UniformBufferBindings[MAX_COMBINED_UNIFORM_BUFFERS];
 
+   /**
+    * Array of shader storage buffers for ARB_shader_storage_buffer_object
+    * and GL 4.3. This is set up using glBindBufferRange() or
+    * glBindBufferBase().  They are associated with shader storage blocks by
+    * glShaderStorageBlockBinding()'s state in the shader program.
+    */
+   struct gl_shader_storage_buffer_binding
+      ShaderStorageBufferBindings[MAX_COMBINED_SHADER_STORAGE_BUFFERS];
+
    /**
     * Object currently associated with the GL_ATOMIC_COUNTER_BUFFER
     * target.

From c717604dc4b5119fa9091241535c3efd1370438c Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Thu, 14 May 2015 12:37:07 +0200
Subject: [PATCH 0305/1208] mesa: add MaxShaderStorageBlocks to struct
 gl_program_constants

v2:
- Set MaxShaderStorageBlocks to 8.

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/main/context.c | 2 ++
 src/mesa/main/mtypes.h  | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 5470c56bdea..f4dc4e3c4da 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -536,6 +536,8 @@ init_program_limits(struct gl_constants *consts, gl_shader_stage stage,
 
    prog->MaxAtomicBuffers = 0;
    prog->MaxAtomicCounters = 0;
+
+   prog->MaxShaderStorageBlocks = 8;
 }
 
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index f1ab4eb0ab3..8a4ad766d05 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3306,6 +3306,9 @@ struct gl_program_constants
 
    /* GL_ARB_shader_image_load_store */
    GLuint MaxImageUniforms;
+
+   /* GL_ARB_shader_storage_buffer_object */
+   GLuint MaxShaderStorageBlocks;
 };
 
 

From fa0a86c057ac9bff9b208f93db75c5ce5bd7136f Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Wed, 18 Mar 2015 09:02:51 +0100
Subject: [PATCH 0306/1208] glsl: enable binding layout qualifier usage for
 shader storage buffer objects

See GLSL 4.30 spec, section 4.4.5 "Uniform and Shader Storage Block
Layout Qualifiers".

v2:
- Add whitespace in an error message. Delete period '.' at the end of that
error message (Jordan).

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/ast_to_hir.cpp | 29 ++++++++++++++++++++++++-----
 src/glsl/glsl_parser.yy |  3 ++-
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index f9f1c083f72..e887ac25fbb 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -2044,9 +2044,10 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
                            ir_variable *var,
                            const ast_type_qualifier *qual)
 {
-   if (var->data.mode != ir_var_uniform) {
+   if (var->data.mode != ir_var_uniform && var->data.mode != ir_var_shader_storage) {
       _mesa_glsl_error(loc, state,
-                       "the \"binding\" qualifier only applies to uniforms");
+                       "the \"binding\" qualifier only applies to uniforms and "
+                       "shader storage buffer objects");
       return false;
    }
 
@@ -2070,13 +2071,31 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
        *
        * The implementation-dependent maximum is GL_MAX_UNIFORM_BUFFER_BINDINGS.
        */
-      if (max_index >= ctx->Const.MaxUniformBufferBindings) {
+      if (var->data.mode == ir_var_uniform &&
+         max_index >= ctx->Const.MaxUniformBufferBindings) {
          _mesa_glsl_error(loc, state, "layout(binding = %d) for %d UBOs exceeds "
                           "the maximum number of UBO binding points (%d)",
                           qual->binding, elements,
                           ctx->Const.MaxUniformBufferBindings);
          return false;
       }
+      /* SSBOs. From page 67 of the GLSL 4.30 specification:
+       * "If the binding point for any uniform or shader storage block instance
+       *  is less than zero, or greater than or equal to the
+       *  implementation-dependent maximum number of uniform buffer bindings, a
+       *  compile-time error will occur. When the binding identifier is used
+       *  with a uniform or shader storage block instanced as an array of size
+       *  N, all elements of the array from binding through binding + N – 1 must
+       *  be within this range."
+       */
+      if (var->data.mode == ir_var_shader_storage &&
+         max_index >= ctx->Const.MaxShaderStorageBufferBindings) {
+         _mesa_glsl_error(loc, state, "layout(binding = %d) for %d SSBOs exceeds "
+                          "the maximum number of SSBO binding points (%d)",
+                          qual->binding, elements,
+                          ctx->Const.MaxShaderStorageBufferBindings);
+         return false;
+      }
    } else if (var->type->is_sampler() ||
               (var->type->is_array() && var->type->fields.array->is_sampler())) {
       /* Samplers.  From page 63 of the GLSL 4.20 specification:
@@ -5955,8 +5974,8 @@ ast_interface_block::hir(exec_list *instructions,
          if (state->symbols->get_variable(var->name) != NULL)
             _mesa_glsl_error(&loc, state, "`%s' redeclared", var->name);
 
-         /* Propagate the "binding" keyword into this UBO's fields;
-          * the UBO declaration itself doesn't get an ir_variable unless it
+         /* Propagate the "binding" keyword into this UBO/SSBO's fields.
+          * The UBO declaration itself doesn't get an ir_variable unless it
           * has an instance name.  This is ugly.
           */
          var->data.explicit_binding = this->layout.flags.q.explicit_binding;
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 8564cb99374..37c44014ba7 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -1425,7 +1425,8 @@ layout_qualifier_id:
       }
 
       if ((state->ARB_shading_language_420pack_enable ||
-           state->has_atomic_counters()) &&
+           state->has_atomic_counters() ||
+           state->ARB_shader_storage_buffer_object_enable) &&
           match_layout_qualifier("binding", $1, state) == 0) {
          $$.flags.q.explicit_binding = 1;
          $$.binding = $3;

From 20b2907db7b93656cbafe1d24302498e5817dbe2 Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Wed, 18 Mar 2015 10:25:10 +0100
Subject: [PATCH 0307/1208] glsl: shader buffer variables cannot have
 initializers

Section 4.3.7 "Buffer Variables" of the GLSL 4.30 spec:

    "Buffer variables cannot have initializers."

v2:
- Rewrite error message (Jordan)

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/ast_to_hir.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index e887ac25fbb..6299bf09a1a 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -2995,6 +2995,15 @@ process_initializer(ir_variable *var, ast_declaration *decl,
                            "cannot initialize uniforms");
    }
 
+   /* Section 4.3.7 "Buffer Variables" of the GLSL 4.30 spec:
+    *
+    *    "Buffer variables cannot have initializers."
+    */
+   if (var->data.mode == ir_var_shader_storage) {
+      _mesa_glsl_error(& initializer_loc, state,
+                       "SSBO variables cannot have initializers");
+   }
+
    /* From section 4.1.7 of the GLSL 4.40 spec:
     *
     *    "Opaque variables [...] are initialized only through the

From 9f651dbf7924938a8aa2c9c940ae3ed1366d6198 Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Wed, 18 Mar 2015 10:52:53 +0100
Subject: [PATCH 0308/1208] glsl: buffer variables cannot be defined outside
 interface blocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Section 4.3.7 "Buffer Variables", GLSL 4.30 spec:

"Buffer variables may only be declared inside interface blocks
(section 4.3.9 “Interface Blocks”), which are then referred to as
shader storage blocks. It is a compile-time error to declare buffer
variables at global scope (outside a block)."

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/ast_to_hir.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 6299bf09a1a..61020cf015f 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -3378,6 +3378,18 @@ ast_declarator_list::hir(exec_list *instructions,
 
    decl_type = this->type->glsl_type(& type_name, state);
 
+   /* Section 4.3.7 "Buffer Variables" of the GLSL 4.30 spec:
+    *    "Buffer variables may only be declared inside interface blocks
+    *    (section 4.3.9 “Interface Blocks”), which are then referred to as
+    *    shader storage blocks. It is a compile-time error to declare buffer
+    *    variables at global scope (outside a block)."
+    */
+   if (type->qualifier.flags.q.buffer && !decl_type->is_interface()) {
+      _mesa_glsl_error(&loc, state,
+                       "buffer variables cannot be declared outside "
+                       "interface blocks");
+   }
+
    /* An offset-qualified atomic counter declaration sets the default
     * offset for the next declaration within the same atomic counter
     * buffer.

From 2747d566f187cdab5d6bdc508e460a76e5cbd6c4 Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Thu, 19 Mar 2015 10:22:00 +0100
Subject: [PATCH 0309/1208] glsl: fix error messages in invalid declarations of
 shader storage blocks

Due to GL_ARB_shader_storage_buffer_object extension, shader storage blocks
have the same limitations as uniform blocks.

This patch fixes the corresponding error messages.

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/ast_to_hir.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 61020cf015f..ca30dbc499f 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -5382,7 +5382,7 @@ ast_process_structure_or_interface_block(exec_list *instructions,
          if (is_interface && field_type->contains_opaque()) {
             YYLTYPE loc = decl_list->get_location();
             _mesa_glsl_error(&loc, state,
-                             "uniform in non-default uniform block contains "
+                             "uniform/buffer in non-default interface block contains "
                              "opaque variable");
          }
 
@@ -5393,8 +5393,8 @@ ast_process_structure_or_interface_block(exec_list *instructions,
              * FINISHME: structures.
              */
             YYLTYPE loc = decl_list->get_location();
-            _mesa_glsl_error(&loc, state, "atomic counter in structure or "
-                             "uniform block");
+            _mesa_glsl_error(&loc, state, "atomic counter in structure, "
+                             "shader storage block or uniform block");
          }
 
          if (field_type->contains_image()) {
@@ -5404,7 +5404,8 @@ ast_process_structure_or_interface_block(exec_list *instructions,
              */
             YYLTYPE loc = decl_list->get_location();
             _mesa_glsl_error(&loc, state,
-                             "image in structure or uniform block");
+                             "image in structure, shader storage block or "
+                             "uniform block");
          }
 
          const struct ast_type_qualifier *const qual =
@@ -5413,9 +5414,9 @@ ast_process_structure_or_interface_block(exec_list *instructions,
              qual->flags.q.packed ||
              qual->flags.q.shared) {
             _mesa_glsl_error(&loc, state,
-                             "uniform block layout qualifiers std140, packed, and "
-                             "shared can only be applied to uniform blocks, not "
-                             "members");
+                             "uniform/shader storage block layout qualifiers "
+                             "std140, packed, and shared can only be applied "
+                             "to uniform/shader storage blocks, not members");
          }
 
          if (qual->flags.q.constant) {

From 98a1a2c7302526d649a727d63400407727d7aad9 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 19 Mar 2015 11:50:51 +0100
Subject: [PATCH 0310/1208] mesa: Initialize and free shader storage buffers

v2:
- Fix indention, used tabs instead of whitespaces. (Jordan)

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/main/bufferobj.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index c5d4adada48..2d70f7b092f 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -836,6 +836,9 @@ _mesa_init_buffer_objects( struct gl_context *ctx )
    _mesa_reference_buffer_object(ctx, &ctx->UniformBuffer,
 				 ctx->Shared->NullBufferObj);
 
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer,
+                                 ctx->Shared->NullBufferObj);
+
    _mesa_reference_buffer_object(ctx, &ctx->AtomicBuffer,
 				 ctx->Shared->NullBufferObj);
 
@@ -850,6 +853,14 @@ _mesa_init_buffer_objects( struct gl_context *ctx )
       ctx->UniformBufferBindings[i].Size = -1;
    }
 
+   for (i = 0; i < MAX_COMBINED_SHADER_STORAGE_BUFFERS; i++) {
+      _mesa_reference_buffer_object(ctx,
+                                    &ctx->ShaderStorageBufferBindings[i].BufferObject,
+                                    ctx->Shared->NullBufferObj);
+      ctx->ShaderStorageBufferBindings[i].Offset = -1;
+      ctx->ShaderStorageBufferBindings[i].Size = -1;
+   }
+
    for (i = 0; i < MAX_COMBINED_ATOMIC_BUFFERS; i++) {
       _mesa_reference_buffer_object(ctx,
 				    &ctx->AtomicBufferBindings[i].BufferObject,
@@ -872,6 +883,8 @@ _mesa_free_buffer_objects( struct gl_context *ctx )
 
    _mesa_reference_buffer_object(ctx, &ctx->UniformBuffer, NULL);
 
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, NULL);
+
    _mesa_reference_buffer_object(ctx, &ctx->AtomicBuffer, NULL);
 
    _mesa_reference_buffer_object(ctx, &ctx->DrawIndirectBuffer, NULL);
@@ -882,6 +895,12 @@ _mesa_free_buffer_objects( struct gl_context *ctx )
 				    NULL);
    }
 
+   for (i = 0; i < MAX_COMBINED_SHADER_STORAGE_BUFFERS; i++) {
+      _mesa_reference_buffer_object(ctx,
+                                    &ctx->ShaderStorageBufferBindings[i].BufferObject,
+                                    NULL);
+   }
+
    for (i = 0; i < MAX_COMBINED_ATOMIC_BUFFERS; i++) {
       _mesa_reference_buffer_object(ctx,
 				    &ctx->AtomicBufferBindings[i].BufferObject,

From e72f5ef50211c3ce31abaab4ed1bf82df2884157 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 19 Mar 2015 10:31:23 +0100
Subject: [PATCH 0311/1208] mesa: Implement _mesa_DeleteBuffers for target
 GL_SHADER_STORAGE_BUFFER

v2:
- Remove the extra spaces (Jordan)

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/main/bufferobj.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 2d70f7b092f..ee920a98b0b 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -1264,6 +1264,17 @@ _mesa_DeleteBuffers(GLsizei n, const GLuint *ids)
             _mesa_BindBuffer( GL_UNIFORM_BUFFER, 0 );
          }
 
+         /* unbind SSBO binding points */
+         for (j = 0; j < ctx->Const.MaxShaderStorageBufferBindings; j++) {
+            if (ctx->ShaderStorageBufferBindings[j].BufferObject == bufObj) {
+               _mesa_BindBufferBase(GL_SHADER_STORAGE_BUFFER, j, 0);
+            }
+         }
+
+         if (ctx->ShaderStorageBuffer == bufObj) {
+            _mesa_BindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+         }
+
          /* unbind Atomci Buffer binding points */
          for (j = 0; j < ctx->Const.MaxAtomicBufferBindings; j++) {
             if (ctx->AtomicBufferBindings[j].BufferObject == bufObj) {

From 0aa83f3e90a5ca547593631bc1557412e5305bdd Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 19 Mar 2015 10:47:17 +0100
Subject: [PATCH 0312/1208] mesa: Implement _mesa_BindBuffersBase for target
 GL_SHADER_STORAGE_BUFFER

v2:
- Add space before const (Jordan)

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/main/bufferobj.c | 142 ++++++++++++++++++++++++++++++++++++++
 src/mesa/main/mtypes.h    |   7 ++
 2 files changed, 149 insertions(+)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index ee920a98b0b..27638caf9fc 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -3033,6 +3033,33 @@ set_ubo_binding(struct gl_context *ctx,
       bufObj->UsageHistory |= USAGE_UNIFORM_BUFFER;
 }
 
+/**
+ * Binds a buffer object to a shader storage buffer binding point.
+ *
+ * The caller is responsible for flushing vertices and updating
+ * NewDriverState.
+ */
+static void
+set_ssbo_binding(struct gl_context *ctx,
+                 struct gl_shader_storage_buffer_binding *binding,
+                 struct gl_buffer_object *bufObj,
+                 GLintptr offset,
+                 GLsizeiptr size,
+                 GLboolean autoSize)
+{
+   _mesa_reference_buffer_object(ctx, &binding->BufferObject, bufObj);
+
+   binding->Offset = offset;
+   binding->Size = size;
+   binding->AutomaticSize = autoSize;
+
+   /* If this is a real buffer object, mark it has having been used
+    * at some point as a SSBO.
+    */
+   if (size >= 0)
+      bufObj->UsageHistory |= USAGE_SHADER_STORAGE_BUFFER;
+}
+
 /**
  * Binds a buffer object to a uniform buffer binding point.
  *
@@ -3254,6 +3281,35 @@ error_check_bind_uniform_buffers(struct gl_context *ctx,
    return true;
 }
 
+static bool
+error_check_bind_shader_storage_buffers(struct gl_context *ctx,
+                                        GLuint first, GLsizei count,
+                                        const char *caller)
+{
+   if (!ctx->Extensions.ARB_shader_storage_buffer_object) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "%s(target=GL_SHADER_STORAGE_BUFFER)", caller);
+      return false;
+   }
+
+   /* The ARB_multi_bind_spec says:
+    *
+    *     "An INVALID_OPERATION error is generated if <first> + <count> is
+    *      greater than the number of target-specific indexed binding points,
+    *      as described in section 6.7.1."
+    */
+   if (first + count > ctx->Const.MaxShaderStorageBufferBindings) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(first=%u + count=%d > the value of "
+                  "GL_MAX_SHADER_STORAGE_BUFFER_BINDINGS=%u)",
+                  caller, first, count,
+                  ctx->Const.MaxShaderStorageBufferBindings);
+      return false;
+   }
+
+   return true;
+}
+
 /**
  * Unbind all uniform buffers in the range
  * <first> through <first>+<count>-1
@@ -3269,6 +3325,22 @@ unbind_uniform_buffers(struct gl_context *ctx, GLuint first, GLsizei count)
                       bufObj, -1, -1, GL_TRUE);
 }
 
+/**
+ * Unbind all shader storage buffers in the range
+ * <first> through <first>+<count>-1
+ */
+static void
+unbind_shader_storage_buffers(struct gl_context *ctx, GLuint first,
+                              GLsizei count)
+{
+   struct gl_buffer_object *bufObj = ctx->Shared->NullBufferObj;
+   GLint i;
+
+   for (i = 0; i < count; i++)
+      set_ssbo_binding(ctx, &ctx->ShaderStorageBufferBindings[first + i],
+                       bufObj, -1, -1, GL_TRUE);
+}
+
 static void
 bind_uniform_buffers_base(struct gl_context *ctx, GLuint first, GLsizei count,
                           const GLuint *buffers)
@@ -3335,6 +3407,73 @@ bind_uniform_buffers_base(struct gl_context *ctx, GLuint first, GLsizei count,
    _mesa_end_bufferobj_lookups(ctx);
 }
 
+static void
+bind_shader_storage_buffers_base(struct gl_context *ctx, GLuint first,
+                                 GLsizei count, const GLuint *buffers)
+{
+   GLint i;
+
+   if (!error_check_bind_shader_storage_buffers(ctx, first, count,
+                                                "glBindBuffersBase"))
+      return;
+
+   /* Assume that at least one binding will be changed */
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
+
+   if (!buffers) {
+      /* The ARB_multi_bind spec says:
+       *
+       *   "If <buffers> is NULL, all bindings from <first> through
+       *    <first>+<count>-1 are reset to their unbound (zero) state."
+       */
+      unbind_shader_storage_buffers(ctx, first, count);
+      return;
+   }
+
+   /* Note that the error semantics for multi-bind commands differ from
+    * those of other GL commands.
+    *
+    * The Issues section in the ARB_multi_bind spec says:
+    *
+    *    "(11) Typically, OpenGL specifies that if an error is generated by a
+    *          command, that command has no effect.  This is somewhat
+    *          unfortunate for multi-bind commands, because it would require a
+    *          first pass to scan the entire list of bound objects for errors
+    *          and then a second pass to actually perform the bindings.
+    *          Should we have different error semantics?
+    *
+    *       RESOLVED:  Yes.  In this specification, when the parameters for
+    *       one of the <count> binding points are invalid, that binding point
+    *       is not updated and an error will be generated.  However, other
+    *       binding points in the same command will be updated if their
+    *       parameters are valid and no other error occurs."
+    */
+
+   _mesa_begin_bufferobj_lookups(ctx);
+
+   for (i = 0; i < count; i++) {
+      struct gl_shader_storage_buffer_binding *binding =
+          &ctx->ShaderStorageBufferBindings[first + i];
+      struct gl_buffer_object *bufObj;
+
+      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
+         bufObj = binding->BufferObject;
+      else
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
+                                                    "glBindBuffersBase");
+
+      if (bufObj) {
+         if (bufObj == ctx->Shared->NullBufferObj)
+            set_ssbo_binding(ctx, binding, bufObj, -1, -1, GL_TRUE);
+         else
+            set_ssbo_binding(ctx, binding, bufObj, 0, 0, GL_TRUE);
+      }
+   }
+
+   _mesa_end_bufferobj_lookups(ctx);
+}
+
 static void
 bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
                            const GLuint *buffers,
@@ -4043,6 +4182,9 @@ _mesa_BindBuffersBase(GLenum target, GLuint first, GLsizei count,
    case GL_UNIFORM_BUFFER:
       bind_uniform_buffers_base(ctx, first, count, buffers);
       return;
+   case GL_SHADER_STORAGE_BUFFER:
+      bind_shader_storage_buffers_base(ctx, first, count, buffers);
+      return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffers_base(ctx, first, count, buffers);
       return;
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 8a4ad766d05..4b0a995aec9 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1459,6 +1459,7 @@ typedef enum {
    USAGE_UNIFORM_BUFFER = 0x1,
    USAGE_TEXTURE_BUFFER = 0x2,
    USAGE_ATOMIC_COUNTER_BUFFER = 0x4,
+   USAGE_SHADER_STORAGE_BUFFER = 0x8,
 } gl_buffer_usage;
 
 
@@ -4034,6 +4035,12 @@ struct gl_driver_flags
     */
    uint64_t NewUniformBuffer;
 
+   /**
+    * gl_context::ShaderStorageBufferBindings
+    * gl_shader_program::ShaderStorageBlocks
+    */
+   uint64_t NewShaderStorageBuffer;
+
    uint64_t NewTextureBuffer;
 
    /**

From 7b0d0a2bf2d147c6024ff1a4b1eaaad955e7d297 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 19 Mar 2015 11:21:52 +0100
Subject: [PATCH 0313/1208] mesa: Implement _mesa_BindBuffersRange for target
 GL_SHADER_STORAGE_BUFFER

v2:
- Fix error message (Jordan)

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/main/bufferobj.c | 110 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 27638caf9fc..0a9ffe41043 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -3579,6 +3579,112 @@ bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
    _mesa_end_bufferobj_lookups(ctx);
 }
 
+static void
+bind_shader_storage_buffers_range(struct gl_context *ctx, GLuint first,
+                                  GLsizei count, const GLuint *buffers,
+                                  const GLintptr *offsets,
+                                  const GLsizeiptr *sizes)
+{
+   GLint i;
+
+   if (!error_check_bind_shader_storage_buffers(ctx, first, count,
+                                                "glBindBuffersRange"))
+      return;
+
+   /* Assume that at least one binding will be changed */
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
+
+   if (!buffers) {
+      /* The ARB_multi_bind spec says:
+       *
+       *    "If <buffers> is NULL, all bindings from <first> through
+       *     <first>+<count>-1 are reset to their unbound (zero) state.
+       *     In this case, the offsets and sizes associated with the
+       *     binding points are set to default values, ignoring
+       *     <offsets> and <sizes>."
+       */
+      unbind_shader_storage_buffers(ctx, first, count);
+      return;
+   }
+
+   /* Note that the error semantics for multi-bind commands differ from
+    * those of other GL commands.
+    *
+    * The Issues section in the ARB_multi_bind spec says:
+    *
+    *    "(11) Typically, OpenGL specifies that if an error is generated by a
+    *          command, that command has no effect.  This is somewhat
+    *          unfortunate for multi-bind commands, because it would require a
+    *          first pass to scan the entire list of bound objects for errors
+    *          and then a second pass to actually perform the bindings.
+    *          Should we have different error semantics?
+    *
+    *       RESOLVED:  Yes.  In this specification, when the parameters for
+    *       one of the <count> binding points are invalid, that binding point
+    *       is not updated and an error will be generated.  However, other
+    *       binding points in the same command will be updated if their
+    *       parameters are valid and no other error occurs."
+    */
+
+   _mesa_begin_bufferobj_lookups(ctx);
+
+   for (i = 0; i < count; i++) {
+      struct gl_shader_storage_buffer_binding *binding =
+         &ctx->ShaderStorageBufferBindings[first + i];
+      struct gl_buffer_object *bufObj;
+
+      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+         continue;
+
+      /* The ARB_multi_bind spec says:
+       *
+       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+       *      pair of values in <offsets> and <sizes> does not respectively
+       *      satisfy the constraints described for those parameters for the
+       *      specified target, as described in section 6.7.1 (per binding)."
+       *
+       * Section 6.7.1 refers to table 6.5, which says:
+       *
+       *     "┌───────────────────────────────────────────────────────────────┐
+       *      │ Shader storage buffer array bindings (see sec. 7.8)           │
+       *      ├─────────────────────┬─────────────────────────────────────────┤
+       *      │  ...                │  ...                                    │
+       *      │  offset restriction │  multiple of value of SHADER_STORAGE_-  │
+       *      │                     │  BUFFER_OFFSET_ALIGNMENT                │
+       *      │  ...                │  ...                                    │
+       *      │  size restriction   │  none                                   │
+       *      └─────────────────────┴─────────────────────────────────────────┘"
+       */
+      if (offsets[i] & (ctx->Const.ShaderStorageBufferOffsetAlignment - 1)) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "glBindBuffersRange(offsets[%u]=%" PRId64
+                     " is misaligned; it must be a multiple of the value of "
+                     "GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT=%u when "
+                     "target=GL_SHADER_STORAGE_BUFFER)",
+                     i, (int64_t) offsets[i],
+                     ctx->Const.ShaderStorageBufferOffsetAlignment);
+         continue;
+      }
+
+      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
+         bufObj = binding->BufferObject;
+      else
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
+                                                    "glBindBuffersRange");
+
+      if (bufObj) {
+         if (bufObj == ctx->Shared->NullBufferObj)
+            set_ssbo_binding(ctx, binding, bufObj, -1, -1, GL_FALSE);
+         else
+            set_ssbo_binding(ctx, binding, bufObj,
+                             offsets[i], sizes[i], GL_FALSE);
+      }
+   }
+
+   _mesa_end_bufferobj_lookups(ctx);
+}
+
 static bool
 error_check_bind_xfb_buffers(struct gl_context *ctx,
                              struct gl_transform_feedback_object *tfObj,
@@ -4158,6 +4264,10 @@ _mesa_BindBuffersRange(GLenum target, GLuint first, GLsizei count,
    case GL_UNIFORM_BUFFER:
       bind_uniform_buffers_range(ctx, first, count, buffers, offsets, sizes);
       return;
+   case GL_SHADER_STORAGE_BUFFER:
+      bind_shader_storage_buffers_range(ctx, first, count, buffers, offsets,
+                                        sizes);
+      return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffers_range(ctx, first, count, buffers,
                                 offsets, sizes);

From 8a1d58bd6129d61ec4efb79cc6f2b61ac777b85b Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 19 Mar 2015 11:37:43 +0100
Subject: [PATCH 0314/1208] mesa: Implement _mesa_BindBufferBase for target
 GL_SHADER_STORAGE_BUFFER

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/main/bufferobj.c | 56 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 0a9ffe41043..c3548b5908e 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -3091,6 +3091,37 @@ bind_uniform_buffer(struct gl_context *ctx,
    set_ubo_binding(ctx, binding, bufObj, offset, size, autoSize);
 }
 
+/**
+ * Binds a buffer object to a shader storage buffer binding point.
+ *
+ * Unlike set_ssbo_binding(), this function also flushes vertices
+ * and updates NewDriverState.  It also checks if the binding
+ * has actually changed before updating it.
+ */
+static void
+bind_shader_storage_buffer(struct gl_context *ctx,
+                           GLuint index,
+                           struct gl_buffer_object *bufObj,
+                           GLintptr offset,
+                           GLsizeiptr size,
+                           GLboolean autoSize)
+{
+   struct gl_shader_storage_buffer_binding *binding =
+      &ctx->ShaderStorageBufferBindings[index];
+
+   if (binding->BufferObject == bufObj &&
+       binding->Offset == offset &&
+       binding->Size == size &&
+       binding->AutomaticSize == autoSize) {
+      return;
+   }
+
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
+
+   set_ssbo_binding(ctx, binding, bufObj, offset, size, autoSize);
+}
+
 /**
  * Bind a region of a buffer object to a uniform block binding point.
  * \param index  the uniform buffer binding point index
@@ -3149,6 +3180,28 @@ bind_buffer_base_uniform_buffer(struct gl_context *ctx,
       bind_uniform_buffer(ctx, index, bufObj, 0, 0, GL_TRUE);
 }
 
+/**
+ * Bind a buffer object to a shader storage block binding point.
+ * As above, but offset = 0.
+ */
+static void
+bind_buffer_base_shader_storage_buffer(struct gl_context *ctx,
+                                       GLuint index,
+                                       struct gl_buffer_object *bufObj)
+{
+   if (index >= ctx->Const.MaxShaderStorageBufferBindings) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glBindBufferBase(index=%d)", index);
+      return;
+   }
+
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, bufObj);
+
+   if (bufObj == ctx->Shared->NullBufferObj)
+      bind_shader_storage_buffer(ctx, index, bufObj, -1, -1, GL_TRUE);
+   else
+      bind_shader_storage_buffer(ctx, index, bufObj, 0, 0, GL_TRUE);
+}
+
 /**
  * Binds a buffer object to an atomic buffer binding point.
  *
@@ -4240,6 +4293,9 @@ _mesa_BindBufferBase(GLenum target, GLuint index, GLuint buffer)
    case GL_UNIFORM_BUFFER:
       bind_buffer_base_uniform_buffer(ctx, index, bufObj);
       return;
+   case GL_SHADER_STORAGE_BUFFER:
+      bind_buffer_base_shader_storage_buffer(ctx, index, bufObj);
+      return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffer(ctx, index, bufObj, 0, 0,
                          "glBindBufferBase");

From 173ed05a6d9e851b2b7b2f9f2d8993e5da115c40 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 19 Mar 2015 11:42:33 +0100
Subject: [PATCH 0315/1208] mesa: Implement _mesa_BindBufferRange for target
 GL_SHADER_STORAGE_BUFFER

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/main/bufferobj.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index c3548b5908e..4e25a72cbdb 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -3157,6 +3157,40 @@ bind_buffer_range_uniform_buffer(struct gl_context *ctx,
    bind_uniform_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
 }
 
+/**
+ * Bind a region of a buffer object to a shader storage block binding point.
+ * \param index  the shader storage buffer binding point index
+ * \param bufObj  the buffer object
+ * \param offset  offset to the start of buffer object region
+ * \param size  size of the buffer object region
+ */
+static void
+bind_buffer_range_shader_storage_buffer(struct gl_context *ctx,
+                                        GLuint index,
+                                        struct gl_buffer_object *bufObj,
+                                        GLintptr offset,
+                                        GLsizeiptr size)
+{
+   if (index >= ctx->Const.MaxShaderStorageBufferBindings) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glBindBufferRange(index=%d)", index);
+      return;
+   }
+
+   if (offset & (ctx->Const.ShaderStorageBufferOffsetAlignment - 1)) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "glBindBufferRange(offset misaligned %d/%d)", (int) offset,
+                  ctx->Const.ShaderStorageBufferOffsetAlignment);
+      return;
+   }
+
+   if (bufObj == ctx->Shared->NullBufferObj) {
+      offset = -1;
+      size = -1;
+   }
+
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, bufObj);
+   bind_shader_storage_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
+}
 
 /**
  * Bind a buffer object to a uniform block binding point.
@@ -4227,6 +4261,9 @@ _mesa_BindBufferRange(GLenum target, GLuint index,
    case GL_UNIFORM_BUFFER:
       bind_buffer_range_uniform_buffer(ctx, index, bufObj, offset, size);
       return;
+   case GL_SHADER_STORAGE_BUFFER:
+      bind_buffer_range_shader_storage_buffer(ctx, index, bufObj, offset, size);
+      return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffer(ctx, index, bufObj, offset, size,
                          "glBindBufferRange");

From 3ad92589f29466383c0218aa4a73bff52019c4be Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 6 Apr 2015 09:37:58 +0200
Subject: [PATCH 0316/1208] glsl: Don't do tree grafting on buffer variables

Otherwise we can lose writes into the buffers backing the variables.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/opt_tree_grafting.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/glsl/opt_tree_grafting.cpp b/src/glsl/opt_tree_grafting.cpp
index d47613c2190..7f2ee6cee34 100644
--- a/src/glsl/opt_tree_grafting.cpp
+++ b/src/glsl/opt_tree_grafting.cpp
@@ -359,10 +359,11 @@ tree_grafting_basic_block(ir_instruction *bb_first,
       if (!lhs_var)
 	 continue;
 
-      if (lhs_var->data.mode == ir_var_function_out ||
-	  lhs_var->data.mode == ir_var_function_inout ||
-          lhs_var->data.mode == ir_var_shader_out)
-	 continue;
+   if (lhs_var->data.mode == ir_var_function_out ||
+       lhs_var->data.mode == ir_var_function_inout ||
+       lhs_var->data.mode == ir_var_shader_out ||
+       lhs_var->data.mode == ir_var_shader_storage)
+      continue;
 
       ir_variable_refcount_entry *entry = info->refs->get_variable_entry(lhs_var);
 

From 5360ff30c4de966422fde6a574e3959c81bf5037 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 6 Apr 2015 10:19:50 +0200
Subject: [PATCH 0317/1208] glsl: Do not kill dead assignments to buffer
 variables or SSBO declarations.

If we kill dead assignments we lose the buffer writes.

Also, we never kill UBO declarations even if they are never referenced
by the shader, they are always considered active. Although the spec
does not seem say this specifically for SSBOs, it is probably implied
since SSBOs are pretty much the same as UBOs, only that you can write
to them.

v2:
- Fix the comment (Jordan)

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/opt_dead_code.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/glsl/opt_dead_code.cpp b/src/glsl/opt_dead_code.cpp
index 7b4730a39bb..04e4d5673d2 100644
--- a/src/glsl/opt_dead_code.cpp
+++ b/src/glsl/opt_dead_code.cpp
@@ -77,11 +77,13 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
 
       if (entry->assign) {
 	 /* Remove a single dead assignment to the variable we found.
-	  * Don't do so if it's a shader or function output, though.
+	  * Don't do so if it's a shader or function output or a shader
+	  * storage variable though.
 	  */
 	 if (entry->var->data.mode != ir_var_function_out &&
 	     entry->var->data.mode != ir_var_function_inout &&
-             entry->var->data.mode != ir_var_shader_out) {
+             entry->var->data.mode != ir_var_shader_out &&
+             entry->var->data.mode != ir_var_shader_storage) {
 	    entry->assign->remove();
 	    progress = true;
 
@@ -99,7 +101,8 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
 	  * stage.  Also, once uniform locations have been assigned, the
 	  * declaration cannot be deleted.
 	  */
-         if (entry->var->data.mode == ir_var_uniform) {
+         if (entry->var->data.mode == ir_var_uniform ||
+             entry->var->data.mode == ir_var_shader_storage) {
             if (uniform_locations_assigned || entry->var->constant_value)
                continue;
 

From 0b1111d985714816fad20c99b4e6ea762df17b46 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 24 Apr 2015 11:14:17 +0200
Subject: [PATCH 0318/1208] glsl: Don't do constant propagation on buffer
 variables

Since the backing storage for these is shared we cannot ensure that
the value won't change by writes from other threads. Normally SSBO
accesses are not guaranteed to be syncronized with other threads,
except when memoryBarrier is used. So, we might be able to optimize
some SSBO accesses, but for now we always take the safe path and emit
the SSBO access.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/opt_constant_propagation.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/glsl/opt_constant_propagation.cpp b/src/glsl/opt_constant_propagation.cpp
index 90cc0c89b65..10be8e800ff 100644
--- a/src/glsl/opt_constant_propagation.cpp
+++ b/src/glsl/opt_constant_propagation.cpp
@@ -444,6 +444,14 @@ ir_constant_propagation_visitor::add_constant(ir_assignment *ir)
    if (!deref->var->type->is_vector() && !deref->var->type->is_scalar())
       return;
 
+   /* We can't do copy propagation on buffer variables, since the underlying
+    * memory storage is shared across multiple threads we can't be sure that
+    * the variable value isn't modified between this assignment and the next
+    * instruction where its value is read.
+    */
+   if (deref->var->data.mode == ir_var_shader_storage)
+      return;
+
    entry = new(this->mem_ctx) acp_entry(deref->var, ir->write_mask, constant);
    this->acp->push_tail(entry);
 }

From 5dfea83ee6bf85fb3962679d043eb06b33bfd4c1 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 24 Apr 2015 11:15:48 +0200
Subject: [PATCH 0319/1208] glsl: Don't do constant variable on buffer
 variables

Since the backing storage for these is shared we cannot ensure that
the value won't change by writes from other threads. Normally SSBO
accesses are not guaranteed to be syncronized with other threads,
except when memoryBarrier is used. So, we might be able to optimize
some SSBO accesses, but for now we always take the safe path and emit
the SSBO access.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/opt_constant_variable.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/glsl/opt_constant_variable.cpp b/src/glsl/opt_constant_variable.cpp
index 7222eb92a7d..7aaaeedf98d 100644
--- a/src/glsl/opt_constant_variable.cpp
+++ b/src/glsl/opt_constant_variable.cpp
@@ -115,6 +115,13 @@ ir_constant_variable_visitor::visit_enter(ir_assignment *ir)
    if (!var)
       return visit_continue;
 
+   /* Ignore buffer variables, since the underlying storage is shared
+    * and we can't be sure that this variable won't be written by another
+    * thread.
+    */
+   if (var->data.mode == ir_var_shader_storage)
+      return visit_continue;
+
    constval = ir->rhs->constant_expression_value();
    if (!constval)
       return visit_continue;

From 2a66ee6fc1fa1e64f2d9a22271187d4462d9e042 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 24 Apr 2015 11:17:15 +0200
Subject: [PATCH 0320/1208] glsl: Don't do copy propagation on buffer variables

Since the backing storage for these is shared we cannot ensure that
the value won't change by writes from other threads. Normally SSBO
accesses are not guaranteed to be syncronized with other threads,
except when memoryBarrier is used. So, we might be able to optimize
some SSBO accesses, but for now we always take the safe path and emit
the SSBO access.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/opt_copy_propagation.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glsl/opt_copy_propagation.cpp b/src/glsl/opt_copy_propagation.cpp
index 806027b280e..f20699563fd 100644
--- a/src/glsl/opt_copy_propagation.cpp
+++ b/src/glsl/opt_copy_propagation.cpp
@@ -330,7 +330,7 @@ ir_copy_propagation_visitor::add_copy(ir_assignment *ir)
 	  */
 	 ir->condition = new(ralloc_parent(ir)) ir_constant(false);
 	 this->progress = true;
-      } else {
+      } else if (lhs_var->data.mode != ir_var_shader_storage) {
 	 entry = new(this->acp) acp_entry(lhs_var, rhs_var);
 	 this->acp->push_tail(entry);
       }

From 1966ea57728a1c05300982ddd83de989e363613c Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Wed, 8 Jul 2015 17:03:06 +0200
Subject: [PATCH 0321/1208] glsl: Lower shader storage buffer object writes to
 GLSL IR instrinsics

Extend the existing lower_ubo_reference pass to also detect SSBO writes
and lower them to __intrinsic_store_ssbo intrinsics.

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/lower_ubo_reference.cpp | 439 ++++++++++++++++++++++---------
 1 file changed, 310 insertions(+), 129 deletions(-)

diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp
index a61ff29be3e..ab161cca8b6 100644
--- a/src/glsl/lower_ubo_reference.cpp
+++ b/src/glsl/lower_ubo_reference.cpp
@@ -37,6 +37,7 @@
 #include "ir_builder.h"
 #include "ir_rvalue_visitor.h"
 #include "main/macros.h"
+#include "glsl_parser_extras.h"
 
 using namespace ir_builder;
 
@@ -139,12 +140,31 @@ public:
    }
 
    void handle_rvalue(ir_rvalue **rvalue);
-   void emit_ubo_loads(ir_dereference *deref, ir_variable *base_offset,
-                       unsigned int deref_offset, bool row_major,
-                       int matrix_columns);
+   ir_visitor_status visit_enter(ir_assignment *ir);
+
+   void setup_for_load_or_store(ir_variable *var,
+                                ir_dereference *deref,
+                                ir_rvalue **offset,
+                                unsigned *const_offset,
+                                bool *row_major,
+                                int *matrix_columns);
    ir_expression *ubo_load(const struct glsl_type *type,
 			   ir_rvalue *offset);
 
+
+   void check_for_ssbo_store(ir_assignment *ir);
+   void write_to_memory(ir_dereference *deref,
+                        ir_variable *var,
+                        ir_variable *write_var,
+                        unsigned write_mask);
+   ir_call *ssbo_store(ir_rvalue *deref, ir_rvalue *offset,
+                       unsigned write_mask);
+
+   void emit_access(bool is_write, ir_dereference *deref,
+                    ir_variable *base_offset, unsigned int deref_offset,
+                    bool row_major, int matrix_columns,
+                    unsigned write_mask);
+
    void *mem_ctx;
    struct gl_shader *shader;
    struct gl_uniform_buffer_variable *ubo_var;
@@ -218,26 +238,20 @@ interface_field_name(void *mem_ctx, char *base_name, ir_dereference *d,
 }
 
 void
-lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
+lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var,
+                                                     ir_dereference *deref,
+                                                     ir_rvalue **offset,
+                                                     unsigned *const_offset,
+                                                     bool *row_major,
+                                                     int *matrix_columns)
 {
-   if (!*rvalue)
-      return;
-
-   ir_dereference *deref = (*rvalue)->as_dereference();
-   if (!deref)
-      return;
-
-   ir_variable *var = deref->variable_referenced();
-   if (!var || !var->is_in_buffer_block())
-      return;
-
-   mem_ctx = ralloc_parent(*rvalue);
-
+   /* Determine the name of the interface block */
    ir_rvalue *nonconst_block_index;
    const char *const field_name =
       interface_field_name(mem_ctx, (char *) var->get_interface_type()->name,
                            deref, &nonconst_block_index);
 
+   /* Locate the ubo block by interface name */
    this->uniform_block = NULL;
    for (unsigned i = 0; i < shader->NumUniformBlocks; i++) {
       if (strcmp(field_name, shader->UniformBlocks[i].Name) == 0) {
@@ -263,10 +277,10 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
 
    assert(this->uniform_block);
 
-   ir_rvalue *offset = new(mem_ctx) ir_constant(0u);
-   unsigned const_offset = 0;
-   bool row_major = is_dereferenced_thing_row_major(deref);
-   int matrix_columns = 1;
+   *offset = new(mem_ctx) ir_constant(0u);
+   *const_offset = 0;
+   *row_major = is_dereferenced_thing_row_major(deref);
+   *matrix_columns = 1;
 
    /* Calculate the offset to the start of the region of the UBO
     * dereferenced by *rvalue.  This may be a variable offset if an
@@ -275,76 +289,76 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
    while (deref) {
       switch (deref->ir_type) {
       case ir_type_dereference_variable: {
-	 const_offset += ubo_var->Offset;
-	 deref = NULL;
-	 break;
+         *const_offset += ubo_var->Offset;
+         deref = NULL;
+         break;
       }
 
       case ir_type_dereference_array: {
-	 ir_dereference_array *deref_array = (ir_dereference_array *)deref;
-	 unsigned array_stride;
-	 if (deref_array->array->type->is_matrix() && row_major) {
-	    /* When loading a vector out of a row major matrix, the
-	     * step between the columns (vectors) is the size of a
-	     * float, while the step between the rows (elements of a
-	     * vector) is handled below in emit_ubo_loads.
-	     */
-	    array_stride = 4;
+         ir_dereference_array *deref_array = (ir_dereference_array *) deref;
+         unsigned array_stride;
+         if (deref_array->array->type->is_matrix() && *row_major) {
+            /* When loading a vector out of a row major matrix, the
+             * step between the columns (vectors) is the size of a
+             * float, while the step between the rows (elements of a
+             * vector) is handled below in emit_ubo_loads.
+             */
+            array_stride = 4;
             if (deref_array->array->type->is_double())
                array_stride *= 2;
-            matrix_columns = deref_array->array->type->matrix_columns;
+            *matrix_columns = deref_array->array->type->matrix_columns;
          } else if (deref_array->type->is_interface()) {
             /* We're processing an array dereference of an interface instance
-	     * array.  The thing being dereferenced *must* be a variable
-	     * dereference because intefaces cannot be embedded an other
-	     * types.  In terms of calculating the offsets for the lowering
-	     * pass, we don't care about the array index.  All elements of an
-	     * interface instance array will have the same offsets relative to
-	     * the base of the block that backs them.
+             * array. The thing being dereferenced *must* be a variable
+             * dereference because interfaces cannot be embedded in other
+             * types. In terms of calculating the offsets for the lowering
+             * pass, we don't care about the array index. All elements of an
+             * interface instance array will have the same offsets relative to
+             * the base of the block that backs them.
              */
             assert(deref_array->array->as_dereference_variable());
             deref = deref_array->array->as_dereference();
             break;
-	 } else {
+         } else {
             /* Whether or not the field is row-major (because it might be a
-             * bvec2 or something) does not affect the array itself.  We need
+             * bvec2 or something) does not affect the array itself. We need
              * to know whether an array element in its entirety is row-major.
              */
             const bool array_row_major =
                is_dereferenced_thing_row_major(deref_array);
 
-	    array_stride = deref_array->type->std140_size(array_row_major);
-	    array_stride = glsl_align(array_stride, 16);
-	 }
+            array_stride = deref_array->type->std140_size(array_row_major);
+            array_stride = glsl_align(array_stride, 16);
+         }
 
          ir_rvalue *array_index = deref_array->array_index;
          if (array_index->type->base_type == GLSL_TYPE_INT)
             array_index = i2u(array_index);
 
-	 ir_constant *const_index =
+         ir_constant *const_index =
             array_index->constant_expression_value(NULL);
-	 if (const_index) {
-	    const_offset += array_stride * const_index->value.u[0];
-	 } else {
-	    offset = add(offset,
-			 mul(array_index,
-			     new(mem_ctx) ir_constant(array_stride)));
-	 }
-	 deref = deref_array->array->as_dereference();
-	 break;
+         if (const_index) {
+            *const_offset += array_stride * const_index->value.u[0];
+         } else {
+            *offset = add(*offset,
+                          mul(array_index,
+                              new(mem_ctx) ir_constant(array_stride)));
+         }
+         deref = deref_array->array->as_dereference();
+         break;
       }
 
       case ir_type_dereference_record: {
-	 ir_dereference_record *deref_record = (ir_dereference_record *)deref;
-	 const glsl_type *struct_type = deref_record->record->type;
-	 unsigned intra_struct_offset = 0;
+         ir_dereference_record *deref_record = (ir_dereference_record *) deref;
+         const glsl_type *struct_type = deref_record->record->type;
+         unsigned intra_struct_offset = 0;
 
-	 for (unsigned int i = 0; i < struct_type->length; i++) {
-	    const glsl_type *type = struct_type->fields.structure[i].type;
+         for (unsigned int i = 0; i < struct_type->length; i++) {
+            const glsl_type *type = struct_type->fields.structure[i].type;
 
-            ir_dereference_record *field_deref =
-               new(mem_ctx) ir_dereference_record(deref_record->record,
-                                                  struct_type->fields.structure[i].name);
+            ir_dereference_record *field_deref = new(mem_ctx)
+               ir_dereference_record(deref_record->record,
+                                     struct_type->fields.structure[i].name);
             const bool field_row_major =
                is_dereferenced_thing_row_major(field_deref);
 
@@ -352,11 +366,12 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
 
             unsigned field_align = type->std140_base_alignment(field_row_major);
 
-	    intra_struct_offset = glsl_align(intra_struct_offset, field_align);
+            intra_struct_offset = glsl_align(intra_struct_offset, field_align);
+
+            if (strcmp(struct_type->fields.structure[i].name,
+                       deref_record->field) == 0)
+               break;
 
-	    if (strcmp(struct_type->fields.structure[i].name,
-		       deref_record->field) == 0)
-	       break;
             intra_struct_offset += type->std140_size(field_row_major);
 
             /* If the field just examined was itself a structure, apply rule
@@ -371,19 +386,49 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
                                                 field_align);
 
             }
-	 }
+         }
 
-	 const_offset += intra_struct_offset;
-
-	 deref = deref_record->record->as_dereference();
-	 break;
+         *const_offset += intra_struct_offset;
+         deref = deref_record->record->as_dereference();
+         break;
       }
+
       default:
-	 assert(!"not reached");
-	 deref = NULL;
-	 break;
+         assert(!"not reached");
+         deref = NULL;
+         break;
       }
    }
+}
+
+void
+lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
+{
+   if (!*rvalue)
+      return;
+
+   ir_dereference *deref = (*rvalue)->as_dereference();
+   if (!deref)
+      return;
+
+   ir_variable *var = deref->variable_referenced();
+   if (!var || !var->is_in_buffer_block())
+      return;
+
+   mem_ctx = ralloc_parent(*rvalue);
+
+   ir_rvalue *offset = NULL;
+   unsigned const_offset;
+   bool row_major;
+   int matrix_columns;
+
+   /* Compute the offset to the start if the dereference as well as other
+    * information we need to configure the write
+    */
+   setup_for_load_or_store(var, deref,
+                           &offset, &const_offset,
+                           &row_major, &matrix_columns);
+   assert(offset);
 
    /* Now that we've calculated the offset to the start of the
     * dereference, walk over the type and emit loads into a temporary.
@@ -401,7 +446,8 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
    base_ir->insert_before(assign(load_offset, offset));
 
    deref = new(mem_ctx) ir_dereference_variable(load_var);
-   emit_ubo_loads(deref, load_offset, const_offset, row_major, matrix_columns);
+   emit_access(false, deref, load_offset, const_offset,
+               row_major, matrix_columns, 0);
    *rvalue = deref;
 
    progress = true;
@@ -420,74 +466,127 @@ lower_ubo_reference_visitor::ubo_load(const glsl_type *type,
 
 }
 
+static bool
+shader_storage_buffer_object(const _mesa_glsl_parse_state *state)
+{
+   return state->ARB_shader_storage_buffer_object_enable;
+}
+
+ir_call *
+lower_ubo_reference_visitor::ssbo_store(ir_rvalue *deref,
+                                        ir_rvalue *offset,
+                                        unsigned write_mask)
+{
+   exec_list sig_params;
+
+   ir_variable *block_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "block_ref" , ir_var_function_in);
+   sig_params.push_tail(block_ref);
+
+   ir_variable *offset_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "offset" , ir_var_function_in);
+   sig_params.push_tail(offset_ref);
+
+   ir_variable *val_ref = new(mem_ctx)
+      ir_variable(deref->type, "value" , ir_var_function_in);
+   sig_params.push_tail(val_ref);
+
+   ir_variable *writemask_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "write_mask" , ir_var_function_in);
+   sig_params.push_tail(writemask_ref);
+
+   ir_function_signature *sig = new(mem_ctx)
+      ir_function_signature(glsl_type::void_type, shader_storage_buffer_object);
+   assert(sig);
+   sig->replace_parameters(&sig_params);
+   sig->is_intrinsic = true;
+
+   ir_function *f = new(mem_ctx) ir_function("__intrinsic_store_ssbo");
+   f->add_signature(sig);
+
+   exec_list call_params;
+   call_params.push_tail(this->uniform_block->clone(mem_ctx, NULL));
+   call_params.push_tail(offset->clone(mem_ctx, NULL));
+   call_params.push_tail(deref->clone(mem_ctx, NULL));
+   call_params.push_tail(new(mem_ctx) ir_constant(write_mask));
+   return new(mem_ctx) ir_call(sig, NULL, &call_params);
+}
+
+static inline int
+writemask_for_size(unsigned n)
+{
+   return ((1 << n) - 1);
+}
+
 /**
- * Takes LHS and emits a series of assignments into its components
- * from the UBO variable at variable_offset + deref_offset.
- *
- * Recursively calls itself to break the deref down to the point that
- * the ir_binop_ubo_load expressions generated are contiguous scalars
- * or vectors.
+ * Takes a deref and recursively calls itself to break the deref down to the
+ * point that the reads or writes generated are contiguous scalars or vectors.
  */
 void
-lower_ubo_reference_visitor::emit_ubo_loads(ir_dereference *deref,
-					    ir_variable *base_offset,
-                                            unsigned int deref_offset,
-                                            bool row_major,
-                                            int matrix_columns)
+lower_ubo_reference_visitor::emit_access(bool is_write,
+                                         ir_dereference *deref,
+                                         ir_variable *base_offset,
+                                         unsigned int deref_offset,
+                                         bool row_major,
+                                         int matrix_columns,
+                                         unsigned write_mask)
 {
    if (deref->type->is_record()) {
       unsigned int field_offset = 0;
 
       for (unsigned i = 0; i < deref->type->length; i++) {
-	 const struct glsl_struct_field *field =
-	    &deref->type->fields.structure[i];
-	 ir_dereference *field_deref =
-	    new(mem_ctx) ir_dereference_record(deref->clone(mem_ctx, NULL),
-					       field->name);
+         const struct glsl_struct_field *field =
+            &deref->type->fields.structure[i];
+         ir_dereference *field_deref =
+            new(mem_ctx) ir_dereference_record(deref->clone(mem_ctx, NULL),
+                                               field->name);
 
-	 field_offset =
-	    glsl_align(field_offset,
+         field_offset =
+            glsl_align(field_offset,
                        field->type->std140_base_alignment(row_major));
 
-	 emit_ubo_loads(field_deref, base_offset, deref_offset + field_offset,
-                        row_major, 1);
+         emit_access(is_write, field_deref, base_offset,
+                     deref_offset + field_offset,
+                     row_major, 1,
+                     writemask_for_size(field_deref->type->vector_elements));
 
-	 field_offset += field->type->std140_size(row_major);
+         field_offset += field->type->std140_size(row_major);
       }
       return;
    }
 
    if (deref->type->is_array()) {
       unsigned array_stride =
-	 glsl_align(deref->type->fields.array->std140_size(row_major),
-		    16);
+         glsl_align(deref->type->fields.array->std140_size(row_major), 16);
 
       for (unsigned i = 0; i < deref->type->length; i++) {
-	 ir_constant *element = new(mem_ctx) ir_constant(i);
-	 ir_dereference *element_deref =
-	    new(mem_ctx) ir_dereference_array(deref->clone(mem_ctx, NULL),
-					      element);
-	 emit_ubo_loads(element_deref, base_offset,
-			deref_offset + i * array_stride,
-                        row_major, 1);
+         ir_constant *element = new(mem_ctx) ir_constant(i);
+         ir_dereference *element_deref =
+            new(mem_ctx) ir_dereference_array(deref->clone(mem_ctx, NULL),
+                                              element);
+         emit_access(is_write, element_deref, base_offset,
+                     deref_offset + i * array_stride,
+                     row_major, 1,
+                     writemask_for_size(element_deref->type->vector_elements));
       }
       return;
    }
 
    if (deref->type->is_matrix()) {
       for (unsigned i = 0; i < deref->type->matrix_columns; i++) {
-	 ir_constant *col = new(mem_ctx) ir_constant(i);
-	 ir_dereference *col_deref =
-	    new(mem_ctx) ir_dereference_array(deref->clone(mem_ctx, NULL),
-					      col);
+         ir_constant *col = new(mem_ctx) ir_constant(i);
+         ir_dereference *col_deref =
+            new(mem_ctx) ir_dereference_array(deref->clone(mem_ctx, NULL), col);
 
          if (row_major) {
             /* For a row-major matrix, the next column starts at the next
              * element.
              */
             int size_mul = deref->type->is_double() ? 8 : 4;
-            emit_ubo_loads(col_deref, base_offset, deref_offset + i * size_mul,
-                           row_major, deref->type->matrix_columns);
+            emit_access(is_write, col_deref, base_offset,
+                        deref_offset + i * size_mul,
+                        row_major, deref->type->matrix_columns,
+                        writemask_for_size(col_deref->type->vector_elements));
          } else {
             /* std140 always rounds the stride of arrays (and matrices) to a
              * vec4, so matrices are always 16 between columns/rows. With
@@ -495,21 +594,25 @@ lower_ubo_reference_visitor::emit_ubo_loads(ir_dereference *deref,
              */
             int size_mul = (deref->type->is_double() &&
                             deref->type->vector_elements > 2) ? 32 : 16;
-            emit_ubo_loads(col_deref, base_offset, deref_offset + i * size_mul,
-                           row_major, deref->type->matrix_columns);
+            emit_access(is_write, col_deref, base_offset,
+                        deref_offset + i * size_mul,
+                        row_major, deref->type->matrix_columns,
+                        writemask_for_size(col_deref->type->vector_elements));
          }
       }
       return;
    }
 
-   assert(deref->type->is_scalar() ||
-	  deref->type->is_vector());
+   assert(deref->type->is_scalar() || deref->type->is_vector());
 
    if (!row_major) {
-      ir_rvalue *offset = add(base_offset,
-			      new(mem_ctx) ir_constant(deref_offset));
-      base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
-				    ubo_load(deref->type, offset)));
+      ir_rvalue *offset =
+         add(base_offset, new(mem_ctx) ir_constant(deref_offset));
+      if (is_write)
+         base_ir->insert_after(ssbo_store(deref, offset, write_mask));
+      else
+         base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
+                                       ubo_load(deref->type, offset)));
    } else {
       unsigned N = deref->type->is_double() ? 8 : 4;
 
@@ -527,22 +630,100 @@ lower_ubo_reference_visitor::emit_ubo_loads(ir_dereference *deref,
       assert(matrix_columns <= 4);
       unsigned matrix_stride = glsl_align(matrix_columns * N, 16);
 
-      const glsl_type *ubo_type = deref->type->base_type == GLSL_TYPE_FLOAT ?
+      const glsl_type *deref_type = deref->type->base_type == GLSL_TYPE_FLOAT ?
          glsl_type::float_type : glsl_type::double_type;
 
       for (unsigned i = 0; i < deref->type->vector_elements; i++) {
-	 ir_rvalue *chan_offset =
-	    add(base_offset,
-		new(mem_ctx) ir_constant(deref_offset + i * matrix_stride));
-
-	 base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
-				       ubo_load(ubo_type,
-						chan_offset),
-				       (1U << i)));
+         ir_rvalue *chan_offset =
+            add(base_offset,
+                new(mem_ctx) ir_constant(deref_offset + i * matrix_stride));
+         if (is_write) {
+            base_ir->insert_after(ssbo_store(swizzle(deref, i, 1), chan_offset, 1));
+         } else {
+            base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
+                                          ubo_load(deref_type, chan_offset),
+                                          (1U << i)));
+         }
       }
    }
 }
 
+void
+lower_ubo_reference_visitor::write_to_memory(ir_dereference *deref,
+                                             ir_variable *var,
+                                             ir_variable *write_var,
+                                             unsigned write_mask)
+{
+   ir_rvalue *offset = NULL;
+   unsigned const_offset;
+   bool row_major;
+   int matrix_columns;
+
+   /* Compute the offset to the start if the dereference as well as other
+    * information we need to configure the write
+    */
+   setup_for_load_or_store(var, deref,
+                           &offset, &const_offset,
+                           &row_major, &matrix_columns);
+   assert(offset);
+
+   /* Now emit writes from the temporary to memory */
+   ir_variable *write_offset =
+      new(mem_ctx) ir_variable(glsl_type::uint_type,
+                               "ssbo_store_temp_offset",
+                               ir_var_temporary);
+
+   base_ir->insert_before(write_offset);
+   base_ir->insert_before(assign(write_offset, offset));
+
+   deref = new(mem_ctx) ir_dereference_variable(write_var);
+   emit_access(true, deref, write_offset, const_offset,
+               row_major, matrix_columns, write_mask);
+}
+
+void
+lower_ubo_reference_visitor::check_for_ssbo_store(ir_assignment *ir)
+{
+   if (!ir || !ir->lhs)
+      return;
+
+   ir_rvalue *rvalue = ir->lhs->as_rvalue();
+   if (!rvalue)
+      return;
+
+   ir_dereference *deref = ir->lhs->as_dereference();
+   if (!deref)
+      return;
+
+   ir_variable *var = ir->lhs->variable_referenced();
+   if (!var || !var->is_in_buffer_block())
+      return;
+
+   /* We have a write to a buffer variable, so declare a temporary and rewrite
+    * the assignment so that the temporary is the LHS.
+    */
+   mem_ctx = ralloc_parent(shader->ir);
+
+   const glsl_type *type = rvalue->type;
+   ir_variable *write_var = new(mem_ctx) ir_variable(type,
+                                                     "ssbo_store_temp",
+                                                     ir_var_temporary);
+   base_ir->insert_before(write_var);
+   ir->lhs = new(mem_ctx) ir_dereference_variable(write_var);
+
+   /* Now we have to write the value assigned to the temporary back to memory */
+   write_to_memory(deref, var, write_var, ir->write_mask);
+   progress = true;
+}
+
+
+ir_visitor_status
+lower_ubo_reference_visitor::visit_enter(ir_assignment *ir)
+{
+   check_for_ssbo_store(ir);
+   return rvalue_visit(ir);
+}
+
 } /* unnamed namespace */
 
 void

From ea633db65ffa684ea5237b8cb5bd96fbc1a7769a Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Wed, 8 Jul 2015 17:30:44 +0200
Subject: [PATCH 0322/1208] glsl: Lower shader storage buffer object loads to
 GLSL IR instrinsics

Extend the existing lower_ubo_reference pass to also detect SSBO loads
and lower them to __intrinsic_load_ssbo intrinsics.

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/glsl/lower_ubo_reference.cpp | 73 ++++++++++++++++++++++++++++----
 1 file changed, 65 insertions(+), 8 deletions(-)

diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp
index ab161cca8b6..8b0810781fe 100644
--- a/src/glsl/lower_ubo_reference.cpp
+++ b/src/glsl/lower_ubo_reference.cpp
@@ -150,7 +150,8 @@ public:
                                 int *matrix_columns);
    ir_expression *ubo_load(const struct glsl_type *type,
 			   ir_rvalue *offset);
-
+   ir_call *ssbo_load(const struct glsl_type *type,
+                      ir_rvalue *offset);
 
    void check_for_ssbo_store(ir_assignment *ir);
    void write_to_memory(ir_dereference *deref,
@@ -170,6 +171,7 @@ public:
    struct gl_uniform_buffer_variable *ubo_var;
    ir_rvalue *uniform_block;
    bool progress;
+   bool is_shader_storage;
 };
 
 /**
@@ -266,6 +268,8 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var,
             this->uniform_block = index;
          }
 
+         this->is_shader_storage = shader->UniformBlocks[i].IsShaderStorage;
+
          struct gl_uniform_block *block = &shader->UniformBlocks[i];
 
          this->ubo_var = var->is_interface_instance()
@@ -415,7 +419,7 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
    if (!var || !var->is_in_buffer_block())
       return;
 
-   mem_ctx = ralloc_parent(*rvalue);
+   mem_ctx = ralloc_parent(shader->ir);
 
    ir_rvalue *offset = NULL;
    unsigned const_offset;
@@ -512,6 +516,42 @@ lower_ubo_reference_visitor::ssbo_store(ir_rvalue *deref,
    return new(mem_ctx) ir_call(sig, NULL, &call_params);
 }
 
+ir_call *
+lower_ubo_reference_visitor::ssbo_load(const struct glsl_type *type,
+                                       ir_rvalue *offset)
+{
+   exec_list sig_params;
+
+   ir_variable *block_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "block_ref" , ir_var_function_in);
+   sig_params.push_tail(block_ref);
+
+   ir_variable *offset_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "offset_ref" , ir_var_function_in);
+   sig_params.push_tail(offset_ref);
+
+   ir_function_signature *sig =
+      new(mem_ctx) ir_function_signature(type, shader_storage_buffer_object);
+   assert(sig);
+   sig->replace_parameters(&sig_params);
+   sig->is_intrinsic = true;
+
+   ir_function *f = new(mem_ctx) ir_function("__intrinsic_load_ssbo");
+   f->add_signature(sig);
+
+   ir_variable *result = new(mem_ctx)
+      ir_variable(type, "ssbo_load_result", ir_var_temporary);
+   base_ir->insert_before(result);
+   ir_dereference_variable *deref_result = new(mem_ctx)
+      ir_dereference_variable(result);
+
+   exec_list call_params;
+   call_params.push_tail(this->uniform_block->clone(mem_ctx, NULL));
+   call_params.push_tail(offset->clone(mem_ctx, NULL));
+
+   return new(mem_ctx) ir_call(sig, deref_result, &call_params);
+}
+
 static inline int
 writemask_for_size(unsigned n)
 {
@@ -610,9 +650,17 @@ lower_ubo_reference_visitor::emit_access(bool is_write,
          add(base_offset, new(mem_ctx) ir_constant(deref_offset));
       if (is_write)
          base_ir->insert_after(ssbo_store(deref, offset, write_mask));
-      else
-         base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
-                                       ubo_load(deref->type, offset)));
+      else {
+         if (!this->is_shader_storage) {
+             base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
+                                           ubo_load(deref->type, offset)));
+         } else {
+            ir_call *load_ssbo = ssbo_load(deref->type, offset);
+            base_ir->insert_before(load_ssbo);
+            ir_rvalue *value = load_ssbo->return_deref->as_rvalue()->clone(mem_ctx, NULL);
+            base_ir->insert_before(assign(deref->clone(mem_ctx, NULL), value));
+         }
+      }
    } else {
       unsigned N = deref->type->is_double() ? 8 : 4;
 
@@ -640,9 +688,18 @@ lower_ubo_reference_visitor::emit_access(bool is_write,
          if (is_write) {
             base_ir->insert_after(ssbo_store(swizzle(deref, i, 1), chan_offset, 1));
          } else {
-            base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
-                                          ubo_load(deref_type, chan_offset),
-                                          (1U << i)));
+            if (!this->is_shader_storage) {
+               base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
+                                             ubo_load(deref_type, chan_offset),
+                                             (1U << i)));
+            } else {
+               ir_call *load_ssbo = ssbo_load(deref_type, chan_offset);
+               base_ir->insert_before(load_ssbo);
+               ir_rvalue *value = load_ssbo->return_deref->as_rvalue()->clone(mem_ctx, NULL);
+               base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
+                                             value,
+                                             (1U << i)));
+            }
          }
       }
    }

From 759ed0bd03818c912e7f1fa62bafc50ef52ef291 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 13 Jul 2015 15:40:41 -0700
Subject: [PATCH 0323/1208] i965: Mark constant static data as const.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_curbe.c       |  2 +-
 src/mesa/drivers/dri/i965/brw_draw_upload.c | 44 ++++++++++-----------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index befd7a9538c..a149ce3ba12 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -176,7 +176,7 @@ void brw_upload_cs_urb_state(struct brw_context *brw)
    ADVANCE_BATCH();
 }
 
-static GLfloat fixed_plane[6][4] = {
+static const GLfloat fixed_plane[6][4] = {
    { 0,    0,   -1, 1 },
    { 0,    0,    1, 1 },
    { 0,   -1,    0, 1 },
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 320e40e1007..e0b55c6b620 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -40,7 +40,7 @@
 #include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 
-static GLuint double_types[5] = {
+static const GLuint double_types[5] = {
    0,
    BRW_SURFACEFORMAT_R64_FLOAT,
    BRW_SURFACEFORMAT_R64G64_FLOAT,
@@ -48,7 +48,7 @@ static GLuint double_types[5] = {
    BRW_SURFACEFORMAT_R64G64B64A64_FLOAT
 };
 
-static GLuint float_types[5] = {
+static const GLuint float_types[5] = {
    0,
    BRW_SURFACEFORMAT_R32_FLOAT,
    BRW_SURFACEFORMAT_R32G32_FLOAT,
@@ -56,7 +56,7 @@ static GLuint float_types[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_FLOAT
 };
 
-static GLuint half_float_types[5] = {
+static const GLuint half_float_types[5] = {
    0,
    BRW_SURFACEFORMAT_R16_FLOAT,
    BRW_SURFACEFORMAT_R16G16_FLOAT,
@@ -64,7 +64,7 @@ static GLuint half_float_types[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_FLOAT
 };
 
-static GLuint fixed_point_types[5] = {
+static const GLuint fixed_point_types[5] = {
    0,
    BRW_SURFACEFORMAT_R32_SFIXED,
    BRW_SURFACEFORMAT_R32G32_SFIXED,
@@ -72,7 +72,7 @@ static GLuint fixed_point_types[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_SFIXED,
 };
 
-static GLuint uint_types_direct[5] = {
+static const GLuint uint_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R32_UINT,
    BRW_SURFACEFORMAT_R32G32_UINT,
@@ -80,7 +80,7 @@ static GLuint uint_types_direct[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_UINT
 };
 
-static GLuint uint_types_norm[5] = {
+static const GLuint uint_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R32_UNORM,
    BRW_SURFACEFORMAT_R32G32_UNORM,
@@ -88,7 +88,7 @@ static GLuint uint_types_norm[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_UNORM
 };
 
-static GLuint uint_types_scale[5] = {
+static const GLuint uint_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R32_USCALED,
    BRW_SURFACEFORMAT_R32G32_USCALED,
@@ -96,7 +96,7 @@ static GLuint uint_types_scale[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_USCALED
 };
 
-static GLuint int_types_direct[5] = {
+static const GLuint int_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R32_SINT,
    BRW_SURFACEFORMAT_R32G32_SINT,
@@ -104,7 +104,7 @@ static GLuint int_types_direct[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_SINT
 };
 
-static GLuint int_types_norm[5] = {
+static const GLuint int_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R32_SNORM,
    BRW_SURFACEFORMAT_R32G32_SNORM,
@@ -112,7 +112,7 @@ static GLuint int_types_norm[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_SNORM
 };
 
-static GLuint int_types_scale[5] = {
+static const GLuint int_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R32_SSCALED,
    BRW_SURFACEFORMAT_R32G32_SSCALED,
@@ -120,7 +120,7 @@ static GLuint int_types_scale[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_SSCALED
 };
 
-static GLuint ushort_types_direct[5] = {
+static const GLuint ushort_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R16_UINT,
    BRW_SURFACEFORMAT_R16G16_UINT,
@@ -128,7 +128,7 @@ static GLuint ushort_types_direct[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_UINT
 };
 
-static GLuint ushort_types_norm[5] = {
+static const GLuint ushort_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R16_UNORM,
    BRW_SURFACEFORMAT_R16G16_UNORM,
@@ -136,7 +136,7 @@ static GLuint ushort_types_norm[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_UNORM
 };
 
-static GLuint ushort_types_scale[5] = {
+static const GLuint ushort_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R16_USCALED,
    BRW_SURFACEFORMAT_R16G16_USCALED,
@@ -144,7 +144,7 @@ static GLuint ushort_types_scale[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_USCALED
 };
 
-static GLuint short_types_direct[5] = {
+static const GLuint short_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R16_SINT,
    BRW_SURFACEFORMAT_R16G16_SINT,
@@ -152,7 +152,7 @@ static GLuint short_types_direct[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_SINT
 };
 
-static GLuint short_types_norm[5] = {
+static const GLuint short_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R16_SNORM,
    BRW_SURFACEFORMAT_R16G16_SNORM,
@@ -160,7 +160,7 @@ static GLuint short_types_norm[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_SNORM
 };
 
-static GLuint short_types_scale[5] = {
+static const GLuint short_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R16_SSCALED,
    BRW_SURFACEFORMAT_R16G16_SSCALED,
@@ -168,7 +168,7 @@ static GLuint short_types_scale[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_SSCALED
 };
 
-static GLuint ubyte_types_direct[5] = {
+static const GLuint ubyte_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R8_UINT,
    BRW_SURFACEFORMAT_R8G8_UINT,
@@ -176,7 +176,7 @@ static GLuint ubyte_types_direct[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_UINT
 };
 
-static GLuint ubyte_types_norm[5] = {
+static const GLuint ubyte_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R8_UNORM,
    BRW_SURFACEFORMAT_R8G8_UNORM,
@@ -184,7 +184,7 @@ static GLuint ubyte_types_norm[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_UNORM
 };
 
-static GLuint ubyte_types_scale[5] = {
+static const GLuint ubyte_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R8_USCALED,
    BRW_SURFACEFORMAT_R8G8_USCALED,
@@ -192,7 +192,7 @@ static GLuint ubyte_types_scale[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_USCALED
 };
 
-static GLuint byte_types_direct[5] = {
+static const GLuint byte_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R8_SINT,
    BRW_SURFACEFORMAT_R8G8_SINT,
@@ -200,7 +200,7 @@ static GLuint byte_types_direct[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_SINT
 };
 
-static GLuint byte_types_norm[5] = {
+static const GLuint byte_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R8_SNORM,
    BRW_SURFACEFORMAT_R8G8_SNORM,
@@ -208,7 +208,7 @@ static GLuint byte_types_norm[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_SNORM
 };
 
-static GLuint byte_types_scale[5] = {
+static const GLuint byte_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R8_SSCALED,
    BRW_SURFACEFORMAT_R8G8_SSCALED,

From ab80519b3cd08401dff2d07343064a27f32b33ca Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 29 Jun 2015 22:32:03 -0700
Subject: [PATCH 0324/1208] vc4: Add perf debug for when we wait on BOs.

---
 src/gallium/drivers/vc4/vc4_bufmgr.c | 110 +++++++++++++++++----------
 src/gallium/drivers/vc4/vc4_bufmgr.h |   5 +-
 src/gallium/drivers/vc4/vc4_fence.c  |   2 +-
 src/gallium/drivers/vc4/vc4_job.c    |   2 +-
 4 files changed, 74 insertions(+), 45 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index cbdb9e89cf6..499b5ce146e 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -94,7 +94,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
                  * allocate something new instead, since we assume that the
                  * user will proceed to CPU map it and fill it with stuff.
                  */
-                if (!vc4_bo_wait(bo, 0)) {
+                if (!vc4_bo_wait(bo, 0, NULL)) {
                         pipe_mutex_unlock(cache->lock);
                         return NULL;
                 }
@@ -413,63 +413,91 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name)
         return true;
 }
 
+static int vc4_wait_seqno_ioctl(int fd, uint64_t seqno, uint64_t timeout_ns)
+{
+        if (using_vc4_simulator)
+                return 0;
+
+        struct drm_vc4_wait_seqno wait = {
+                .seqno = seqno,
+                .timeout_ns = timeout_ns,
+        };
+        int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
+        if (ret == -1)
+                return -errno;
+        else
+                return 0;
+
+}
+
 bool
-vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns)
+vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns,
+               const char *reason)
 {
         if (screen->finished_seqno >= seqno)
                 return true;
 
-        struct drm_vc4_wait_seqno wait;
-        memset(&wait, 0, sizeof(wait));
-        wait.seqno = seqno;
-        wait.timeout_ns = timeout_ns;
-
-        int ret;
-        if (!using_vc4_simulator)
-                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
-        else {
-                wait.seqno = screen->finished_seqno;
-                ret = 0;
+        if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) {
+                if (vc4_wait_seqno_ioctl(screen->fd, seqno, 0) == -ETIME) {
+                        fprintf(stderr, "Blocking on seqno %lld for %s\n",
+                                (long long)seqno, reason);
+                }
         }
 
-        if (ret == 0) {
-                screen->finished_seqno = wait.seqno;
-                return true;
+        int ret = vc4_wait_seqno_ioctl(screen->fd, seqno, timeout_ns);
+        if (ret) {
+                if (ret != -ETIME) {
+                        fprintf(stderr, "wait failed: %d\n", ret);
+                        abort();
+                }
+
+                return false;
         }
 
-        if (errno != ETIME) {
-                fprintf(stderr, "wait failed: %d\n", ret);
-                abort();
-        }
+        screen->finished_seqno = seqno;
+        return true;
+}
+
+static int vc4_wait_bo_ioctl(int fd, uint32_t handle, uint64_t timeout_ns)
+{
+        if (using_vc4_simulator)
+                return 0;
+
+        struct drm_vc4_wait_bo wait = {
+                .handle = handle,
+                .timeout_ns = timeout_ns,
+        };
+        int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_BO, &wait);
+        if (ret == -1)
+                return -errno;
+        else
+                return 0;
 
-        return false;
 }
 
 bool
-vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns)
+vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns, const char *reason)
 {
         struct vc4_screen *screen = bo->screen;
 
-        struct drm_vc4_wait_bo wait;
-        memset(&wait, 0, sizeof(wait));
-        wait.handle = bo->handle;
-        wait.timeout_ns = timeout_ns;
-
-        int ret;
-        if (!using_vc4_simulator)
-                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_WAIT_BO, &wait);
-        else
-                ret = 0;
-
-        if (ret == 0)
-                return true;
-
-        if (errno != ETIME) {
-                fprintf(stderr, "wait failed: %d\n", ret);
-                abort();
+        if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) {
+                if (vc4_wait_bo_ioctl(screen->fd, bo->handle, 0) == -ETIME) {
+                        fprintf(stderr, "Blocking on %s BO for %s\n",
+                                bo->name, reason);
+                }
         }
 
-        return false;
+        int ret = vc4_wait_bo_ioctl(screen->fd, bo->handle, timeout_ns);
+        if (ret) {
+                if (ret != -ETIME) {
+                        fprintf(stderr, "wait failed: %d\n", ret);
+                        abort();
+                }
+
+                return false;
+        }
+
+        return true;
 }
 
 void *
@@ -515,7 +543,7 @@ vc4_bo_map(struct vc4_bo *bo)
 {
         void *map = vc4_bo_map_unsynchronized(bo);
 
-        bool ok = vc4_bo_wait(bo, PIPE_TIMEOUT_INFINITE);
+        bool ok = vc4_bo_wait(bo, PIPE_TIMEOUT_INFINITE, "bo map");
         if (!ok) {
                 fprintf(stderr, "BO wait for map failed\n");
                 abort();
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.h b/src/gallium/drivers/vc4/vc4_bufmgr.h
index 7320695ca8e..eb8409afb68 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.h
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.h
@@ -113,10 +113,11 @@ void *
 vc4_bo_map_unsynchronized(struct vc4_bo *bo);
 
 bool
-vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns);
+vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns, const char *reason);
 
 bool
-vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns);
+vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns,
+               const char *reason);
 
 void
 vc4_bufmgr_destroy(struct pipe_screen *pscreen);
diff --git a/src/gallium/drivers/vc4/vc4_fence.c b/src/gallium/drivers/vc4/vc4_fence.c
index f644bf9a04e..b6fb2a8a460 100644
--- a/src/gallium/drivers/vc4/vc4_fence.c
+++ b/src/gallium/drivers/vc4/vc4_fence.c
@@ -67,7 +67,7 @@ vc4_fence_finish(struct pipe_screen *pscreen,
         struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_fence *f = (struct vc4_fence *)pf;
 
-        return vc4_wait_seqno(screen, f->seqno, timeout_ns);
+        return vc4_wait_seqno(screen, f->seqno, timeout_ns, "fence wait");
 }
 
 struct vc4_fence *
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index dcade15443a..6435dbb333e 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -207,7 +207,7 @@ vc4_job_submit(struct vc4_context *vc4)
 
         if (vc4_debug & VC4_DEBUG_ALWAYS_SYNC) {
                 if (!vc4_wait_seqno(vc4->screen, vc4->last_emit_seqno,
-                                    PIPE_TIMEOUT_INFINITE)) {
+                                    PIPE_TIMEOUT_INFINITE, "sync")) {
                         fprintf(stderr, "Wait failed.\n");
                         abort();
                 }

From e4c540f6d09390013a9cb66060a29f236ad7dcfc Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 9 Jul 2015 22:42:22 -0700
Subject: [PATCH 0325/1208] vc4: Store reloc pointers as pointers, not offsets.

Now that we don't resize the CL as we build (it's set up at the top by
vc4_start_draw()), we can store the pointers instead of offsets from
the base.  Saves a bit of math in emitting relocs (about 60 bytes of
code).
---
 src/gallium/drivers/vc4/vc4_cl.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h
index 4a50e790942..4974da1530f 100644
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -36,8 +36,8 @@ struct vc4_bo;
 struct vc4_cl {
         void *base;
         void *next;
+        void *reloc_next;
         uint32_t size;
-        uint32_t reloc_next;
         uint32_t reloc_count;
 };
 
@@ -128,7 +128,7 @@ cl_start_reloc(struct vc4_cl *cl, uint32_t n)
         cl->reloc_count = n;
 
         cl_u8(cl, VC4_PACKET_GEM_HANDLES);
-        cl->reloc_next = cl->next - cl->base;
+        cl->reloc_next = cl->next;
         cl_u32(cl, 0); /* Space where hindex will be written. */
         cl_u32(cl, 0); /* Space where hindex will be written. */
 }
@@ -138,7 +138,7 @@ cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n)
 {
         assert(cl->reloc_count == 0);
         cl->reloc_count = n;
-        cl->reloc_next = cl->next - cl->base;
+        cl->reloc_next = cl->next;
 
         /* Space where hindex will be written. */
         cl->next += n * 4;
@@ -147,7 +147,7 @@ cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n)
 static inline void
 cl_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
 {
-        *(uint32_t *)(cl->base + cl->reloc_next) = hindex;
+        *(uint32_t *)cl->reloc_next = hindex;
         cl->reloc_next += 4;
 
         cl->reloc_count--;
@@ -158,7 +158,7 @@ cl_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
 static inline void
 cl_aligned_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
 {
-        *(uint32_t *)(cl->base + cl->reloc_next) = hindex;
+        *(uint32_t *)cl->reloc_next = hindex;
         cl->reloc_next += 4;
 
         cl->reloc_count--;

From 748bf459b46b44e184ee1d425ce612da61a0800e Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 9 Jul 2015 22:48:17 -0700
Subject: [PATCH 0326/1208] vc4: Drop separate cl*_reloc_hindex().

Now that RCL generation is in the kernel, we don't have any other
callers.  Oddly, the compiler generates another 8 bytes of code for
this, but the simplification is worth it.
---
 src/gallium/drivers/vc4/vc4_cl.h | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h
index 4974da1530f..3aa4721a414 100644
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -145,9 +145,10 @@ cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n)
 }
 
 static inline void
-cl_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
+cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
+         struct vc4_bo *bo, uint32_t offset)
 {
-        *(uint32_t *)cl->reloc_next = hindex;
+        *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo);
         cl->reloc_next += 4;
 
         cl->reloc_count--;
@@ -156,9 +157,10 @@ cl_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
 }
 
 static inline void
-cl_aligned_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
+cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
+         struct vc4_bo *bo, uint32_t offset)
 {
-        *(uint32_t *)cl->reloc_next = hindex;
+        *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo);
         cl->reloc_next += 4;
 
         cl->reloc_count--;
@@ -166,20 +168,6 @@ cl_aligned_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
         cl_aligned_u32(cl, offset);
 }
 
-static inline void
-cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
-         struct vc4_bo *bo, uint32_t offset)
-{
-        cl_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset);
-}
-
-static inline void
-cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
-         struct vc4_bo *bo, uint32_t offset)
-{
-        cl_aligned_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset);
-}
-
 void cl_ensure_space(struct vc4_cl *cl, uint32_t size);
 
 #endif /* VC4_CL_H */

From a0d3915663fb7cbd3c1a5561450e256e00ecf11b Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 10 Jul 2015 14:46:42 -0700
Subject: [PATCH 0327/1208] vc4: Make a helper function for getting the current
 offset in the CL.

I needed to rewrite this a bit for safety checking in the next commit.
Despite being a static inline of the same thing that was being done, we
lose 36 bytes of code for some reason.
---
 src/gallium/drivers/vc4/vc4_cl.c      |  9 ++++-----
 src/gallium/drivers/vc4/vc4_cl.h      | 15 ++++++++++-----
 src/gallium/drivers/vc4/vc4_context.c |  3 +--
 src/gallium/drivers/vc4/vc4_job.c     | 14 ++++++--------
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_cl.c b/src/gallium/drivers/vc4/vc4_cl.c
index 0700e885cbf..97f6b89024c 100644
--- a/src/gallium/drivers/vc4/vc4_cl.c
+++ b/src/gallium/drivers/vc4/vc4_cl.c
@@ -36,11 +36,12 @@ vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl)
 void
 cl_ensure_space(struct vc4_cl *cl, uint32_t space)
 {
-        if ((cl->next - cl->base) + space <= cl->size)
+        uint32_t offset = cl_offset(cl);
+
+        if (offset + space <= cl->size)
                 return;
 
         uint32_t size = MAX2(cl->size + space, cl->size * 2);
-        uint32_t offset = cl->next -cl->base;
 
         cl->base = reralloc(ralloc_parent(cl->base), cl->base, uint8_t, size);
         cl->size = size;
@@ -60,9 +61,7 @@ vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo)
         uint32_t hindex;
         uint32_t *current_handles = vc4->bo_handles.base;
 
-        for (hindex = 0;
-             hindex < (vc4->bo_handles.next - vc4->bo_handles.base) / 4;
-             hindex++) {
+        for (hindex = 0; hindex < cl_offset(&vc4->bo_handles) / 4; hindex++) {
                 if (current_handles[hindex] == bo->handle)
                         return hindex;
         }
diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h
index 3aa4721a414..b914745ed4f 100644
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -49,6 +49,11 @@ uint32_t vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo);
 struct PACKED unaligned_16 { uint16_t x; };
 struct PACKED unaligned_32 { uint32_t x; };
 
+static inline uint32_t cl_offset(struct vc4_cl *cl)
+{
+        return (char *)cl->next - (char *)cl->base;
+}
+
 static inline void
 put_unaligned_32(void *ptr, uint32_t val)
 {
@@ -66,7 +71,7 @@ put_unaligned_16(void *ptr, uint16_t val)
 static inline void
 cl_u8(struct vc4_cl *cl, uint8_t n)
 {
-        assert((cl->next - cl->base) + 1 <= cl->size);
+        assert(cl_offset(cl) + 1 <= cl->size);
 
         *(uint8_t *)cl->next = n;
         cl->next++;
@@ -75,7 +80,7 @@ cl_u8(struct vc4_cl *cl, uint8_t n)
 static inline void
 cl_u16(struct vc4_cl *cl, uint16_t n)
 {
-        assert((cl->next - cl->base) + 2 <= cl->size);
+        assert(cl_offset(cl) + 2 <= cl->size);
 
         put_unaligned_16(cl->next, n);
         cl->next += 2;
@@ -84,7 +89,7 @@ cl_u16(struct vc4_cl *cl, uint16_t n)
 static inline void
 cl_u32(struct vc4_cl *cl, uint32_t n)
 {
-        assert((cl->next - cl->base) + 4 <= cl->size);
+        assert(cl_offset(cl) + 4 <= cl->size);
 
         put_unaligned_32(cl->next, n);
         cl->next += 4;
@@ -93,7 +98,7 @@ cl_u32(struct vc4_cl *cl, uint32_t n)
 static inline void
 cl_aligned_u32(struct vc4_cl *cl, uint32_t n)
 {
-        assert((cl->next - cl->base) + 4 <= cl->size);
+        assert(cl_offset(cl) + 4 <= cl->size);
 
         *(uint32_t *)cl->next = n;
         cl->next += 4;
@@ -102,7 +107,7 @@ cl_aligned_u32(struct vc4_cl *cl, uint32_t n)
 static inline void
 cl_ptr(struct vc4_cl *cl, void *ptr)
 {
-        assert((cl->next - cl->base) + sizeof(void *) <= cl->size);
+        assert(cl_offset(cl) + sizeof(void *) <= cl->size);
 
         *(void **)cl->next = ptr;
         cl->next += sizeof(void *);
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 316598f0a9d..60da218e59e 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -128,8 +128,7 @@ vc4_cl_references_bo(struct pipe_context *pctx, struct vc4_bo *bo)
          * they match.
          */
         struct vc4_bo **referenced_bos = vc4->bo_pointers.base;
-        for (int i = 0; i < (vc4->bo_handles.next -
-                             vc4->bo_handles.base) / 4; i++) {
+        for (int i = 0; i < cl_offset(&vc4->bo_handles) / 4; i++) {
                 if (referenced_bos[i] == bo) {
                         return true;
                 }
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index 6435dbb333e..7ebd9f160eb 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -44,8 +44,7 @@ void
 vc4_job_reset(struct vc4_context *vc4)
 {
         struct vc4_bo **referenced_bos = vc4->bo_pointers.base;
-        for (int i = 0; i < (vc4->bo_handles.next -
-                             vc4->bo_handles.base) / 4; i++) {
+        for (int i = 0; i < cl_offset(&vc4->bo_handles) / 4; i++) {
                 vc4_bo_unreference(&referenced_bos[i]);
         }
         vc4_reset_cl(&vc4->bcl);
@@ -145,7 +144,7 @@ vc4_job_submit(struct vc4_context *vc4)
 {
         if (vc4_debug & VC4_DEBUG_CL) {
                 fprintf(stderr, "BCL:\n");
-                vc4_dump_cl(vc4->bcl.base, vc4->bcl.next - vc4->bcl.base, false);
+                vc4_dump_cl(vc4->bcl.base, cl_offset(&vc4->bcl), false);
         }
 
         struct drm_vc4_submit_cl submit;
@@ -164,15 +163,14 @@ vc4_job_submit(struct vc4_context *vc4)
                                      vc4->zs_write, true, true);
 
         submit.bo_handles = (uintptr_t)vc4->bo_handles.base;
-        submit.bo_handle_count = (vc4->bo_handles.next -
-                                  vc4->bo_handles.base) / 4;
+        submit.bo_handle_count = cl_offset(&vc4->bo_handles) / 4;
         submit.bin_cl = (uintptr_t)vc4->bcl.base;
-        submit.bin_cl_size = vc4->bcl.next - vc4->bcl.base;
+        submit.bin_cl_size = cl_offset(&vc4->bcl);
         submit.shader_rec = (uintptr_t)vc4->shader_rec.base;
-        submit.shader_rec_size = vc4->shader_rec.next - vc4->shader_rec.base;
+        submit.shader_rec_size = cl_offset(&vc4->shader_rec);
         submit.shader_rec_count = vc4->shader_rec_count;
         submit.uniforms = (uintptr_t)vc4->uniforms.base;
-        submit.uniforms_size = vc4->uniforms.next - vc4->uniforms.base;
+        submit.uniforms_size = cl_offset(&vc4->uniforms);
 
         assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0);
         submit.min_x_tile = vc4->draw_min_x / 64;

From 7432017f65174e82a3de7afef3e4e6f60932356c Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 9 Jul 2015 22:51:06 -0700
Subject: [PATCH 0328/1208] vc4: Rework cl handling to be friendlier to the
 compiler.

Drops 680 bytes of code, from avoiding a bunch of extra updates to the
next pointer in the struct.
---
 src/gallium/drivers/vc4/vc4_cl.c      |  11 ++-
 src/gallium/drivers/vc4/vc4_cl.h      | 113 +++++++++++++++-----------
 src/gallium/drivers/vc4/vc4_context.c |   6 +-
 src/gallium/drivers/vc4/vc4_draw.c    | 105 +++++++++++++-----------
 src/gallium/drivers/vc4/vc4_emit.c    |  57 +++++++------
 src/gallium/drivers/vc4/vc4_program.c |  59 ++++++++------
 6 files changed, 201 insertions(+), 150 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_cl.c b/src/gallium/drivers/vc4/vc4_cl.c
index 97f6b89024c..ced4f2dfa86 100644
--- a/src/gallium/drivers/vc4/vc4_cl.c
+++ b/src/gallium/drivers/vc4/vc4_cl.c
@@ -66,8 +66,15 @@ vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo)
                         return hindex;
         }
 
-        cl_u32(&vc4->bo_handles, bo->handle);
-        cl_ptr(&vc4->bo_pointers, vc4_bo_reference(bo));
+        struct vc4_cl_out *out;
+
+        out = cl_start(&vc4->bo_handles);
+        cl_u32(&out, bo->handle);
+        cl_end(&vc4->bo_handles, out);
+
+        out = cl_start(&vc4->bo_pointers);
+        cl_ptr(&out, vc4_bo_reference(bo));
+        cl_end(&vc4->bo_pointers, out);
 
         return hindex;
 }
diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h
index b914745ed4f..95f1a531d34 100644
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -33,10 +33,16 @@
 
 struct vc4_bo;
 
+/**
+ * Undefined structure, used for typechecking that you're passing the pointers
+ * to these functions correctly.
+ */
+struct vc4_cl_out;
+
 struct vc4_cl {
         void *base;
-        void *next;
-        void *reloc_next;
+        struct vc4_cl_out *next;
+        struct vc4_cl_out *reloc_next;
         uint32_t size;
         uint32_t reloc_count;
 };
@@ -55,122 +61,135 @@ static inline uint32_t cl_offset(struct vc4_cl *cl)
 }
 
 static inline void
-put_unaligned_32(void *ptr, uint32_t val)
+cl_advance(struct vc4_cl_out **cl, uint32_t n)
 {
-        struct unaligned_32 *p = ptr;
+        (*cl) = (struct vc4_cl_out *)((char *)(*cl) + n);
+}
+
+static inline struct vc4_cl_out *
+cl_start(struct vc4_cl *cl)
+{
+        return cl->next;
+}
+
+static inline void
+cl_end(struct vc4_cl *cl, struct vc4_cl_out *next)
+{
+        cl->next = next;
+        assert(cl_offset(cl) <= cl->size);
+}
+
+
+static inline void
+put_unaligned_32(struct vc4_cl_out *ptr, uint32_t val)
+{
+        struct unaligned_32 *p = (void *)ptr;
         p->x = val;
 }
 
 static inline void
-put_unaligned_16(void *ptr, uint16_t val)
+put_unaligned_16(struct vc4_cl_out *ptr, uint16_t val)
 {
-        struct unaligned_16 *p = ptr;
+        struct unaligned_16 *p = (void *)ptr;
         p->x = val;
 }
 
 static inline void
-cl_u8(struct vc4_cl *cl, uint8_t n)
+cl_u8(struct vc4_cl_out **cl, uint8_t n)
 {
-        assert(cl_offset(cl) + 1 <= cl->size);
-
-        *(uint8_t *)cl->next = n;
-        cl->next++;
+        *(uint8_t *)(*cl) = n;
+        cl_advance(cl, 1);
 }
 
 static inline void
-cl_u16(struct vc4_cl *cl, uint16_t n)
+cl_u16(struct vc4_cl_out **cl, uint16_t n)
 {
-        assert(cl_offset(cl) + 2 <= cl->size);
-
-        put_unaligned_16(cl->next, n);
-        cl->next += 2;
+        put_unaligned_16(*cl, n);
+        cl_advance(cl, 2);
 }
 
 static inline void
-cl_u32(struct vc4_cl *cl, uint32_t n)
+cl_u32(struct vc4_cl_out **cl, uint32_t n)
 {
-        assert(cl_offset(cl) + 4 <= cl->size);
-
-        put_unaligned_32(cl->next, n);
-        cl->next += 4;
+        put_unaligned_32(*cl, n);
+        cl_advance(cl, 4);
 }
 
 static inline void
-cl_aligned_u32(struct vc4_cl *cl, uint32_t n)
+cl_aligned_u32(struct vc4_cl_out **cl, uint32_t n)
 {
-        assert(cl_offset(cl) + 4 <= cl->size);
-
-        *(uint32_t *)cl->next = n;
-        cl->next += 4;
+        *(uint32_t *)(*cl) = n;
+        cl_advance(cl, 4);
 }
 
 static inline void
-cl_ptr(struct vc4_cl *cl, void *ptr)
+cl_ptr(struct vc4_cl_out **cl, void *ptr)
 {
-        assert(cl_offset(cl) + sizeof(void *) <= cl->size);
-
-        *(void **)cl->next = ptr;
-        cl->next += sizeof(void *);
+        *(struct vc4_cl_out **)(*cl) = ptr;
+        cl_advance(cl, sizeof(void *));
 }
 
 static inline void
-cl_f(struct vc4_cl *cl, float f)
+cl_f(struct vc4_cl_out **cl, float f)
 {
         cl_u32(cl, fui(f));
 }
 
 static inline void
-cl_aligned_f(struct vc4_cl *cl, float f)
+cl_aligned_f(struct vc4_cl_out **cl, float f)
 {
         cl_aligned_u32(cl, fui(f));
 }
 
 static inline void
-cl_start_reloc(struct vc4_cl *cl, uint32_t n)
+cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n)
 {
         assert(n == 1 || n == 2);
         assert(cl->reloc_count == 0);
         cl->reloc_count = n;
 
-        cl_u8(cl, VC4_PACKET_GEM_HANDLES);
-        cl->reloc_next = cl->next;
-        cl_u32(cl, 0); /* Space where hindex will be written. */
-        cl_u32(cl, 0); /* Space where hindex will be written. */
+        cl_u8(out, VC4_PACKET_GEM_HANDLES);
+        cl->reloc_next = *out;
+        cl_u32(out, 0); /* Space where hindex will be written. */
+        cl_u32(out, 0); /* Space where hindex will be written. */
 }
 
-static inline void
+static inline struct vc4_cl_out *
 cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n)
 {
         assert(cl->reloc_count == 0);
         cl->reloc_count = n;
         cl->reloc_next = cl->next;
 
-        /* Space where hindex will be written. */
-        cl->next += n * 4;
+        /* Reserve the space where hindex will be written. */
+        cl_advance(&cl->next, n * 4);
+
+        return cl->next;
 }
 
 static inline void
-cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
+cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl, struct vc4_cl_out **cl_out,
          struct vc4_bo *bo, uint32_t offset)
 {
         *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo);
-        cl->reloc_next += 4;
+        cl_advance(&cl->reloc_next, 4);
 
         cl->reloc_count--;
 
-        cl_u32(cl, offset);
+        cl_u32(cl_out, offset);
 }
 
 static inline void
 cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
-         struct vc4_bo *bo, uint32_t offset)
+                 struct vc4_cl_out **cl_out,
+                 struct vc4_bo *bo, uint32_t offset)
 {
         *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo);
-        cl->reloc_next += 4;
+        cl_advance(&cl->reloc_next, 4);
 
         cl->reloc_count--;
 
-        cl_aligned_u32(cl, offset);
+        cl_aligned_u32(cl_out, offset);
 }
 
 void cl_ensure_space(struct vc4_cl *cl, uint32_t size);
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 60da218e59e..fff63158c9d 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -61,9 +61,11 @@ vc4_flush(struct pipe_context *pctx)
          * FLUSH completes.
          */
         cl_ensure_space(&vc4->bcl, 8);
-        cl_u8(&vc4->bcl, VC4_PACKET_INCREMENT_SEMAPHORE);
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
+        cl_u8(&bcl, VC4_PACKET_INCREMENT_SEMAPHORE);
         /* The FLUSH caps all of our bin lists with a VC4_PACKET_RETURN. */
-        cl_u8(&vc4->bcl, VC4_PACKET_FLUSH);
+        cl_u8(&bcl, VC4_PACKET_FLUSH);
+        cl_end(&vc4->bcl, bcl);
 
         if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) {
                 pipe_surface_reference(&vc4->color_write, cbuf);
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index 5e6d70d6f33..fc3c2321abb 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -71,37 +71,40 @@ vc4_start_draw(struct vc4_context *vc4)
         uint32_t height = vc4->framebuffer.height;
         uint32_t tilew = align(width, 64) / 64;
         uint32_t tileh = align(height, 64) / 64;
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
 
         //   Tile state data is 48 bytes per tile, I think it can be thrown away
         //   as soon as binning is finished.
-        cl_u8(&vc4->bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
-        cl_u32(&vc4->bcl, 0); /* tile alloc addr, filled by kernel */
-        cl_u32(&vc4->bcl, 0); /* tile alloc size, filled by kernel */
-        cl_u32(&vc4->bcl, 0); /* tile state addr, filled by kernel */
-        cl_u8(&vc4->bcl, tilew);
-        cl_u8(&vc4->bcl, tileh);
-        cl_u8(&vc4->bcl, 0); /* flags, filled by kernel. */
+        cl_u8(&bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
+        cl_u32(&bcl, 0); /* tile alloc addr, filled by kernel */
+        cl_u32(&bcl, 0); /* tile alloc size, filled by kernel */
+        cl_u32(&bcl, 0); /* tile state addr, filled by kernel */
+        cl_u8(&bcl, tilew);
+        cl_u8(&bcl, tileh);
+        cl_u8(&bcl, 0); /* flags, filled by kernel. */
 
         /* START_TILE_BINNING resets the statechange counters in the hardware,
          * which are what is used when a primitive is binned to a tile to
          * figure out what new state packets need to be written to that tile's
          * command list.
          */
-        cl_u8(&vc4->bcl, VC4_PACKET_START_TILE_BINNING);
+        cl_u8(&bcl, VC4_PACKET_START_TILE_BINNING);
 
         /* Reset the current compressed primitives format.  This gets modified
          * by VC4_PACKET_GL_INDEXED_PRIMITIVE and
          * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start
          * of every tile.
          */
-        cl_u8(&vc4->bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT);
-        cl_u8(&vc4->bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX |
-                          VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES));
+        cl_u8(&bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT);
+        cl_u8(&bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX |
+                     VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES));
 
         vc4->needs_flush = true;
         vc4->draw_call_queued = true;
         vc4->draw_width = width;
         vc4->draw_height = height;
+
+        cl_end(&vc4->bcl, bcl);
 }
 
 static void
@@ -167,28 +170,29 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
          */
         uint32_t num_elements_emit = MAX2(vtx->num_elements, 1);
         /* Emit the shader record. */
-        cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit);
-        cl_u16(&vc4->shader_rec,
+        struct vc4_cl_out *shader_rec =
+                cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit);
+        cl_u16(&shader_rec,
                VC4_SHADER_FLAG_ENABLE_CLIPPING |
                ((info->mode == PIPE_PRIM_POINTS &&
                  vc4->rasterizer->base.point_size_per_vertex) ?
                 VC4_SHADER_FLAG_VS_POINT_SIZE : 0));
-        cl_u8(&vc4->shader_rec, 0); /* fs num uniforms (unused) */
-        cl_u8(&vc4->shader_rec, vc4->prog.fs->num_inputs);
-        cl_reloc(vc4, &vc4->shader_rec, vc4->prog.fs->bo, 0);
-        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
+        cl_u8(&shader_rec, 0); /* fs num uniforms (unused) */
+        cl_u8(&shader_rec, vc4->prog.fs->num_inputs);
+        cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.fs->bo, 0);
+        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
 
-        cl_u16(&vc4->shader_rec, 0); /* vs num uniforms */
-        cl_u8(&vc4->shader_rec, vc4->prog.vs->vattrs_live);
-        cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[8]);
-        cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, 0);
-        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
+        cl_u16(&shader_rec, 0); /* vs num uniforms */
+        cl_u8(&shader_rec, vc4->prog.vs->vattrs_live);
+        cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]);
+        cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.vs->bo, 0);
+        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
 
-        cl_u16(&vc4->shader_rec, 0); /* cs num uniforms */
-        cl_u8(&vc4->shader_rec, vc4->prog.cs->vattrs_live);
-        cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[8]);
-        cl_reloc(vc4, &vc4->shader_rec, vc4->prog.cs->bo, 0);
-        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
+        cl_u16(&shader_rec, 0); /* cs num uniforms */
+        cl_u8(&shader_rec, vc4->prog.cs->vattrs_live);
+        cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]);
+        cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.cs->bo, 0);
+        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
 
         uint32_t max_index = 0xffff;
         uint32_t vpm_offset = 0;
@@ -202,11 +206,11 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 uint32_t elem_size =
                         util_format_get_blocksize(elem->src_format);
 
-                cl_reloc(vc4, &vc4->shader_rec, rsc->bo, offset);
-                cl_u8(&vc4->shader_rec, elem_size - 1);
-                cl_u8(&vc4->shader_rec, vb->stride);
-                cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[i]);
-                cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[i]);
+                cl_reloc(vc4, &vc4->shader_rec, &shader_rec, rsc->bo, offset);
+                cl_u8(&shader_rec, elem_size - 1);
+                cl_u8(&shader_rec, vb->stride);
+                cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[i]);
+                cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[i]);
 
                 vpm_offset += align(elem_size, 4);
 
@@ -219,21 +223,23 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
         if (vtx->num_elements == 0) {
                 assert(num_elements_emit == 1);
                 struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO");
-                cl_reloc(vc4, &vc4->shader_rec, bo, 0);
-                cl_u8(&vc4->shader_rec, 16 - 1); /* element size */
-                cl_u8(&vc4->shader_rec, 0); /* stride */
-                cl_u8(&vc4->shader_rec, 0); /* VS VPM offset */
-                cl_u8(&vc4->shader_rec, 0); /* CS VPM offset */
+                cl_reloc(vc4, &vc4->shader_rec, &shader_rec, bo, 0);
+                cl_u8(&shader_rec, 16 - 1); /* element size */
+                cl_u8(&shader_rec, 0); /* stride */
+                cl_u8(&shader_rec, 0); /* VS VPM offset */
+                cl_u8(&shader_rec, 0); /* CS VPM offset */
                 vc4_bo_unreference(&bo);
         }
+        cl_end(&vc4->shader_rec, shader_rec);
 
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
         /* the actual draw call. */
-        cl_u8(&vc4->bcl, VC4_PACKET_GL_SHADER_STATE);
+        cl_u8(&bcl, VC4_PACKET_GL_SHADER_STATE);
         assert(vtx->num_elements <= 8);
         /* Note that number of attributes == 0 in the packet means 8
          * attributes.  This field also contains the offset into shader_rec.
          */
-        cl_u32(&vc4->bcl, num_elements_emit & 0x7);
+        cl_u32(&bcl, num_elements_emit & 0x7);
 
         /* Note that the primitive type fields match with OpenGL/gallium
          * definitions, up to but not including QUADS.
@@ -251,25 +257,26 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 }
                 struct vc4_resource *rsc = vc4_resource(prsc);
 
-                cl_start_reloc(&vc4->bcl, 1);
-                cl_u8(&vc4->bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
-                cl_u8(&vc4->bcl,
+                cl_start_reloc(&vc4->bcl, &bcl, 1);
+                cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
+                cl_u8(&bcl,
                       info->mode |
                       (index_size == 2 ?
                        VC4_INDEX_BUFFER_U16:
                        VC4_INDEX_BUFFER_U8));
-                cl_u32(&vc4->bcl, info->count);
-                cl_reloc(vc4, &vc4->bcl, rsc->bo, offset);
-                cl_u32(&vc4->bcl, max_index);
+                cl_u32(&bcl, info->count);
+                cl_reloc(vc4, &vc4->bcl, &bcl, rsc->bo, offset);
+                cl_u32(&bcl, max_index);
 
                 if (vc4->indexbuf.index_size == 4)
                         pipe_resource_reference(&prsc, NULL);
         } else {
-                cl_u8(&vc4->bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
-                cl_u8(&vc4->bcl, info->mode);
-                cl_u32(&vc4->bcl, info->count);
-                cl_u32(&vc4->bcl, info->start);
+                cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
+                cl_u8(&bcl, info->mode);
+                cl_u32(&bcl, info->count);
+                cl_u32(&bcl, info->start);
         }
+        cl_end(&vc4->bcl, bcl);
 
         if (vc4->zsa && vc4->zsa->base.depth.enabled) {
                 vc4->resolve |= PIPE_CLEAR_DEPTH;
diff --git a/src/gallium/drivers/vc4/vc4_emit.c b/src/gallium/drivers/vc4/vc4_emit.c
index d2b54fccf91..f5925734415 100644
--- a/src/gallium/drivers/vc4/vc4_emit.c
+++ b/src/gallium/drivers/vc4/vc4_emit.c
@@ -28,6 +28,7 @@ vc4_emit_state(struct pipe_context *pctx)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
 
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
         if (vc4->dirty & (VC4_DIRTY_SCISSOR | VC4_DIRTY_VIEWPORT)) {
                 float *vpscale = vc4->viewport.scale;
                 float *vptranslate = vc4->viewport.translate;
@@ -40,11 +41,11 @@ vc4_emit_state(struct pipe_context *pctx)
                 uint32_t maxx = MIN2(vc4->scissor.maxx, vp_maxx);
                 uint32_t maxy = MIN2(vc4->scissor.maxy, vp_maxy);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_CLIP_WINDOW);
-                cl_u16(&vc4->bcl, minx);
-                cl_u16(&vc4->bcl, miny);
-                cl_u16(&vc4->bcl, maxx - minx);
-                cl_u16(&vc4->bcl, maxy - miny);
+                cl_u8(&bcl, VC4_PACKET_CLIP_WINDOW);
+                cl_u16(&bcl, minx);
+                cl_u16(&bcl, miny);
+                cl_u16(&bcl, maxx - minx);
+                cl_u16(&bcl, maxy - miny);
 
                 vc4->draw_min_x = MIN2(vc4->draw_min_x, minx);
                 vc4->draw_min_y = MIN2(vc4->draw_min_y, miny);
@@ -53,47 +54,49 @@ vc4_emit_state(struct pipe_context *pctx)
         }
 
         if (vc4->dirty & (VC4_DIRTY_RASTERIZER | VC4_DIRTY_ZSA)) {
-                cl_u8(&vc4->bcl, VC4_PACKET_CONFIGURATION_BITS);
-                cl_u8(&vc4->bcl,
+                cl_u8(&bcl, VC4_PACKET_CONFIGURATION_BITS);
+                cl_u8(&bcl,
                       vc4->rasterizer->config_bits[0] |
                       vc4->zsa->config_bits[0]);
-                cl_u8(&vc4->bcl,
+                cl_u8(&bcl,
                       vc4->rasterizer->config_bits[1] |
                       vc4->zsa->config_bits[1]);
-                cl_u8(&vc4->bcl,
+                cl_u8(&bcl,
                       vc4->rasterizer->config_bits[2] |
                       vc4->zsa->config_bits[2]);
         }
 
         if (vc4->dirty & VC4_DIRTY_RASTERIZER) {
-                cl_u8(&vc4->bcl, VC4_PACKET_DEPTH_OFFSET);
-                cl_u16(&vc4->bcl, vc4->rasterizer->offset_factor);
-                cl_u16(&vc4->bcl, vc4->rasterizer->offset_units);
+                cl_u8(&bcl, VC4_PACKET_DEPTH_OFFSET);
+                cl_u16(&bcl, vc4->rasterizer->offset_factor);
+                cl_u16(&bcl, vc4->rasterizer->offset_units);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_POINT_SIZE);
-                cl_f(&vc4->bcl, vc4->rasterizer->point_size);
+                cl_u8(&bcl, VC4_PACKET_POINT_SIZE);
+                cl_f(&bcl, vc4->rasterizer->point_size);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_LINE_WIDTH);
-                cl_f(&vc4->bcl, vc4->rasterizer->base.line_width);
+                cl_u8(&bcl, VC4_PACKET_LINE_WIDTH);
+                cl_f(&bcl, vc4->rasterizer->base.line_width);
         }
 
         if (vc4->dirty & VC4_DIRTY_VIEWPORT) {
-                cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_XY_SCALING);
-                cl_f(&vc4->bcl, vc4->viewport.scale[0] * 16.0f);
-                cl_f(&vc4->bcl, vc4->viewport.scale[1] * 16.0f);
+                cl_u8(&bcl, VC4_PACKET_CLIPPER_XY_SCALING);
+                cl_f(&bcl, vc4->viewport.scale[0] * 16.0f);
+                cl_f(&bcl, vc4->viewport.scale[1] * 16.0f);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_Z_SCALING);
-                cl_f(&vc4->bcl, vc4->viewport.translate[2]);
-                cl_f(&vc4->bcl, vc4->viewport.scale[2]);
+                cl_u8(&bcl, VC4_PACKET_CLIPPER_Z_SCALING);
+                cl_f(&bcl, vc4->viewport.translate[2]);
+                cl_f(&bcl, vc4->viewport.scale[2]);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_VIEWPORT_OFFSET);
-                cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[0]);
-                cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[1]);
+                cl_u8(&bcl, VC4_PACKET_VIEWPORT_OFFSET);
+                cl_u16(&bcl, 16 * vc4->viewport.translate[0]);
+                cl_u16(&bcl, 16 * vc4->viewport.translate[1]);
         }
 
         if (vc4->dirty & VC4_DIRTY_FLAT_SHADE_FLAGS) {
-                cl_u8(&vc4->bcl, VC4_PACKET_FLAT_SHADE_FLAGS);
-                cl_u32(&vc4->bcl, vc4->rasterizer->base.flatshade ?
+                cl_u8(&bcl, VC4_PACKET_FLAT_SHADE_FLAGS);
+                cl_u32(&bcl, vc4->rasterizer->base.flatshade ?
                        vc4->prog.fs->color_inputs : 0);
         }
+
+        cl_end(&vc4->bcl, bcl);
 }
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index a7aa3172a75..e61ea2170ff 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2530,13 +2530,14 @@ static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest)
 
 static void
 write_texture_p0(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
                  struct vc4_texture_stateobj *texstate,
                  uint32_t unit)
 {
         struct pipe_sampler_view *texture = texstate->textures[unit];
         struct vc4_resource *rsc = vc4_resource(texture->texture);
 
-        cl_reloc(vc4, &vc4->uniforms, rsc->bo,
+        cl_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo,
                  VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
                  VC4_SET_FIELD(texture->u.tex.last_level -
                                texture->u.tex.first_level, VC4_TEX_P0_MIPLVLS) |
@@ -2547,6 +2548,7 @@ write_texture_p0(struct vc4_context *vc4,
 
 static void
 write_texture_p1(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
                  struct vc4_texture_stateobj *texstate,
                  uint32_t unit)
 {
@@ -2570,7 +2572,7 @@ write_texture_p1(struct vc4_context *vc4,
                 (sampler->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
                  sampler->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
 
-        cl_aligned_u32(&vc4->uniforms,
+        cl_aligned_u32(uniforms,
                VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) |
                VC4_SET_FIELD(texture->texture->height0 & 2047,
                              VC4_TEX_P1_HEIGHT) |
@@ -2589,6 +2591,7 @@ write_texture_p1(struct vc4_context *vc4,
 
 static void
 write_texture_p2(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
                  struct vc4_texture_stateobj *texstate,
                  uint32_t data)
 {
@@ -2596,7 +2599,7 @@ write_texture_p2(struct vc4_context *vc4,
         struct pipe_sampler_view *texture = texstate->textures[unit];
         struct vc4_resource *rsc = vc4_resource(texture->texture);
 
-        cl_aligned_u32(&vc4->uniforms,
+        cl_aligned_u32(uniforms,
                VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE,
                              VC4_TEX_P2_PTYPE) |
                VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) |
@@ -2613,6 +2616,7 @@ write_texture_p2(struct vc4_context *vc4,
 
 static void
 write_texture_border_color(struct vc4_context *vc4,
+                           struct vc4_cl_out **uniforms,
                            struct vc4_texture_stateobj *texstate,
                            uint32_t unit)
 {
@@ -2673,7 +2677,7 @@ write_texture_border_color(struct vc4_context *vc4,
                 }
         }
 
-        cl_aligned_u32(&vc4->uniforms, uc.ui[0]);
+        cl_aligned_u32(uniforms, uc.ui[0]);
 }
 
 static uint32_t
@@ -2693,7 +2697,8 @@ get_texrect_scale(struct vc4_texture_stateobj *texstate,
 }
 
 static struct vc4_bo *
-vc4_upload_ubo(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
+vc4_upload_ubo(struct vc4_context *vc4,
+               struct vc4_compiled_shader *shader,
                const uint32_t *gallium_uniforms)
 {
         if (!shader->ubo_size)
@@ -2722,72 +2727,78 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
         cl_ensure_space(&vc4->uniforms, (uinfo->count +
                                          uinfo->num_texture_samples) * 4);
 
-        cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples);
+        struct vc4_cl_out *uniforms =
+                cl_start_shader_reloc(&vc4->uniforms,
+                                      uinfo->num_texture_samples);
 
         for (int i = 0; i < uinfo->count; i++) {
 
                 switch (uinfo->contents[i]) {
                 case QUNIFORM_CONSTANT:
-                        cl_aligned_u32(&vc4->uniforms, uinfo->data[i]);
+                        cl_aligned_u32(&uniforms, uinfo->data[i]);
                         break;
                 case QUNIFORM_UNIFORM:
-                        cl_aligned_u32(&vc4->uniforms,
+                        cl_aligned_u32(&uniforms,
                                        gallium_uniforms[uinfo->data[i]]);
                         break;
                 case QUNIFORM_VIEWPORT_X_SCALE:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[0] * 16.0f);
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[0] * 16.0f);
                         break;
                 case QUNIFORM_VIEWPORT_Y_SCALE:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[1] * 16.0f);
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[1] * 16.0f);
                         break;
 
                 case QUNIFORM_VIEWPORT_Z_OFFSET:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.translate[2]);
+                        cl_aligned_f(&uniforms, vc4->viewport.translate[2]);
                         break;
                 case QUNIFORM_VIEWPORT_Z_SCALE:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[2]);
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[2]);
                         break;
 
                 case QUNIFORM_USER_CLIP_PLANE:
-                        cl_aligned_f(&vc4->uniforms,
+                        cl_aligned_f(&uniforms,
                                      vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
                         break;
 
                 case QUNIFORM_TEXTURE_CONFIG_P0:
-                        write_texture_p0(vc4, texstate, uinfo->data[i]);
+                        write_texture_p0(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
                         break;
 
                 case QUNIFORM_TEXTURE_CONFIG_P1:
-                        write_texture_p1(vc4, texstate, uinfo->data[i]);
+                        write_texture_p1(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
                         break;
 
                 case QUNIFORM_TEXTURE_CONFIG_P2:
-                        write_texture_p2(vc4, texstate, uinfo->data[i]);
+                        write_texture_p2(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
                         break;
 
                 case QUNIFORM_UBO_ADDR:
-                        cl_aligned_reloc(vc4, &vc4->uniforms, ubo, 0);
+                        cl_aligned_reloc(vc4, &vc4->uniforms, &uniforms, ubo, 0);
                         break;
 
                 case QUNIFORM_TEXTURE_BORDER_COLOR:
-                        write_texture_border_color(vc4, texstate, uinfo->data[i]);
+                        write_texture_border_color(vc4, &uniforms,
+                                                   texstate, uinfo->data[i]);
                         break;
 
                 case QUNIFORM_TEXRECT_SCALE_X:
                 case QUNIFORM_TEXRECT_SCALE_Y:
-                        cl_aligned_u32(&vc4->uniforms,
+                        cl_aligned_u32(&uniforms,
                                        get_texrect_scale(texstate,
                                                          uinfo->contents[i],
                                                          uinfo->data[i]));
                         break;
 
                 case QUNIFORM_BLEND_CONST_COLOR:
-                        cl_aligned_f(&vc4->uniforms,
+                        cl_aligned_f(&uniforms,
                                      CLAMP(vc4->blend_color.color[uinfo->data[i]], 0, 1));
                         break;
 
                 case QUNIFORM_STENCIL:
-                        cl_aligned_u32(&vc4->uniforms,
+                        cl_aligned_u32(&uniforms,
                                        vc4->zsa->stencil_uniforms[uinfo->data[i]] |
                                        (uinfo->data[i] <= 1 ?
                                         (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) :
@@ -2795,16 +2806,18 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                         break;
 
                 case QUNIFORM_ALPHA_REF:
-                        cl_aligned_f(&vc4->uniforms,
+                        cl_aligned_f(&uniforms,
                                      vc4->zsa->base.alpha.ref_value);
                         break;
                 }
 #if 0
-                uint32_t written_val = *(uint32_t *)(vc4->uniforms.next - 4);
+                uint32_t written_val = *((uint32_t *)uniforms - 1);
                 fprintf(stderr, "%p: %d / 0x%08x (%f)\n",
                         shader, i, written_val, uif(written_val));
 #endif
         }
+
+        cl_end(&vc4->uniforms, uniforms);
 }
 
 static void

From 3df78928786134874eafa6f68186c8edbbdd3ae7 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 10 Jul 2015 16:11:23 -0700
Subject: [PATCH 0329/1208] vc4: Drop reloc_count tracking for debug asserts on
 non-debug builds.

Cuts another 88 bytes of compiled code.
---
 src/gallium/drivers/vc4/vc4_cl.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h
index 95f1a531d34..bf4be0efc29 100644
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -44,7 +44,9 @@ struct vc4_cl {
         struct vc4_cl_out *next;
         struct vc4_cl_out *reloc_next;
         uint32_t size;
+#ifdef DEBUG
         uint32_t reloc_count;
+#endif
 };
 
 void vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl);
@@ -145,8 +147,10 @@ static inline void
 cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n)
 {
         assert(n == 1 || n == 2);
+#ifdef DEBUG
         assert(cl->reloc_count == 0);
         cl->reloc_count = n;
+#endif
 
         cl_u8(out, VC4_PACKET_GEM_HANDLES);
         cl->reloc_next = *out;
@@ -157,8 +161,10 @@ cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n)
 static inline struct vc4_cl_out *
 cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n)
 {
+#ifdef DEBUG
         assert(cl->reloc_count == 0);
         cl->reloc_count = n;
+#endif
         cl->reloc_next = cl->next;
 
         /* Reserve the space where hindex will be written. */
@@ -174,7 +180,9 @@ cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl, struct vc4_cl_out **cl_out,
         *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo);
         cl_advance(&cl->reloc_next, 4);
 
+#ifdef DEBUG
         cl->reloc_count--;
+#endif
 
         cl_u32(cl_out, offset);
 }
@@ -187,7 +195,9 @@ cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
         *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo);
         cl_advance(&cl->reloc_next, 4);
 
+#ifdef DEBUG
         cl->reloc_count--;
+#endif
 
         cl_aligned_u32(cl_out, offset);
 }

From 1e80c9fab98d7de216937a47f8e231f3beb78403 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 10 Jul 2015 17:01:37 -0700
Subject: [PATCH 0330/1208] vc4: Add better debug for register allocation
 failure.

---
 src/gallium/drivers/vc4/vc4_register_allocate.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 3b0b890b66a..73964b48dca 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -270,7 +270,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         }
 
         bool ok = ra_allocate(g);
-        assert(ok);
+        if (!ok) {
+                fprintf(stderr, "Failed to register allocate:\n");
+                qir_dump(c);
+                abort();
+        }
 
         for (uint32_t i = 0; i < c->num_temps; i++) {
                 temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])];

From cd7dd45bfec9ad68719c5e4e04b66ea4bcc1a2c1 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 10 Jul 2015 17:00:34 -0700
Subject: [PATCH 0331/1208] vc4: Fix compiler warnings on release builds.

---
 src/gallium/drivers/vc4/vc4_program.c      |  2 ++
 src/gallium/drivers/vc4/vc4_qpu_emit.c     |  1 +
 src/gallium/drivers/vc4/vc4_qpu_validate.c |  7 +++++++
 src/gallium/drivers/vc4/vc4_tiling.c       | 11 ++++-------
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index e61ea2170ff..df440f627ec 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1748,6 +1748,7 @@ ntq_setup_inputs(struct vc4_compile *c)
                 unsigned loc = var->data.driver_location;
 
                 assert(array_len == 1);
+                (void)array_len;
                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
                                   (loc + 1) * 4);
 
@@ -1782,6 +1783,7 @@ ntq_setup_outputs(struct vc4_compile *c)
                 unsigned loc = var->data.driver_location * 4;
 
                 assert(array_len == 1);
+                (void)array_len;
 
                 /* NIR hack to pass through
                  * TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS */
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 99afe4b8798..e1b3f3ce99a 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -234,6 +234,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         case QFILE_VPM:
                                 assert((int)qinst->src[i].index >=
                                        last_vpm_read_index);
+                                (void)last_vpm_read_index;
                                 last_vpm_read_index = qinst->src[i].index;
                                 src[i] = qpu_ra(QPU_R_VPM);
                                 break;
diff --git a/src/gallium/drivers/vc4/vc4_qpu_validate.c b/src/gallium/drivers/vc4/vc4_qpu_validate.c
index 8471edbf62c..9cf6841f41c 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_validate.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_validate.c
@@ -23,6 +23,13 @@
 
 #include "vc4_qpu.h"
 
+#ifdef NDEBUG
+/* Since most of our code is used in assert()s, don't warn about dead code. */
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
 static bool
 writes_reg(uint64_t inst, uint32_t w)
 {
diff --git a/src/gallium/drivers/vc4/vc4_tiling.c b/src/gallium/drivers/vc4/vc4_tiling.c
index f9801c9cefd..cf86eb0fa31 100644
--- a/src/gallium/drivers/vc4/vc4_tiling.c
+++ b/src/gallium/drivers/vc4/vc4_tiling.c
@@ -127,13 +127,10 @@ vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp)
 static void
 check_box_utile_alignment(const struct pipe_box *box, int cpp)
 {
-        uint32_t utile_w = vc4_utile_width(cpp);
-        uint32_t utile_h = vc4_utile_height(cpp);
-
-        assert(!(box->x & (utile_w - 1)));
-        assert(!(box->y & (utile_h - 1)));
-        assert(!(box->width & (utile_w - 1)));
-        assert(!(box->height & (utile_h - 1)));
+        assert(!(box->x & (vc4_utile_width(cpp) - 1)));
+        assert(!(box->y & (vc4_utile_height(cpp) - 1)));
+        assert(!(box->width & (vc4_utile_width(cpp) - 1)));
+        assert(!(box->height & (vc4_utile_height(cpp) - 1)));
 }
 
 static void

From 320089dbd63de3ac1bd3d42ee8cec41837486d8c Mon Sep 17 00:00:00 2001
From: Jordan Justen <jordan.l.justen@intel.com>
Date: Thu, 11 Jun 2015 19:17:03 -0700
Subject: [PATCH 0332/1208] i965/cs: Initialize GPGPU Thread Count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This field should always be set for gen8. In the bdw PRM, Volume 2d:
Command Reference: Structures under INTERFACE_DESCRIPTOR_DATA, DWORD
6, Bits 9:0, Number of Threads in GPGPU Thread Group:

"This field should not be set to 0 even if the barrier is disabled,
since an accurate value is needed for proper pre-emption."

In the HSW PRM, the it doesn't mention that it must always be set, but
it should not hurt.

Reported-by: Kristian Høgsberg <krh@bitplanet.net>
Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/brw_cs.cpp    | 20 ++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_defines.h |  5 +++++
 2 files changed, 25 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp
index 4c5082c82c4..d61bba002c4 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
@@ -291,6 +291,17 @@ brw_cs_precompile(struct gl_context *ctx,
 }
 
 
+static unsigned
+get_cs_thread_count(const struct brw_cs_prog_data *cs_prog_data)
+{
+   const unsigned simd_size = cs_prog_data->simd_size;
+   unsigned group_size = cs_prog_data->local_size[0] *
+      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
+
+   return (group_size + simd_size - 1) / simd_size;
+}
+
+
 static void
 brw_upload_cs_state(struct brw_context *brw)
 {
@@ -316,6 +327,8 @@ brw_upload_cs_state(struct brw_context *brw)
                                             prog_data->binding_table.size_bytes,
                                             32, &stage_state->bind_bo_offset);
 
+   unsigned threads = get_cs_thread_count(cs_prog_data);
+
    uint32_t dwords = brw->gen < 8 ? 8 : 9;
    BEGIN_BATCH(dwords);
    OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2));
@@ -365,6 +378,13 @@ brw_upload_cs_state(struct brw_context *brw)
    desc[dw++] = 0;
    desc[dw++] = 0;
    desc[dw++] = stage_state->bind_bo_offset;
+   desc[dw++] = 0;
+   const uint32_t media_threads =
+      brw->gen >= 8 ?
+      SET_FIELD(threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
+      SET_FIELD(threads, MEDIA_GPGPU_THREAD_COUNT);
+   assert(threads <= brw->max_cs_threads);
+   desc[dw++] = media_threads;
 
    BEGIN_BATCH(4);
    OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2));
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 19489aba5be..b1a1c11b3ae 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -2518,6 +2518,11 @@ enum brw_wm_barycentric_interp_mode {
 # define MEDIA_VFE_STATE_CURBE_ALLOC_MASK       INTEL_MASK(15, 0)
 
 #define MEDIA_INTERFACE_DESCRIPTOR_LOAD         0x7002
+/* GEN7 DW5, GEN8+ DW6 */
+# define MEDIA_GPGPU_THREAD_COUNT_SHIFT         0
+# define MEDIA_GPGPU_THREAD_COUNT_MASK          INTEL_MASK(7, 0)
+# define GEN8_MEDIA_GPGPU_THREAD_COUNT_SHIFT    0
+# define GEN8_MEDIA_GPGPU_THREAD_COUNT_MASK     INTEL_MASK(9, 0)
 #define MEDIA_STATE_FLUSH                       0x7004
 #define GPGPU_WALKER                            0x7105
 /* GEN8+ DW2 */

From 9476b11d6edc67403dd7c5aaddbc375400e02425 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 14 Jul 2015 11:54:15 -0700
Subject: [PATCH 0333/1208] vc4: Fix some -Wdouble-promotion warnings.

No code generation changes from this, but it'll be useful to have this
next time I go checking -Wdouble-promotion.
---
 src/gallium/drivers/vc4/vc4_emit.c    | 8 ++++----
 src/gallium/drivers/vc4/vc4_program.c | 2 +-
 src/gallium/drivers/vc4/vc4_state.c   | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_emit.c b/src/gallium/drivers/vc4/vc4_emit.c
index f5925734415..ba064ff889b 100644
--- a/src/gallium/drivers/vc4/vc4_emit.c
+++ b/src/gallium/drivers/vc4/vc4_emit.c
@@ -32,10 +32,10 @@ vc4_emit_state(struct pipe_context *pctx)
         if (vc4->dirty & (VC4_DIRTY_SCISSOR | VC4_DIRTY_VIEWPORT)) {
                 float *vpscale = vc4->viewport.scale;
                 float *vptranslate = vc4->viewport.translate;
-                float vp_minx = -fabs(vpscale[0]) + vptranslate[0];
-                float vp_maxx = fabs(vpscale[0]) + vptranslate[0];
-                float vp_miny = -fabs(vpscale[1]) + vptranslate[1];
-                float vp_maxy = fabs(vpscale[1]) + vptranslate[1];
+                float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
+                float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
+                float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
+                float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
                 uint32_t minx = MAX2(vc4->scissor.minx, vp_minx);
                 uint32_t miny = MAX2(vc4->scissor.miny, vp_miny);
                 uint32_t maxx = MIN2(vc4->scissor.maxx, vp_maxx);
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index df440f627ec..f0bfe24c68a 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -558,7 +558,7 @@ ntq_fsin(struct vc4_compile *c, struct qreg src)
         struct qreg scaled_x =
                 qir_FMUL(c,
                          src,
-                         qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
+                         qir_uniform_f(c, 1.0 / (M_PI * 2.0)));
 
         struct qreg x = qir_FADD(c,
                                  ntq_ffract(c, scaled_x),
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 4a1d4c3a4d6..e0ce4aee779 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -107,7 +107,7 @@ vc4_create_rasterizer_state(struct pipe_context *pctx,
         /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
          * BCM21553).
          */
-        so->point_size = MAX2(cso->point_size, .125);
+        so->point_size = MAX2(cso->point_size, .125f);
 
         if (cso->front_ccw)
                 so->config_bits[0] |= VC4_CONFIG_BITS_CW_PRIMITIVES;

From 1835ce6e35e6a186c2ba1bdf39b73783a2cb2ad5 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 14 Jul 2015 12:21:23 -0700
Subject: [PATCH 0334/1208] vc4: Move uniforms handling to a separate file.

The rest of vc4_program.c is about compiling, while this is about
uniform emit at draw time.
---
 src/gallium/drivers/vc4/Makefile.sources |   1 +
 src/gallium/drivers/vc4/vc4_program.c    | 314 ---------------------
 src/gallium/drivers/vc4/vc4_uniforms.c   | 340 +++++++++++++++++++++++
 3 files changed, 341 insertions(+), 314 deletions(-)
 create mode 100644 src/gallium/drivers/vc4/vc4_uniforms.c

diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index 1eb029e67e7..c24b01d4fbb 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -49,4 +49,5 @@ C_SOURCES := \
 	vc4_state.c \
 	vc4_tiling.c \
 	vc4_tiling.h \
+	vc4_uniforms.c \
 	$()
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index f0bfe24c68a..ce99989a79d 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -28,8 +28,6 @@
 #include "util/u_hash.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-#include "util/u_pack_color.h"
-#include "util/format_srgb.h"
 #include "util/ralloc.h"
 #include "util/hash_table.h"
 #include "tgsi/tgsi_dump.h"
@@ -2510,318 +2508,6 @@ vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
         free(so);
 }
 
-static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest)
-{
-        switch (p_wrap) {
-        case PIPE_TEX_WRAP_REPEAT:
-                return 0;
-        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-                return 1;
-        case PIPE_TEX_WRAP_MIRROR_REPEAT:
-                return 2;
-        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-                return 3;
-        case PIPE_TEX_WRAP_CLAMP:
-                return (using_nearest ? 1 : 3);
-        default:
-                fprintf(stderr, "Unknown wrap mode %d\n", p_wrap);
-                assert(!"not reached");
-                return 0;
-        }
-}
-
-static void
-write_texture_p0(struct vc4_context *vc4,
-                 struct vc4_cl_out **uniforms,
-                 struct vc4_texture_stateobj *texstate,
-                 uint32_t unit)
-{
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-
-        cl_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo,
-                 VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
-                 VC4_SET_FIELD(texture->u.tex.last_level -
-                               texture->u.tex.first_level, VC4_TEX_P0_MIPLVLS) |
-                 VC4_SET_FIELD(texture->target == PIPE_TEXTURE_CUBE,
-                               VC4_TEX_P0_CMMODE) |
-                 VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE));
-}
-
-static void
-write_texture_p1(struct vc4_context *vc4,
-                 struct vc4_cl_out **uniforms,
-                 struct vc4_texture_stateobj *texstate,
-                 uint32_t unit)
-{
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-        struct pipe_sampler_state *sampler = texstate->samplers[unit];
-        static const uint8_t minfilter_map[6] = {
-                VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR,
-                VC4_TEX_P1_MINFILT_LIN_MIP_NEAR,
-                VC4_TEX_P1_MINFILT_NEAR_MIP_LIN,
-                VC4_TEX_P1_MINFILT_LIN_MIP_LIN,
-                VC4_TEX_P1_MINFILT_NEAREST,
-                VC4_TEX_P1_MINFILT_LINEAR,
-        };
-        static const uint32_t magfilter_map[] = {
-                [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST,
-                [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR,
-        };
-
-        bool either_nearest =
-                (sampler->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
-                 sampler->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
-
-        cl_aligned_u32(uniforms,
-               VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) |
-               VC4_SET_FIELD(texture->texture->height0 & 2047,
-                             VC4_TEX_P1_HEIGHT) |
-               VC4_SET_FIELD(texture->texture->width0 & 2047,
-                             VC4_TEX_P1_WIDTH) |
-               VC4_SET_FIELD(magfilter_map[sampler->mag_img_filter],
-                             VC4_TEX_P1_MAGFILT) |
-               VC4_SET_FIELD(minfilter_map[sampler->min_mip_filter * 2 +
-                                           sampler->min_img_filter],
-                             VC4_TEX_P1_MINFILT) |
-               VC4_SET_FIELD(translate_wrap(sampler->wrap_s, either_nearest),
-                             VC4_TEX_P1_WRAP_S) |
-               VC4_SET_FIELD(translate_wrap(sampler->wrap_t, either_nearest),
-                             VC4_TEX_P1_WRAP_T));
-}
-
-static void
-write_texture_p2(struct vc4_context *vc4,
-                 struct vc4_cl_out **uniforms,
-                 struct vc4_texture_stateobj *texstate,
-                 uint32_t data)
-{
-        uint32_t unit = data & 0xffff;
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-
-        cl_aligned_u32(uniforms,
-               VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE,
-                             VC4_TEX_P2_PTYPE) |
-               VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) |
-               VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD));
-}
-
-
-#define SWIZ(x,y,z,w) {          \
-        UTIL_FORMAT_SWIZZLE_##x, \
-        UTIL_FORMAT_SWIZZLE_##y, \
-        UTIL_FORMAT_SWIZZLE_##z, \
-        UTIL_FORMAT_SWIZZLE_##w  \
-}
-
-static void
-write_texture_border_color(struct vc4_context *vc4,
-                           struct vc4_cl_out **uniforms,
-                           struct vc4_texture_stateobj *texstate,
-                           uint32_t unit)
-{
-        struct pipe_sampler_state *sampler = texstate->samplers[unit];
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-        union util_color uc;
-
-        const struct util_format_description *tex_format_desc =
-                util_format_description(texture->format);
-
-        float border_color[4];
-        for (int i = 0; i < 4; i++)
-                border_color[i] = sampler->border_color.f[i];
-        if (util_format_is_srgb(texture->format)) {
-                for (int i = 0; i < 3; i++)
-                        border_color[i] =
-                                util_format_linear_to_srgb_float(border_color[i]);
-        }
-
-        /* Turn the border color into the layout of channels that it would
-         * have when stored as texture contents.
-         */
-        float storage_color[4];
-        util_format_unswizzle_4f(storage_color,
-                                 border_color,
-                                 tex_format_desc->swizzle);
-
-        /* Now, pack so that when the vc4_format-sampled texture contents are
-         * replaced with our border color, the vc4_get_format_swizzle()
-         * swizzling will get the right channels.
-         */
-        if (util_format_is_depth_or_stencil(texture->format)) {
-                uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,
-                                       sampler->border_color.f[0]) << 8;
-        } else {
-                switch (rsc->vc4_format) {
-                default:
-                case VC4_TEXTURE_TYPE_RGBA8888:
-                        util_pack_color(storage_color,
-                                        PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
-                        break;
-                case VC4_TEXTURE_TYPE_RGBA4444:
-                        util_pack_color(storage_color,
-                                        PIPE_FORMAT_A8B8G8R8_UNORM, &uc);
-                        break;
-                case VC4_TEXTURE_TYPE_RGB565:
-                        util_pack_color(storage_color,
-                                        PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
-                        break;
-                case VC4_TEXTURE_TYPE_ALPHA:
-                        uc.ui[0] = float_to_ubyte(storage_color[0]) << 24;
-                        break;
-                case VC4_TEXTURE_TYPE_LUMALPHA:
-                        uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) |
-                                    (float_to_ubyte(storage_color[0]) << 0));
-                        break;
-                }
-        }
-
-        cl_aligned_u32(uniforms, uc.ui[0]);
-}
-
-static uint32_t
-get_texrect_scale(struct vc4_texture_stateobj *texstate,
-                  enum quniform_contents contents,
-                  uint32_t data)
-{
-        struct pipe_sampler_view *texture = texstate->textures[data];
-        uint32_t dim;
-
-        if (contents == QUNIFORM_TEXRECT_SCALE_X)
-                dim = texture->texture->width0;
-        else
-                dim = texture->texture->height0;
-
-        return fui(1.0f / dim);
-}
-
-static struct vc4_bo *
-vc4_upload_ubo(struct vc4_context *vc4,
-               struct vc4_compiled_shader *shader,
-               const uint32_t *gallium_uniforms)
-{
-        if (!shader->ubo_size)
-                return NULL;
-
-        struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo");
-        uint32_t *data = vc4_bo_map(ubo);
-        for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) {
-                memcpy(data + shader->ubo_ranges[i].dst_offset,
-                       gallium_uniforms + shader->ubo_ranges[i].src_offset,
-                       shader->ubo_ranges[i].size);
-        }
-
-        return ubo;
-}
-
-void
-vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
-                   struct vc4_constbuf_stateobj *cb,
-                   struct vc4_texture_stateobj *texstate)
-{
-        struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
-        const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
-        struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms);
-
-        cl_ensure_space(&vc4->uniforms, (uinfo->count +
-                                         uinfo->num_texture_samples) * 4);
-
-        struct vc4_cl_out *uniforms =
-                cl_start_shader_reloc(&vc4->uniforms,
-                                      uinfo->num_texture_samples);
-
-        for (int i = 0; i < uinfo->count; i++) {
-
-                switch (uinfo->contents[i]) {
-                case QUNIFORM_CONSTANT:
-                        cl_aligned_u32(&uniforms, uinfo->data[i]);
-                        break;
-                case QUNIFORM_UNIFORM:
-                        cl_aligned_u32(&uniforms,
-                                       gallium_uniforms[uinfo->data[i]]);
-                        break;
-                case QUNIFORM_VIEWPORT_X_SCALE:
-                        cl_aligned_f(&uniforms, vc4->viewport.scale[0] * 16.0f);
-                        break;
-                case QUNIFORM_VIEWPORT_Y_SCALE:
-                        cl_aligned_f(&uniforms, vc4->viewport.scale[1] * 16.0f);
-                        break;
-
-                case QUNIFORM_VIEWPORT_Z_OFFSET:
-                        cl_aligned_f(&uniforms, vc4->viewport.translate[2]);
-                        break;
-                case QUNIFORM_VIEWPORT_Z_SCALE:
-                        cl_aligned_f(&uniforms, vc4->viewport.scale[2]);
-                        break;
-
-                case QUNIFORM_USER_CLIP_PLANE:
-                        cl_aligned_f(&uniforms,
-                                     vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
-                        break;
-
-                case QUNIFORM_TEXTURE_CONFIG_P0:
-                        write_texture_p0(vc4, &uniforms, texstate,
-                                         uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_TEXTURE_CONFIG_P1:
-                        write_texture_p1(vc4, &uniforms, texstate,
-                                         uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_TEXTURE_CONFIG_P2:
-                        write_texture_p2(vc4, &uniforms, texstate,
-                                         uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_UBO_ADDR:
-                        cl_aligned_reloc(vc4, &vc4->uniforms, &uniforms, ubo, 0);
-                        break;
-
-                case QUNIFORM_TEXTURE_BORDER_COLOR:
-                        write_texture_border_color(vc4, &uniforms,
-                                                   texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_TEXRECT_SCALE_X:
-                case QUNIFORM_TEXRECT_SCALE_Y:
-                        cl_aligned_u32(&uniforms,
-                                       get_texrect_scale(texstate,
-                                                         uinfo->contents[i],
-                                                         uinfo->data[i]));
-                        break;
-
-                case QUNIFORM_BLEND_CONST_COLOR:
-                        cl_aligned_f(&uniforms,
-                                     CLAMP(vc4->blend_color.color[uinfo->data[i]], 0, 1));
-                        break;
-
-                case QUNIFORM_STENCIL:
-                        cl_aligned_u32(&uniforms,
-                                       vc4->zsa->stencil_uniforms[uinfo->data[i]] |
-                                       (uinfo->data[i] <= 1 ?
-                                        (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) :
-                                        0));
-                        break;
-
-                case QUNIFORM_ALPHA_REF:
-                        cl_aligned_f(&uniforms,
-                                     vc4->zsa->base.alpha.ref_value);
-                        break;
-                }
-#if 0
-                uint32_t written_val = *((uint32_t *)uniforms - 1);
-                fprintf(stderr, "%p: %d / 0x%08x (%f)\n",
-                        shader, i, written_val, uif(written_val));
-#endif
-        }
-
-        cl_end(&vc4->uniforms, uniforms);
-}
-
 static void
 vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)
 {
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
new file mode 100644
index 00000000000..e8fca3d0cbd
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -0,0 +1,340 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_pack_color.h"
+#include "util/format_srgb.h"
+
+#include "vc4_context.h"
+#include "vc4_qir.h"
+
+static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest)
+{
+        switch (p_wrap) {
+        case PIPE_TEX_WRAP_REPEAT:
+                return 0;
+        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+                return 1;
+        case PIPE_TEX_WRAP_MIRROR_REPEAT:
+                return 2;
+        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+                return 3;
+        case PIPE_TEX_WRAP_CLAMP:
+                return (using_nearest ? 1 : 3);
+        default:
+                fprintf(stderr, "Unknown wrap mode %d\n", p_wrap);
+                assert(!"not reached");
+                return 0;
+        }
+}
+
+static void
+write_texture_p0(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t unit)
+{
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+
+        cl_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo,
+                 VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
+                 VC4_SET_FIELD(texture->u.tex.last_level -
+                               texture->u.tex.first_level, VC4_TEX_P0_MIPLVLS) |
+                 VC4_SET_FIELD(texture->target == PIPE_TEXTURE_CUBE,
+                               VC4_TEX_P0_CMMODE) |
+                 VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE));
+}
+
+static void
+write_texture_p1(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t unit)
+{
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+        struct pipe_sampler_state *sampler = texstate->samplers[unit];
+        static const uint8_t minfilter_map[6] = {
+                VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR,
+                VC4_TEX_P1_MINFILT_LIN_MIP_NEAR,
+                VC4_TEX_P1_MINFILT_NEAR_MIP_LIN,
+                VC4_TEX_P1_MINFILT_LIN_MIP_LIN,
+                VC4_TEX_P1_MINFILT_NEAREST,
+                VC4_TEX_P1_MINFILT_LINEAR,
+        };
+        static const uint32_t magfilter_map[] = {
+                [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST,
+                [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR,
+        };
+
+        bool either_nearest =
+                (sampler->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
+                 sampler->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
+
+        cl_aligned_u32(uniforms,
+               VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) |
+               VC4_SET_FIELD(texture->texture->height0 & 2047,
+                             VC4_TEX_P1_HEIGHT) |
+               VC4_SET_FIELD(texture->texture->width0 & 2047,
+                             VC4_TEX_P1_WIDTH) |
+               VC4_SET_FIELD(magfilter_map[sampler->mag_img_filter],
+                             VC4_TEX_P1_MAGFILT) |
+               VC4_SET_FIELD(minfilter_map[sampler->min_mip_filter * 2 +
+                                           sampler->min_img_filter],
+                             VC4_TEX_P1_MINFILT) |
+               VC4_SET_FIELD(translate_wrap(sampler->wrap_s, either_nearest),
+                             VC4_TEX_P1_WRAP_S) |
+               VC4_SET_FIELD(translate_wrap(sampler->wrap_t, either_nearest),
+                             VC4_TEX_P1_WRAP_T));
+}
+
+static void
+write_texture_p2(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t data)
+{
+        uint32_t unit = data & 0xffff;
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+
+        cl_aligned_u32(uniforms,
+               VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE,
+                             VC4_TEX_P2_PTYPE) |
+               VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) |
+               VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD));
+}
+
+
+#define SWIZ(x,y,z,w) {          \
+        UTIL_FORMAT_SWIZZLE_##x, \
+        UTIL_FORMAT_SWIZZLE_##y, \
+        UTIL_FORMAT_SWIZZLE_##z, \
+        UTIL_FORMAT_SWIZZLE_##w  \
+}
+
+static void
+write_texture_border_color(struct vc4_context *vc4,
+                           struct vc4_cl_out **uniforms,
+                           struct vc4_texture_stateobj *texstate,
+                           uint32_t unit)
+{
+        struct pipe_sampler_state *sampler = texstate->samplers[unit];
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+        union util_color uc;
+
+        const struct util_format_description *tex_format_desc =
+                util_format_description(texture->format);
+
+        float border_color[4];
+        for (int i = 0; i < 4; i++)
+                border_color[i] = sampler->border_color.f[i];
+        if (util_format_is_srgb(texture->format)) {
+                for (int i = 0; i < 3; i++)
+                        border_color[i] =
+                                util_format_linear_to_srgb_float(border_color[i]);
+        }
+
+        /* Turn the border color into the layout of channels that it would
+         * have when stored as texture contents.
+         */
+        float storage_color[4];
+        util_format_unswizzle_4f(storage_color,
+                                 border_color,
+                                 tex_format_desc->swizzle);
+
+        /* Now, pack so that when the vc4_format-sampled texture contents are
+         * replaced with our border color, the vc4_get_format_swizzle()
+         * swizzling will get the right channels.
+         */
+        if (util_format_is_depth_or_stencil(texture->format)) {
+                uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,
+                                       sampler->border_color.f[0]) << 8;
+        } else {
+                switch (rsc->vc4_format) {
+                default:
+                case VC4_TEXTURE_TYPE_RGBA8888:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
+                        break;
+                case VC4_TEXTURE_TYPE_RGBA4444:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_A8B8G8R8_UNORM, &uc);
+                        break;
+                case VC4_TEXTURE_TYPE_RGB565:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
+                        break;
+                case VC4_TEXTURE_TYPE_ALPHA:
+                        uc.ui[0] = float_to_ubyte(storage_color[0]) << 24;
+                        break;
+                case VC4_TEXTURE_TYPE_LUMALPHA:
+                        uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) |
+                                    (float_to_ubyte(storage_color[0]) << 0));
+                        break;
+                }
+        }
+
+        cl_aligned_u32(uniforms, uc.ui[0]);
+}
+
+static uint32_t
+get_texrect_scale(struct vc4_texture_stateobj *texstate,
+                  enum quniform_contents contents,
+                  uint32_t data)
+{
+        struct pipe_sampler_view *texture = texstate->textures[data];
+        uint32_t dim;
+
+        if (contents == QUNIFORM_TEXRECT_SCALE_X)
+                dim = texture->texture->width0;
+        else
+                dim = texture->texture->height0;
+
+        return fui(1.0f / dim);
+}
+
+static struct vc4_bo *
+vc4_upload_ubo(struct vc4_context *vc4,
+               struct vc4_compiled_shader *shader,
+               const uint32_t *gallium_uniforms)
+{
+        if (!shader->ubo_size)
+                return NULL;
+
+        struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo");
+        uint32_t *data = vc4_bo_map(ubo);
+        for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) {
+                memcpy(data + shader->ubo_ranges[i].dst_offset,
+                       gallium_uniforms + shader->ubo_ranges[i].src_offset,
+                       shader->ubo_ranges[i].size);
+        }
+
+        return ubo;
+}
+
+void
+vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
+                   struct vc4_constbuf_stateobj *cb,
+                   struct vc4_texture_stateobj *texstate)
+{
+        struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
+        const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
+        struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms);
+
+        cl_ensure_space(&vc4->uniforms, (uinfo->count +
+                                         uinfo->num_texture_samples) * 4);
+
+        struct vc4_cl_out *uniforms =
+                cl_start_shader_reloc(&vc4->uniforms,
+                                      uinfo->num_texture_samples);
+
+        for (int i = 0; i < uinfo->count; i++) {
+
+                switch (uinfo->contents[i]) {
+                case QUNIFORM_CONSTANT:
+                        cl_aligned_u32(&uniforms, uinfo->data[i]);
+                        break;
+                case QUNIFORM_UNIFORM:
+                        cl_aligned_u32(&uniforms,
+                                       gallium_uniforms[uinfo->data[i]]);
+                        break;
+                case QUNIFORM_VIEWPORT_X_SCALE:
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[0] * 16.0f);
+                        break;
+                case QUNIFORM_VIEWPORT_Y_SCALE:
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[1] * 16.0f);
+                        break;
+
+                case QUNIFORM_VIEWPORT_Z_OFFSET:
+                        cl_aligned_f(&uniforms, vc4->viewport.translate[2]);
+                        break;
+                case QUNIFORM_VIEWPORT_Z_SCALE:
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[2]);
+                        break;
+
+                case QUNIFORM_USER_CLIP_PLANE:
+                        cl_aligned_f(&uniforms,
+                                     vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P0:
+                        write_texture_p0(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P1:
+                        write_texture_p1(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P2:
+                        write_texture_p2(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_UBO_ADDR:
+                        cl_aligned_reloc(vc4, &vc4->uniforms, &uniforms, ubo, 0);
+                        break;
+
+                case QUNIFORM_TEXTURE_BORDER_COLOR:
+                        write_texture_border_color(vc4, &uniforms,
+                                                   texstate, uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXRECT_SCALE_X:
+                case QUNIFORM_TEXRECT_SCALE_Y:
+                        cl_aligned_u32(&uniforms,
+                                       get_texrect_scale(texstate,
+                                                         uinfo->contents[i],
+                                                         uinfo->data[i]));
+                        break;
+
+                case QUNIFORM_BLEND_CONST_COLOR:
+                        cl_aligned_f(&uniforms,
+                                     CLAMP(vc4->blend_color.color[uinfo->data[i]], 0, 1));
+                        break;
+
+                case QUNIFORM_STENCIL:
+                        cl_aligned_u32(&uniforms,
+                                       vc4->zsa->stencil_uniforms[uinfo->data[i]] |
+                                       (uinfo->data[i] <= 1 ?
+                                        (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) :
+                                        0));
+                        break;
+
+                case QUNIFORM_ALPHA_REF:
+                        cl_aligned_f(&uniforms,
+                                     vc4->zsa->base.alpha.ref_value);
+                        break;
+                }
+#if 0
+                uint32_t written_val = *((uint32_t *)uniforms - 1);
+                fprintf(stderr, "%p: %d / 0x%08x (%f)\n",
+                        shader, i, written_val, uif(written_val));
+#endif
+        }
+
+        cl_end(&vc4->uniforms, uniforms);
+}

From 0f4d2b0a2dd3fa39426f2789bf2a8fc939adf001 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 14 Jul 2015 12:18:40 -0700
Subject: [PATCH 0335/1208] vc4: Cache texture p0/p1 setup for the sampler
 view.

In exchange for a bit of space and computation in CSO setup, we cut
vc4_uniform.c (draw time) code size by 4.8%.
---
 src/gallium/drivers/vc4/vc4_context.h  | 12 ++++++++++
 src/gallium/drivers/vc4/vc4_state.c    | 33 ++++++++++++++++++--------
 src/gallium/drivers/vc4/vc4_uniforms.c | 24 ++++++-------------
 3 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index d5d6be16f6e..be7df1bd347 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -69,6 +69,12 @@
 #define VC4_DIRTY_UNCOMPILED_FS (1 << 22)
 #define VC4_DIRTY_COMPILED_FS   (1 << 24)
 
+struct vc4_sampler_view {
+        struct pipe_sampler_view base;
+        uint32_t texture_p0;
+        uint32_t texture_p1;
+};
+
 struct vc4_texture_stateobj {
         struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS];
         unsigned num_textures;
@@ -326,6 +332,12 @@ vc4_context(struct pipe_context *pcontext)
         return (struct vc4_context *)pcontext;
 }
 
+static inline struct vc4_sampler_view *
+vc4_sampler_view(struct pipe_sampler_view *psview)
+{
+        return (struct vc4_sampler_view *)psview;
+}
+
 struct pipe_context *vc4_context_create(struct pipe_screen *pscreen,
                                         void *priv);
 void vc4_draw_init(struct pipe_context *pctx);
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index e0ce4aee779..46852b6628c 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -499,13 +499,13 @@ static struct pipe_sampler_view *
 vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
                         const struct pipe_sampler_view *cso)
 {
-        struct pipe_sampler_view *so = malloc(sizeof(*so));
+        struct vc4_sampler_view *so = malloc(sizeof(*so));
         struct vc4_resource *rsc = vc4_resource(prsc);
 
         if (!so)
                 return NULL;
 
-        *so = *cso;
+        so->base = *cso;
 
         pipe_reference(NULL, &prsc->reference);
 
@@ -516,18 +516,19 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
          * Also, Raspberry Pi doesn't support sampling from raster textures,
          * so we also have to copy to a temporary then.
          */
-        if (so->u.tex.first_level ||
+        if (cso->u.tex.first_level ||
             rsc->vc4_format == VC4_TEXTURE_TYPE_RGBA32R) {
                 struct vc4_resource *shadow_parent = vc4_resource(prsc);
                 struct pipe_resource tmpl = shadow_parent->base.b;
                 struct vc4_resource *clone;
 
                 tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
-                tmpl.width0 = u_minify(tmpl.width0, so->u.tex.first_level);
-                tmpl.height0 = u_minify(tmpl.height0, so->u.tex.first_level);
-                tmpl.last_level = so->u.tex.last_level - so->u.tex.first_level;
+                tmpl.width0 = u_minify(tmpl.width0, cso->u.tex.first_level);
+                tmpl.height0 = u_minify(tmpl.height0, cso->u.tex.first_level);
+                tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level;
 
                 prsc = vc4_resource_create(pctx->screen, &tmpl);
+                rsc = vc4_resource(prsc);
                 clone = vc4_resource(prsc);
                 clone->shadow_parent = &shadow_parent->base.b;
                 /* Flag it as needing update of the contents from the parent. */
@@ -535,11 +536,23 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
 
                 assert(clone->vc4_format != VC4_TEXTURE_TYPE_RGBA32R);
         }
-        so->texture = prsc;
-        so->reference.count = 1;
-        so->context = pctx;
+        so->base.texture = prsc;
+        so->base.reference.count = 1;
+        so->base.context = pctx;
 
-        return so;
+        so->texture_p0 =
+                (VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
+                 VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE) |
+                 VC4_SET_FIELD(cso->u.tex.last_level -
+                               cso->u.tex.first_level, VC4_TEX_P0_MIPLVLS) |
+                 VC4_SET_FIELD(cso->target == PIPE_TEXTURE_CUBE,
+                               VC4_TEX_P0_CMMODE));
+        so->texture_p1 =
+                (VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) |
+                 VC4_SET_FIELD(prsc->height0 & 2047, VC4_TEX_P1_HEIGHT) |
+                 VC4_SET_FIELD(prsc->width0 & 2047, VC4_TEX_P1_WIDTH));
+
+        return &so->base;
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
index e8fca3d0cbd..e8e9d6b0119 100644
--- a/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -53,16 +53,11 @@ write_texture_p0(struct vc4_context *vc4,
                  struct vc4_texture_stateobj *texstate,
                  uint32_t unit)
 {
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
+        struct vc4_sampler_view *sview =
+                vc4_sampler_view(texstate->textures[unit]);
+        struct vc4_resource *rsc = vc4_resource(sview->base.texture);
 
-        cl_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo,
-                 VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
-                 VC4_SET_FIELD(texture->u.tex.last_level -
-                               texture->u.tex.first_level, VC4_TEX_P0_MIPLVLS) |
-                 VC4_SET_FIELD(texture->target == PIPE_TEXTURE_CUBE,
-                               VC4_TEX_P0_CMMODE) |
-                 VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE));
+        cl_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo, sview->texture_p0);
 }
 
 static void
@@ -71,8 +66,8 @@ write_texture_p1(struct vc4_context *vc4,
                  struct vc4_texture_stateobj *texstate,
                  uint32_t unit)
 {
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
+        struct vc4_sampler_view *sview =
+                vc4_sampler_view(texstate->textures[unit]);
         struct pipe_sampler_state *sampler = texstate->samplers[unit];
         static const uint8_t minfilter_map[6] = {
                 VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR,
@@ -91,12 +86,7 @@ write_texture_p1(struct vc4_context *vc4,
                 (sampler->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
                  sampler->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
 
-        cl_aligned_u32(uniforms,
-               VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) |
-               VC4_SET_FIELD(texture->texture->height0 & 2047,
-                             VC4_TEX_P1_HEIGHT) |
-               VC4_SET_FIELD(texture->texture->width0 & 2047,
-                             VC4_TEX_P1_WIDTH) |
+        cl_aligned_u32(uniforms, sview->texture_p1 |
                VC4_SET_FIELD(magfilter_map[sampler->mag_img_filter],
                              VC4_TEX_P1_MAGFILT) |
                VC4_SET_FIELD(minfilter_map[sampler->min_mip_filter * 2 +

From 7124feba1b879deb88dbf2baf600ed42309d9839 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 14 Jul 2015 12:32:04 -0700
Subject: [PATCH 0336/1208] vc4: Cache the texture p1 for the sampler.

Cuts another 12% of vc4_uniforms.o, in exchange for computing it at
CSO creation time.
---
 src/gallium/drivers/vc4/vc4_context.h  | 11 ++++++
 src/gallium/drivers/vc4/vc4_state.c    | 55 +++++++++++++++++++++++++-
 src/gallium/drivers/vc4/vc4_uniforms.c | 50 ++---------------------
 3 files changed, 68 insertions(+), 48 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index be7df1bd347..7faf5223630 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -75,6 +75,11 @@ struct vc4_sampler_view {
         uint32_t texture_p1;
 };
 
+struct vc4_sampler_state {
+        struct pipe_sampler_state base;
+        uint32_t texture_p1;
+};
+
 struct vc4_texture_stateobj {
         struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS];
         unsigned num_textures;
@@ -338,6 +343,12 @@ vc4_sampler_view(struct pipe_sampler_view *psview)
         return (struct vc4_sampler_view *)psview;
 }
 
+static inline struct vc4_sampler_state *
+vc4_sampler_state(struct pipe_sampler_state *psampler)
+{
+        return (struct vc4_sampler_state *)psampler;
+}
+
 struct pipe_context *vc4_context_create(struct pipe_screen *pscreen,
                                         void *priv);
 void vc4_draw_init(struct pipe_context *pctx);
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 46852b6628c..8a759c2ca4c 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -461,11 +461,64 @@ vc4_get_stage_tex(struct vc4_context *vc4, unsigned shader)
         }
 }
 
+static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest)
+{
+        switch (p_wrap) {
+        case PIPE_TEX_WRAP_REPEAT:
+                return 0;
+        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+                return 1;
+        case PIPE_TEX_WRAP_MIRROR_REPEAT:
+                return 2;
+        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+                return 3;
+        case PIPE_TEX_WRAP_CLAMP:
+                return (using_nearest ? 1 : 3);
+        default:
+                fprintf(stderr, "Unknown wrap mode %d\n", p_wrap);
+                assert(!"not reached");
+                return 0;
+        }
+}
+
 static void *
 vc4_create_sampler_state(struct pipe_context *pctx,
                          const struct pipe_sampler_state *cso)
 {
-        return vc4_generic_cso_state_create(cso, sizeof(*cso));
+        static const uint8_t minfilter_map[6] = {
+                VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR,
+                VC4_TEX_P1_MINFILT_LIN_MIP_NEAR,
+                VC4_TEX_P1_MINFILT_NEAR_MIP_LIN,
+                VC4_TEX_P1_MINFILT_LIN_MIP_LIN,
+                VC4_TEX_P1_MINFILT_NEAREST,
+                VC4_TEX_P1_MINFILT_LINEAR,
+        };
+        static const uint32_t magfilter_map[] = {
+                [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST,
+                [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR,
+        };
+        bool either_nearest =
+                (cso->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
+                 cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
+        struct vc4_sampler_state *so = CALLOC_STRUCT(vc4_sampler_state);
+
+        if (!so)
+                return NULL;
+
+        memcpy(so, cso, sizeof(*cso));
+
+        so->texture_p1 =
+                (VC4_SET_FIELD(magfilter_map[cso->mag_img_filter],
+                               VC4_TEX_P1_MAGFILT) |
+                 VC4_SET_FIELD(minfilter_map[cso->min_mip_filter * 2 +
+                                             cso->min_img_filter],
+                               VC4_TEX_P1_MINFILT) |
+                 VC4_SET_FIELD(translate_wrap(cso->wrap_s, either_nearest),
+                               VC4_TEX_P1_WRAP_S) |
+                 VC4_SET_FIELD(translate_wrap(cso->wrap_t, either_nearest),
+                               VC4_TEX_P1_WRAP_T));
+
+        return so;
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
index e8e9d6b0119..5613d6b28c0 100644
--- a/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -27,26 +27,6 @@
 #include "vc4_context.h"
 #include "vc4_qir.h"
 
-static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest)
-{
-        switch (p_wrap) {
-        case PIPE_TEX_WRAP_REPEAT:
-                return 0;
-        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-                return 1;
-        case PIPE_TEX_WRAP_MIRROR_REPEAT:
-                return 2;
-        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-                return 3;
-        case PIPE_TEX_WRAP_CLAMP:
-                return (using_nearest ? 1 : 3);
-        default:
-                fprintf(stderr, "Unknown wrap mode %d\n", p_wrap);
-                assert(!"not reached");
-                return 0;
-        }
-}
-
 static void
 write_texture_p0(struct vc4_context *vc4,
                  struct vc4_cl_out **uniforms,
@@ -68,34 +48,10 @@ write_texture_p1(struct vc4_context *vc4,
 {
         struct vc4_sampler_view *sview =
                 vc4_sampler_view(texstate->textures[unit]);
-        struct pipe_sampler_state *sampler = texstate->samplers[unit];
-        static const uint8_t minfilter_map[6] = {
-                VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR,
-                VC4_TEX_P1_MINFILT_LIN_MIP_NEAR,
-                VC4_TEX_P1_MINFILT_NEAR_MIP_LIN,
-                VC4_TEX_P1_MINFILT_LIN_MIP_LIN,
-                VC4_TEX_P1_MINFILT_NEAREST,
-                VC4_TEX_P1_MINFILT_LINEAR,
-        };
-        static const uint32_t magfilter_map[] = {
-                [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST,
-                [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR,
-        };
+        struct vc4_sampler_state *sampler =
+                vc4_sampler_state(texstate->samplers[unit]);
 
-        bool either_nearest =
-                (sampler->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
-                 sampler->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
-
-        cl_aligned_u32(uniforms, sview->texture_p1 |
-               VC4_SET_FIELD(magfilter_map[sampler->mag_img_filter],
-                             VC4_TEX_P1_MAGFILT) |
-               VC4_SET_FIELD(minfilter_map[sampler->min_mip_filter * 2 +
-                                           sampler->min_img_filter],
-                             VC4_TEX_P1_MINFILT) |
-               VC4_SET_FIELD(translate_wrap(sampler->wrap_s, either_nearest),
-                             VC4_TEX_P1_WRAP_S) |
-               VC4_SET_FIELD(translate_wrap(sampler->wrap_t, either_nearest),
-                             VC4_TEX_P1_WRAP_T));
+        cl_aligned_u32(uniforms, sview->texture_p1 | sampler->texture_p1);
 }
 
 static void

From 141e1eb29fe80ad341e718147a1277cc3b1b9c11 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 15 Jul 2015 06:15:06 -0600
Subject: [PATCH 0337/1208] osmesa: fix OSMesaPixelsStore typo

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91337
Cc: 10.6 <mesa-stable@lists.freedesktop.org>

Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/gallium/state_trackers/osmesa/osmesa.c | 2 +-
 src/mesa/drivers/osmesa/osmesa.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/state_trackers/osmesa/osmesa.c b/src/gallium/state_trackers/osmesa/osmesa.c
index 2d5d096d8ed..96b83585429 100644
--- a/src/gallium/state_trackers/osmesa/osmesa.c
+++ b/src/gallium/state_trackers/osmesa/osmesa.c
@@ -886,7 +886,7 @@ static struct name_function functions[] = {
    { "OSMesaDestroyContext", (OSMESAproc) OSMesaDestroyContext },
    { "OSMesaMakeCurrent", (OSMESAproc) OSMesaMakeCurrent },
    { "OSMesaGetCurrentContext", (OSMESAproc) OSMesaGetCurrentContext },
-   { "OSMesaPixelsStore", (OSMESAproc) OSMesaPixelStore },
+   { "OSMesaPixelStore", (OSMESAproc) OSMesaPixelStore },
    { "OSMesaGetIntegerv", (OSMESAproc) OSMesaGetIntegerv },
    { "OSMesaGetDepthBuffer", (OSMESAproc) OSMesaGetDepthBuffer },
    { "OSMesaGetColorBuffer", (OSMESAproc) OSMesaGetColorBuffer },
diff --git a/src/mesa/drivers/osmesa/osmesa.c b/src/mesa/drivers/osmesa/osmesa.c
index 022523eb00b..5c7dcac3841 100644
--- a/src/mesa/drivers/osmesa/osmesa.c
+++ b/src/mesa/drivers/osmesa/osmesa.c
@@ -1124,7 +1124,7 @@ static struct name_function functions[] = {
    { "OSMesaDestroyContext", (OSMESAproc) OSMesaDestroyContext },
    { "OSMesaMakeCurrent", (OSMESAproc) OSMesaMakeCurrent },
    { "OSMesaGetCurrentContext", (OSMESAproc) OSMesaGetCurrentContext },
-   { "OSMesaPixelsStore", (OSMESAproc) OSMesaPixelStore },
+   { "OSMesaPixelStore", (OSMESAproc) OSMesaPixelStore },
    { "OSMesaGetIntegerv", (OSMESAproc) OSMesaGetIntegerv },
    { "OSMesaGetDepthBuffer", (OSMESAproc) OSMesaGetDepthBuffer },
    { "OSMesaGetColorBuffer", (OSMESAproc) OSMesaGetColorBuffer },

From fbf3aebf1f33fbec559c5b69bdf3b5dec6031612 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 8 Jul 2015 18:56:52 -0700
Subject: [PATCH 0338/1208] i965: Move BEGIN_BATCH() into same control flow as
 ADVANCE_BATCH().

BEGIN_BATCH() and ADVANCE_BATCH() will contain "do {" and "} while (0)"
respectively to allow declaring local variables used by intervening
OUT_BATCH macros. As such, BEGIN_BATCH() and ADVANCE_BATCH() need to be
in the same control flow.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 src/mesa/drivers/dri/i965/brw_draw.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 69ad4d444da..ec13473798c 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -261,17 +261,17 @@ static void brw_emit_prim(struct brw_context *brw,
       indirect_flag = 0;
    }
 
+   BEGIN_BATCH(brw->gen >= 7 ? 7 : 6);
+
    if (brw->gen >= 7) {
       if (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT)
          predicate_enable = GEN7_3DPRIM_PREDICATE_ENABLE;
       else
          predicate_enable = 0;
 
-      BEGIN_BATCH(7);
       OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2) | indirect_flag | predicate_enable);
       OUT_BATCH(hw_prim | vertex_access_type);
    } else {
-      BEGIN_BATCH(6);
       OUT_BATCH(CMD_3D_PRIM << 16 | (6 - 2) |
                 hw_prim << GEN4_3DPRIM_TOPOLOGY_TYPE_SHIFT |
                 vertex_access_type);

From 09348c12fceba59c22219fe3272260eb8ea6051e Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 8 Jul 2015 18:59:51 -0700
Subject: [PATCH 0339/1208] i965: Split batch emission from relocation
 functions.

So that everything writing to the batch between BEGIN_BATCH() and
ADVANCE_BATCH() goes through OUT_BATCH.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 30 +++++++---------
 src/mesa/drivers/dri/i965/intel_batchbuffer.h | 34 ++++++++++---------
 2 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index d93ee6eed8a..f82958f0f6c 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -393,11 +393,11 @@ _intel_batchbuffer_flush(struct brw_context *brw,
 
 /*  This is the only way buffers get added to the validate list.
  */
-bool
-intel_batchbuffer_emit_reloc(struct brw_context *brw,
-                             drm_intel_bo *buffer,
-                             uint32_t read_domains, uint32_t write_domain,
-			     uint32_t delta)
+uint32_t
+intel_batchbuffer_reloc(struct brw_context *brw,
+                        drm_intel_bo *buffer,
+                        uint32_t read_domains, uint32_t write_domain,
+                        uint32_t delta)
 {
    int ret;
 
@@ -411,16 +411,14 @@ intel_batchbuffer_emit_reloc(struct brw_context *brw,
     * case the buffer doesn't move and we can short-circuit the relocation
     * processing in the kernel
     */
-   intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
-
-   return true;
+   return buffer->offset64 + delta;
 }
 
-bool
-intel_batchbuffer_emit_reloc64(struct brw_context *brw,
-                               drm_intel_bo *buffer,
-                               uint32_t read_domains, uint32_t write_domain,
-			       uint32_t delta)
+uint64_t
+intel_batchbuffer_reloc64(struct brw_context *brw,
+                          drm_intel_bo *buffer,
+                          uint32_t read_domains, uint32_t write_domain,
+                          uint32_t delta)
 {
    int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
                                      buffer, delta,
@@ -432,11 +430,7 @@ intel_batchbuffer_emit_reloc64(struct brw_context *brw,
     * case the buffer doesn't move and we can short-circuit the relocation
     * processing in the kernel
     */
-   uint64_t offset = buffer->offset64 + delta;
-   intel_batchbuffer_emit_dword(brw, offset);
-   intel_batchbuffer_emit_dword(brw, offset >> 32);
-
-   return true;
+   return buffer->offset64 + delta;
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index e58eae4115d..c0456f3c1f6 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -57,16 +57,16 @@ void intel_batchbuffer_data(struct brw_context *brw,
                             const void *data, GLuint bytes,
                             enum brw_gpu_ring ring);
 
-bool intel_batchbuffer_emit_reloc(struct brw_context *brw,
-                                       drm_intel_bo *buffer,
-				       uint32_t read_domains,
-				       uint32_t write_domain,
-				       uint32_t offset);
-bool intel_batchbuffer_emit_reloc64(struct brw_context *brw,
-                                    drm_intel_bo *buffer,
-                                    uint32_t read_domains,
-                                    uint32_t write_domain,
-                                    uint32_t offset);
+uint32_t intel_batchbuffer_reloc(struct brw_context *brw,
+                                 drm_intel_bo *buffer,
+                                 uint32_t read_domains,
+                                 uint32_t write_domain,
+                                 uint32_t offset);
+uint64_t intel_batchbuffer_reloc64(struct brw_context *brw,
+                                   drm_intel_bo *buffer,
+                                   uint32_t read_domains,
+                                   uint32_t write_domain,
+                                   uint32_t offset);
 static inline uint32_t float_as_int(float f)
 {
    union {
@@ -164,14 +164,16 @@ intel_batchbuffer_advance(struct brw_context *brw)
 #define BEGIN_BATCH_BLT(n) intel_batchbuffer_begin(brw, n, BLT_RING)
 #define OUT_BATCH(d) intel_batchbuffer_emit_dword(brw, d)
 #define OUT_BATCH_F(f) intel_batchbuffer_emit_float(brw, f)
-#define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
-   intel_batchbuffer_emit_reloc(brw, buf,			\
-				read_domains, write_domain, delta);	\
-} while (0)
+#define OUT_RELOC(buf, read_domains, write_domain, delta)                  \
+   OUT_BATCH(intel_batchbuffer_reloc(brw, buf, read_domains, write_domain, \
+                                     delta))
 
 /* Handle 48-bit address relocations for Gen8+ */
-#define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
-   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta);	\
+#define OUT_RELOC64(buf, read_domains, write_domain, delta) do {        \
+   uint64_t reloc64 = intel_batchbuffer_reloc64(brw, buf, read_domains, \
+                                                write_domain, delta);   \
+   OUT_BATCH(reloc64);                                                  \
+   OUT_BATCH(reloc64 >> 32);                                            \
 } while (0)
 
 #define ADVANCE_BATCH() intel_batchbuffer_advance(brw);

From 131573df7aea0b10e97d9d5db0d26d89f8dfef54 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sat, 11 Jul 2015 14:36:25 -0700
Subject: [PATCH 0340/1208] i965: Add and use USED_BATCH macro.

The next patch will replace the .used field with an on-demand
calculation of batchbuffer usage.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 src/mesa/drivers/dri/i965/brw_blorp.cpp       |  4 ++--
 .../dri/i965/brw_performance_monitor.c        |  6 +++---
 src/mesa/drivers/dri/i965/brw_state_batch.c   |  4 ++--
 src/mesa/drivers/dri/i965/brw_urb.c           |  4 ++--
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 +++++++++----------
 src/mesa/drivers/dri/i965/intel_batchbuffer.h |  9 ++++++---
 6 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp
index 2ccfae1d77f..eac1f005496 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp
@@ -226,7 +226,7 @@ retry:
    intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING);
    intel_batchbuffer_save_state(brw);
    drm_intel_bo *saved_bo = brw->batch.bo;
-   uint32_t saved_used = brw->batch.used;
+   uint32_t saved_used = USED_BATCH(brw->batch);
    uint32_t saved_state_batch_offset = brw->batch.state_batch_offset;
 
    switch (brw->gen) {
@@ -245,7 +245,7 @@ retry:
     * reserved enough space that a wrap will never happen.
     */
    assert(brw->batch.bo == saved_bo);
-   assert((brw->batch.used - saved_used) * 4 +
+   assert((USED_BATCH(brw->batch) - saved_used) * 4 +
           (saved_state_batch_offset - brw->batch.state_batch_offset) <
           estimated_max_batch_usage);
    /* Shut up compiler warnings on release build */
diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
index 0a123754257..7e90e8a8fa1 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
@@ -710,7 +710,7 @@ emit_mi_report_perf_count(struct brw_context *brw,
    /* Make sure the commands to take a snapshot fits in a single batch. */
    intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
                                    RENDER_RING);
-   int batch_used = brw->batch.used;
+   int batch_used = USED_BATCH(brw->batch);
 
    /* Reports apparently don't always get written unless we flush first. */
    brw_emit_mi_flush(brw);
@@ -754,7 +754,7 @@ emit_mi_report_perf_count(struct brw_context *brw,
    brw_emit_mi_flush(brw);
 
    (void) batch_used;
-   assert(brw->batch.used - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
+   assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
 }
 
 /**
@@ -1386,7 +1386,7 @@ void
 brw_perf_monitor_new_batch(struct brw_context *brw)
 {
    assert(brw->batch.ring == RENDER_RING);
-   assert(brw->gen < 6 || brw->batch.used == 0);
+   assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0);
 
    if (brw->perfmon.oa_users == 0)
       return;
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index a405a80ef6e..d79e0ea00c7 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -87,7 +87,7 @@ brw_annotate_aub(struct brw_context *brw)
    drm_intel_aub_annotation annotations[annotation_count];
    int a = 0;
    make_annotation(&annotations[a++], AUB_TRACE_TYPE_BATCH, 0,
-                   4*brw->batch.used);
+                   4 * USED_BATCH(brw->batch));
    for (int i = brw->state_batch_count; i-- > 0; ) {
       uint32_t type = brw->state_batch_list[i].type;
       uint32_t start_offset = brw->state_batch_list[i].offset;
@@ -136,7 +136,7 @@ __brw_state_batch(struct brw_context *brw,
     * space, then flush and try again.
     */
    if (batch->state_batch_offset < size ||
-       offset < 4*batch->used + batch->reserved_space) {
+       offset < 4 * USED_BATCH(*batch) + batch->reserved_space) {
       intel_batchbuffer_flush(brw);
       offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 6fcf1b0cb1d..8fc06ba7cd9 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -249,8 +249,8 @@ void brw_upload_urb_fence(struct brw_context *brw)
    uf.bits1.cs_fence  = brw->urb.size;
 
    /* erratum: URB_FENCE must not cross a 64byte cacheline */
-   if ((brw->batch.used & 15) > 12) {
-      int pad = 16 - (brw->batch.used & 15);
+   if ((USED_BATCH(brw->batch) & 15) > 12) {
+      int pad = 16 - (USED_BATCH(brw->batch) & 15);
       do
 	 brw->batch.map[brw->batch.used++] = MI_NOOP;
       while (--pad);
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index f82958f0f6c..628a7b774b2 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -94,7 +94,7 @@ intel_batchbuffer_reset_to_saved(struct brw_context *brw)
    drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
 
    brw->batch.used = brw->batch.saved.used;
-   if (brw->batch.used == 0)
+   if (USED_BATCH(brw->batch) == 0)
       brw->batch.ring = UNKNOWN_RING;
 }
 
@@ -122,7 +122,7 @@ do_batch_dump(struct brw_context *brw)
       drm_intel_decode_set_batch_pointer(decode,
 					 batch->bo->virtual,
 					 batch->bo->offset64,
-					 batch->used);
+                                         USED_BATCH(*batch));
    } else {
       fprintf(stderr,
 	      "WARNING: failed to map batchbuffer (%s), "
@@ -131,7 +131,7 @@ do_batch_dump(struct brw_context *brw)
       drm_intel_decode_set_batch_pointer(decode,
 					 batch->map,
 					 batch->bo->offset64,
-					 batch->used);
+                                         USED_BATCH(*batch));
    }
 
    drm_intel_decode_set_output_file(decode, stderr);
@@ -289,7 +289,7 @@ do_flush_locked(struct brw_context *brw)
    if (brw->has_llc) {
       drm_intel_bo_unmap(batch->bo);
    } else {
-      ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map);
+      ret = drm_intel_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
       if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
 	 ret = drm_intel_bo_subdata(batch->bo,
 				    batch->state_batch_offset,
@@ -314,11 +314,11 @@ do_flush_locked(struct brw_context *brw)
             brw_annotate_aub(brw);
 
 	 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
-	    ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
-					flags);
+            ret = drm_intel_bo_mrb_exec(batch->bo, 4 * USED_BATCH(*batch),
+                                        NULL, 0, 0, flags);
 	 } else {
 	    ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
-						4 * batch->used, flags);
+                                                4 * USED_BATCH(*batch), flags);
 	 }
       }
 
@@ -342,7 +342,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
 {
    int ret;
 
-   if (brw->batch.used == 0)
+   if (USED_BATCH(brw->batch) == 0)
       return 0;
 
    if (brw->throttle_batch[0] == NULL) {
@@ -351,7 +351,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
    }
 
    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
-      int bytes_for_commands = 4 * brw->batch.used;
+      int bytes_for_commands = 4 * USED_BATCH(brw->batch);
       int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
       int total_bytes = bytes_for_commands + bytes_for_state;
       fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
@@ -367,7 +367,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
 
    /* Mark the end of the buffer. */
    intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
-   if (brw->batch.used & 1) {
+   if (USED_BATCH(brw->batch) & 1) {
       /* Round batchbuffer usage to 2 DWORDs. */
       intel_batchbuffer_emit_dword(brw, MI_NOOP);
    }
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index c0456f3c1f6..12d20d363c3 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -67,6 +67,9 @@ uint64_t intel_batchbuffer_reloc64(struct brw_context *brw,
                                    uint32_t read_domains,
                                    uint32_t write_domain,
                                    uint32_t offset);
+
+#define USED_BATCH(batch) ((batch).used)
+
 static inline uint32_t float_as_int(float f)
 {
    union {
@@ -87,7 +90,7 @@ static inline unsigned
 intel_batchbuffer_space(struct brw_context *brw)
 {
    return (brw->batch.state_batch_offset - brw->batch.reserved_space)
-      - brw->batch.used*4;
+      - USED_BATCH(brw->batch) * 4;
 }
 
 
@@ -139,7 +142,7 @@ intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
    intel_batchbuffer_require_space(brw, n * 4, ring);
 
 #ifdef DEBUG
-   brw->batch.emit = brw->batch.used;
+   brw->batch.emit = USED_BATCH(brw->batch);
    brw->batch.total = n;
 #endif
 }
@@ -149,7 +152,7 @@ intel_batchbuffer_advance(struct brw_context *brw)
 {
 #ifdef DEBUG
    struct intel_batchbuffer *batch = &brw->batch;
-   unsigned int _n = batch->used - batch->emit;
+   unsigned int _n = USED_BATCH(*batch) - batch->emit;
    assert(batch->total != 0);
    if (_n != batch->total) {
       fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",

From f11c6f09cf36909ff399353b20195a31cf0f1907 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 8 Jul 2015 19:00:48 -0700
Subject: [PATCH 0341/1208] i965: Optimize batchbuffer macros.

Previously OUT_BATCH was just a macro around an inline function which
does

   brw->batch.map[brw->batch.used++] = dword;

When making consecutive calls to intel_batchbuffer_emit_dword() the
compiler isn't able to recognize that we're writing consecutive memory
locations or that it doesn't need to write batch.used back to memory
each time.

We can avoid both of these problems by making a local pointer to the
next location in the batch in BEGIN_BATCH().

Cuts 18k from the .text size.

   text     data      bss      dec      hex  filename
4946956   195152    26192  5168300   4edcac  i965_dri.so before
4928956   195152    26192  5150300   4e965c  i965_dri.so after

This series (including commit c0433948) improves performance of Synmark
OglBatch7 by 8.01389% +/- 0.63922% (n=83) on Ivybridge.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 src/mesa/drivers/dri/i965/brw_context.h       |  5 +-
 src/mesa/drivers/dri/i965/brw_draw_upload.c   | 12 ++--
 src/mesa/drivers/dri/i965/brw_urb.c           |  2 +-
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 19 ++++---
 src/mesa/drivers/dri/i965/intel_batchbuffer.h | 57 +++++++++++++------
 src/mesa/drivers/dri/i965/intel_blit.c        | 19 ++++---
 6 files changed, 71 insertions(+), 43 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 44d1aeaf54a..34a49b2abdc 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -873,7 +873,8 @@ struct intel_batchbuffer {
 #ifdef DEBUG
    uint16_t emit, total;
 #endif
-   uint16_t used, reserved_space;
+   uint16_t reserved_space;
+   uint32_t *map_next;
    uint32_t *map;
    uint32_t *cpu_map;
 #define BATCH_SZ (8192*sizeof(uint32_t))
@@ -883,7 +884,7 @@ struct intel_batchbuffer {
    bool needs_sol_reset;
 
    struct {
-      uint16_t used;
+      uint32_t *map_next;
       int reloc_count;
    } saved;
 };
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index e0b55c6b620..c95f0c37f89 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -604,14 +604,15 @@ brw_prepare_shader_draw_parameters(struct brw_context *brw)
 /**
  * Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS).
  */
-static void
+static uint32_t *
 emit_vertex_buffer_state(struct brw_context *brw,
                          unsigned buffer_nr,
                          drm_intel_bo *bo,
                          unsigned bo_ending_address,
                          unsigned bo_offset,
                          unsigned stride,
-                         unsigned step_rate)
+                         unsigned step_rate,
+                         uint32_t *__map)
 {
    struct gl_context *ctx = &brw->ctx;
    uint32_t dw0;
@@ -643,7 +644,10 @@ emit_vertex_buffer_state(struct brw_context *brw,
       OUT_BATCH(0);
    }
    OUT_BATCH(step_rate);
+
+   return __map;
 }
+#define EMIT_VERTEX_BUFFER_STATE(...) __map = emit_vertex_buffer_state(__VA_ARGS__, __map)
 
 static void brw_emit_vertices(struct brw_context *brw)
 {
@@ -704,14 +708,14 @@ static void brw_emit_vertices(struct brw_context *brw)
       OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
       for (i = 0; i < brw->vb.nr_buffers; i++) {
 	 struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
-         emit_vertex_buffer_state(brw, i, buffer->bo, buffer->bo->size - 1,
+         EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo, buffer->bo->size - 1,
                                   buffer->offset, buffer->stride,
                                   buffer->step_rate);
 
       }
 
       if (brw->vs.prog_data->uses_vertexid) {
-         emit_vertex_buffer_state(brw, brw->vb.nr_buffers,
+         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers,
                                   brw->draw.draw_params_bo,
                                   brw->draw.draw_params_bo->size - 1,
                                   brw->draw.draw_params_offset,
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 8fc06ba7cd9..6078c3810d4 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -252,7 +252,7 @@ void brw_upload_urb_fence(struct brw_context *brw)
    if ((USED_BATCH(brw->batch) & 15) > 12) {
       int pad = 16 - (USED_BATCH(brw->batch) & 15);
       do
-	 brw->batch.map[brw->batch.used++] = MI_NOOP;
+         *brw->batch.map_next++ = MI_NOOP;
       while (--pad);
    }
 
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 628a7b774b2..088ffd276b4 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -48,6 +48,7 @@ intel_batchbuffer_init(struct brw_context *brw)
    if (!brw->has_llc) {
       brw->batch.cpu_map = malloc(BATCH_SZ);
       brw->batch.map = brw->batch.cpu_map;
+      brw->batch.map_next = brw->batch.cpu_map;
    }
 }
 
@@ -68,10 +69,10 @@ intel_batchbuffer_reset(struct brw_context *brw)
       drm_intel_bo_map(brw->batch.bo, true);
       brw->batch.map = brw->batch.bo->virtual;
    }
+   brw->batch.map_next = brw->batch.map;
 
    brw->batch.reserved_space = BATCH_RESERVED;
    brw->batch.state_batch_offset = brw->batch.bo->size;
-   brw->batch.used = 0;
    brw->batch.needs_sol_reset = false;
 
    /* We don't know what ring the new batch will be sent to until we see the
@@ -83,7 +84,7 @@ intel_batchbuffer_reset(struct brw_context *brw)
 void
 intel_batchbuffer_save_state(struct brw_context *brw)
 {
-   brw->batch.saved.used = brw->batch.used;
+   brw->batch.saved.map_next = brw->batch.map_next;
    brw->batch.saved.reloc_count =
       drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
 }
@@ -93,7 +94,7 @@ intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 {
    drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
 
-   brw->batch.used = brw->batch.saved.used;
+   brw->batch.map_next = brw->batch.saved.map_next;
    if (USED_BATCH(brw->batch) == 0)
       brw->batch.ring = UNKNOWN_RING;
 }
@@ -395,13 +396,13 @@ _intel_batchbuffer_flush(struct brw_context *brw,
  */
 uint32_t
 intel_batchbuffer_reloc(struct brw_context *brw,
-                        drm_intel_bo *buffer,
+                        drm_intel_bo *buffer, uint32_t offset,
                         uint32_t read_domains, uint32_t write_domain,
                         uint32_t delta)
 {
    int ret;
 
-   ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+   ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
 				 buffer, delta,
 				 read_domains, write_domain);
    assert(ret == 0);
@@ -416,11 +417,11 @@ intel_batchbuffer_reloc(struct brw_context *brw,
 
 uint64_t
 intel_batchbuffer_reloc64(struct brw_context *brw,
-                          drm_intel_bo *buffer,
+                          drm_intel_bo *buffer, uint32_t offset,
                           uint32_t read_domains, uint32_t write_domain,
                           uint32_t delta)
 {
-   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
                                      buffer, delta,
                                      read_domains, write_domain);
    assert(ret == 0);
@@ -440,8 +441,8 @@ intel_batchbuffer_data(struct brw_context *brw,
 {
    assert((bytes & 3) == 0);
    intel_batchbuffer_require_space(brw, bytes, ring);
-   memcpy(brw->batch.map + brw->batch.used, data, bytes);
-   brw->batch.used += bytes >> 2;
+   memcpy(brw->batch.map_next, data, bytes);
+   brw->batch.map_next += bytes >> 2;
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index 12d20d363c3..84add927c9a 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -59,16 +59,18 @@ void intel_batchbuffer_data(struct brw_context *brw,
 
 uint32_t intel_batchbuffer_reloc(struct brw_context *brw,
                                  drm_intel_bo *buffer,
+                                 uint32_t offset,
                                  uint32_t read_domains,
                                  uint32_t write_domain,
-                                 uint32_t offset);
+                                 uint32_t delta);
 uint64_t intel_batchbuffer_reloc64(struct brw_context *brw,
                                    drm_intel_bo *buffer,
+                                   uint32_t offset,
                                    uint32_t read_domains,
                                    uint32_t write_domain,
-                                   uint32_t offset);
+                                   uint32_t delta);
 
-#define USED_BATCH(batch) ((batch).used)
+#define USED_BATCH(batch) ((uintptr_t)((batch).map_next - (batch).map))
 
 static inline uint32_t float_as_int(float f)
 {
@@ -100,7 +102,7 @@ intel_batchbuffer_emit_dword(struct brw_context *brw, GLuint dword)
 #ifdef DEBUG
    assert(intel_batchbuffer_space(brw) >= 4);
 #endif
-   brw->batch.map[brw->batch.used++] = dword;
+   *brw->batch.map_next++ = dword;
    assert(brw->batch.ring != UNKNOWN_RING);
 }
 
@@ -163,23 +165,42 @@ intel_batchbuffer_advance(struct brw_context *brw)
 #endif
 }
 
-#define BEGIN_BATCH(n) intel_batchbuffer_begin(brw, n, RENDER_RING)
-#define BEGIN_BATCH_BLT(n) intel_batchbuffer_begin(brw, n, BLT_RING)
-#define OUT_BATCH(d) intel_batchbuffer_emit_dword(brw, d)
-#define OUT_BATCH_F(f) intel_batchbuffer_emit_float(brw, f)
-#define OUT_RELOC(buf, read_domains, write_domain, delta)                  \
-   OUT_BATCH(intel_batchbuffer_reloc(brw, buf, read_domains, write_domain, \
-                                     delta))
+#define BEGIN_BATCH(n) do {                            \
+   intel_batchbuffer_begin(brw, (n), RENDER_RING);     \
+   uint32_t *__map = brw->batch.map_next;              \
+   brw->batch.map_next += (n)
 
-/* Handle 48-bit address relocations for Gen8+ */
-#define OUT_RELOC64(buf, read_domains, write_domain, delta) do {        \
-   uint64_t reloc64 = intel_batchbuffer_reloc64(brw, buf, read_domains, \
-                                                write_domain, delta);   \
-   OUT_BATCH(reloc64);                                                  \
-   OUT_BATCH(reloc64 >> 32);                                            \
+#define BEGIN_BATCH_BLT(n) do {                        \
+   intel_batchbuffer_begin(brw, (n), BLT_RING);        \
+   uint32_t *__map = brw->batch.map_next;              \
+   brw->batch.map_next += (n)
+
+#define OUT_BATCH(d) *__map++ = (d)
+#define OUT_BATCH_F(f) OUT_BATCH(float_as_int((f)))
+
+#define OUT_RELOC(buf, read_domains, write_domain, delta) do { \
+   uint32_t __offset = (__map - brw->batch.map) * 4;           \
+   OUT_BATCH(intel_batchbuffer_reloc(brw, (buf), __offset,     \
+                                     (read_domains),           \
+                                     (write_domain),           \
+                                     (delta)));                \
 } while (0)
 
-#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
+/* Handle 48-bit address relocations for Gen8+ */
+#define OUT_RELOC64(buf, read_domains, write_domain, delta) do {      \
+   uint32_t __offset = (__map - brw->batch.map) * 4;                  \
+   uint64_t reloc64 = intel_batchbuffer_reloc64(brw, (buf), __offset, \
+                                                (read_domains),       \
+                                                (write_domain),       \
+                                                (delta));             \
+   OUT_BATCH(reloc64);                                                \
+   OUT_BATCH(reloc64 >> 32);                                          \
+} while (0)
+
+#define ADVANCE_BATCH()                  \
+   assert(__map == brw->batch.map_next); \
+   intel_batchbuffer_advance(brw);       \
+} while (0)
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index bc390535c86..4fc3fa803cb 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -176,9 +176,10 @@ get_tr_vertical_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
  * tiling state would leak into other unsuspecting applications (like the X
  * server).
  */
-static void
+static uint32_t *
 set_blitter_tiling(struct brw_context *brw,
-                   bool dst_y_tiled, bool src_y_tiled)
+                   bool dst_y_tiled, bool src_y_tiled,
+                   uint32_t *__map)
 {
    assert(brw->gen >= 6);
 
@@ -193,19 +194,19 @@ set_blitter_tiling(struct brw_context *brw,
    OUT_BATCH((BCS_SWCTRL_DST_Y | BCS_SWCTRL_SRC_Y) << 16 |
              (dst_y_tiled ? BCS_SWCTRL_DST_Y : 0) |
              (src_y_tiled ? BCS_SWCTRL_SRC_Y : 0));
+   return __map;
 }
+#define SET_BLITTER_TILING(...) __map = set_blitter_tiling(__VA_ARGS__, __map)
 
-#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled) do {         \
+#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled)              \
       BEGIN_BATCH_BLT(n + ((dst_y_tiled || src_y_tiled) ? 14 : 0));     \
       if (dst_y_tiled || src_y_tiled)                                   \
-         set_blitter_tiling(brw, dst_y_tiled, src_y_tiled);             \
-   } while (0)
+         SET_BLITTER_TILING(brw, dst_y_tiled, src_y_tiled)
 
-#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled) do {              \
+#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled)                   \
       if (dst_y_tiled || src_y_tiled)                                   \
-         set_blitter_tiling(brw, false, false);                         \
-      ADVANCE_BATCH();                                                  \
-   } while (0)
+         SET_BLITTER_TILING(brw, false, false);                         \
+      ADVANCE_BATCH()
 
 static int
 blt_pitch(struct intel_mipmap_tree *mt)

From 642f289824dc9a07e8209c905badef31b4841ae1 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Tue, 7 Jul 2015 12:23:33 -0700
Subject: [PATCH 0342/1208] i965: Fix 32 bit build warnings in
 intel_get_yf_ys_bo_size()

Along with fixing the type of pitch parameter, patch also changes
the types of few local variables and function return type.

Warnings fixed are:
intel_mipmap_tree.c:671:7: warning: passing argument 3 of
'intel_get_yf_ys_bo_size' from incompatible pointer type

intel_mipmap_tree.c:563:1: note: expected 'uint64_t *' but
argument is of type 'long unsigned int *'

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index fb896a92263..15296518941 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -559,14 +559,14 @@ intel_lower_compressed_format(struct brw_context *brw, mesa_format format)
 }
 
 /* This function computes Yf/Ys tiled bo size, alignment and pitch. */
-static uint64_t
+static unsigned long
 intel_get_yf_ys_bo_size(struct intel_mipmap_tree *mt, unsigned *alignment,
-                        uint64_t *pitch)
+                        unsigned long *pitch)
 {
    const uint32_t bpp = mt->cpp * 8;
    const uint32_t aspect_ratio = (bpp == 16 || bpp == 64) ? 2 : 1;
    uint32_t tile_width, tile_height;
-   uint64_t stride, size, aligned_y;
+   unsigned long stride, size, aligned_y;
 
    assert(mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE);
 

From 26c1361ac386bd5b108d79289a3f82d15b01d014 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Thu, 16 Jul 2015 03:06:47 +0200
Subject: [PATCH 0343/1208] r200: fix fbo rendering by disabling optimized
 texture format chooser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is rather unfortunate that we don't know if a texture is going to be used
as a rt later, and we lack the means to do something about a format chosen
which we can't render to directly, so disable this and always chose renderable
format for rgba8 textures.
This addresses an issue raised on (old) bug,
https://bugs.freedesktop.org/show_bug.cgi?id=51658 with gnome-shell, don't
know if that's still applicable but it might fix other things as well.

Acked-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/drivers/dri/radeon/radeon_texture.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index 1a178988b68..d05c870128c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -224,7 +224,19 @@ static mesa_format radeonChoose8888TexFormat(radeonContextPtr rmesa,
 	const GLuint ui = 1;
 	const GLubyte littleEndian = *((const GLubyte *)&ui);
 
-	if (fbo)
+
+	/* Unfortunately, regardless the fbo flag, we might still be asked to
+	 * attach a texture to a fbo later, which then won't succeed if we chose
+	 * one which isn't renderable. And unlike more exotic formats, apps aren't
+	 * really prepared for the incomplete framebuffer this results in (they'd
+	 * have to retry with same internalFormat even, just different
+	 * srcFormat/srcType, which can't really be expected anyway).
+	 * Ideally, we'd defer format selection until later (if the texture is
+	 * used as a rt it's likely there's never data uploaded to it before attached
+	 * to a fbo), but this isn't really possible, so for now just always use
+	 * a renderable format.
+	 */
+	if (1 || fbo)
 		return _radeon_texformat_argb8888;
 
 	if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||

From 882476fea3ba4fdd05d21582eeb968f84523fb9a Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Sat, 11 Jul 2015 20:03:27 +0200
Subject: [PATCH 0344/1208] radeon/r200: mark state atoms as dirty after blits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Blit submits lots of packets which are usually handled by state atoms, so
these must be dirtied.
Not sure if this fixes anything, but it was a concern raised by bug 51658
(with this all issues there seen as actual bugs should be fixed, with the
exception of the patch to upload non-used texenv state atoms which I just
don't understand).

Acked-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/drivers/dri/r200/r200_blit.c     | 16 ++++++++++++++++
 src/mesa/drivers/dri/radeon/radeon_blit.c |  8 ++++++++
 2 files changed, 24 insertions(+)

diff --git a/src/mesa/drivers/dri/r200/r200_blit.c b/src/mesa/drivers/dri/r200/r200_blit.c
index 3adc69423cd..0e6afa0d481 100644
--- a/src/mesa/drivers/dri/r200/r200_blit.c
+++ b/src/mesa/drivers/dri/r200/r200_blit.c
@@ -547,5 +547,21 @@ unsigned r200_blit(struct gl_context *ctx,
 
     radeonFlush(ctx);
 
+    /* We submitted those packets outside our state atom mechanism. Thus
+     * make sure the atoms are resubmitted the next time. */
+    r200->hw.cst.dirty = GL_TRUE;
+    r200->hw.ctx.dirty = GL_TRUE;
+    r200->hw.vap.dirty = GL_TRUE;
+    r200->hw.msk.dirty = GL_TRUE;
+    r200->hw.pix[0].dirty = GL_TRUE;
+    r200->hw.pix[1].dirty = GL_TRUE;
+    r200->hw.pix[2].dirty = GL_TRUE;
+    r200->hw.pix[3].dirty = GL_TRUE;
+    r200->hw.sci.dirty = GL_TRUE;
+    r200->hw.set.dirty = GL_TRUE;
+    r200->hw.tex[0].dirty = GL_TRUE;
+    r200->hw.vte.dirty = GL_TRUE;
+    r200->hw.vtx.dirty = GL_TRUE;
+
     return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_blit.c b/src/mesa/drivers/dri/radeon/radeon_blit.c
index 0de17514e05..028e8b04bf6 100644
--- a/src/mesa/drivers/dri/radeon/radeon_blit.c
+++ b/src/mesa/drivers/dri/radeon/radeon_blit.c
@@ -425,5 +425,13 @@ unsigned r100_blit(struct gl_context *ctx,
 
     radeonFlush(ctx);
 
+    /* We submitted those packets outside our state atom mechanism. Thus
+     * make sure they are all resubmitted the next time. */
+    r100->hw.ctx.dirty = GL_TRUE;
+    r100->hw.msk.dirty = GL_TRUE;
+    r100->hw.set.dirty = GL_TRUE;
+    r100->hw.tex[0].dirty = GL_TRUE;
+    r100->hw.txr[0].dirty = GL_TRUE;
+
     return GL_TRUE;
 }

From d21320f6258b2e1780a15c1ca718963d8a15ca18 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Thu, 16 Jul 2015 03:18:20 +0200
Subject: [PATCH 0345/1208] radeon: fix some potential big endian issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The formats chosen (both by texture format choser, fbo storage allocation)
are different for big endian not just for rgba8 but also lower bit width
formats (why I don't actually know). Even the function to test for renderable
formats used different formats, however the actual colorbuffer setup did not.
And the blitter did not take that into account neither.
Untested (what could possibly go wrong...).

Acked-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/drivers/dri/radeon/radeon_blit.c     | 84 +++++++++----------
 .../drivers/dri/radeon/radeon_state_init.c    |  3 +
 src/mesa/drivers/dri/radeon/radeon_tex.h      | 35 ++++++++
 src/mesa/drivers/dri/radeon/radeon_texstate.c | 44 ----------
 4 files changed, 76 insertions(+), 90 deletions(-)

diff --git a/src/mesa/drivers/dri/radeon/radeon_blit.c b/src/mesa/drivers/dri/radeon/radeon_blit.c
index 028e8b04bf6..0b0f06f0edb 100644
--- a/src/mesa/drivers/dri/radeon/radeon_blit.c
+++ b/src/mesa/drivers/dri/radeon/radeon_blit.c
@@ -28,6 +28,7 @@
 #include "radeon_common.h"
 #include "radeon_context.h"
 #include "radeon_blit.h"
+#include "radeon_tex.h"
 
 static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
                                   int reg, int count)
@@ -40,19 +41,36 @@ static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
 /* common formats supported as both textures and render targets */
 unsigned r100_check_blit(mesa_format mesa_format, uint32_t dst_pitch)
 {
-    /* XXX others?  BE/LE? */
-    switch (mesa_format) {
-    case MESA_FORMAT_B8G8R8A8_UNORM:
-    case MESA_FORMAT_B8G8R8X8_UNORM:
-    case MESA_FORMAT_B5G6R5_UNORM:
-    case MESA_FORMAT_B4G4R4A4_UNORM:
-    case MESA_FORMAT_B5G5R5A1_UNORM:
-    case MESA_FORMAT_A_UNORM8:
-    case MESA_FORMAT_L_UNORM8:
-    case MESA_FORMAT_I_UNORM8:
+    /* XXX others?  */
+    if (_mesa_little_endian()) {
+	switch (mesa_format) {
+	case MESA_FORMAT_B8G8R8A8_UNORM:
+	case MESA_FORMAT_B8G8R8X8_UNORM:
+	case MESA_FORMAT_B5G6R5_UNORM:
+	case MESA_FORMAT_B4G4R4A4_UNORM:
+	case MESA_FORMAT_B5G5R5A1_UNORM:
+	case MESA_FORMAT_A_UNORM8:
+	case MESA_FORMAT_L_UNORM8:
+	case MESA_FORMAT_I_UNORM8:
 	    break;
-    default:
+	default:
 	    return 0;
+	}
+    }
+    else {
+	switch (mesa_format) {
+	case MESA_FORMAT_A8R8G8B8_UNORM:
+	case MESA_FORMAT_X8R8G8B8_UNORM:
+	case MESA_FORMAT_R5G6B5_UNORM:
+	case MESA_FORMAT_A4R4G4B4_UNORM:
+	case MESA_FORMAT_A1R5G5B5_UNORM:
+	case MESA_FORMAT_A_UNORM8:
+	case MESA_FORMAT_L_UNORM8:
+	case MESA_FORMAT_I_UNORM8:
+	    break;
+	default:
+	    return 0;
+	}
     }
 
     /* Rendering to small buffer doesn't work.
@@ -106,40 +124,8 @@ static void inline emit_tx_setup(struct r100_context *r100,
     assert(height <= 2048);
     assert(offset % 32 == 0);
 
-    /* XXX others?  BE/LE? */
-    switch (mesa_format) {
-    case MESA_FORMAT_B8G8R8A8_UNORM:
-	    txformat |= RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_A8B8G8R8_UNORM:
-            txformat |= RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-            break;
-    case MESA_FORMAT_B8G8R8X8_UNORM:
-	    txformat |= RADEON_TXFORMAT_ARGB8888;
-	    break;
-    case MESA_FORMAT_B5G6R5_UNORM:
-	    txformat |= RADEON_TXFORMAT_RGB565;
-	    break;
-    case MESA_FORMAT_B4G4R4A4_UNORM:
-	    txformat |= RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_B5G5R5A1_UNORM:
-	    txformat |= RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_A_UNORM8:
-    case MESA_FORMAT_I_UNORM8:
-	    txformat |= RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_L_UNORM8:
-            txformat |= RADEON_TXFORMAT_I8;
-            break;
-    case MESA_FORMAT_L8A8_UNORM:
-            txformat |= RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-            break;
-    default:
-	    break;
-    }
-    
+    txformat |= tx_table[mesa_format].format;
+
     if (bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
        offset |= RADEON_TXO_MACRO_TILE;
     if (bo->flags & RADEON_BO_FLAGS_MICRO_TILE)
@@ -184,19 +170,25 @@ static inline void emit_cb_setup(struct r100_context *r100,
     uint32_t dst_format = 0;
     BATCH_LOCALS(&r100->radeon);
 
-    /* XXX others?  BE/LE? */
+    /* XXX others? */
     switch (mesa_format) {
+    /* The first of each pair is for little, the second for big endian. */
     case MESA_FORMAT_B8G8R8A8_UNORM:
+    case MESA_FORMAT_A8R8G8B8_UNORM:
     case MESA_FORMAT_B8G8R8X8_UNORM:
+    case MESA_FORMAT_X8R8G8B8_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB8888;
 	    break;
     case MESA_FORMAT_B5G6R5_UNORM:
+    case MESA_FORMAT_R5G6B5_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_RGB565;
 	    break;
     case MESA_FORMAT_B4G4R4A4_UNORM:
+    case MESA_FORMAT_A4R4G4B4_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB4444;
 	    break;
     case MESA_FORMAT_B5G5R5A1_UNORM:
+    case MESA_FORMAT_A1R5G5B5_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
 	    break;
     case MESA_FORMAT_A_UNORM8:
diff --git a/src/mesa/drivers/dri/radeon/radeon_state_init.c b/src/mesa/drivers/dri/radeon/radeon_state_init.c
index c800edfc7be..5e2f41fdb4a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state_init.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state_init.c
@@ -336,12 +336,15 @@ static void ctx_emit_cs(struct gl_context *ctx, struct radeon_state_atom *atom)
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
    else switch (rrb->base.Base.Format) {
    case MESA_FORMAT_B5G6R5_UNORM:
+   case MESA_FORMAT_R5G6B5_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
 	break;
    case MESA_FORMAT_B4G4R4A4_UNORM:
+   case MESA_FORMAT_A4R4G4B4_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB4444;
 	break;
    case MESA_FORMAT_B5G5R5A1_UNORM:
+   case MESA_FORMAT_A1R5G5B5_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB1555;
 	break;
    default:
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.h b/src/mesa/drivers/dri/radeon/radeon_tex.h
index fa57c08987d..f8ec432755a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.h
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.h
@@ -51,4 +51,39 @@ extern void radeonTexUpdateParameters(struct gl_context *ctx, GLuint unit);
 
 extern void radeonInitTextureFuncs( radeonContextPtr radeon, struct dd_function_table *functions );
 
+struct tx_table {
+   GLuint format, filter;
+};
+
+/* XXX verify this table against MESA_FORMAT_x values */
+static const struct tx_table tx_table[] =
+{
+   [ MESA_FORMAT_NONE ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_BGR_UNORM8 ] = { RADEON_TXFORMAT_ARGB8888, 0 },
+   [ MESA_FORMAT_B5G6R5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_R5G6B5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L8A8_UNORM ] = { RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8L8_UNORM ] = { RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A_UNORM8 ] = { RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L_UNORM8 ] = { RADEON_TXFORMAT_I8, 0 },
+   [ MESA_FORMAT_I_UNORM8 ] = { RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_YCBCR ] = { RADEON_TXFORMAT_YVYU422, RADEON_YUV_TO_RGB },
+   [ MESA_FORMAT_YCBCR_REV ] = { RADEON_TXFORMAT_VYUY422, RADEON_YUV_TO_RGB },
+   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGB_DXT1 ] = { RADEON_TXFORMAT_DXT1, 0 },
+   [ MESA_FORMAT_RGBA_DXT1 ] = { RADEON_TXFORMAT_DXT1 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT3 ] = { RADEON_TXFORMAT_DXT23 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT5 ] = { RADEON_TXFORMAT_DXT45 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+};
+
+
 #endif /* __RADEON_TEX_H__ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_texstate.c b/src/mesa/drivers/dri/radeon/radeon_texstate.c
index 45667efb65f..ec835f248eb 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texstate.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texstate.c
@@ -53,53 +53,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_tcl.h"
 
 
-#define RADEON_TXFORMAT_A8        RADEON_TXFORMAT_I8
-#define RADEON_TXFORMAT_L8        RADEON_TXFORMAT_I8
-#define RADEON_TXFORMAT_AL88      RADEON_TXFORMAT_AI88
-#define RADEON_TXFORMAT_YCBCR     RADEON_TXFORMAT_YVYU422
-#define RADEON_TXFORMAT_YCBCR_REV RADEON_TXFORMAT_VYUY422
-#define RADEON_TXFORMAT_RGB_DXT1  RADEON_TXFORMAT_DXT1
-#define RADEON_TXFORMAT_RGBA_DXT1 RADEON_TXFORMAT_DXT1
-#define RADEON_TXFORMAT_RGBA_DXT3 RADEON_TXFORMAT_DXT23
-#define RADEON_TXFORMAT_RGBA_DXT5 RADEON_TXFORMAT_DXT45
-
 #define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5) \
 			     && (tx_table[f].format != 0xffffffff) )
 
-struct tx_table {
-   GLuint format, filter;
-};
-
-/* XXX verify this table against MESA_FORMAT_x values */
-static const struct tx_table tx_table[] =
-{
-   [ MESA_FORMAT_NONE ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_BGR_UNORM8 ] = { RADEON_TXFORMAT_ARGB8888, 0 },
-   [ MESA_FORMAT_B5G6R5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_R5G6B5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L8A8_UNORM ] = { RADEON_TXFORMAT_AL88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8L8_UNORM ] = { RADEON_TXFORMAT_AL88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A_UNORM8 ] = { RADEON_TXFORMAT_A8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L_UNORM8 ] = { RADEON_TXFORMAT_L8, 0 },
-   [ MESA_FORMAT_I_UNORM8 ] = { RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_YCBCR ] = { RADEON_TXFORMAT_YCBCR, RADEON_YUV_TO_RGB },
-   [ MESA_FORMAT_YCBCR_REV ] = { RADEON_TXFORMAT_YCBCR_REV, RADEON_YUV_TO_RGB },
-   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGB_DXT1 ] = { RADEON_TXFORMAT_RGB_DXT1, 0 },
-   [ MESA_FORMAT_RGBA_DXT1 ] = { RADEON_TXFORMAT_RGBA_DXT1 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT3 ] = { RADEON_TXFORMAT_RGBA_DXT3 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT5 ] = { RADEON_TXFORMAT_RGBA_DXT5 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-};
-
 /* ================================================================
  * Texture combine functions
  */

From 779cabfc7d022de8b7b9bc7fdac0caffa8646c51 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Thu, 16 Jul 2015 03:55:59 +0200
Subject: [PATCH 0346/1208] r200: fix some potential big endian issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The formats chosen (both by texture format choser, fbo storage allocation)
are different for big endian not just for rgba8 but also lower bit width
formats (why I don't actually know). Even the function to test for renderable
formats used different formats, however the actual colorbuffer setup did not.
And the blitter did not take that into account neither.
Untested (what could possibly go wrong...).
Same as for r100.

Acked-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/drivers/dri/r200/r200_blit.c       | 125 +++++++++++---------
 src/mesa/drivers/dri/r200/r200_context.h    |   4 -
 src/mesa/drivers/dri/r200/r200_state_init.c |   5 +-
 src/mesa/drivers/dri/r200/r200_tex.h        |  64 ++++++++++
 src/mesa/drivers/dri/r200/r200_texstate.c   |  71 -----------
 5 files changed, 140 insertions(+), 129 deletions(-)

diff --git a/src/mesa/drivers/dri/r200/r200_blit.c b/src/mesa/drivers/dri/r200/r200_blit.c
index 0e6afa0d481..d68a53e67f7 100644
--- a/src/mesa/drivers/dri/r200/r200_blit.c
+++ b/src/mesa/drivers/dri/r200/r200_blit.c
@@ -28,6 +28,7 @@
 #include "radeon_common.h"
 #include "r200_context.h"
 #include "r200_blit.h"
+#include "r200_tex.h"
 
 static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
                                   int reg, int count)
@@ -40,22 +41,42 @@ static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
 /* common formats supported as both textures and render targets */
 unsigned r200_check_blit(mesa_format mesa_format, uint32_t dst_pitch)
 {
-    /* XXX others?  BE/LE? */
-    switch (mesa_format) {
-    case MESA_FORMAT_B8G8R8A8_UNORM:
-    case MESA_FORMAT_B8G8R8X8_UNORM:
-    case MESA_FORMAT_B5G6R5_UNORM:
-    case MESA_FORMAT_B4G4R4A4_UNORM:
-    case MESA_FORMAT_B5G5R5A1_UNORM:
-    case MESA_FORMAT_A_UNORM8:
-    case MESA_FORMAT_L_UNORM8:
-    case MESA_FORMAT_I_UNORM8:
-    /* swizzled */
-    case MESA_FORMAT_A8B8G8R8_UNORM:
-    case MESA_FORMAT_R8G8B8A8_UNORM:
+    /* XXX others? */
+    if (_mesa_little_endian()) {
+	switch (mesa_format) {
+	case MESA_FORMAT_B8G8R8A8_UNORM:
+	case MESA_FORMAT_B8G8R8X8_UNORM:
+	case MESA_FORMAT_B5G6R5_UNORM:
+	case MESA_FORMAT_B4G4R4A4_UNORM:
+	case MESA_FORMAT_B5G5R5A1_UNORM:
+	case MESA_FORMAT_A_UNORM8:
+	case MESA_FORMAT_L_UNORM8:
+	case MESA_FORMAT_I_UNORM8:
+	/* swizzled - probably can't happen with the disabled Choose8888TexFormat code */
+	case MESA_FORMAT_A8B8G8R8_UNORM:
+	case MESA_FORMAT_R8G8B8A8_UNORM:
 	    break;
-    default:
+	default:
 	    return 0;
+	}
+    }
+    else {
+	switch (mesa_format) {
+	case MESA_FORMAT_A8R8G8B8_UNORM:
+	case MESA_FORMAT_X8R8G8B8_UNORM:
+	case MESA_FORMAT_R5G6B5_UNORM:
+	case MESA_FORMAT_A4R4G4B4_UNORM:
+	case MESA_FORMAT_A1R5G5B5_UNORM:
+	case MESA_FORMAT_A_UNORM8:
+	case MESA_FORMAT_L_UNORM8:
+	case MESA_FORMAT_I_UNORM8:
+	/* swizzled  - probably can't happen with the disabled Choose8888TexFormat code */
+	case MESA_FORMAT_R8G8B8A8_UNORM:
+	case MESA_FORMAT_A8B8G8R8_UNORM:
+	   break;
+	default:
+	   return 0;
+	}
     }
 
     /* Rendering to small buffer doesn't work.
@@ -112,41 +133,11 @@ static void inline emit_tx_setup(struct r200_context *r200,
     assert(height <= 2048);
     assert(offset % 32 == 0);
 
-    /* XXX others?  BE/LE? */
-    switch (src_mesa_format) {
-    case MESA_FORMAT_B8G8R8A8_UNORM:
-	    txformat |= R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_A8B8G8R8_UNORM:
-	    txformat |= R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_R8G8B8A8_UNORM:
-	    txformat |= R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_B8G8R8X8_UNORM:
-	    txformat |= R200_TXFORMAT_ARGB8888;
-	    break;
-    case MESA_FORMAT_B5G6R5_UNORM:
-	    txformat |= R200_TXFORMAT_RGB565;
-	    break;
-    case MESA_FORMAT_B4G4R4A4_UNORM:
-	    txformat |= R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_B5G5R5A1_UNORM:
-	    txformat |= R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_A_UNORM8:
-    case MESA_FORMAT_I_UNORM8:
-	    txformat |= R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_L_UNORM8:
-	    txformat |= R200_TXFORMAT_I8;
-	    break;
-    case MESA_FORMAT_L8A8_UNORM:
-	    txformat |= R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    default:
-	    break;
+    if (_mesa_little_endian()) {
+	txformat |= tx_table_le[src_mesa_format].format;
+    }
+    else {
+	txformat |= tx_table_be[src_mesa_format].format;
     }
 
     if (bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
@@ -155,11 +146,19 @@ static void inline emit_tx_setup(struct r200_context *r200,
 	offset |= R200_TXO_MICRO_TILE;
 
     switch (dst_mesa_format) {
+    /* le */
     case MESA_FORMAT_B8G8R8A8_UNORM:
     case MESA_FORMAT_B8G8R8X8_UNORM:
     case MESA_FORMAT_B5G6R5_UNORM:
     case MESA_FORMAT_B4G4R4A4_UNORM:
     case MESA_FORMAT_B5G5R5A1_UNORM:
+    /* be */
+    case MESA_FORMAT_A8R8G8B8_UNORM:
+    case MESA_FORMAT_X8R8G8B8_UNORM:
+    case MESA_FORMAT_R5G6B5_UNORM:
+    case MESA_FORMAT_A4R4G4B4_UNORM:
+    case MESA_FORMAT_A1R5G5B5_UNORM:
+    /* little and big */
     case MESA_FORMAT_A_UNORM8:
     case MESA_FORMAT_L_UNORM8:
     case MESA_FORMAT_I_UNORM8:
@@ -183,6 +182,9 @@ static void inline emit_tx_setup(struct r200_context *r200,
 	    END_BATCH();
 	    break;
     case MESA_FORMAT_A8B8G8R8_UNORM:
+    case MESA_FORMAT_R8G8B8A8_UNORM:
+       if ((dst_mesa_format == MESA_FORMAT_A8B8G8R8_UNORM && _mesa_little_endian()) ||
+	   (dst_mesa_format == MESA_FORMAT_R8G8B8A8_UNORM && !_mesa_little_endian())) {
 	    BEGIN_BATCH(10);
 	    OUT_BATCH_REGVAL(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE |
 					      RADEON_TEX_BLEND_0_ENABLE));
@@ -190,6 +192,8 @@ static void inline emit_tx_setup(struct r200_context *r200,
 						  R200_TXC_ARG_B_ZERO |
 						  R200_TXC_ARG_C_R0_COLOR |
 						  R200_TXC_OP_MADD));
+	    /* XXX I don't think this can work. This is output rotation, and alpha contains
+	     * red, not alpha (we'd write gbrr). */
 	    OUT_BATCH_REGVAL(R200_PP_TXCBLEND2_0, (R200_TXC_CLAMP_0_1 |
 						   R200_TXC_OUTPUT_ROTATE_GBA |
 						   R200_TXC_OUTPUT_REG_R0));
@@ -201,8 +205,16 @@ static void inline emit_tx_setup(struct r200_context *r200,
 						   (R200_TXA_REPL_RED << R200_TXA_REPL_ARG_C_SHIFT) |
 						   R200_TXA_OUTPUT_REG_R0));
 	    END_BATCH();
-	    break;
-    case MESA_FORMAT_R8G8B8A8_UNORM:
+       }
+       else {
+	    /* XXX pretty sure could do this with just 2 instead of 4 instructions.
+	     * Like so:
+	     * 1st: use RGA output rotation, rgb arg replicate b, a arg r, write mask rb.
+	     * That's just one instruction in fact but I'm not entirely sure it works
+	     * if some of those incoming r0 components are never written (due to mask)
+	     * in the shader itself to r0.
+	     * In any case this case (and the one above) may not be reachable with
+	     * disabled Choose8888TexFormat code. */
 	    BEGIN_BATCH(34);
 	    OUT_BATCH_REGVAL(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE |
 					      RADEON_TEX_BLEND_0_ENABLE |
@@ -272,7 +284,8 @@ static void inline emit_tx_setup(struct r200_context *r200,
 	    OUT_BATCH_REGVAL(R200_PP_TXABLEND2_3, (R200_TXA_CLAMP_0_1 |
 						   R200_TXA_OUTPUT_REG_R0));
 	    END_BATCH();
-	    break;
+	}
+	break;
     }
 
     BEGIN_BATCH(18);
@@ -306,21 +319,27 @@ static inline void emit_cb_setup(struct r200_context *r200,
     uint32_t dst_format = 0;
     BATCH_LOCALS(&r200->radeon);
 
-    /* XXX others?  BE/LE? */
     switch (mesa_format) {
+    /* The first of each pair is for little, the second for big endian */
     case MESA_FORMAT_B8G8R8A8_UNORM:
+    case MESA_FORMAT_A8R8G8B8_UNORM:
     case MESA_FORMAT_B8G8R8X8_UNORM:
+    case MESA_FORMAT_X8R8G8B8_UNORM:
+    /* These two are valid both for little and big endian (swizzled) */
     case MESA_FORMAT_A8B8G8R8_UNORM:
     case MESA_FORMAT_R8G8B8A8_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB8888;
 	    break;
     case MESA_FORMAT_B5G6R5_UNORM:
+    case MESA_FORMAT_R5G6B5_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_RGB565;
 	    break;
     case MESA_FORMAT_B4G4R4A4_UNORM:
+    case MESA_FORMAT_A4R4G4B4_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB4444;
 	    break;
     case MESA_FORMAT_B5G5R5A1_UNORM:
+    case MESA_FORMAT_A1R5G5B5_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
 	    break;
     case MESA_FORMAT_A_UNORM8:
diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h
index eb498f7406b..e8784aeeed5 100644
--- a/src/mesa/drivers/dri/r200/r200_context.h
+++ b/src/mesa/drivers/dri/r200/r200_context.h
@@ -109,7 +109,6 @@ struct r200_texture_state {
 #define CTX_RB3D_COLOROFFSET  11
 #define CTX_CMD_2             12 /* why */
 #define CTX_RB3D_COLORPITCH   13 /* why */
-#define CTX_STATE_SIZE_OLDDRM 14
 #define CTX_CMD_3             14
 #define CTX_RB3D_BLENDCOLOR   15
 #define CTX_RB3D_ABLENDCNTL   16
@@ -167,9 +166,6 @@ struct r200_texture_state {
 #define TEX_PP_TXSIZE               4  /*2c0c*/
 #define TEX_PP_TXPITCH              5  /*2c10*/
 #define TEX_PP_BORDER_COLOR         6  /*2c14*/
-#define TEX_CMD_1_OLDDRM            7
-#define TEX_PP_TXOFFSET_OLDDRM      8  /*2d00 */
-#define TEX_STATE_SIZE_OLDDRM       9
 #define TEX_PP_CUBIC_FACES          7
 #define TEX_PP_TXMULTI_CTL          8
 #define TEX_CMD_1_NEWDRM            9
diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c
index d9d1a0ed227..ad64f788b9f 100644
--- a/src/mesa/drivers/dri/r200/r200_state_init.c
+++ b/src/mesa/drivers/dri/r200/r200_state_init.c
@@ -254,7 +254,7 @@ CHECK( never, GL_FALSE, 0 )
 CHECK( tex_any, ctx->Texture._MaxEnabledTexImageUnit != -1, 0 )
 CHECK( tf, (ctx->Texture._MaxEnabledTexImageUnit != -1 && !ctx->ATIFragmentShader._Enabled), 0 );
 CHECK( pix_zero, !ctx->ATIFragmentShader._Enabled, 0 )
-   CHECK( texenv, (rmesa->state.envneeded & (1 << (atom->idx)) && !ctx->ATIFragmentShader._Enabled), 0 )
+CHECK( texenv, (rmesa->state.envneeded & (1 << (atom->idx)) && !ctx->ATIFragmentShader._Enabled), 0 )
 CHECK( afs_pass1, (ctx->ATIFragmentShader._Enabled && (ctx->ATIFragmentShader.Current->NumPasses > 1)), 0 )
 CHECK( afs, ctx->ATIFragmentShader._Enabled, 0 )
 CHECK( tex_cube, rmesa->state.texture.unit[atom->idx].unitneeded & TEXTURE_CUBE_BIT, 3 + 3*5 - CUBE_STATE_SIZE )
@@ -453,12 +453,15 @@ static void ctx_emit_cs(struct gl_context *ctx, struct radeon_state_atom *atom)
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
    else switch (rrb->base.Base.Format) {
    case MESA_FORMAT_B5G6R5_UNORM:
+   case MESA_FORMAT_R5G6B5_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
 	break;
    case MESA_FORMAT_B4G4R4A4_UNORM:
+   case MESA_FORMAT_A4R4G4B4_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB4444;
 	break;
    case MESA_FORMAT_B5G5R5A1_UNORM:
+   case MESA_FORMAT_A1R5G5B5_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB1555;
 	break;
    default:
diff --git a/src/mesa/drivers/dri/r200/r200_tex.h b/src/mesa/drivers/dri/r200/r200_tex.h
index d7e91d1a0c8..a8c31b741ed 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.h
+++ b/src/mesa/drivers/dri/r200/r200_tex.h
@@ -52,4 +52,68 @@ extern void r200TexUpdateParameters(struct gl_context *ctx, GLuint unit);
 
 extern void set_re_cntl_d3d( struct gl_context *ctx, int unit, GLboolean use_d3d );
 
+struct tx_table {
+   GLuint format, filter;
+};
+
+/* Note the tables (have to) contain invalid entries (if they are only valid
+ * for either be/le) */
+static const struct tx_table tx_table_be[] =
+{
+   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_BGR_UNORM8 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_I8, 0 },
+   [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YVYU422, R200_YUV_TO_RGB },
+   [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_VYUY422, R200_YUV_TO_RGB },
+   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGB_DXT1 ] = { R200_TXFORMAT_DXT1, 0 },
+   [ MESA_FORMAT_RGBA_DXT1 ] = { R200_TXFORMAT_DXT1 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT3 ] = { R200_TXFORMAT_DXT23 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT5 ] = { R200_TXFORMAT_DXT45 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+};
+
+static const struct tx_table tx_table_le[] =
+{
+   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_BGR_UNORM8 ] = { R200_TXFORMAT_ARGB8888, 0 },
+   [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_I8, 0 },
+   [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YVYU422, R200_YUV_TO_RGB },
+   [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_VYUY422, R200_YUV_TO_RGB },
+   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGB_DXT1 ] = { R200_TXFORMAT_DXT1, 0 },
+   [ MESA_FORMAT_RGBA_DXT1 ] = { R200_TXFORMAT_DXT1 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT3 ] = { R200_TXFORMAT_DXT23 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT5 ] = { R200_TXFORMAT_DXT45 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+};
+
+
+
 #endif /* __R200_TEX_H__ */
diff --git a/src/mesa/drivers/dri/r200/r200_texstate.c b/src/mesa/drivers/dri/r200/r200_texstate.c
index ab84d1752ba..441ac730d4c 100644
--- a/src/mesa/drivers/dri/r200/r200_texstate.c
+++ b/src/mesa/drivers/dri/r200/r200_texstate.c
@@ -49,80 +49,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_tex.h"
 #include "r200_tcl.h"
 
-
-#define R200_TXFORMAT_A8        R200_TXFORMAT_I8
-#define R200_TXFORMAT_L8        R200_TXFORMAT_I8
-#define R200_TXFORMAT_AL88      R200_TXFORMAT_AI88
-#define R200_TXFORMAT_YCBCR     R200_TXFORMAT_YVYU422
-#define R200_TXFORMAT_YCBCR_REV R200_TXFORMAT_VYUY422
-#define R200_TXFORMAT_RGB_DXT1  R200_TXFORMAT_DXT1
-#define R200_TXFORMAT_RGBA_DXT1 R200_TXFORMAT_DXT1
-#define R200_TXFORMAT_RGBA_DXT3 R200_TXFORMAT_DXT23
-#define R200_TXFORMAT_RGBA_DXT5 R200_TXFORMAT_DXT45
-
 #define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5) \
                              && (tx_table_be[f].format != 0xffffffff) )
 
-struct tx_table {
-   GLuint format, filter;
-};
-
-static const struct tx_table tx_table_be[] =
-{
-   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_BGR_UNORM8 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AL88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AL88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_A8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_L8, 0 },
-   [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YCBCR, R200_YUV_TO_RGB },
-   [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_YCBCR_REV, R200_YUV_TO_RGB },
-   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGB_DXT1 ] = { R200_TXFORMAT_RGB_DXT1, 0 },
-   [ MESA_FORMAT_RGBA_DXT1 ] = { R200_TXFORMAT_RGBA_DXT1 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT3 ] = { R200_TXFORMAT_RGBA_DXT3 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT5 ] = { R200_TXFORMAT_RGBA_DXT5 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-};
-
-static const struct tx_table tx_table_le[] =
-{
-   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_BGR_UNORM8 ] = { R200_TXFORMAT_ARGB8888, 0 },
-   [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AL88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AL88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_A8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_L8, 0 },
-   [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YCBCR, R200_YUV_TO_RGB },
-   [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_YCBCR_REV, R200_YUV_TO_RGB },
-   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGB_DXT1 ] = { R200_TXFORMAT_RGB_DXT1, 0 },
-   [ MESA_FORMAT_RGBA_DXT1 ] = { R200_TXFORMAT_RGBA_DXT1 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT3 ] = { R200_TXFORMAT_RGBA_DXT3 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT5 ] = { R200_TXFORMAT_RGBA_DXT5 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-};
-
 /* ================================================================
  * Texture combine functions
  */

From 7b9ebf879b6f35038996805a641667f00d93c4b7 Mon Sep 17 00:00:00 2001
From: Renaud Gaubert <renaud@lse.epita.fr>
Date: Sat, 11 Jul 2015 19:38:10 +0200
Subject: [PATCH 0347/1208] glsl: avoid compiler's segfault when processing
 operators with void arguments

This is done by returning an rvalue of type void in the
ast_function_expression::hir function instead of a void expression.

This produces (in the case of the ternary) an hir with a call
to the void returning function and an assignment of a void variable
which will be optimized out (the assignment) during the optimization
pass.

This fix results in having a valid subexpression in the many
different cases where the subexpressions are functions whose
return values are void.

Thus preventing to dereference NULL in the following cases:
  * binary operator
  * unary operators
  * ternary operator
  * comparison operators (except equal and nequal operator)

Equal and nequal had to be handled as a special case because
instead of segfaulting on a forbidden syntax it was now accepting
expressions with a void return value on either (or both) side of
the expression.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=85252

Signed-off-by: Renaud Gaubert <renaud@lse.epita.fr>
Reviewed-by: Gabriel Laskar <gabriel@lse.epita.fr>
Reviewed-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
---
 src/glsl/ast_function.cpp | 9 ++++++++-
 src/glsl/ast_to_hir.cpp   | 9 ++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp
index 92e26bf2416..6749e998e5a 100644
--- a/src/glsl/ast_function.cpp
+++ b/src/glsl/ast_function.cpp
@@ -1785,7 +1785,14 @@ ast_function_expression::hir(exec_list *instructions,
 	 /* an error has already been emitted */
 	 value = ir_rvalue::error_value(ctx);
       } else {
-	 value = generate_call(instructions, sig, &actual_parameters, state);
+         value = generate_call(instructions, sig, &actual_parameters, state);
+         if (!value) {
+            ir_variable *const tmp = new(ctx) ir_variable(glsl_type::void_type,
+                                                          "void_var",
+                                                          ir_var_temporary);
+            instructions->push_tail(tmp);
+            value = new(ctx) ir_dereference_variable(tmp);
+         }
       }
 
       return value;
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index ca30dbc499f..b5c4ed9c667 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -1270,7 +1270,14 @@ ast_expression::do_hir(exec_list *instructions,
        *    applied to one operand that can make them match, in which
        *    case this conversion is done."
        */
-      if ((!apply_implicit_conversion(op[0]->type, op[1], state)
+
+      if (op[0]->type == glsl_type::void_type || op[1]->type == glsl_type::void_type) {
+         _mesa_glsl_error(& loc, state, "`%s':  wrong operand types: "
+                         "no operation `%1$s' exists that takes a left-hand "
+                         "operand of type 'void' or a right operand of type "
+                         "'void'", (this->oper == ast_equal) ? "==" : "!=");
+         error_emitted = true;
+      } else if ((!apply_implicit_conversion(op[0]->type, op[1], state)
            && !apply_implicit_conversion(op[1]->type, op[0], state))
           || (op[0]->type != op[1]->type)) {
          _mesa_glsl_error(& loc, state, "operands of `%s' must have the same "

From 7e0180d57d330bd8d3047e841086712376b2a1cc Mon Sep 17 00:00:00 2001
From: EdB <edb+mesa@sigluy.net>
Date: Tue, 7 Jul 2015 17:58:56 +0200
Subject: [PATCH 0348/1208] clover: little OpenCL status code logging clean

s/build_error/compile_error in order to match the stored OpenCL status code.
Make program::build catch and log every OpenCL error.
Make tgsi error triggering uniform with the llvm one.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 .../state_trackers/clover/core/compiler.hpp   |  3 +-
 .../state_trackers/clover/core/error.hpp      |  4 +--
 .../state_trackers/clover/core/program.cpp    |  4 +--
 .../state_trackers/clover/llvm/invocation.cpp | 18 ++++++------
 .../state_trackers/clover/tgsi/compiler.cpp   | 28 +++++++++++--------
 5 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/src/gallium/state_trackers/clover/core/compiler.hpp b/src/gallium/state_trackers/clover/core/compiler.hpp
index c68aa39db85..207641785ca 100644
--- a/src/gallium/state_trackers/clover/core/compiler.hpp
+++ b/src/gallium/state_trackers/clover/core/compiler.hpp
@@ -37,7 +37,8 @@ namespace clover {
                                const std::string &opts,
                                std::string &r_log);
 
-   module compile_program_tgsi(const std::string &source);
+   module compile_program_tgsi(const std::string &source,
+                               std::string &r_log);
 }
 
 #endif
diff --git a/src/gallium/state_trackers/clover/core/error.hpp b/src/gallium/state_trackers/clover/core/error.hpp
index 780b973383a..59a5af4c799 100644
--- a/src/gallium/state_trackers/clover/core/error.hpp
+++ b/src/gallium/state_trackers/clover/core/error.hpp
@@ -65,9 +65,9 @@ namespace clover {
       cl_int code;
    };
 
-   class build_error : public error {
+   class compile_error : public error {
    public:
-      build_error(const std::string &what = "") :
+      compile_error(const std::string &what = "") :
          error(CL_COMPILE_PROGRAM_FAILURE, what) {
       }
    };
diff --git a/src/gallium/state_trackers/clover/core/program.cpp b/src/gallium/state_trackers/clover/core/program.cpp
index 0d6cc402db7..6eebd9c5cda 100644
--- a/src/gallium/state_trackers/clover/core/program.cpp
+++ b/src/gallium/state_trackers/clover/core/program.cpp
@@ -56,14 +56,14 @@ program::build(const ref_vector<device> &devs, const char *opts,
 
          try {
             auto module = (dev.ir_format() == PIPE_SHADER_IR_TGSI ?
-                           compile_program_tgsi(_source) :
+                           compile_program_tgsi(_source, log) :
                            compile_program_llvm(_source, headers,
                                                 dev.ir_format(),
                                                 dev.ir_target(), build_opts(dev),
                                                 log));
             _binaries.insert({ &dev, module });
             _logs.insert({ &dev, log });
-         } catch (const build_error &) {
+         } catch (const error &) {
             _logs.insert({ &dev, log });
             throw;
          }
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index 9b91fee9032..967284d8094 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -108,7 +108,7 @@ namespace {
          name, llvm::MemoryBuffer::getMemBuffer(source));
 
       if (!c.ExecuteAction(act))
-         throw build_error(log);
+         throw compile_error(log);
    }
 
    module
@@ -256,7 +256,7 @@ namespace {
       r_log = log;
 
       if (!ExecSuccess)
-         throw build_error();
+         throw compile_error();
 
       // Get address spaces map to be able to find kernel argument address space
       memcpy(address_spaces, c.getTarget().getAddressSpaceMap(),
@@ -485,7 +485,7 @@ namespace {
       LLVMDisposeMessage(err_message);
 
       if (err) {
-         throw build_error();
+         throw compile_error();
       }
    }
 
@@ -505,7 +505,7 @@ namespace {
       if (LLVMGetTargetFromTriple(triple.c_str(), &target, &error_message)) {
          r_log = std::string(error_message);
          LLVMDisposeMessage(error_message);
-         throw build_error();
+         throw compile_error();
       }
 
       LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
@@ -514,7 +514,7 @@ namespace {
 
       if (!tm) {
          r_log = "Could not create TargetMachine: " + triple;
-         throw build_error();
+         throw compile_error();
       }
 
       if (dump_asm) {
@@ -567,7 +567,7 @@ namespace {
             const char *name;
             if (gelf_getshdr(section, &symtab_header) != &symtab_header) {
                r_log = "Failed to read ELF section header.";
-               throw build_error();
+               throw compile_error();
             }
             name = elf_strptr(elf, section_str_index, symtab_header.sh_name);
            if (!strcmp(name, ".symtab")) {
@@ -577,9 +577,9 @@ namespace {
          }
          if (!symtab) {
             r_log = "Unable to find symbol table.";
-            throw build_error();
+            throw compile_error();
          }
-      } catch (build_error &e) {
+      } catch (compile_error &e) {
          elf_end(elf);
          throw e;
       }
@@ -650,7 +650,7 @@ namespace {
          stream.flush();
          *(std::string*)data = message;
 
-         throw build_error();
+         throw compile_error();
       }
    }
 
diff --git a/src/gallium/state_trackers/clover/tgsi/compiler.cpp b/src/gallium/state_trackers/clover/tgsi/compiler.cpp
index b70104e7604..54cb747e6fb 100644
--- a/src/gallium/state_trackers/clover/tgsi/compiler.cpp
+++ b/src/gallium/state_trackers/clover/tgsi/compiler.cpp
@@ -32,7 +32,7 @@ using namespace clover;
 
 namespace {
    void
-   read_header(const std::string &header, module &m) {
+   read_header(const std::string &header, module &m, std::string &r_log) {
       std::istringstream ls(header);
       std::string line;
 
@@ -45,8 +45,10 @@ namespace {
          if (!(ts >> name))
             continue;
 
-         if (!(ts >> offset))
-            throw build_error("invalid kernel start address");
+         if (!(ts >> offset)) {
+            r_log = "invalid kernel start address";
+            throw compile_error();
+         }
 
          while (ts >> tok) {
             if (tok == "scalar")
@@ -67,8 +69,10 @@ namespace {
                args.push_back({ module::argument::image3d_wr, 4 });
             else if (tok == "sampler")
                args.push_back({ module::argument::sampler, 0 });
-            else
-               throw build_error("invalid kernel argument");
+            else {
+               r_log = "invalid kernel argument";
+               throw compile_error();
+            }
          }
 
          m.syms.push_back({ name, 0, offset, args });
@@ -76,11 +80,13 @@ namespace {
    }
 
    void
-   read_body(const char *source, module &m) {
+   read_body(const char *source, module &m, std::string &r_log) {
       tgsi_token prog[1024];
 
-      if (!tgsi_text_translate(source, prog, Elements(prog)))
-         throw build_error("translate failed");
+      if (!tgsi_text_translate(source, prog, Elements(prog))) {
+         r_log = "translate failed";
+         throw compile_error();
+      }
 
       unsigned sz = tgsi_num_tokens(prog) * sizeof(tgsi_token);
       std::vector<char> data( (char *)prog, (char *)prog + sz );
@@ -89,13 +95,13 @@ namespace {
 }
 
 module
-clover::compile_program_tgsi(const std::string &source) {
+clover::compile_program_tgsi(const std::string &source, std::string &r_log) {
    const size_t body_pos = source.find("COMP\n");
    const char *body = &source[body_pos];
    module m;
 
-   read_header({ source.begin(), source.begin() + body_pos }, m);
-   read_body(body, m);
+   read_header({ source.begin(), source.begin() + body_pos }, m, r_log);
+   read_body(body, m, r_log);
 
    return m;
 }

From af768922cafa3eb3e78a2fdfee90380a74c79460 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 1 Jul 2015 16:32:24 +0300
Subject: [PATCH 0349/1208] i965/gen9: Use custom MOCS entries set up by the
 kernel.

Instead of relying on hardware defaults the i915 kernel driver is
going program custom MOCS tables system-wide on Gen9 hardware.  The
"WT" entry previously used for renderbuffers had a number of problems:
It disabled caching on eLLC, it used a reserved L3 cacheability
setting, and it used to override the PTE controls making renderbuffers
always WT on LLC regardless of the kernel's setting.  Instead use an
entry from the new MOCS tables with parameters: TC=LLC/eLLC, LeCC=PTE,
L3CC=WB.

The "WB" entry previously used for anything other than renderbuffers
has moved to a different index in the new MOCS tables but it should
have the same caching semantics as the old entry.

Even though the corresponding kernel change ("drm/i915: Added
Programming of the MOCS") is in a way an ABI break it doesn't seem
necessary to check that the kernel is recent enough because the change
should only affect Gen9 which is still unreleased hardware.

v2: Update MOCS values for the new Android-incompatible tables
    introduced in v7 of the kernel patch.

Cc: 10.6 <mesa-stable@lists.freedesktop.org>
Reference: http://lists.freedesktop.org/archives/intel-gfx/2015-July/071080.html
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/brw_defines.h        | 11 ++++++-----
 src/mesa/drivers/dri/i965/gen8_surface_state.c |  3 +--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index b1a1c11b3ae..5bf53e375e8 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -2492,12 +2492,13 @@ enum brw_wm_barycentric_interp_mode {
 #define BDW_MOCS_WT  0x58
 #define BDW_MOCS_PTE 0x18
 
-/* Skylake: MOCS is now an index into an array of 64 different configurable
- * cache settings.  We still use only either write-back or write-through; and
- * rely on the documented default values.
+/* Skylake: MOCS is now an index into an array of 62 different caching
+ * configurations programmed by the kernel.
  */
-#define SKL_MOCS_WB (0b001001 << 1)
-#define SKL_MOCS_WT (0b000101 << 1)
+/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
+#define SKL_MOCS_WB  (2 << 1)
+/* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */
+#define SKL_MOCS_PTE (1 << 1)
 
 #define MEDIA_VFE_STATE                         0x7000
 /* GEN7 DW2, GEN8+ DW3 */
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index bd3eb00a439..6c4d3e197a5 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -401,8 +401,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
       irb->mt_layer : (irb->mt_layer / MAX2(mt->num_samples, 1));
    GLenum gl_target =
       rb->TexImage ? rb->TexImage->TexObject->Target : GL_TEXTURE_2D;
-   /* FINISHME: Use PTE MOCS on Skylake. */
-   uint32_t mocs = brw->gen >= 9 ? SKL_MOCS_WT : BDW_MOCS_PTE;
+   const uint32_t mocs = brw->gen >= 9 ? SKL_MOCS_PTE : BDW_MOCS_PTE;
 
    intel_miptree_used_for_rendering(mt);
 

From 26222932c013da3688e39dc831179659cc65c39a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 12 Jun 2015 14:24:17 +0200
Subject: [PATCH 0350/1208] gallium: add PIPE_CAP_MAX_SHADER_PATCH_VARYINGS

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/docs/source/screen.rst               | 4 ++++
 src/gallium/drivers/freedreno/freedreno_screen.c | 1 +
 src/gallium/drivers/i915/i915_screen.c           | 1 +
 src/gallium/drivers/ilo/ilo_screen.c             | 1 +
 src/gallium/drivers/llvmpipe/lp_screen.c         | 1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 1 +
 src/gallium/drivers/r300/r300_screen.c           | 1 +
 src/gallium/drivers/r600/r600_pipe.c             | 1 +
 src/gallium/drivers/radeonsi/si_pipe.c           | 1 +
 src/gallium/drivers/softpipe/sp_screen.c         | 1 +
 src/gallium/drivers/svga/svga_screen.c           | 1 +
 src/gallium/drivers/vc4/vc4_screen.c             | 1 +
 src/gallium/include/pipe/p_defines.h             | 1 +
 15 files changed, 18 insertions(+)

diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 74636207d06..246428deb87 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -254,6 +254,10 @@ The integer capabilities:
   and size must be page-aligned.
 * ``PIPE_CAP_DEVICE_RESET_STATUS_QUERY``:
   Whether pipe_context::get_device_reset_status is implemented.
+* ``PIPE_CAP_MAX_SHADER_PATCH_VARYINGS``:
+  How many per-patch outputs and inputs are supported between tessellation
+  control and tessellation evaluation shaders, not counting in TESSINNER and
+  TESSOUTER. The minimum allowed value for OpenGL is 30.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 7b1b4e357c4..b28d315cc12 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -219,6 +219,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index dc0004366b1..6083687962f 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -243,6 +243,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index eb6c0466eb9..338643e65bb 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -465,6 +465,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 466c5d0cef8..1c6c82ea115 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -292,6 +292,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 4b3ed7dc157..b63de694fa7 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -163,6 +163,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 6583a353578..c32ad01eec3 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -210,6 +210,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 3f52c85bc74..518968a75f9 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -195,6 +195,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index e8accefe1a8..6292c2c4a03 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -191,6 +191,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+        case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 143e98ea0af..e8459288106 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -335,6 +335,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
 		return 0;
 
 	/* Stream output. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index a9dce2cdd32..2b6a6ff5522 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -297,6 +297,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
 		return 0;
 
 	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index a688d319bb8..ed8e5457f27 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -242,6 +242,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index e2305bec359..ac5abe8f89d 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -309,6 +309,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_UMA:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
       return 0;
    }
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index f63bead0fbb..c069454c2d1 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -176,6 +176,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index b0cd23dbeff..2dc8798f3c7 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -605,6 +605,7 @@ enum pipe_cap
    PIPE_CAP_MULTISAMPLE_Z_RESOLVE,
    PIPE_CAP_RESOURCE_FROM_USER_MEMORY,
    PIPE_CAP_DEVICE_RESET_STATUS_QUERY,
+   PIPE_CAP_MAX_SHADER_PATCH_VARYINGS,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)

From f9f79d29ce75c681c46bdbac5aa3f19ee1adb93b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 5 Jul 2015 13:51:16 +0200
Subject: [PATCH 0351/1208] gallium: add BIND flags for R/W buffers and images

PIPE_CAPs and TGSI support will be added later. The TGSI support should be
straightforward. We only need to split TGSI_FILE_RESOURCE into TGSI_FILE_IMAGE
and TGSI_FILE_BUFFER, though duplicating all opcodes shouldn't be necessary.

The idea is:
* ARB_shader_image_load_store should use set_shader_images.
* ARB_shader_storage_buffer_object should use set_shader_buffers(slots 0..M-1)
  if M shader storage buffers are supported.
* ARB_shader_atomic_counters should use set_shader_buffers(slots M..N)
  if N-M+1 atomic counter buffers are supported.

PIPE_CAPs can describe various constraints for early DX11 hardware.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/auxiliary/util/u_debug.c            |  3 ++-
 src/gallium/docs/source/screen.rst              |  6 ++++--
 src/gallium/drivers/ilo/ilo_resource.c          |  2 +-
 src/gallium/drivers/nouveau/nouveau_buffer.c    |  3 ++-
 src/gallium/drivers/nouveau/nouveau_screen.c    |  3 ++-
 src/gallium/drivers/nouveau/nv50/nv50_formats.c |  2 +-
 src/gallium/include/pipe/p_defines.h            | 13 +++++++------
 7 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 2d2d049b205..cf6eca77a39 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -758,7 +758,8 @@ debug_print_bind_flags(const char *msg, unsigned usage)
       DEBUG_NAMED_VALUE(PIPE_BIND_CURSOR),
       DEBUG_NAMED_VALUE(PIPE_BIND_CUSTOM),
       DEBUG_NAMED_VALUE(PIPE_BIND_GLOBAL),
-      DEBUG_NAMED_VALUE(PIPE_BIND_SHADER_RESOURCE),
+      DEBUG_NAMED_VALUE(PIPE_BIND_SHADER_BUFFER),
+      DEBUG_NAMED_VALUE(PIPE_BIND_SHADER_IMAGE),
       DEBUG_NAMED_VALUE(PIPE_BIND_COMPUTE_RESOURCE),
       DEBUG_NAMED_VALUE(PIPE_BIND_COMMAND_ARGS_BUFFER),
       DEBUG_NAMED_VALUE(PIPE_BIND_SCANOUT),
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 246428deb87..dbdccc7fcef 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -430,8 +430,10 @@ resources might be created and handled quite differently.
   process.
 * ``PIPE_BIND_GLOBAL``: A buffer that can be mapped into the global
   address space of a compute program.
-* ``PIPE_BIND_SHADER_RESOURCE``: A buffer or texture that can be
-  bound to the graphics pipeline as a shader resource.
+* ``PIPE_BIND_SHADER_BUFFER``: A buffer without a format that can be bound
+  to a shader and can be used with load, store, and atomic instructions.
+* ``PIPE_BIND_SHADER_IMAGE``: A buffer or texture with a format that can be
+  bound to a shader and can be used with load, store, and atomic instructions.
 * ``PIPE_BIND_COMPUTE_RESOURCE``: A buffer or texture that can be
   bound to the compute program as a shader resource.
 * ``PIPE_BIND_COMMAND_ARGS_BUFFER``: A buffer that may be sourced by the
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index 3797c8ec24f..9026ba9a983 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -210,7 +210,7 @@ resource_get_image_info(const struct pipe_resource *templ,
    info->bind_surface_sampler = (templ->bind & PIPE_BIND_SAMPLER_VIEW);
    info->bind_surface_dp_render = (templ->bind & PIPE_BIND_RENDER_TARGET);
    info->bind_surface_dp_typed = (templ->bind &
-         (PIPE_BIND_SHADER_RESOURCE | PIPE_BIND_COMPUTE_RESOURCE));
+         (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_COMPUTE_RESOURCE));
    info->bind_zs = (templ->bind & PIPE_BIND_DEPTH_STENCIL);
    info->bind_scanout = (templ->bind & PIPE_BIND_SCANOUT);
    info->bind_cursor = (templ->bind & PIPE_BIND_CURSOR);
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 09cdbb53ecb..23619467231 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -44,7 +44,8 @@ nouveau_buffer_allocate(struct nouveau_screen *screen,
 
    if (buf->base.bind & (PIPE_BIND_CONSTANT_BUFFER |
                          PIPE_BIND_COMPUTE_RESOURCE |
-                         PIPE_BIND_SHADER_RESOURCE))
+                         PIPE_BIND_SHADER_BUFFER |
+                         PIPE_BIND_SHADER_IMAGE))
       size = align(size, 0x100);
 
    if (domain == NOUVEAU_BO_VRAM) {
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index e5b3c159c84..0f6313c768c 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -209,7 +209,8 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 		PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
 		PIPE_BIND_CURSOR |
 		PIPE_BIND_SAMPLER_VIEW |
-		PIPE_BIND_SHADER_RESOURCE | PIPE_BIND_COMPUTE_RESOURCE |
+		PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE |
+                PIPE_BIND_COMPUTE_RESOURCE |
 		PIPE_BIND_GLOBAL;
 	screen->sysmem_bindings =
 		PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_STREAM_OUTPUT |
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_formats.c b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
index 0f86ba1de0d..49a93bf1d91 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_formats.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
@@ -44,7 +44,7 @@
  */
 #define U_V   PIPE_BIND_VERTEX_BUFFER
 #define U_T   PIPE_BIND_SAMPLER_VIEW
-#define U_I   PIPE_BIND_SHADER_RESOURCE | PIPE_BIND_COMPUTE_RESOURCE
+#define U_I   PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE | PIPE_BIND_COMPUTE_RESOURCE
 #define U_TR  PIPE_BIND_RENDER_TARGET | U_T
 #define U_IR  U_TR | U_I
 #define U_TB  PIPE_BIND_BLENDABLE | U_TR
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 2dc8798f3c7..68c536f1101 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -351,9 +351,10 @@ enum pipe_flush_flags
 #define PIPE_BIND_CURSOR               (1 << 11) /* mouse cursor */
 #define PIPE_BIND_CUSTOM               (1 << 12) /* state-tracker/winsys usages */
 #define PIPE_BIND_GLOBAL               (1 << 13) /* set_global_binding */
-#define PIPE_BIND_SHADER_RESOURCE      (1 << 14) /* set_shader_resources */
-#define PIPE_BIND_COMPUTE_RESOURCE     (1 << 15) /* set_compute_resources */
-#define PIPE_BIND_COMMAND_ARGS_BUFFER  (1 << 16) /* pipe_draw_info.indirect */
+#define PIPE_BIND_SHADER_BUFFER        (1 << 14) /* set_shader_buffers */
+#define PIPE_BIND_SHADER_IMAGE         (1 << 15) /* set_shader_images */
+#define PIPE_BIND_COMPUTE_RESOURCE     (1 << 16) /* set_compute_resources */
+#define PIPE_BIND_COMMAND_ARGS_BUFFER  (1 << 17) /* pipe_draw_info.indirect */
 
 /**
  * The first two flags above were previously part of the amorphous
@@ -374,9 +375,9 @@ enum pipe_flush_flags
  * The third flag has been added to be able to force textures to be created
  * in linear mode (no tiling).
  */
-#define PIPE_BIND_SCANOUT     (1 << 17) /*  */
-#define PIPE_BIND_SHARED      (1 << 18) /* get_texture_handle ??? */
-#define PIPE_BIND_LINEAR      (1 << 19)
+#define PIPE_BIND_SCANOUT     (1 << 18) /*  */
+#define PIPE_BIND_SHARED      (1 << 19) /* get_texture_handle ??? */
+#define PIPE_BIND_LINEAR      (1 << 20)
 
 
 /**

From b73bec0ecd43861337daf9663e242d2b44f36dbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 5 Jul 2015 14:34:13 +0200
Subject: [PATCH 0352/1208] gallium: add new limits for shader buffers and
 images

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/ilo/ilo_state.h               | 2 +-
 src/gallium/include/pipe/p_state.h                | 3 ++-
 src/gallium/state_trackers/clover/core/device.cpp | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/ilo/ilo_state.h b/src/gallium/drivers/ilo/ilo_state.h
index 3e6fd8a2554..66c93007eb1 100644
--- a/src/gallium/drivers/ilo/ilo_state.h
+++ b/src/gallium/drivers/ilo/ilo_state.h
@@ -202,7 +202,7 @@ struct ilo_cbuf_state {
 };
 
 struct ilo_resource_state {
-   struct pipe_surface *states[PIPE_MAX_SHADER_RESOURCES];
+   struct pipe_surface *states[PIPE_MAX_SHADER_IMAGES];
    unsigned count;
 };
 
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index a18f12e8a87..1c529f7d078 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -61,7 +61,8 @@ extern "C" {
 #define PIPE_MAX_SHADER_INPUTS    80 /* 32 GENERIC + 32 PATCH + 16 others */
 #define PIPE_MAX_SHADER_OUTPUTS   80 /* 32 GENERIC + 32 PATCH + 16 others */
 #define PIPE_MAX_SHADER_SAMPLER_VIEWS 32
-#define PIPE_MAX_SHADER_RESOURCES 32
+#define PIPE_MAX_SHADER_BUFFERS   32
+#define PIPE_MAX_SHADER_IMAGES    32
 #define PIPE_MAX_TEXTURE_LEVELS   16
 #define PIPE_MAX_SO_BUFFERS        4
 #define PIPE_MAX_SO_OUTPUTS       64
diff --git a/src/gallium/state_trackers/clover/core/device.cpp b/src/gallium/state_trackers/clover/core/device.cpp
index c42d1d26004..6efff79c7f4 100644
--- a/src/gallium/state_trackers/clover/core/device.cpp
+++ b/src/gallium/state_trackers/clover/core/device.cpp
@@ -89,12 +89,12 @@ device::vendor_id() const {
 
 size_t
 device::max_images_read() const {
-   return PIPE_MAX_SHADER_RESOURCES;
+   return PIPE_MAX_SHADER_IMAGES;
 }
 
 size_t
 device::max_images_write() const {
-   return PIPE_MAX_SHADER_RESOURCES;
+   return PIPE_MAX_SHADER_IMAGES;
 }
 
 cl_uint

From 05a12c53a308965aba1c00f0caf36d8e0f32e035 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 5 Jul 2015 14:48:33 +0200
Subject: [PATCH 0353/1208] gallium: add interface for writable shader images

PIPE_CAPs will be added some other time.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/auxiliary/util/u_debug_describe.c |  9 +++++
 src/gallium/auxiliary/util/u_debug_describe.h |  2 ++
 src/gallium/auxiliary/util/u_dump.h           |  3 ++
 src/gallium/auxiliary/util/u_dump_state.c     | 27 ++++++++++++++
 src/gallium/auxiliary/util/u_inlines.h        | 10 ++++++
 src/gallium/docs/source/context.rst           | 16 ++++-----
 src/gallium/drivers/ilo/ilo_state.c           | 10 +++---
 src/gallium/drivers/nouveau/nvc0/nvc0_state.c | 10 +++---
 src/gallium/include/pipe/p_context.h          | 35 +++++++++++++------
 src/gallium/include/pipe/p_state.h            | 25 +++++++++++++
 10 files changed, 120 insertions(+), 27 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_debug_describe.c b/src/gallium/auxiliary/util/u_debug_describe.c
index df73ed83ef6..f428d22d205 100644
--- a/src/gallium/auxiliary/util/u_debug_describe.c
+++ b/src/gallium/auxiliary/util/u_debug_describe.c
@@ -80,6 +80,15 @@ debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr)
    util_sprintf(buf, "pipe_sampler_view<%s,%s>", res, util_format_short_name(ptr->format));
 }
 
+void
+debug_describe_image_view(char* buf, const struct pipe_image_view *ptr)
+{
+   char res[128];
+   debug_describe_resource(res, ptr->resource);
+   util_sprintf(buf, "pipe_image_view<%s,%s>", res,
+                util_format_short_name(ptr->format));
+}
+
 void
 debug_describe_so_target(char* buf,
                          const struct pipe_stream_output_target *ptr)
diff --git a/src/gallium/auxiliary/util/u_debug_describe.h b/src/gallium/auxiliary/util/u_debug_describe.h
index 4f7882b0b37..2172ecb4395 100644
--- a/src/gallium/auxiliary/util/u_debug_describe.h
+++ b/src/gallium/auxiliary/util/u_debug_describe.h
@@ -35,12 +35,14 @@ struct pipe_reference;
 struct pipe_resource;
 struct pipe_surface;
 struct pipe_sampler_view;
+struct pipe_image_view;
 
 /* a 256-byte buffer is necessary and sufficient */
 void debug_describe_reference(char* buf, const struct pipe_reference*ptr);
 void debug_describe_resource(char* buf, const struct pipe_resource *ptr);
 void debug_describe_surface(char* buf, const struct pipe_surface *ptr);
 void debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr);
+void debug_describe_image_view(char* buf, const struct pipe_image_view *ptr);
 void debug_describe_so_target(char* buf,
                               const struct pipe_stream_output_target *ptr);
 
diff --git a/src/gallium/auxiliary/util/u_dump.h b/src/gallium/auxiliary/util/u_dump.h
index 58e7dfd8244..3ddf518fa2b 100644
--- a/src/gallium/auxiliary/util/u_dump.h
+++ b/src/gallium/auxiliary/util/u_dump.h
@@ -153,6 +153,9 @@ void
 util_dump_surface(FILE *stream,
                   const struct pipe_surface *state);
 
+void
+util_dump_image_view(FILE *stream, const struct pipe_image_view *state);
+
 void
 util_dump_transfer(FILE *stream,
                    const struct pipe_transfer *state);
diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c
index 7f620b50cf0..88027cbbc79 100644
--- a/src/gallium/auxiliary/util/u_dump_state.c
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -671,6 +671,33 @@ util_dump_surface(FILE *stream, const struct pipe_surface *state)
 }
 
 
+void
+util_dump_image_view(FILE *stream, const struct pipe_image_view *state)
+{
+   if (!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_image_view");
+
+   util_dump_member(stream, ptr, state, resource);
+   util_dump_member(stream, format, state, format);
+
+   if (state->resource->target == PIPE_BUFFER) {
+      util_dump_member(stream, uint, state, u.buf.first_element);
+      util_dump_member(stream, uint, state, u.buf.last_element);
+   }
+   else {
+      util_dump_member(stream, uint, state, u.tex.first_layer);
+      util_dump_member(stream, uint, state, u.tex.last_layer);
+      util_dump_member(stream, uint, state, u.tex.level);
+   }
+
+   util_dump_struct_end(stream);
+}
+
+
 void
 util_dump_transfer(FILE *stream, const struct pipe_transfer *state)
 {
diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h
index 95401621ec3..661a949a4b1 100644
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -173,6 +173,16 @@ pipe_sampler_view_release(struct pipe_context *ctx,
    *ptr = NULL;
 }
 
+static INLINE void
+pipe_image_view_reference(struct pipe_image_view **ptr, struct pipe_image_view *view)
+{
+   struct pipe_image_view *old_view = *ptr;
+
+   if (pipe_reference_described(&(*ptr)->reference, &view->reference,
+                                (debug_reference_descriptor)debug_describe_image_view))
+      old_view->context->image_view_destroy(old_view->context, old_view);
+   *ptr = view;
+}
 
 static INLINE void
 pipe_so_target_reference(struct pipe_stream_output_target **ptr,
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index 0908ee7e058..a7d08d2c7f9 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -131,14 +131,14 @@ from a shader without an associated sampler.  This means that they
 have no support for floating point coordinates, address wrap modes or
 filtering.
 
-Shader resources are specified for all the shader stages at once using
-the ``set_shader_resources`` method.  When binding texture resources,
-the ``level``, ``first_layer`` and ``last_layer`` pipe_surface fields
-specify the mipmap level and the range of layers the texture will be
-constrained to.  In the case of buffers, ``first_element`` and
-``last_element`` specify the range within the buffer that will be used
-by the shader resource.  Writes to a shader resource are only allowed
-when the ``writable`` flag is set.
+There are 2 types of shader resources: buffers and images.
+
+Buffers are specified using the ``set_shader_buffers`` method.
+
+Images are specified using the ``set_shader_images`` method. When binding
+images, the ``level``, ``first_layer`` and ``last_layer`` pipe_image_view
+fields specify the mipmap level and the range of layers the image will be
+constrained to.
 
 Surfaces
 ^^^^^^^^
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index c13fa9c2e18..d89765a9d23 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -1849,10 +1849,11 @@ ilo_set_sampler_views(struct pipe_context *pipe, unsigned shader,
 }
 
 static void
-ilo_set_shader_resources(struct pipe_context *pipe,
-                         unsigned start, unsigned count,
-                         struct pipe_surface **surfaces)
+ilo_set_shader_images(struct pipe_context *pipe, unsigned shader,
+                      unsigned start, unsigned count,
+                      struct pipe_image_view **views)
 {
+#if 0
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
    struct ilo_resource_state *dst = &vec->resource;
    unsigned i;
@@ -1881,6 +1882,7 @@ ilo_set_shader_resources(struct pipe_context *pipe,
    }
 
    vec->dirty |= ILO_DIRTY_RESOURCE;
+#endif
 }
 
 static void
@@ -2345,7 +2347,7 @@ ilo_init_state_functions(struct ilo_context *ilo)
    ilo->base.set_scissor_states = ilo_set_scissor_states;
    ilo->base.set_viewport_states = ilo_set_viewport_states;
    ilo->base.set_sampler_views = ilo_set_sampler_views;
-   ilo->base.set_shader_resources = ilo_set_shader_resources;
+   ilo->base.set_shader_images = ilo_set_shader_images;
    ilo->base.set_vertex_buffers = ilo_set_vertex_buffers;
    ilo->base.set_index_buffer = ilo_set_index_buffer;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 6b7a211e71b..337559c3be8 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -1125,13 +1125,15 @@ nvc0_set_compute_resources(struct pipe_context *pipe,
 }
 
 static void
-nvc0_set_shader_resources(struct pipe_context *pipe,
+nvc0_set_shader_images(struct pipe_context *pipe,
                           unsigned start, unsigned nr,
-                          struct pipe_surface **resources)
+                          struct pipe_image_view **views)
 {
-   nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, resources);
+#if 0
+   nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, views);
 
    nvc0_context(pipe)->dirty |= NVC0_NEW_SURFACES;
+#endif
 }
 
 static INLINE void
@@ -1253,7 +1255,7 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
 
    pipe->set_global_binding = nvc0_set_global_bindings;
    pipe->set_compute_resources = nvc0_set_compute_resources;
-   pipe->set_shader_resources = nvc0_set_shader_resources;
+   pipe->set_shader_images = nvc0_set_shader_images;
 
    nvc0->sample_mask = ~0;
    nvc0->min_samples = 1;
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index d2c2e4c8d14..76873956cc0 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -48,6 +48,7 @@ struct pipe_depth_stencil_alpha_state;
 struct pipe_draw_info;
 struct pipe_fence_handle;
 struct pipe_framebuffer_state;
+struct pipe_image_view;
 struct pipe_index_buffer;
 struct pipe_query;
 struct pipe_poly_stipple;
@@ -236,20 +237,21 @@ struct pipe_context {
                           const float default_inner_level[2]);
 
    /**
-    * Bind an array of shader resources that will be used by the
-    * graphics pipeline.  Any resources that were previously bound to
-    * the specified range will be unbound after this call.
+    * Bind an array of images that will be used by a shader.
+    * Any images that were previously bound to the specified range
+    * will be unbound.
     *
-    * \param start      first resource to bind.
-    * \param count      number of consecutive resources to bind.
-    * \param resources  array of pointers to the resources to bind, it
+    * \param shader     selects shader stage
+    * \param start_slot first image slot to bind.
+    * \param count      number of consecutive images to bind.
+    * \param buffers    array of pointers to the images to bind, it
     *                   should contain at least \a count elements
-    *                   unless it's NULL, in which case no new
-    *                   resources will be bound.
+    *                   unless it's NULL, in which case no images will
+    *                   be bound.
     */
-   void (*set_shader_resources)(struct pipe_context *,
-                                unsigned start, unsigned count,
-                                struct pipe_surface **resources);
+   void (*set_shader_images)(struct pipe_context *, unsigned shader,
+                             unsigned start_slot, unsigned count,
+                             struct pipe_image_view **images);
 
    void (*set_vertex_buffers)( struct pipe_context *,
                                unsigned start_slot,
@@ -397,6 +399,17 @@ struct pipe_context {
    void (*surface_destroy)(struct pipe_context *ctx,
                            struct pipe_surface *);
 
+   /**
+    * Create an image view into a buffer or texture to be used with load,
+    * store, and atomic instructions by a shader stage.
+    */
+   struct pipe_image_view * (*create_image_view)(struct pipe_context *ctx,
+                                                 struct pipe_resource *texture,
+                                                 const struct pipe_image_view *templat);
+
+   void (*image_view_destroy)(struct pipe_context *ctx,
+                              struct pipe_image_view *view);
+
    /**
     * Map a resource.
     *
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 1c529f7d078..78dc5785332 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -388,6 +388,31 @@ struct pipe_sampler_view
 };
 
 
+/**
+ * A view into a writable buffer or texture that can be bound to a shader
+ * stage.
+ */
+struct pipe_image_view
+{
+   struct pipe_reference reference;
+   struct pipe_resource *resource; /**< resource into which this is a view  */
+   struct pipe_context *context; /**< context this view belongs to */
+   enum pipe_format format;      /**< typed PIPE_FORMAT_x */
+
+   union {
+      struct {
+         unsigned first_layer:16;     /**< first layer to use for array textures */
+         unsigned last_layer:16;      /**< last layer to use for array textures */
+         unsigned level:8;            /**< mipmap level to use */
+      } tex;
+      struct {
+         unsigned first_element;
+         unsigned last_element;
+      } buf;
+   } u;
+};
+
+
 /**
  * Subregion of 1D/2D/3D image resource.
  */

From 8fba933ca2dd3c3487281135a9063b6ca9bed359 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 5 Jul 2015 15:00:22 +0200
Subject: [PATCH 0354/1208] gallium: add interface for writable shader buffers

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/include/pipe/p_context.h | 18 ++++++++++++++++++
 src/gallium/include/pipe/p_state.h   | 10 ++++++++++
 2 files changed, 28 insertions(+)

diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 76873956cc0..f89dae98a2f 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -58,6 +58,7 @@ struct pipe_resource;
 struct pipe_sampler_state;
 struct pipe_sampler_view;
 struct pipe_scissor_state;
+struct pipe_shader_buffer;
 struct pipe_shader_state;
 struct pipe_stencil_ref;
 struct pipe_stream_output_target;
@@ -236,6 +237,23 @@ struct pipe_context {
                           const float default_outer_level[4],
                           const float default_inner_level[2]);
 
+   /**
+    * Bind an array of shader buffers that will be used by a shader.
+    * Any buffers that were previously bound to the specified range
+    * will be unbound.
+    *
+    * \param shader     selects shader stage
+    * \param start_slot first buffer slot to bind.
+    * \param count      number of consecutive buffers to bind.
+    * \param buffers    array of pointers to the buffers to bind, it
+    *                   should contain at least \a count elements
+    *                   unless it's NULL, in which case no buffers will
+    *                   be bound.
+    */
+   void (*set_shader_buffers)(struct pipe_context *, unsigned shader,
+                              unsigned start_slot, unsigned count,
+                              struct pipe_shader_buffer *buffers);
+
    /**
     * Bind an array of images that will be used by a shader.
     * Any images that were previously bound to the specified range
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 78dc5785332..a233610ead6 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -493,6 +493,16 @@ struct pipe_constant_buffer
 };
 
 
+/**
+ * An untyped shader buffer supporting loads, stores, and atomics.
+ */
+struct pipe_shader_buffer {
+   struct pipe_resource *buffer; /**< the actual buffer */
+   unsigned buffer_offset; /**< offset to start of data in buffer, in bytes */
+   unsigned buffer_size;   /**< how much data can be read in shader */
+};
+
+
 /**
  * A stream output target. The structure specifies the range vertices can
  * be written to.

From 3ee2daf23dc91b8dfc017b5c89c10ab1376ba4df Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Fri, 10 Jul 2015 19:18:39 +0300
Subject: [PATCH 0355/1208] i965: Implement b2f and b2i using negation.

Booleans are represented as 0/-1 on modern hardware which means we can
just negate them to convert them into a numeric type.  Negation has
the benefit that it can be implemented using a source modifier which
can easily be propagated into some other instruction.  shader-db
results on HSW:

total instructions in shared programs: 6349082 -> 6346693 (-0.04%)
instructions in affected programs:     40948 -> 38559 (-5.83%)
helped:                                123
HURT:                                  1
GAINED:                                1
LOST:                                  0

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp       | 4 +---
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 7 +------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 10903a11c31..198703281e6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -968,10 +968,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       break;
 
    case nir_op_b2i:
-      bld.AND(result, op[0], fs_reg(1));
-      break;
    case nir_op_b2f:
-      bld.AND(retype(result, BRW_REGISTER_TYPE_UD), op[0], fs_reg(0x3f800000u));
+      bld.MOV(result, negate(op[0]));
       break;
 
    case nir_op_f2b:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 67f2b5c29ff..8a352d33e2f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1733,16 +1733,11 @@ vec4_visitor::visit(ir_expression *ir)
       emit(MOV(result_dst, op[0]));
       break;
    case ir_unop_b2i:
-      emit(AND(result_dst, op[0], src_reg(1)));
-      break;
    case ir_unop_b2f:
       if (devinfo->gen <= 5) {
          resolve_bool_comparison(ir->operands[0], &op[0]);
       }
-      op[0].type = BRW_REGISTER_TYPE_D;
-      result_dst.type = BRW_REGISTER_TYPE_D;
-      emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
-      result_dst.type = BRW_REGISTER_TYPE_F;
+      emit(MOV(result_dst, negate(op[0])));
       break;
    case ir_unop_f2b:
       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));

From b00cd6e4a0f9a84d514f428428be348900236e2e Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Thu, 9 Jul 2015 21:42:28 +0300
Subject: [PATCH 0356/1208] i965: Implement nir_op_uadd_carry and _usub_borrow
 without accumulator.

This gets rid of two no16() fall-backs and should allow better
scheduling of the generated IR.  There are no uses of usubBorrow() or
uaddCarry() in shader-db so no changes are expected.  However the
"arb_gpu_shader5/execution/built-in-functions/fs-usubBorrow" and
"arb_gpu_shader5/execution/built-in-functions/fs-uaddCarry" piglit
tests go from 40 to 28 instructions.  The reason is that the plain ADD
instruction can easily be CSE'ed with the original addition, and the
b2i negation can easily be propagated into the source modifier of
another instruction, so effectively both operations are performed with
just one instruction.

v2: Rely on carry_to_arith() and borrow_to_arith() to lower these
    (Ilia Mirkin).

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp      | 26 +++----------------
 src/mesa/drivers/dri/i965/brw_shader.cpp      |  4 ++-
 .../drivers/dri/i965/brw_vec4_visitor.cpp     | 17 ++++--------
 3 files changed, 12 insertions(+), 35 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 198703281e6..3099dc447ec 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -835,29 +835,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
       break;
 
-   case nir_op_uadd_carry: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
+   case nir_op_uadd_carry:
+      unreachable("Should have been lowered by carry_to_arith().");
 
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  BRW_REGISTER_TYPE_UD);
-
-      bld.ADDC(bld.null_reg_ud(), op[0], op[1]);
-      bld.MOV(result, fs_reg(acc));
-      break;
-   }
-
-   case nir_op_usub_borrow: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  BRW_REGISTER_TYPE_UD);
-
-      bld.SUBB(bld.null_reg_ud(), op[0], op[1]);
-      bld.MOV(result, fs_reg(acc));
-      break;
-   }
+   case nir_op_usub_borrow:
+      unreachable("Should have been lowered by borrow_to_arith().");
 
    case nir_op_umod:
       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 3e3d78b9ad7..d66baf34b38 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -259,7 +259,9 @@ process_glsl_ir(struct brw_context *brw,
                       EXP_TO_EXP2 |
                       LOG_TO_LOG2 |
                       bitfield_insert |
-                      LDEXP_TO_ARITH);
+                      LDEXP_TO_ARITH |
+                      CARRY_TO_ARITH |
+                      BORROW_TO_ARITH);
 
    /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
     * if-statements need to be flattened.
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 8a352d33e2f..f351bf4075b 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1601,20 +1601,13 @@ vec4_visitor::visit(ir_expression *ir)
       assert(ir->type->is_integer());
       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
       break;
-   case ir_binop_carry: {
-      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
 
-      emit(ADDC(dst_null_ud(), op[0], op[1]));
-      emit(MOV(result_dst, src_reg(acc)));
-      break;
-   }
-   case ir_binop_borrow: {
-      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+   case ir_binop_carry:
+      unreachable("Should have been lowered by carry_to_arith().");
+
+   case ir_binop_borrow:
+      unreachable("Should have been lowered by borrow_to_arith().");
 
-      emit(SUBB(dst_null_ud(), op[0], op[1]));
-      emit(MOV(result_dst, src_reg(acc)));
-      break;
-   }
    case ir_binop_mod:
       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
       assert(ir->type->is_integer());

From 4bddd82bf3dae44c2b75cef34e9e85e15d63df7f Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 14 Jul 2015 15:43:44 +0300
Subject: [PATCH 0357/1208] i965/fs: Factor out universally broken calculation
 of the register component size.

This in principle simple calculation was being open-coded in a number
of places (in a series I haven't yet sent for review there will be a
couple more), all of them were subtly broken in one way or another:
None of them were handling the HW_REG case correctly as pointed out by
Connor, and fs_inst::regs_read() was handling the stride=0 case rather
naively.  This patch solves both problems and factors out the
calculation as a new fs_reg method.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp          | 21 ++++++++++++-------
 src/mesa/drivers/dri/i965/brw_fs.h            |  6 +++---
 .../drivers/dri/i965/brw_fs_generator.cpp     |  2 +-
 src/mesa/drivers/dri/i965/brw_ir_fs.h         |  6 ++++++
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 189da1d553e..ff0675d146f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -78,8 +78,8 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    case HW_REG:
    case MRF:
    case ATTR:
-      this->regs_written =
-         DIV_ROUND_UP(MAX2(exec_size * dst.stride, 1) * type_sz(dst.type), 32);
+      this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
+                                        REG_SIZE);
       break;
    case BAD_FILE:
       this->regs_written = 0;
@@ -443,6 +443,15 @@ fs_reg::is_contiguous() const
    return stride == 1;
 }
 
+unsigned
+fs_reg::component_size(unsigned width) const
+{
+   const unsigned stride = (file != HW_REG ? this->stride :
+                            fixed_hw_reg.hstride == 0 ? 0 :
+                            1 << (fixed_hw_reg.hstride - 1));
+   return MAX2(width * stride, 1) * type_sz(type);
+}
+
 int
 fs_visitor::type_size(const struct glsl_type *type)
 {
@@ -702,12 +711,8 @@ fs_inst::regs_read(int arg) const
       return 1;
    case GRF:
    case HW_REG:
-      if (src[arg].stride == 0) {
-         return 1;
-      } else {
-         int size = components * this->exec_size * type_sz(src[arg].type);
-         return DIV_ROUND_UP(size * src[arg].stride, 32);
-      }
+      return DIV_ROUND_UP(components * src[arg].component_size(exec_size),
+                          REG_SIZE);
    case MRF:
       unreachable("MRF registers are not allowed as sources");
    default:
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 88a50ae0913..c00566667e0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -70,14 +70,14 @@ offset(fs_reg reg, const brw::fs_builder& bld, unsigned delta)
       break;
    case GRF:
    case MRF:
+   case HW_REG:
    case ATTR:
       return byte_offset(reg,
-                         delta * MAX2(bld.dispatch_width() * reg.stride, 1) *
-                         type_sz(reg.type));
+                         delta * reg.component_size(bld.dispatch_width()));
    case UNIFORM:
       reg.reg_offset += delta;
       break;
-   default:
+   case IMM:
       assert(delta == 0);
    }
    return reg;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index c986d91d988..bae72167972 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1601,7 +1601,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          /* If the instruction writes to more than one register, it needs to
           * be a "compressed" instruction on Gen <= 5.
           */
-         if (inst->exec_size * inst->dst.stride * type_sz(inst->dst.type) > 32)
+         if (inst->dst.component_size(inst->exec_size) > REG_SIZE)
             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
          else
             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index b48244ae4ca..693357f2796 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -48,6 +48,12 @@ public:
    bool equals(const fs_reg &r) const;
    bool is_contiguous() const;
 
+   /**
+    * Return the size in bytes of a single logical component of the
+    * register assuming the given execution width.
+    */
+   unsigned component_size(unsigned width) const;
+
    /** Smear a channel of the reg to all channels. */
    fs_reg &set_smear(unsigned subreg);
 

From 51e8d549e110f86cb7107cf712843aebd956fb9a Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Tue, 14 Jul 2015 09:56:09 -0700
Subject: [PATCH 0358/1208] i965: Push miptree tiling request into flags

With the last few patches a way was provided to influence lower layer miptree
layout and allocation decisions via flags (replacing bools). For simplicity, I
chose not to touch the tiling requests because the change was slightly less
mechanical than replacing the bools.

The goal is to organize the code so we can continue to add new parameters and
tiling types while minimizing risk to the existing code, and not having to
constantly add new function parameters.

v2: Rebased on Anuj's recent Yf/Ys changes
Fix non-msrt MCS allocation (was only happening in gen8 case before)

v3: small fix in assertion requested by Chad

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com> (v2)
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com> (v2)
Reviewed-by: Chad Versace <chad.versace@intel.com> (v2)
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c    | 21 +++++----
 src/mesa/drivers/dri/i965/intel_fbo.c         |  6 ++-
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 46 ++++++++++---------
 src/mesa/drivers/dri/i965/intel_mipmap_tree.h | 15 +++---
 src/mesa/drivers/dri/i965/intel_tex.c         |  2 +-
 src/mesa/drivers/dri/i965/intel_tex_image.c   |  3 +-
 .../drivers/dri/i965/intel_tex_validate.c     |  5 +-
 7 files changed, 51 insertions(+), 47 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 389834f012a..a12b4af579e 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -614,8 +614,8 @@ brw_miptree_layout_texture_3d(struct brw_context *brw,
  */
 static uint32_t
 brw_miptree_choose_tiling(struct brw_context *brw,
-                          enum intel_miptree_tiling_mode requested,
-                          const struct intel_mipmap_tree *mt)
+                          const struct intel_mipmap_tree *mt,
+                          uint32_t layout_flags)
 {
    if (mt->format == MESA_FORMAT_S_UINT8) {
       /* The stencil buffer is W tiled. However, we request from the kernel a
@@ -624,15 +624,18 @@ brw_miptree_choose_tiling(struct brw_context *brw,
       return I915_TILING_NONE;
    }
 
+   /* Do not support changing the tiling for miptrees with pre-allocated BOs. */
+   assert((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0);
+
    /* Some usages may want only one type of tiling, like depth miptrees (Y
     * tiled), or temporary BOs for uploading data once (linear).
     */
-   switch (requested) {
-   case INTEL_MIPTREE_TILING_ANY:
+   switch (layout_flags & MIPTREE_LAYOUT_ALLOC_ANY_TILED) {
+   case MIPTREE_LAYOUT_ALLOC_ANY_TILED:
       break;
-   case INTEL_MIPTREE_TILING_Y:
+   case MIPTREE_LAYOUT_ALLOC_YTILED:
       return I915_TILING_Y;
-   case INTEL_MIPTREE_TILING_NONE:
+   case MIPTREE_LAYOUT_ALLOC_LINEAR:
       return I915_TILING_NONE;
    }
 
@@ -835,7 +838,6 @@ intel_miptree_can_use_tr_mode(const struct intel_mipmap_tree *mt)
 void
 brw_miptree_layout(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
-                   enum intel_miptree_tiling_mode requested,
                    uint32_t layout_flags)
 {
    const unsigned bpp = mt->cpp * 8;
@@ -852,8 +854,7 @@ brw_miptree_layout(struct brw_context *brw,
       !(layout_flags & MIPTREE_LAYOUT_FOR_BO) &&
       !mt->compressed &&
       _mesa_is_format_color_format(mt->format) &&
-      (requested == INTEL_MIPTREE_TILING_Y ||
-       requested == INTEL_MIPTREE_TILING_ANY) &&
+      (layout_flags & MIPTREE_LAYOUT_ALLOC_YTILED) &&
       (bpp && is_power_of_two(bpp)) &&
       /* FIXME: To avoid piglit regressions keep the Yf/Ys tiling
        * disabled at the moment.
@@ -897,7 +898,7 @@ brw_miptree_layout(struct brw_context *brw,
       if (layout_flags & MIPTREE_LAYOUT_FOR_BO)
          break;
 
-      mt->tiling = brw_miptree_choose_tiling(brw, requested, mt);
+      mt->tiling = brw_miptree_choose_tiling(brw, mt, layout_flags);
       if (is_tr_mode_yf_ys_allowed) {
          if (intel_miptree_can_use_tr_mode(mt))
             break;
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 05e3f8b7ae2..26f895bf904 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -1022,6 +1022,9 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
    struct intel_mipmap_tree *new_mt;
    int width, height, depth;
 
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                           MIPTREE_LAYOUT_ALLOC_ANY_TILED;
+
    intel_miptree_get_dimensions_for_image(rb->TexImage, &width, &height, &depth);
 
    new_mt = intel_miptree_create(brw, rb->TexImage->TexObject->Target,
@@ -1030,8 +1033,7 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
                                  intel_image->base.Base.Level,
                                  width, height, depth,
                                  irb->mt->num_samples,
-                                 INTEL_MIPTREE_TILING_ANY,
-                                 MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                                 layout_flags);
 
    if (intel_miptree_wants_hiz_buffer(brw, new_mt)) {
       intel_miptree_alloc_hiz(brw, new_mt);
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 15296518941..58675a1d719 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -272,7 +272,6 @@ intel_miptree_create_layout(struct brw_context *brw,
                             GLuint height0,
                             GLuint depth0,
                             GLuint num_samples,
-                            enum intel_miptree_tiling_mode requested,
                             uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
@@ -454,8 +453,10 @@ intel_miptree_create_layout(struct brw_context *brw,
 	(brw->has_separate_stencil &&
          intel_miptree_wants_hiz_buffer(brw, mt)))) {
       uint32_t stencil_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
-      if (brw->gen == 6)
-         stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD;
+      if (brw->gen == 6) {
+         stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD |
+                          MIPTREE_LAYOUT_ALLOC_ANY_TILED;
+      }
 
       mt->stencil_mt = intel_miptree_create(brw,
                                             mt->target,
@@ -466,7 +467,6 @@ intel_miptree_create_layout(struct brw_context *brw,
                                             mt->logical_height0,
                                             mt->logical_depth0,
                                             num_samples,
-                                            INTEL_MIPTREE_TILING_ANY,
                                             stencil_flags);
 
       if (!mt->stencil_mt) {
@@ -510,7 +510,7 @@ intel_miptree_create_layout(struct brw_context *brw,
       assert((layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16) == 0);
    }
 
-   brw_miptree_layout(brw, mt, requested, layout_flags);
+   brw_miptree_layout(brw, mt, layout_flags);
 
    if (mt->disable_aux_buffers)
       assert(mt->msaa_layout != INTEL_MSAA_LAYOUT_CMS);
@@ -616,7 +616,6 @@ intel_miptree_create(struct brw_context *brw,
                      GLuint height0,
                      GLuint depth0,
                      GLuint num_samples,
-                     enum intel_miptree_tiling_mode requested_tiling,
                      uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt;
@@ -634,7 +633,7 @@ intel_miptree_create(struct brw_context *brw,
    mt = intel_miptree_create_layout(brw, target, format,
                                     first_level, last_level, width0,
                                     height0, depth0, num_samples,
-                                    requested_tiling, layout_flags);
+                                    layout_flags);
    /*
     * pitch == 0 || height == 0  indicates the null texture
     */
@@ -757,17 +756,16 @@ intel_miptree_create_for_bo(struct brw_context *brw,
 
    target = depth > 1 ? GL_TEXTURE_2D_ARRAY : GL_TEXTURE_2D;
 
-   /* 'requested' parameter of intel_miptree_create_layout() is relevant
-    * only for non bo miptree. Tiling for bo is already computed above.
-    * So, the tiling requested (INTEL_MIPTREE_TILING_ANY) below is
-    * just a place holder and will not make any change to the miptree
-    * tiling format.
+   /* The BO already has a tiling format and we shouldn't confuse the lower
+    * layers by making it try to find a tiling format again.
     */
+   assert(layout_flags & MIPTREE_LAYOUT_ALLOC_ANY_TILED == 0);
+   assert(layout_flags & MIPTREE_LAYOUT_ALLOC_LINEAR == 0);
+
    layout_flags |= MIPTREE_LAYOUT_FOR_BO;
    mt = intel_miptree_create_layout(brw, target, format,
                                     0, 0,
                                     width, height, depth, 0,
-                                    INTEL_MIPTREE_TILING_ANY,
                                     layout_flags);
    if (!mt)
       return NULL;
@@ -875,11 +873,13 @@ intel_miptree_create_for_renderbuffer(struct brw_context *brw,
    uint32_t depth = 1;
    bool ok;
    GLenum target = num_samples > 1 ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D;
+   const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                                 MIPTREE_LAYOUT_ALLOC_ANY_TILED;
+
 
    mt = intel_miptree_create(brw, target, format, 0, 0,
                              width, height, depth, num_samples,
-                             INTEL_MIPTREE_TILING_ANY,
-                             MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                             layout_flags);
    if (!mt)
       goto fail;
 
@@ -1384,6 +1384,8 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
     *
     *     "The MCS surface must be stored as Tile Y."
     */
+   const uint32_t mcs_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                              MIPTREE_LAYOUT_ALLOC_YTILED;
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1393,8 +1395,7 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
                                      mt->logical_height0,
                                      mt->logical_depth0,
                                      0 /* num_samples */,
-                                     INTEL_MIPTREE_TILING_Y,
-                                     MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                                     mcs_flags);
 
    /* From the Ivy Bridge PRM, Vol 2 Part 1 p326:
     *
@@ -1442,9 +1443,11 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
    unsigned mcs_height =
       ALIGN(mt->logical_height0, height_divisor) / height_divisor;
    assert(mt->logical_depth0 == 1);
-   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
-   if (brw->gen >= 8)
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                           MIPTREE_LAYOUT_ALLOC_YTILED;
+   if (brw->gen >= 8) {
       layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
+   }
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1454,7 +1457,6 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
                                      mcs_height,
                                      mt->logical_depth0,
                                      0 /* num_samples */,
-                                     INTEL_MIPTREE_TILING_Y,
                                      layout_flags);
 
    return mt->mcs_mt;
@@ -1707,6 +1709,7 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
    if (!buf)
       return NULL;
 
+   layout_flags |= MIPTREE_LAYOUT_ALLOC_ANY_TILED;
    buf->mt = intel_miptree_create(brw,
                                   mt->target,
                                   mt->format,
@@ -1716,7 +1719,6 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
                                   mt->logical_height0,
                                   mt->logical_depth0,
                                   mt->num_samples,
-                                  INTEL_MIPTREE_TILING_ANY,
                                   layout_flags);
    if (!buf->mt) {
       free(buf);
@@ -2147,7 +2149,7 @@ intel_miptree_map_blit(struct brw_context *brw,
    map->mt = intel_miptree_create(brw, GL_TEXTURE_2D, mt->format,
                                   0, 0,
                                   map->w, map->h, 1,
-                                  0, INTEL_MIPTREE_TILING_NONE, 0);
+                                  0, 0);
 
    if (!map->mt) {
       fprintf(stderr, "Failed to allocate blit temporary\n");
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index bde6daa4e2d..89fdccb1730 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -516,12 +516,6 @@ struct intel_mipmap_tree
    GLuint refcount;
 };
 
-enum intel_miptree_tiling_mode {
-   INTEL_MIPTREE_TILING_ANY,
-   INTEL_MIPTREE_TILING_Y,
-   INTEL_MIPTREE_TILING_NONE,
-};
-
 void
 intel_get_non_msrt_mcs_alignment(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt,
@@ -541,8 +535,15 @@ enum {
    MIPTREE_LAYOUT_FOR_BO                   = 1 << 2,
    MIPTREE_LAYOUT_DISABLE_AUX              = 1 << 3,
    MIPTREE_LAYOUT_FORCE_HALIGN16           = 1 << 4,
+
+   MIPTREE_LAYOUT_ALLOC_YTILED             = 1 << 5,
+   MIPTREE_LAYOUT_ALLOC_XTILED             = 1 << 6,
+   MIPTREE_LAYOUT_ALLOC_LINEAR             = 1 << 7,
 };
 
+#define MIPTREE_LAYOUT_ALLOC_ANY_TILED (MIPTREE_LAYOUT_ALLOC_YTILED | \
+                                        MIPTREE_LAYOUT_ALLOC_XTILED)
+
 struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLenum target,
 					       mesa_format format,
@@ -552,7 +553,6 @@ struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLuint height0,
                                                GLuint depth0,
                                                GLuint num_samples,
-                                               enum intel_miptree_tiling_mode,
                                                uint32_t flags);
 
 struct intel_mipmap_tree *
@@ -771,7 +771,6 @@ brw_miptree_get_vertical_slice_pitch(const struct brw_context *brw,
 void
 brw_miptree_layout(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
-                   enum intel_miptree_tiling_mode requested,
                    uint32_t layout_flags);
 
 void *intel_miptree_map_raw(struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
index b0181ad1d75..8fa5e3cd55a 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -145,7 +145,7 @@ intel_alloc_texture_storage(struct gl_context *ctx,
                                               0, levels - 1,
                                               width, height, depth,
                                               num_samples,
-                                              INTEL_MIPTREE_TILING_ANY, 0);
+                                              MIPTREE_LAYOUT_ALLOC_ANY_TILED);
 
       if (intel_texobj->mt == NULL) {
          return false;
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index e077d5e4743..226aaeb4d54 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -80,8 +80,7 @@ intel_miptree_create_for_teximage(struct brw_context *brw,
 			       height,
 			       depth,
                                intelImage->base.Base.NumSamples,
-                               INTEL_MIPTREE_TILING_ANY,
-                               layout_flags);
+                               layout_flags | MIPTREE_LAYOUT_ALLOC_ANY_TILED);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_tex_validate.c b/src/mesa/drivers/dri/i965/intel_tex_validate.c
index 4991c2997ef..6ebf381e626 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_validate.c
@@ -136,6 +136,8 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                  _mesa_get_format_name(firstImage->base.Base.TexFormat),
                  width, height, depth, validate_last_level + 1);
 
+      const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                                    MIPTREE_LAYOUT_ALLOC_ANY_TILED;
       intelObj->mt = intel_miptree_create(brw,
                                           intelObj->base.Target,
 					  firstImage->base.Base.TexFormat,
@@ -145,8 +147,7 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                                           height,
                                           depth,
                                           0 /* num_samples */,
-                                          INTEL_MIPTREE_TILING_ANY,
-                                          MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                                          layout_flags);
       if (!intelObj->mt)
          return false;
    }

From ef42352ff4e1feeea7338db73f540038c6755472 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Thu, 16 Jul 2015 16:52:08 -0700
Subject: [PATCH 0359/1208] Revert "i965: Push miptree tiling request into
 flags"

This reverts commit 51e8d549e110f86cb7107cf712843aebd956fb9a.
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c    | 21 ++++-----
 src/mesa/drivers/dri/i965/intel_fbo.c         |  6 +--
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 46 +++++++++----------
 src/mesa/drivers/dri/i965/intel_mipmap_tree.h | 15 +++---
 src/mesa/drivers/dri/i965/intel_tex.c         |  2 +-
 src/mesa/drivers/dri/i965/intel_tex_image.c   |  3 +-
 .../drivers/dri/i965/intel_tex_validate.c     |  5 +-
 7 files changed, 47 insertions(+), 51 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index a12b4af579e..389834f012a 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -614,8 +614,8 @@ brw_miptree_layout_texture_3d(struct brw_context *brw,
  */
 static uint32_t
 brw_miptree_choose_tiling(struct brw_context *brw,
-                          const struct intel_mipmap_tree *mt,
-                          uint32_t layout_flags)
+                          enum intel_miptree_tiling_mode requested,
+                          const struct intel_mipmap_tree *mt)
 {
    if (mt->format == MESA_FORMAT_S_UINT8) {
       /* The stencil buffer is W tiled. However, we request from the kernel a
@@ -624,18 +624,15 @@ brw_miptree_choose_tiling(struct brw_context *brw,
       return I915_TILING_NONE;
    }
 
-   /* Do not support changing the tiling for miptrees with pre-allocated BOs. */
-   assert((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0);
-
    /* Some usages may want only one type of tiling, like depth miptrees (Y
     * tiled), or temporary BOs for uploading data once (linear).
     */
-   switch (layout_flags & MIPTREE_LAYOUT_ALLOC_ANY_TILED) {
-   case MIPTREE_LAYOUT_ALLOC_ANY_TILED:
+   switch (requested) {
+   case INTEL_MIPTREE_TILING_ANY:
       break;
-   case MIPTREE_LAYOUT_ALLOC_YTILED:
+   case INTEL_MIPTREE_TILING_Y:
       return I915_TILING_Y;
-   case MIPTREE_LAYOUT_ALLOC_LINEAR:
+   case INTEL_MIPTREE_TILING_NONE:
       return I915_TILING_NONE;
    }
 
@@ -838,6 +835,7 @@ intel_miptree_can_use_tr_mode(const struct intel_mipmap_tree *mt)
 void
 brw_miptree_layout(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
+                   enum intel_miptree_tiling_mode requested,
                    uint32_t layout_flags)
 {
    const unsigned bpp = mt->cpp * 8;
@@ -854,7 +852,8 @@ brw_miptree_layout(struct brw_context *brw,
       !(layout_flags & MIPTREE_LAYOUT_FOR_BO) &&
       !mt->compressed &&
       _mesa_is_format_color_format(mt->format) &&
-      (layout_flags & MIPTREE_LAYOUT_ALLOC_YTILED) &&
+      (requested == INTEL_MIPTREE_TILING_Y ||
+       requested == INTEL_MIPTREE_TILING_ANY) &&
       (bpp && is_power_of_two(bpp)) &&
       /* FIXME: To avoid piglit regressions keep the Yf/Ys tiling
        * disabled at the moment.
@@ -898,7 +897,7 @@ brw_miptree_layout(struct brw_context *brw,
       if (layout_flags & MIPTREE_LAYOUT_FOR_BO)
          break;
 
-      mt->tiling = brw_miptree_choose_tiling(brw, mt, layout_flags);
+      mt->tiling = brw_miptree_choose_tiling(brw, requested, mt);
       if (is_tr_mode_yf_ys_allowed) {
          if (intel_miptree_can_use_tr_mode(mt))
             break;
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 26f895bf904..05e3f8b7ae2 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -1022,9 +1022,6 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
    struct intel_mipmap_tree *new_mt;
    int width, height, depth;
 
-   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                           MIPTREE_LAYOUT_ALLOC_ANY_TILED;
-
    intel_miptree_get_dimensions_for_image(rb->TexImage, &width, &height, &depth);
 
    new_mt = intel_miptree_create(brw, rb->TexImage->TexObject->Target,
@@ -1033,7 +1030,8 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
                                  intel_image->base.Base.Level,
                                  width, height, depth,
                                  irb->mt->num_samples,
-                                 layout_flags);
+                                 INTEL_MIPTREE_TILING_ANY,
+                                 MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
 
    if (intel_miptree_wants_hiz_buffer(brw, new_mt)) {
       intel_miptree_alloc_hiz(brw, new_mt);
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 58675a1d719..15296518941 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -272,6 +272,7 @@ intel_miptree_create_layout(struct brw_context *brw,
                             GLuint height0,
                             GLuint depth0,
                             GLuint num_samples,
+                            enum intel_miptree_tiling_mode requested,
                             uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
@@ -453,10 +454,8 @@ intel_miptree_create_layout(struct brw_context *brw,
 	(brw->has_separate_stencil &&
          intel_miptree_wants_hiz_buffer(brw, mt)))) {
       uint32_t stencil_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
-      if (brw->gen == 6) {
-         stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD |
-                          MIPTREE_LAYOUT_ALLOC_ANY_TILED;
-      }
+      if (brw->gen == 6)
+         stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD;
 
       mt->stencil_mt = intel_miptree_create(brw,
                                             mt->target,
@@ -467,6 +466,7 @@ intel_miptree_create_layout(struct brw_context *brw,
                                             mt->logical_height0,
                                             mt->logical_depth0,
                                             num_samples,
+                                            INTEL_MIPTREE_TILING_ANY,
                                             stencil_flags);
 
       if (!mt->stencil_mt) {
@@ -510,7 +510,7 @@ intel_miptree_create_layout(struct brw_context *brw,
       assert((layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16) == 0);
    }
 
-   brw_miptree_layout(brw, mt, layout_flags);
+   brw_miptree_layout(brw, mt, requested, layout_flags);
 
    if (mt->disable_aux_buffers)
       assert(mt->msaa_layout != INTEL_MSAA_LAYOUT_CMS);
@@ -616,6 +616,7 @@ intel_miptree_create(struct brw_context *brw,
                      GLuint height0,
                      GLuint depth0,
                      GLuint num_samples,
+                     enum intel_miptree_tiling_mode requested_tiling,
                      uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt;
@@ -633,7 +634,7 @@ intel_miptree_create(struct brw_context *brw,
    mt = intel_miptree_create_layout(brw, target, format,
                                     first_level, last_level, width0,
                                     height0, depth0, num_samples,
-                                    layout_flags);
+                                    requested_tiling, layout_flags);
    /*
     * pitch == 0 || height == 0  indicates the null texture
     */
@@ -756,16 +757,17 @@ intel_miptree_create_for_bo(struct brw_context *brw,
 
    target = depth > 1 ? GL_TEXTURE_2D_ARRAY : GL_TEXTURE_2D;
 
-   /* The BO already has a tiling format and we shouldn't confuse the lower
-    * layers by making it try to find a tiling format again.
+   /* 'requested' parameter of intel_miptree_create_layout() is relevant
+    * only for non bo miptree. Tiling for bo is already computed above.
+    * So, the tiling requested (INTEL_MIPTREE_TILING_ANY) below is
+    * just a place holder and will not make any change to the miptree
+    * tiling format.
     */
-   assert(layout_flags & MIPTREE_LAYOUT_ALLOC_ANY_TILED == 0);
-   assert(layout_flags & MIPTREE_LAYOUT_ALLOC_LINEAR == 0);
-
    layout_flags |= MIPTREE_LAYOUT_FOR_BO;
    mt = intel_miptree_create_layout(brw, target, format,
                                     0, 0,
                                     width, height, depth, 0,
+                                    INTEL_MIPTREE_TILING_ANY,
                                     layout_flags);
    if (!mt)
       return NULL;
@@ -873,13 +875,11 @@ intel_miptree_create_for_renderbuffer(struct brw_context *brw,
    uint32_t depth = 1;
    bool ok;
    GLenum target = num_samples > 1 ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D;
-   const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                                 MIPTREE_LAYOUT_ALLOC_ANY_TILED;
-
 
    mt = intel_miptree_create(brw, target, format, 0, 0,
                              width, height, depth, num_samples,
-                             layout_flags);
+                             INTEL_MIPTREE_TILING_ANY,
+                             MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
    if (!mt)
       goto fail;
 
@@ -1384,8 +1384,6 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
     *
     *     "The MCS surface must be stored as Tile Y."
     */
-   const uint32_t mcs_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                              MIPTREE_LAYOUT_ALLOC_YTILED;
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1395,7 +1393,8 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
                                      mt->logical_height0,
                                      mt->logical_depth0,
                                      0 /* num_samples */,
-                                     mcs_flags);
+                                     INTEL_MIPTREE_TILING_Y,
+                                     MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
 
    /* From the Ivy Bridge PRM, Vol 2 Part 1 p326:
     *
@@ -1443,11 +1442,9 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
    unsigned mcs_height =
       ALIGN(mt->logical_height0, height_divisor) / height_divisor;
    assert(mt->logical_depth0 == 1);
-   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                           MIPTREE_LAYOUT_ALLOC_YTILED;
-   if (brw->gen >= 8) {
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
+   if (brw->gen >= 8)
       layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
-   }
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1457,6 +1454,7 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
                                      mcs_height,
                                      mt->logical_depth0,
                                      0 /* num_samples */,
+                                     INTEL_MIPTREE_TILING_Y,
                                      layout_flags);
 
    return mt->mcs_mt;
@@ -1709,7 +1707,6 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
    if (!buf)
       return NULL;
 
-   layout_flags |= MIPTREE_LAYOUT_ALLOC_ANY_TILED;
    buf->mt = intel_miptree_create(brw,
                                   mt->target,
                                   mt->format,
@@ -1719,6 +1716,7 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
                                   mt->logical_height0,
                                   mt->logical_depth0,
                                   mt->num_samples,
+                                  INTEL_MIPTREE_TILING_ANY,
                                   layout_flags);
    if (!buf->mt) {
       free(buf);
@@ -2149,7 +2147,7 @@ intel_miptree_map_blit(struct brw_context *brw,
    map->mt = intel_miptree_create(brw, GL_TEXTURE_2D, mt->format,
                                   0, 0,
                                   map->w, map->h, 1,
-                                  0, 0);
+                                  0, INTEL_MIPTREE_TILING_NONE, 0);
 
    if (!map->mt) {
       fprintf(stderr, "Failed to allocate blit temporary\n");
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index 89fdccb1730..bde6daa4e2d 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -516,6 +516,12 @@ struct intel_mipmap_tree
    GLuint refcount;
 };
 
+enum intel_miptree_tiling_mode {
+   INTEL_MIPTREE_TILING_ANY,
+   INTEL_MIPTREE_TILING_Y,
+   INTEL_MIPTREE_TILING_NONE,
+};
+
 void
 intel_get_non_msrt_mcs_alignment(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt,
@@ -535,15 +541,8 @@ enum {
    MIPTREE_LAYOUT_FOR_BO                   = 1 << 2,
    MIPTREE_LAYOUT_DISABLE_AUX              = 1 << 3,
    MIPTREE_LAYOUT_FORCE_HALIGN16           = 1 << 4,
-
-   MIPTREE_LAYOUT_ALLOC_YTILED             = 1 << 5,
-   MIPTREE_LAYOUT_ALLOC_XTILED             = 1 << 6,
-   MIPTREE_LAYOUT_ALLOC_LINEAR             = 1 << 7,
 };
 
-#define MIPTREE_LAYOUT_ALLOC_ANY_TILED (MIPTREE_LAYOUT_ALLOC_YTILED | \
-                                        MIPTREE_LAYOUT_ALLOC_XTILED)
-
 struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLenum target,
 					       mesa_format format,
@@ -553,6 +552,7 @@ struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLuint height0,
                                                GLuint depth0,
                                                GLuint num_samples,
+                                               enum intel_miptree_tiling_mode,
                                                uint32_t flags);
 
 struct intel_mipmap_tree *
@@ -771,6 +771,7 @@ brw_miptree_get_vertical_slice_pitch(const struct brw_context *brw,
 void
 brw_miptree_layout(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
+                   enum intel_miptree_tiling_mode requested,
                    uint32_t layout_flags);
 
 void *intel_miptree_map_raw(struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
index 8fa5e3cd55a..b0181ad1d75 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -145,7 +145,7 @@ intel_alloc_texture_storage(struct gl_context *ctx,
                                               0, levels - 1,
                                               width, height, depth,
                                               num_samples,
-                                              MIPTREE_LAYOUT_ALLOC_ANY_TILED);
+                                              INTEL_MIPTREE_TILING_ANY, 0);
 
       if (intel_texobj->mt == NULL) {
          return false;
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index 226aaeb4d54..e077d5e4743 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -80,7 +80,8 @@ intel_miptree_create_for_teximage(struct brw_context *brw,
 			       height,
 			       depth,
                                intelImage->base.Base.NumSamples,
-                               layout_flags | MIPTREE_LAYOUT_ALLOC_ANY_TILED);
+                               INTEL_MIPTREE_TILING_ANY,
+                               layout_flags);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_tex_validate.c b/src/mesa/drivers/dri/i965/intel_tex_validate.c
index 6ebf381e626..4991c2997ef 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_validate.c
@@ -136,8 +136,6 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                  _mesa_get_format_name(firstImage->base.Base.TexFormat),
                  width, height, depth, validate_last_level + 1);
 
-      const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                                    MIPTREE_LAYOUT_ALLOC_ANY_TILED;
       intelObj->mt = intel_miptree_create(brw,
                                           intelObj->base.Target,
 					  firstImage->base.Base.TexFormat,
@@ -147,7 +145,8 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                                           height,
                                           depth,
                                           0 /* num_samples */,
-                                          layout_flags);
+                                          INTEL_MIPTREE_TILING_ANY,
+                                          MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
       if (!intelObj->mt)
          return false;
    }

From 3a31876600cb5c4d90c998ecb5635c602eeb2bd1 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Tue, 14 Jul 2015 09:56:09 -0700
Subject: [PATCH 0360/1208] i965: Push miptree tiling request into flags

With the last few patches a way was provided to influence lower layer miptree
layout and allocation decisions via flags (replacing bools). For simplicity, I
chose not to touch the tiling requests because the change was slightly less
mechanical than replacing the bools.

The goal is to organize the code so we can continue to add new parameters and
tiling types while minimizing risk to the existing code, and not having to
constantly add new function parameters.

v2: Rebased on Anuj's recent Yf/Ys changes
Fix non-msrt MCS allocation (was only happening in gen8 case before)

v3: small fix in assertion requested by Chad

v4: Use parens to get the order right from v3.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c    | 21 +++++----
 src/mesa/drivers/dri/i965/intel_fbo.c         |  6 ++-
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 46 ++++++++++---------
 src/mesa/drivers/dri/i965/intel_mipmap_tree.h | 15 +++---
 src/mesa/drivers/dri/i965/intel_tex.c         |  2 +-
 src/mesa/drivers/dri/i965/intel_tex_image.c   |  3 +-
 .../drivers/dri/i965/intel_tex_validate.c     |  5 +-
 7 files changed, 51 insertions(+), 47 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 389834f012a..a12b4af579e 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -614,8 +614,8 @@ brw_miptree_layout_texture_3d(struct brw_context *brw,
  */
 static uint32_t
 brw_miptree_choose_tiling(struct brw_context *brw,
-                          enum intel_miptree_tiling_mode requested,
-                          const struct intel_mipmap_tree *mt)
+                          const struct intel_mipmap_tree *mt,
+                          uint32_t layout_flags)
 {
    if (mt->format == MESA_FORMAT_S_UINT8) {
       /* The stencil buffer is W tiled. However, we request from the kernel a
@@ -624,15 +624,18 @@ brw_miptree_choose_tiling(struct brw_context *brw,
       return I915_TILING_NONE;
    }
 
+   /* Do not support changing the tiling for miptrees with pre-allocated BOs. */
+   assert((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0);
+
    /* Some usages may want only one type of tiling, like depth miptrees (Y
     * tiled), or temporary BOs for uploading data once (linear).
     */
-   switch (requested) {
-   case INTEL_MIPTREE_TILING_ANY:
+   switch (layout_flags & MIPTREE_LAYOUT_ALLOC_ANY_TILED) {
+   case MIPTREE_LAYOUT_ALLOC_ANY_TILED:
       break;
-   case INTEL_MIPTREE_TILING_Y:
+   case MIPTREE_LAYOUT_ALLOC_YTILED:
       return I915_TILING_Y;
-   case INTEL_MIPTREE_TILING_NONE:
+   case MIPTREE_LAYOUT_ALLOC_LINEAR:
       return I915_TILING_NONE;
    }
 
@@ -835,7 +838,6 @@ intel_miptree_can_use_tr_mode(const struct intel_mipmap_tree *mt)
 void
 brw_miptree_layout(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
-                   enum intel_miptree_tiling_mode requested,
                    uint32_t layout_flags)
 {
    const unsigned bpp = mt->cpp * 8;
@@ -852,8 +854,7 @@ brw_miptree_layout(struct brw_context *brw,
       !(layout_flags & MIPTREE_LAYOUT_FOR_BO) &&
       !mt->compressed &&
       _mesa_is_format_color_format(mt->format) &&
-      (requested == INTEL_MIPTREE_TILING_Y ||
-       requested == INTEL_MIPTREE_TILING_ANY) &&
+      (layout_flags & MIPTREE_LAYOUT_ALLOC_YTILED) &&
       (bpp && is_power_of_two(bpp)) &&
       /* FIXME: To avoid piglit regressions keep the Yf/Ys tiling
        * disabled at the moment.
@@ -897,7 +898,7 @@ brw_miptree_layout(struct brw_context *brw,
       if (layout_flags & MIPTREE_LAYOUT_FOR_BO)
          break;
 
-      mt->tiling = brw_miptree_choose_tiling(brw, requested, mt);
+      mt->tiling = brw_miptree_choose_tiling(brw, mt, layout_flags);
       if (is_tr_mode_yf_ys_allowed) {
          if (intel_miptree_can_use_tr_mode(mt))
             break;
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 05e3f8b7ae2..26f895bf904 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -1022,6 +1022,9 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
    struct intel_mipmap_tree *new_mt;
    int width, height, depth;
 
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                           MIPTREE_LAYOUT_ALLOC_ANY_TILED;
+
    intel_miptree_get_dimensions_for_image(rb->TexImage, &width, &height, &depth);
 
    new_mt = intel_miptree_create(brw, rb->TexImage->TexObject->Target,
@@ -1030,8 +1033,7 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
                                  intel_image->base.Base.Level,
                                  width, height, depth,
                                  irb->mt->num_samples,
-                                 INTEL_MIPTREE_TILING_ANY,
-                                 MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                                 layout_flags);
 
    if (intel_miptree_wants_hiz_buffer(brw, new_mt)) {
       intel_miptree_alloc_hiz(brw, new_mt);
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 15296518941..3f22a87a742 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -272,7 +272,6 @@ intel_miptree_create_layout(struct brw_context *brw,
                             GLuint height0,
                             GLuint depth0,
                             GLuint num_samples,
-                            enum intel_miptree_tiling_mode requested,
                             uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
@@ -454,8 +453,10 @@ intel_miptree_create_layout(struct brw_context *brw,
 	(brw->has_separate_stencil &&
          intel_miptree_wants_hiz_buffer(brw, mt)))) {
       uint32_t stencil_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
-      if (brw->gen == 6)
-         stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD;
+      if (brw->gen == 6) {
+         stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD |
+                          MIPTREE_LAYOUT_ALLOC_ANY_TILED;
+      }
 
       mt->stencil_mt = intel_miptree_create(brw,
                                             mt->target,
@@ -466,7 +467,6 @@ intel_miptree_create_layout(struct brw_context *brw,
                                             mt->logical_height0,
                                             mt->logical_depth0,
                                             num_samples,
-                                            INTEL_MIPTREE_TILING_ANY,
                                             stencil_flags);
 
       if (!mt->stencil_mt) {
@@ -510,7 +510,7 @@ intel_miptree_create_layout(struct brw_context *brw,
       assert((layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16) == 0);
    }
 
-   brw_miptree_layout(brw, mt, requested, layout_flags);
+   brw_miptree_layout(brw, mt, layout_flags);
 
    if (mt->disable_aux_buffers)
       assert(mt->msaa_layout != INTEL_MSAA_LAYOUT_CMS);
@@ -616,7 +616,6 @@ intel_miptree_create(struct brw_context *brw,
                      GLuint height0,
                      GLuint depth0,
                      GLuint num_samples,
-                     enum intel_miptree_tiling_mode requested_tiling,
                      uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt;
@@ -634,7 +633,7 @@ intel_miptree_create(struct brw_context *brw,
    mt = intel_miptree_create_layout(brw, target, format,
                                     first_level, last_level, width0,
                                     height0, depth0, num_samples,
-                                    requested_tiling, layout_flags);
+                                    layout_flags);
    /*
     * pitch == 0 || height == 0  indicates the null texture
     */
@@ -757,17 +756,16 @@ intel_miptree_create_for_bo(struct brw_context *brw,
 
    target = depth > 1 ? GL_TEXTURE_2D_ARRAY : GL_TEXTURE_2D;
 
-   /* 'requested' parameter of intel_miptree_create_layout() is relevant
-    * only for non bo miptree. Tiling for bo is already computed above.
-    * So, the tiling requested (INTEL_MIPTREE_TILING_ANY) below is
-    * just a place holder and will not make any change to the miptree
-    * tiling format.
+   /* The BO already has a tiling format and we shouldn't confuse the lower
+    * layers by making it try to find a tiling format again.
     */
+   assert((layout_flags & MIPTREE_LAYOUT_ALLOC_ANY_TILED) == 0);
+   assert((layout_flags & MIPTREE_LAYOUT_ALLOC_LINEAR) == 0);
+
    layout_flags |= MIPTREE_LAYOUT_FOR_BO;
    mt = intel_miptree_create_layout(brw, target, format,
                                     0, 0,
                                     width, height, depth, 0,
-                                    INTEL_MIPTREE_TILING_ANY,
                                     layout_flags);
    if (!mt)
       return NULL;
@@ -875,11 +873,13 @@ intel_miptree_create_for_renderbuffer(struct brw_context *brw,
    uint32_t depth = 1;
    bool ok;
    GLenum target = num_samples > 1 ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D;
+   const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                                 MIPTREE_LAYOUT_ALLOC_ANY_TILED;
+
 
    mt = intel_miptree_create(brw, target, format, 0, 0,
                              width, height, depth, num_samples,
-                             INTEL_MIPTREE_TILING_ANY,
-                             MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                             layout_flags);
    if (!mt)
       goto fail;
 
@@ -1384,6 +1384,8 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
     *
     *     "The MCS surface must be stored as Tile Y."
     */
+   const uint32_t mcs_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                              MIPTREE_LAYOUT_ALLOC_YTILED;
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1393,8 +1395,7 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
                                      mt->logical_height0,
                                      mt->logical_depth0,
                                      0 /* num_samples */,
-                                     INTEL_MIPTREE_TILING_Y,
-                                     MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                                     mcs_flags);
 
    /* From the Ivy Bridge PRM, Vol 2 Part 1 p326:
     *
@@ -1442,9 +1443,11 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
    unsigned mcs_height =
       ALIGN(mt->logical_height0, height_divisor) / height_divisor;
    assert(mt->logical_depth0 == 1);
-   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
-   if (brw->gen >= 8)
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                           MIPTREE_LAYOUT_ALLOC_YTILED;
+   if (brw->gen >= 8) {
       layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
+   }
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1454,7 +1457,6 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
                                      mcs_height,
                                      mt->logical_depth0,
                                      0 /* num_samples */,
-                                     INTEL_MIPTREE_TILING_Y,
                                      layout_flags);
 
    return mt->mcs_mt;
@@ -1707,6 +1709,7 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
    if (!buf)
       return NULL;
 
+   layout_flags |= MIPTREE_LAYOUT_ALLOC_ANY_TILED;
    buf->mt = intel_miptree_create(brw,
                                   mt->target,
                                   mt->format,
@@ -1716,7 +1719,6 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
                                   mt->logical_height0,
                                   mt->logical_depth0,
                                   mt->num_samples,
-                                  INTEL_MIPTREE_TILING_ANY,
                                   layout_flags);
    if (!buf->mt) {
       free(buf);
@@ -2147,7 +2149,7 @@ intel_miptree_map_blit(struct brw_context *brw,
    map->mt = intel_miptree_create(brw, GL_TEXTURE_2D, mt->format,
                                   0, 0,
                                   map->w, map->h, 1,
-                                  0, INTEL_MIPTREE_TILING_NONE, 0);
+                                  0, 0);
 
    if (!map->mt) {
       fprintf(stderr, "Failed to allocate blit temporary\n");
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index bde6daa4e2d..89fdccb1730 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -516,12 +516,6 @@ struct intel_mipmap_tree
    GLuint refcount;
 };
 
-enum intel_miptree_tiling_mode {
-   INTEL_MIPTREE_TILING_ANY,
-   INTEL_MIPTREE_TILING_Y,
-   INTEL_MIPTREE_TILING_NONE,
-};
-
 void
 intel_get_non_msrt_mcs_alignment(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt,
@@ -541,8 +535,15 @@ enum {
    MIPTREE_LAYOUT_FOR_BO                   = 1 << 2,
    MIPTREE_LAYOUT_DISABLE_AUX              = 1 << 3,
    MIPTREE_LAYOUT_FORCE_HALIGN16           = 1 << 4,
+
+   MIPTREE_LAYOUT_ALLOC_YTILED             = 1 << 5,
+   MIPTREE_LAYOUT_ALLOC_XTILED             = 1 << 6,
+   MIPTREE_LAYOUT_ALLOC_LINEAR             = 1 << 7,
 };
 
+#define MIPTREE_LAYOUT_ALLOC_ANY_TILED (MIPTREE_LAYOUT_ALLOC_YTILED | \
+                                        MIPTREE_LAYOUT_ALLOC_XTILED)
+
 struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLenum target,
 					       mesa_format format,
@@ -552,7 +553,6 @@ struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLuint height0,
                                                GLuint depth0,
                                                GLuint num_samples,
-                                               enum intel_miptree_tiling_mode,
                                                uint32_t flags);
 
 struct intel_mipmap_tree *
@@ -771,7 +771,6 @@ brw_miptree_get_vertical_slice_pitch(const struct brw_context *brw,
 void
 brw_miptree_layout(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
-                   enum intel_miptree_tiling_mode requested,
                    uint32_t layout_flags);
 
 void *intel_miptree_map_raw(struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
index b0181ad1d75..8fa5e3cd55a 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -145,7 +145,7 @@ intel_alloc_texture_storage(struct gl_context *ctx,
                                               0, levels - 1,
                                               width, height, depth,
                                               num_samples,
-                                              INTEL_MIPTREE_TILING_ANY, 0);
+                                              MIPTREE_LAYOUT_ALLOC_ANY_TILED);
 
       if (intel_texobj->mt == NULL) {
          return false;
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index e077d5e4743..226aaeb4d54 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -80,8 +80,7 @@ intel_miptree_create_for_teximage(struct brw_context *brw,
 			       height,
 			       depth,
                                intelImage->base.Base.NumSamples,
-                               INTEL_MIPTREE_TILING_ANY,
-                               layout_flags);
+                               layout_flags | MIPTREE_LAYOUT_ALLOC_ANY_TILED);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_tex_validate.c b/src/mesa/drivers/dri/i965/intel_tex_validate.c
index 4991c2997ef..6ebf381e626 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_validate.c
@@ -136,6 +136,8 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                  _mesa_get_format_name(firstImage->base.Base.TexFormat),
                  width, height, depth, validate_last_level + 1);
 
+      const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                                    MIPTREE_LAYOUT_ALLOC_ANY_TILED;
       intelObj->mt = intel_miptree_create(brw,
                                           intelObj->base.Target,
 					  firstImage->base.Base.TexFormat,
@@ -145,8 +147,7 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                                           height,
                                           depth,
                                           0 /* num_samples */,
-                                          INTEL_MIPTREE_TILING_ANY,
-                                          MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                                          layout_flags);
       if (!intelObj->mt)
          return false;
    }

From 4b17f0d9f58637300b0748d1fb702a7e4d51979f Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sat, 4 Jul 2015 19:15:16 -0700
Subject: [PATCH 0361/1208] program: Allow redundant OPTION ARB_fog_*
 directives.

A fragment program from "Pixel Piracy" contains redundant OPTION
directives:

!!ARBfp1.0
OPTION ARB_precision_hint_fastest;
OPTION ARB_fog_exp2;
OPTION ARB_precision_hint_fastest;
OPTION ARB_fog_exp2;
...

We already allow redundant ARB_precision_hint_fastest directives, but
disallow the redundant (yet consistent) ARB_fog_exp2 directives, failing
to compile the program.

The specification seems to contradict itself - the main text says that
only one fog application option may be specified, but then backpedals,
indicating the intent is to disallow /contradictory/ flags.  One of the
issues suggests that specifying contradictory ones is stupid, but
allowed, and only the last one should take effect.

Accepting multiple redundant (but consistent) directives seems harmless,
and like a reasonable interpretation of the specification.  It also
fixes a fragment program found in the wild.

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/program/program_parse_extra.c | 50 +++++++++++++++++++-------
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/src/mesa/program/program_parse_extra.c b/src/mesa/program/program_parse_extra.c
index 32b54afc57b..71f86d13ace 100644
--- a/src/mesa/program/program_parse_extra.c
+++ b/src/mesa/program/program_parse_extra.c
@@ -163,6 +163,8 @@ _mesa_ARBvp_parse_option(struct asm_parser_state *state, const char *option)
 int
 _mesa_ARBfp_parse_option(struct asm_parser_state *state, const char *option)
 {
+   unsigned fog_option;
+
    /* All of the options currently supported start with "ARB_".  The code is
     * currently structured with nested if-statements because eventually options
     * that start with "NV_" will be supported.  This structure will result in
@@ -177,20 +179,42 @@ _mesa_ARBfp_parse_option(struct asm_parser_state *state, const char *option)
       if (strncmp(option, "fog_", 4) == 0) {
 	 option += 4;
 
-	 if (state->option.Fog == OPTION_NONE) {
-	    if (strcmp(option, "exp") == 0) {
-	       state->option.Fog = OPTION_FOG_EXP;
-	       return 1;
-	    } else if (strcmp(option, "exp2") == 0) {
-	       state->option.Fog = OPTION_FOG_EXP2;
-	       return 1;
-	    } else if (strcmp(option, "linear") == 0) {
-	       state->option.Fog = OPTION_FOG_LINEAR;
-	       return 1;
-	    }
-	 }
+         if (strcmp(option, "exp") == 0) {
+            fog_option = OPTION_FOG_EXP;
+         } else if (strcmp(option, "exp2") == 0) {
+            fog_option = OPTION_FOG_EXP2;
+         } else if (strcmp(option, "linear") == 0) {
+            fog_option = OPTION_FOG_LINEAR;
+         } else {
+            /* invalid option */
+            return 0;
+         }
 
-	 return 0;
+         if (state->option.Fog == OPTION_NONE) {
+            state->option.Fog = fog_option;
+            return 1;
+         }
+
+         /* The ARB_fragment_program specification instructs us to handle
+          * redundant options in two seemingly contradictory ways:
+          *
+          * Section 3.11.4.5.1 says:
+          * "Only one fog application option may be specified by any given
+          *  fragment program.  A fragment program that specifies more than one
+          *  of the program options "ARB_fog_exp", "ARB_fog_exp2", and
+          *  "ARB_fog_linear", will fail to load."
+          *
+          * Issue 27 says:
+          * "The three mandatory options are ARB_fog_exp, ARB_fog_exp2, and
+          *  ARB_fog_linear.  As these options are mutually exclusive by
+          *  nature, specifying more than one is not useful.  If more than one
+          *  is specified, the last one encountered in the <optionSequence>
+          *  will be the one to actually modify the execution environment."
+          *
+          * We choose to allow programs to specify the same OPTION redundantly,
+          * but fail to load programs that specify contradictory options.
+          */
+         return state->option.Fog == fog_option ? 1 : 0;
       } else if (strncmp(option, "precision_hint_", 15) == 0) {
 	 option += 15;
 

From 7e337859ff98a0caf00fd201a5389933d42d0baa Mon Sep 17 00:00:00 2001
From: Jordan Justen <jordan.l.justen@intel.com>
Date: Thu, 16 Jul 2015 15:04:43 -0700
Subject: [PATCH 0362/1208] i965/cs: Return 1 for regs_read on
 CS_OPCODE_CS_TERMINATE

This prevents an assertion failure in brw_fs_live_variables.cpp,
fs_live_variables::setup_one_read: Assertion `var < num_vars' failed.

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index ff0675d146f..2e3eb05b796 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -698,6 +698,9 @@ fs_inst::regs_read(int arg) const
          return 1;
       break;
 
+   case CS_OPCODE_CS_TERMINATE:
+      return 1;
+
    default:
       if (is_tex() && arg == 0 && src[0].file == GRF)
          return mlen;

From 01cdbba341b47972a743e7f192d3554010d0da84 Mon Sep 17 00:00:00 2001
From: Jordan Justen <jordan.l.justen@intel.com>
Date: Thu, 16 Jul 2015 15:07:05 -0700
Subject: [PATCH 0363/1208] i965/cs: Use dispatch width of 8 for cs terminate
 payload setup

This prevents an assertion failure in brw_fs_live_variables.cpp,
fs_live_variables::setup_one_write: Assertion `var < num_vars' failed.

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 94d6a584424..d6a60a7cff7 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1960,7 +1960,7 @@ fs_visitor::emit_cs_terminate()
     */
    struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
    fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   bld.exec_all().MOV(payload, g0);
+   bld.group(8, 0).exec_all().MOV(payload, g0);
 
    /* Send a message to the thread spawner to terminate the thread. */
    fs_inst *inst = bld.exec_all()

From 8eea091747c9b12b21688b738145632b90d923cb Mon Sep 17 00:00:00 2001
From: Connor Abbott <connor.w.abbott@intel.com>
Date: Wed, 15 Jul 2015 12:00:47 -0700
Subject: [PATCH 0364/1208] nir: add nir_instr_is_first() and
 nir_instr_is_last() helpers

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Signed-off-by: Connor Abbott <connor.w.abbott@intel.com>
---
 src/glsl/nir/nir.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index e9a506c5971..0db1fc3076e 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -443,6 +443,18 @@ nir_instr_prev(nir_instr *instr)
       return exec_node_data(nir_instr, prev, node);
 }
 
+static inline bool
+nir_instr_is_first(nir_instr *instr)
+{
+   return exec_node_is_head_sentinel(exec_node_get_prev(&instr->node));
+}
+
+static inline bool
+nir_instr_is_last(nir_instr *instr)
+{
+   return exec_node_is_tail_sentinel(exec_node_get_next(&instr->node));
+}
+
 typedef struct {
    /** for debugging only, can be NULL */
    const char* name;

From eaf799ddff9f2583d6dee5a0db36fa0a1162fde6 Mon Sep 17 00:00:00 2001
From: Connor Abbott <connor.w.abbott@intel.com>
Date: Wed, 15 Jul 2015 12:01:20 -0700
Subject: [PATCH 0365/1208] nir: add nir_foreach_instr_safe_reverse()

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Signed-off-by: Connor Abbott <connor.w.abbott@intel.com>
---
 src/glsl/nir/nir.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 0db1fc3076e..62cdbd4a5fb 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1233,6 +1233,8 @@ nir_block_last_instr(nir_block *block)
    foreach_list_typed_reverse(nir_instr, instr, node, &(block)->instr_list)
 #define nir_foreach_instr_safe(block, instr) \
    foreach_list_typed_safe(nir_instr, instr, node, &(block)->instr_list)
+#define nir_foreach_instr_safe_reverse(block, instr) \
+   foreach_list_typed_safe_reverse(nir_instr, instr, node, &(block)->instr_list)
 
 typedef struct nir_if {
    nir_cf_node cf_node;

From 9f344b908a95440d215f29c0b05b8ea8dba2839e Mon Sep 17 00:00:00 2001
From: Connor Abbott <connor.w.abbott@intel.com>
Date: Wed, 1 Jul 2015 09:58:47 -0700
Subject: [PATCH 0366/1208] i965/fs: fix regs_read() for LINTERP

The second source always stays within the same SIMD8 register.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Signed-off-by: Connor Abbott <connor.w.abbott@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 2e3eb05b796..51ef32c1b6a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -685,7 +685,8 @@ fs_inst::regs_read(int arg) const
    case FS_OPCODE_LINTERP:
       if (arg == 0)
          return exec_size / 4;
-      break;
+      else
+         return 1;
 
    case FS_OPCODE_PIXEL_X:
    case FS_OPCODE_PIXEL_Y:

From c4a2217e79ac78c59cec3eb97542ceb819f92a44 Mon Sep 17 00:00:00 2001
From: Jordan Justen <jordan.l.justen@intel.com>
Date: Mon, 2 Feb 2015 14:23:35 -0800
Subject: [PATCH 0367/1208] i965/fs: Mark last used ip for all regs read in the
 payload

If a source register in the push constant registers uses more than one
register, then we wouldn't update payload_last_use_ip for subsequent
registers.

Unlike most uniform data pushed into registers, the CS gl_LocalInvocationID
data varies per execution channel. Therefore for SIMD16 mode, we have vec16
data in the payload. In this case we then need to mark 2 registers in
payload_last_use_ip as last used by the instruction. There's a similar
situation for the z and w coordinates of gl_FragCoord for fragment shaders,
where it had only happened to work before because of some bogus interferences
which the next commit removes.

(Connor: added bit about gl_FragCoord to commit message)

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Connor Abbott <connor.w.abbott@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 8e5621d3cb5..620fc235c1b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -380,7 +380,10 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
             if (node_nr >= payload_node_count)
                continue;
 
-            payload_last_use_ip[node_nr] = use_ip;
+            for (int j = 0; j < inst->regs_read(i); j++) {
+               payload_last_use_ip[node_nr + j] = use_ip;
+               assert(node_nr + j < payload_node_count);
+            }
          }
       }
 

From 18e73bf7f8b12022e02db3230ee109657581900b Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Tue, 30 Jun 2015 13:38:20 -0700
Subject: [PATCH 0368/1208] i965/fs: remove special case in
 setup_payload_interference()

regs_read() will handle LINTERP for us since the previous commit. In
addition, we were being too conservative, since it will only read 2
registers on SIMD8.

instructions in affected programs:     9061 -> 8893 (-1.85%)
helped:                                10
HURT:                                  0
GAINED:                                0
LOST:                                  0

All of the changes were due to spills being eliminated, mostly in KSP
shaders.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Signed-off-by: Connor Abbott <connor.w.abbott@intel.com>
---
 .../drivers/dri/i965/brw_fs_reg_allocate.cpp  | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 620fc235c1b..1ee19e45a1b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -389,26 +389,6 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
 
       /* Special case instructions which have extra implied registers used. */
       switch (inst->opcode) {
-      case FS_OPCODE_LINTERP:
-         /* On gen6+ in SIMD16, there are 4 adjacent registers used by
-          * PLN's sourcing of the deltas, while we list only the first one
-          * in the arguments.  Pre-gen6, the deltas are computed in normal
-          * VGRFs.
-          */
-         if (devinfo->gen >= 6) {
-            int delta_x_arg = 0;
-            if (inst->src[delta_x_arg].file == HW_REG &&
-                inst->src[delta_x_arg].fixed_hw_reg.file ==
-                BRW_GENERAL_REGISTER_FILE) {
-               for (int i = 1; i < 4; ++i) {
-                  int node = inst->src[delta_x_arg].fixed_hw_reg.nr + i;
-                  assert(node < payload_node_count);
-                  payload_last_use_ip[node] = use_ip;
-               }
-            }
-         }
-         break;
-
       case CS_OPCODE_CS_TERMINATE:
          payload_last_use_ip[0] = use_ip;
          break;

From bde4c8ec1fd69e312fe21e36c8ce07139916811a Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Tue, 30 Jun 2015 13:42:15 -0700
Subject: [PATCH 0369/1208] i965/fs: don't make unused payload registers
 interfere

Before, we were setting payload_last_use_ip for unused payload
registers to 0, which made them interfere with whatever the first
instruction wrote to due to the workaround for SIMD16 uniform arguments.
Just use -1 to mean "unused" instead, and then skip setting any
interferences for unused payload registers.

instructions in affected programs:     0 -> 0
helped:                                0
HURT:                                  0
GAINED:                                1
LOST:                                  0

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Signed-off-by: Connor Abbott <connor.w.abbott@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 1ee19e45a1b..f25f2ecce75 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -341,7 +341,9 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
    int loop_end_ip = 0;
 
    int payload_last_use_ip[payload_node_count];
-   memset(payload_last_use_ip, 0, sizeof(payload_last_use_ip));
+   for (int i = 0; i < payload_node_count; i++)
+      payload_last_use_ip[i] = -1;
+
    int ip = 0;
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       switch (inst->opcode) {
@@ -411,6 +413,9 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
    }
 
    for (int i = 0; i < payload_node_count; i++) {
+      if (payload_last_use_ip[i] == -1)
+         continue;
+
       /* Mark the payload node as interfering with any virtual grf that is
        * live between the start of the program and our last use of the payload
        * node.

From be1f49bda90425b7fd009ac177b307e61da0f994 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 10 Jul 2015 16:25:26 -0700
Subject: [PATCH 0370/1208] mesa: Detect and provide macros for function
 attributes pure and const.

These are really useful hints to the compiler in the absence of link-time
optimization, and I'm going to use them in VC4.

I've made the const attribute be ATTRIBUTE_CONST unlike other function
attributes, because we have other things in the tree #defining CONST for
their own unrelated purposes.

v2: Alphabetize.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1)
---
 configure.ac      |  2 ++
 src/util/macros.h | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/configure.ac b/configure.ac
index bdfd13417e0..bb6d4f618ff 100644
--- a/configure.ac
+++ b/configure.ac
@@ -205,10 +205,12 @@ AX_GCC_BUILTIN([__builtin_popcount])
 AX_GCC_BUILTIN([__builtin_popcountll])
 AX_GCC_BUILTIN([__builtin_unreachable])
 
+AX_GCC_FUNC_ATTRIBUTE([const])
 AX_GCC_FUNC_ATTRIBUTE([flatten])
 AX_GCC_FUNC_ATTRIBUTE([format])
 AX_GCC_FUNC_ATTRIBUTE([malloc])
 AX_GCC_FUNC_ATTRIBUTE([packed])
+AX_GCC_FUNC_ATTRIBUTE([pure])
 AX_GCC_FUNC_ATTRIBUTE([unused])
 AX_GCC_FUNC_ATTRIBUTE([warn_unused_result])
 
diff --git a/src/util/macros.h b/src/util/macros.h
index 66698e72701..5c5c92ec610 100644
--- a/src/util/macros.h
+++ b/src/util/macros.h
@@ -103,6 +103,17 @@ do {                       \
 #define assume(expr) assert(expr)
 #endif
 
+/* Attribute const is used for functions that have no effects other than their
+ * return value, and only rely on the argument values to compute the return
+ * value.  As a result, calls to it can be CSEed.  Note that using memory
+ * pointed to by the arguments is not allowed for const functions.
+ */
+#ifdef HAVE_FUNC_ATTRIBUTE_CONST
+#define ATTRIBUTE_CONST __attribute__((__const__))
+#else
+#define ATTRIBUTE_CONST
+#endif
+
 #ifdef HAVE_FUNC_ATTRIBUTE_FLATTEN
 #define FLATTEN __attribute__((__flatten__))
 #else
@@ -130,6 +141,15 @@ do {                       \
 #define PACKED
 #endif
 
+/* Attribute pure is used for functions that have no effects other than their
+ * return value.  As a result, calls to it can be dead code eliminated.
+ */
+#ifdef HAVE_FUNC_ATTRIBUTE_PURE
+#define PURE __attribute__((__pure__))
+#else
+#define PURE
+#endif
+
 #ifdef __cplusplus
 /**
  * Macro function that evaluates to true if T is a trivially

From 90dfabc3b5ce5b485a1bbcd7e815a72588f7153d Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 10 Jul 2015 16:30:27 -0700
Subject: [PATCH 0371/1208] vc4: Use the pure/const attributes on a bunch of
 our QPU functions.

On a release build, this makes the rest of vc4_qpu_validate.c go away
(the compiler didn't know that our qpu helper function calls had no
side effects).
---
 src/gallium/drivers/vc4/vc4_qpu.h    | 28 ++++++++++++++--------------
 src/gallium/drivers/vc4/vc4_tiling.h |  6 +++---
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h
index c9ab6344589..fbb90ba12a0 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -122,23 +122,23 @@ static inline struct qpu_reg qpu_r3(void) { return qpu_rn(3); }
 static inline struct qpu_reg qpu_r4(void) { return qpu_rn(4); }
 static inline struct qpu_reg qpu_r5(void) { return qpu_rn(5); }
 
-uint64_t qpu_NOP(void);
-uint64_t qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src);
-uint64_t qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src);
+uint64_t qpu_NOP(void) ATTRIBUTE_CONST;
+uint64_t qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src) ATTRIBUTE_CONST;
+uint64_t qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src) ATTRIBUTE_CONST;
 uint64_t qpu_a_alu2(enum qpu_op_add op, struct qpu_reg dst,
-                    struct qpu_reg src0, struct qpu_reg src1);
+                    struct qpu_reg src0, struct qpu_reg src1) ATTRIBUTE_CONST;
 uint64_t qpu_m_alu2(enum qpu_op_mul op, struct qpu_reg dst,
-                    struct qpu_reg src0, struct qpu_reg src1);
-uint64_t qpu_merge_inst(uint64_t a, uint64_t b);
-uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val);
-uint64_t qpu_set_sig(uint64_t inst, uint32_t sig);
-uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond);
-uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond);
-uint32_t qpu_encode_small_immediate(uint32_t i);
+                    struct qpu_reg src0, struct qpu_reg src1) ATTRIBUTE_CONST;
+uint64_t qpu_merge_inst(uint64_t a, uint64_t b) ATTRIBUTE_CONST;
+uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val) ATTRIBUTE_CONST;
+uint64_t qpu_set_sig(uint64_t inst, uint32_t sig) ATTRIBUTE_CONST;
+uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST;
+uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST;
+uint32_t qpu_encode_small_immediate(uint32_t i) ATTRIBUTE_CONST;
 
-bool qpu_waddr_is_tlb(uint32_t waddr);
-bool qpu_inst_is_tlb(uint64_t inst);
-int qpu_num_sf_accesses(uint64_t inst);
+bool qpu_waddr_is_tlb(uint32_t waddr) ATTRIBUTE_CONST;
+bool qpu_inst_is_tlb(uint64_t inst) ATTRIBUTE_CONST;
+int qpu_num_sf_accesses(uint64_t inst) ATTRIBUTE_CONST;
 void qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst);
 
 static inline uint64_t
diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h
index b5d10da3417..b90bba70200 100644
--- a/src/gallium/drivers/vc4/vc4_tiling.h
+++ b/src/gallium/drivers/vc4/vc4_tiling.h
@@ -24,9 +24,9 @@
 #ifndef VC4_TILING_H
 #define VC4_TILING_H
 
-uint32_t vc4_utile_width(int cpp);
-uint32_t vc4_utile_height(int cpp);
-bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp);
+uint32_t vc4_utile_width(int cpp) ATTRIBUTE_CONST;
+uint32_t vc4_utile_height(int cpp) ATTRIBUTE_CONST;
+bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
 void vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp);
 void vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp);
 void vc4_load_tiled_image(void *dst, uint32_t dst_stride,

From be7adc2ecad0d04037cb0c99754703dde86ee73a Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 16 Jul 2015 14:30:28 -0700
Subject: [PATCH 0372/1208] vc4: Also consider uniform 0 in uniform lowering.

The hash table considers key 0 to be the empty key.
---
 src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
index 910c89dca79..f087c3b81b5 100644
--- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
@@ -52,7 +52,7 @@ static void
 add_uniform(struct hash_table *ht, struct qreg reg)
 {
         struct hash_entry *entry;
-        void *key = (void *)(uintptr_t)reg.index;
+        void *key = (void *)(uintptr_t)(reg.index + 1);
 
         entry = _mesa_hash_table_search(ht, key);
         if (entry) {
@@ -66,7 +66,7 @@ static void
 remove_uniform(struct hash_table *ht, struct qreg reg)
 {
         struct hash_entry *entry;
-        void *key = (void *)(uintptr_t)reg.index;
+        void *key = (void *)(uintptr_t)(reg.index + 1);
 
         entry = _mesa_hash_table_search(ht, key);
         assert(entry);
@@ -122,7 +122,7 @@ qir_lower_uniforms(struct vc4_compile *c)
                 struct hash_entry *entry;
                 hash_table_foreach(ht, entry) {
                         uint32_t count = (uintptr_t)entry->data;
-                        uint32_t index = (uintptr_t)entry->key;
+                        uint32_t index = (uintptr_t)entry->key - 1;
                         if (count > max_count) {
                                 max_count = count;
                                 max_index = index;

From 5341349dde6f5c70af188e48ef0082e6e7d5361f Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 17 Jul 2015 11:22:40 -0700
Subject: [PATCH 0373/1208] vc4: Add debugging on texture relocation validation
 failures.

---
 src/gallium/drivers/vc4/kernel/vc4_validate.c | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index a0b67a7e50b..1d457b80d52 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -619,11 +619,11 @@ reloc_tex(struct vc4_exec_info *exec,
 		uint32_t remaining_size = tex->base.size - p0;
 		if (p0 > tex->base.size - 4) {
 			DRM_ERROR("UBO offset greater than UBO size\n");
-			return false;
+			goto fail;
 		}
 		if (p1 > remaining_size - 4) {
 			DRM_ERROR("UBO clamp would allow reads outside of UBO\n");
-			return false;
+			goto fail;
 		}
 		*validated_p0 = tex->paddr + p0;
 		return true;
@@ -642,14 +642,14 @@ reloc_tex(struct vc4_exec_info *exec,
 		    VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE) {
 			if (cube_map_stride) {
 				DRM_ERROR("Cube map stride set twice\n");
-				return false;
+				goto fail;
 			}
 
 			cube_map_stride = p3 & VC4_TEX_P2_CMST_MASK;
 		}
 		if (!cube_map_stride) {
 			DRM_ERROR("Cube map stride not set\n");
-			return false;
+			goto fail;
 		}
 	}
 
@@ -683,7 +683,7 @@ reloc_tex(struct vc4_exec_info *exec,
 	case VC4_TEXTURE_TYPE_YUV422R:
 	default:
 		DRM_ERROR("Texture format %d unsupported\n", type);
-		return false;
+		goto fail;
 	}
 	utile_w = utile_width(cpp);
 	utile_h = utile_height(cpp);
@@ -699,7 +699,7 @@ reloc_tex(struct vc4_exec_info *exec,
 
 	if (!vc4_check_tex_size(exec, tex, offset + cube_map_stride * 5,
 				tiling_format, width, height, cpp)) {
-		return false;
+		goto fail;
 	}
 
 	/* The mipmap levels are stored before the base of the texture.  Make
@@ -740,7 +740,7 @@ reloc_tex(struct vc4_exec_info *exec,
 				  i, level_width, level_height,
 				  aligned_width, aligned_height,
 				  level_size, offset);
-			return false;
+			goto fail;
 		}
 
 		offset -= level_size;
@@ -749,6 +749,12 @@ reloc_tex(struct vc4_exec_info *exec,
 	*validated_p0 = tex->paddr + p0;
 
 	return true;
+ fail:
+	DRM_INFO("Texture p0 at %d: 0x%08x\n", sample->p_offset[0], p0);
+	DRM_INFO("Texture p1 at %d: 0x%08x\n", sample->p_offset[1], p1);
+	DRM_INFO("Texture p2 at %d: 0x%08x\n", sample->p_offset[2], p2);
+	DRM_INFO("Texture p3 at %d: 0x%08x\n", sample->p_offset[3], p3);
+	return false;
 }
 
 static int

From 27aa31fab40783356207ba5dabd839b430496e7b Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 17 Jul 2015 11:52:09 -0700
Subject: [PATCH 0374/1208] vc4: Fix printing of shader-db debug when shader-db
 isn't turned on.

---
 src/gallium/drivers/vc4/vc4_program.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index ce99989a79d..75c1be4f421 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2306,10 +2306,12 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
                 }
         }
         if (shader->ubo_size) {
-                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
-                        qir_get_stage_name(c->stage),
-                        c->program_id, c->variant_id,
-                        shader->ubo_size / 4);
+                if (vc4_debug & VC4_DEBUG_SHADERDB) {
+                        fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
+                                qir_get_stage_name(c->stage),
+                                c->program_id, c->variant_id,
+                                shader->ubo_size / 4);
+                }
         }
 
         qir_compile_destroy(c);

From e42cfe5d032e97e0444df39421a9f93f84452d68 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Fri, 17 Jul 2015 18:01:01 +0200
Subject: [PATCH 0375/1208] mesa: fix up some texture error checks

In particular, we were incorrectly accepting s3tc (and lots of others)
for CompressedTexSubImage3D (but not CompressedTexImage3D) calls with 3d
targets. At this time, the only allowed formats for these calls are the
bptc ones, since none of the specific extensions allow it (astc hdr would).
Also, fix up a bug in _mesa_target_can_be_compressed - 3d target needs to
be allowed for bptc formats.

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/main/teximage.c   | 41 +++++++++++++++++++-------------------
 src/mesa/main/texstorage.c |  1 +
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 3d85615fa45..a70b9da7e91 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -1793,8 +1793,6 @@ GLboolean
 _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
                                GLenum intFormat)
 {
-   (void) intFormat;  /* not used yet */
-
    switch (target) {
    case GL_TEXTURE_2D:
    case GL_PROXY_TEXTURE_2D:
@@ -1814,6 +1812,16 @@ _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
    case GL_PROXY_TEXTURE_CUBE_MAP_ARRAY:
    case GL_TEXTURE_CUBE_MAP_ARRAY:
       return ctx->Extensions.ARB_texture_cube_map_array;
+   case GL_TEXTURE_3D:
+      switch (intFormat) {
+      case GL_COMPRESSED_RGBA_BPTC_UNORM:
+      case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
+      case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
+      case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
+         return ctx->Extensions.ARB_texture_compression_bptc;
+      default:
+         return GL_FALSE;
+      }
    default:
       return GL_FALSE;
    }
@@ -4575,32 +4583,23 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
        *    one of the EAC, ETC2, or RGTC formats and either border is
        *    non-zero, or the effective target for the texture is not
        *    TEXTURE_2D_ARRAY."
+       * Instead of listing all these, just list those which are allowed,
+       * which is (at this time) only bptc. Otherwise we'd say s3tc (and more)
+       * are valid here, which they are not, but of course not mentioned by
+       * core spec.
        */
       if (target != GL_TEXTURE_2D_ARRAY) {
          bool invalidformat;
          switch (format) {
             /* These came from _mesa_is_compressed_format in glformats.c. */
-            /* EAC formats */
-            case GL_COMPRESSED_RGBA8_ETC2_EAC:
-            case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC:
-            case GL_COMPRESSED_R11_EAC:
-            case GL_COMPRESSED_RG11_EAC:
-            case GL_COMPRESSED_SIGNED_R11_EAC:
-            case GL_COMPRESSED_SIGNED_RG11_EAC:
-            /* ETC2 formats */
-            case GL_COMPRESSED_RGB8_ETC2:
-            case GL_COMPRESSED_SRGB8_ETC2:
-            case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-            case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-            /* RGTC formats */
-            case GL_COMPRESSED_RED_RGTC1:
-            case GL_COMPRESSED_SIGNED_RED_RGTC1:
-            case GL_COMPRESSED_RG_RGTC2:
-            case GL_COMPRESSED_SIGNED_RG_RGTC2:
-               invalidformat = true;
+            case GL_COMPRESSED_RGBA_BPTC_UNORM:
+            case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
+            case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
+            case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
+               invalidformat = false;
                break;
             default:
-               invalidformat = false;
+               invalidformat = true;
          }
          if (invalidformat) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
diff --git a/src/mesa/main/texstorage.c b/src/mesa/main/texstorage.c
index 53cb2c091f8..aa8fa3e6343 100644
--- a/src/mesa/main/texstorage.c
+++ b/src/mesa/main/texstorage.c
@@ -309,6 +309,7 @@ tex_storage_error_check(struct gl_context *ctx,
                   GL_INVALID_ENUM : GL_INVALID_OPERATION,
                   "glTex%sStorage%dD(internalformat = %s)", suffix, dims,
                   _mesa_lookup_enum_by_nr(internalformat));
+      return GL_TRUE;
    }
 
    /* levels check */

From ff7896a398f55baefd00e695c8f45f2ffa57bceb Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 17 Jul 2015 10:01:48 -0700
Subject: [PATCH 0376/1208] vc4: Switch to using a separate ioctl for making
 shaders.

This gives the kernel a chance to validate and lock down the data,
without having to deal with mmap zapping.

With this, GLBenchmark stops on a texture relocations, because we'd
recycled a shader BO as another shader and failed to revalidate, since we
weren't clearing the cached validation state on mmap faults.
---
 src/gallium/drivers/vc4/vc4_bufmgr.c  | 56 +++++++++++++++++++++++----
 src/gallium/drivers/vc4/vc4_bufmgr.h  |  4 +-
 src/gallium/drivers/vc4/vc4_drm.h     | 25 ++++++++++++
 src/gallium/drivers/vc4/vc4_program.c |  5 +--
 4 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index 499b5ce146e..f7b41f5816d 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2014 Broadcom
+ * Copyright © 2014-2015 Broadcom
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -381,15 +381,57 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo)
 }
 
 struct vc4_bo *
-vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data, uint32_t size,
-                 const char *name)
+vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size)
 {
-        void *map;
         struct vc4_bo *bo;
+        int ret;
+
+        bo = CALLOC_STRUCT(vc4_bo);
+        if (!bo)
+                return NULL;
+
+        pipe_reference_init(&bo->reference, 1);
+        bo->screen = screen;
+        bo->size = align(size, 4096);
+        bo->name = "code";
+        bo->private = false; /* Make sure it doesn't go back to the cache. */
+
+        if (!using_vc4_simulator) {
+                struct drm_vc4_create_shader_bo create = {
+                        .size = size,
+                        .data = (uintptr_t)data,
+                };
+
+                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_CREATE_SHADER_BO,
+                               &create);
+                bo->handle = create.handle;
+        } else {
+                struct drm_mode_create_dumb create;
+                memset(&create, 0, sizeof(create));
+
+                create.width = 128;
+                create.bpp = 8;
+                create.height = (size + 127) / 128;
+
+                ret = drmIoctl(screen->fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
+                bo->handle = create.handle;
+                assert(create.size >= size);
+
+                vc4_bo_map(bo);
+                memcpy(bo->map, data, size);
+        }
+        if (ret != 0) {
+                fprintf(stderr, "create shader ioctl failure\n");
+                abort();
+        }
+
+        screen->bo_count++;
+        screen->bo_size += bo->size;
+        if (dump_stats) {
+                fprintf(stderr, "Allocated shader %dkb:\n", size / 1024);
+                vc4_bo_dump_stats(screen);
+        }
 
-        bo = vc4_bo_alloc(screen, size, name);
-        map = vc4_bo_map(bo);
-        memcpy(map, data, size);
         return bo;
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.h b/src/gallium/drivers/vc4/vc4_bufmgr.h
index eb8409afb68..b77506e242a 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.h
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.h
@@ -58,8 +58,8 @@ struct vc4_bo {
 
 struct vc4_bo *vc4_bo_alloc(struct vc4_screen *screen, uint32_t size,
                             const char *name);
-struct vc4_bo *vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data,
-                                uint32_t size, const char *name);
+struct vc4_bo *vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data,
+                                   uint32_t size);
 void vc4_bo_last_unreference(struct vc4_bo *bo);
 void vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time);
 struct vc4_bo *vc4_bo_open_name(struct vc4_screen *screen, uint32_t name,
diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h
index 5f1ee4fa125..863ef8da8fb 100644
--- a/src/gallium/drivers/vc4/vc4_drm.h
+++ b/src/gallium/drivers/vc4/vc4_drm.h
@@ -31,12 +31,14 @@
 #define DRM_VC4_WAIT_BO                           0x02
 #define DRM_VC4_CREATE_BO                         0x03
 #define DRM_VC4_MMAP_BO                           0x04
+#define DRM_VC4_CREATE_SHADER_BO                  0x05
 
 #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
 #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
 #define DRM_IOCTL_VC4_WAIT_BO             DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_BO, struct drm_vc4_wait_bo)
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
+#define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
 
 struct drm_vc4_submit_rcl_surface {
 	uint32_t hindex; /* Handle index, or ~0 if not present. */
@@ -182,6 +184,29 @@ struct drm_vc4_create_bo {
 	uint32_t pad;
 };
 
+/**
+ * struct drm_vc4_create_shader_bo - ioctl argument for creating VC4
+ * shader BOs.
+ *
+ * Since allowing a shader to be overwritten while it's also being
+ * executed from would allow privlege escalation, shaders must be
+ * created using this ioctl, and they can't be mmapped later.
+ */
+struct drm_vc4_create_shader_bo {
+	/* Size of the data argument. */
+	uint32_t size;
+	/* Flags, currently must be 0. */
+	uint32_t flags;
+
+	/* Pointer to the data. */
+	uint64_t data;
+
+	/** Returned GEM handle for the BO. */
+	uint32_t handle;
+	/* Pad, must be 0. */
+	uint32_t pad;
+};
+
 /**
  * struct drm_vc4_mmap_bo - ioctl argument for mapping VC4 BOs.
  *
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 75c1be4f421..561da1074ce 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2277,9 +2277,8 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
         }
 
         copy_uniform_state_to_shader(shader, c);
-        shader->bo = vc4_bo_alloc_mem(vc4->screen, c->qpu_insts,
-                                      c->qpu_inst_count * sizeof(uint64_t),
-                                      "code");
+        shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts,
+                                         c->qpu_inst_count * sizeof(uint64_t));
 
         /* Copy the compiler UBO range state to the compiled shader, dropping
          * out arrays that were never referenced by an indirect load.

From ccf9598ad7681f5c9c87e9ca8bf856fcb5198b45 Mon Sep 17 00:00:00 2001
From: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Date: Mon, 18 May 2015 14:32:17 +0300
Subject: [PATCH 0377/1208] i965: Define HW-binding table and resource streamer
 control opcodes

v2: Use macros for HW binding table edits (Topi)
v3: Add Broadwell support.
v4: Make hardware binding table bit definitions even more clearer (Ken)

Cc: kenneth@whitecape.org
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
---
 src/mesa/drivers/dri/i965/brw_defines.h | 30 +++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/intel_reg.h   |  3 +++
 2 files changed, 33 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 5bf53e375e8..e6fdc3dd125 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1640,6 +1640,36 @@ enum brw_message_target {
 #define _3DSTATE_BINDING_TABLE_POINTERS_GS	0x7829 /* GEN7+ */
 #define _3DSTATE_BINDING_TABLE_POINTERS_PS	0x782A /* GEN7+ */
 
+#define _3DSTATE_BINDING_TABLE_POOL_ALLOC       0x7919 /* GEN7.5+ */
+#define BRW_HW_BINDING_TABLE_ENABLE             (1 << 11)
+#define GEN7_HW_BT_POOL_MOCS_SHIFT              7
+#define GEN7_HW_BT_POOL_MOCS_MASK               INTEL_MASK(10, 7)
+#define GEN8_HW_BT_POOL_MOCS_SHIFT              0
+#define GEN8_HW_BT_POOL_MOCS_MASK               INTEL_MASK(6, 0)
+/* Only required in HSW */
+#define HSW_BT_POOL_ALLOC_MUST_BE_ONE           (3 << 5)
+
+#define _3DSTATE_BINDING_TABLE_EDIT_VS          0x7843 /* GEN7.5 */
+#define _3DSTATE_BINDING_TABLE_EDIT_GS          0x7844 /* GEN7.5 */
+#define _3DSTATE_BINDING_TABLE_EDIT_HS          0x7845 /* GEN7.5 */
+#define _3DSTATE_BINDING_TABLE_EDIT_DS          0x7846 /* GEN7.5 */
+#define _3DSTATE_BINDING_TABLE_EDIT_PS          0x7847 /* GEN7.5 */
+#define BRW_BINDING_TABLE_INDEX_SHIFT           16
+#define BRW_BINDING_TABLE_INDEX_MASK            INTEL_MASK(23, 16)
+
+#define BRW_BINDING_TABLE_EDIT_TARGET_ALL       3
+#define BRW_BINDING_TABLE_EDIT_TARGET_CORE1     2
+#define BRW_BINDING_TABLE_EDIT_TARGET_CORE0     1
+/* In HSW, when editing binding table entries to surface state offsets,
+ * the surface state offset is a 16-bit value aligned to 32 bytes. But
+ * Surface State Pointer in dword 2 is [15:0]. Right shift surf_offset
+ * by 5 bits so it won't disturb bit 16 (which is used as the binding
+ * table index entry), otherwise it would hang the GPU.
+ */
+#define HSW_SURFACE_STATE_EDIT(value)           (value >> 5)
+/* Same as Haswell, but surface state offsets now aligned to 64 bytes.*/
+#define GEN8_SURFACE_STATE_EDIT(value)          (value >> 6)
+
 #define _3DSTATE_SAMPLER_STATE_POINTERS		0x7802 /* GEN6+ */
 # define PS_SAMPLER_STATE_CHANGE				(1 << 12)
 # define GS_SAMPLER_STATE_CHANGE				(1 << 9)
diff --git a/src/mesa/drivers/dri/i965/intel_reg.h b/src/mesa/drivers/dri/i965/intel_reg.h
index 4223e11e78c..b4283da9633 100644
--- a/src/mesa/drivers/dri/i965/intel_reg.h
+++ b/src/mesa/drivers/dri/i965/intel_reg.h
@@ -47,6 +47,9 @@
 /* Load a value from memory into a register.  Only available on Gen7+. */
 #define GEN7_MI_LOAD_REGISTER_MEM	(CMD_MI | (0x29 << 23))
 # define MI_LOAD_REGISTER_MEM_USE_GGTT		(1 << 22)
+/* Haswell RS control */
+#define MI_RS_CONTROL                   (CMD_MI | (0x6 << 23))
+#define MI_RS_STORE_DATA_IMM            (CMD_MI | (0x2b << 23))
 
 /* Manipulate the predicate bit based on some register values. Only on Gen7+ */
 #define GEN7_MI_PREDICATE		(CMD_MI | (0xC << 23))

From 090529af1828817344e0850ef27eebd1f096eb5f Mon Sep 17 00:00:00 2001
From: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Date: Tue, 2 Jul 2013 11:48:22 -0400
Subject: [PATCH 0378/1208] i965: Enable resource streamer for the batchbuffer

Check first if the hardware and kernel supports resource streamer. If this
is allowed, tell the kernel to enable the resource streamer enable bit on
MI_BATCHBUFFER_START by specifying I915_EXEC_RESOURCE_STREAMER
execbuffer flags.

v2: - Use new I915_PARAM_HAS_RESOURCE_STREAMER ioctl to check if kernel
      supports RS (Ken).
    - Add brw_device_info::has_resource_streamer and toggle it for
      Haswell, Broadwell, Cherryview, Skylake, and Broxton (Ken).
v3: - Update I915_PARAM_HAS_RESOURCE_STREAMER to match updated kernel.
v4: - Always inspect the getparam.value (Chris Wilson).
v5: - Fold redundant devinfo->has_resource_streamer check in context create
      into init screen.

Cc: kenneth@whitecape.org
Cc: chris@chris-wilson.co.uk
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
---
 src/mesa/drivers/dri/i965/brw_context.c       |  4 ++++
 src/mesa/drivers/dri/i965/brw_context.h       |  1 +
 src/mesa/drivers/dri/i965/brw_device_info.c   |  5 ++++-
 src/mesa/drivers/dri/i965/brw_device_info.h   |  1 +
 src/mesa/drivers/dri/i965/intel_batchbuffer.c |  8 +++++++-
 src/mesa/drivers/dri/i965/intel_screen.c      | 14 ++++++++++++++
 src/mesa/drivers/dri/i965/intel_screen.h      |  5 +++++
 7 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 8150b943bc8..05cb53b3711 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -871,6 +871,10 @@ brwCreateContext(gl_api api,
 
    brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
 
+   brw->use_resource_streamer = screen->has_resource_streamer &&
+      (brw_env_var_as_boolean("INTEL_USE_HW_BT", false) ||
+       brw_env_var_as_boolean("INTEL_USE_GATHER", false));
+
    ctx->VertexProgram._MaintainTnlProgram = true;
    ctx->FragmentProgram._MaintainTexEnvProgram = true;
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 34a49b2abdc..a9f1f61b268 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1134,6 +1134,7 @@ struct brw_context
    bool has_pln;
    bool no_simd8;
    bool use_rep_send;
+   bool use_resource_streamer;
 
    /**
     * Some versions of Gen hardware don't do centroid interpolation correctly
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index 342e56622b7..51a91b6c0d9 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -170,7 +170,8 @@ static const struct brw_device_info brw_device_info_byt = {
 #define HSW_FEATURES             \
    GEN7_FEATURES,                \
    .is_haswell = true,           \
-   .supports_simd16_3src = true
+   .supports_simd16_3src = true, \
+   .has_resource_streamer = true
 
 static const struct brw_device_info brw_device_info_hsw_gt1 = {
    HSW_FEATURES, .gt = 1,
@@ -229,6 +230,7 @@ static const struct brw_device_info brw_device_info_hsw_gt3 = {
 #define GEN8_FEATURES                               \
    .gen = 8,                                        \
    .has_hiz_and_separate_stencil = true,            \
+   .has_resource_streamer = true,                   \
    .must_use_separate_stencil = true,               \
    .has_llc = true,                                 \
    .has_pln = true,                                 \
@@ -301,6 +303,7 @@ static const struct brw_device_info brw_device_info_chv = {
 #define GEN9_FEATURES                               \
    .gen = 9,                                        \
    .has_hiz_and_separate_stencil = true,            \
+   .has_resource_streamer = true,                   \
    .must_use_separate_stencil = true,               \
    .has_llc = true,                                 \
    .has_pln = true,                                 \
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.h b/src/mesa/drivers/dri/i965/brw_device_info.h
index 7b7a1fc046a..2a73e937d9f 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.h
+++ b/src/mesa/drivers/dri/i965/brw_device_info.h
@@ -46,6 +46,7 @@ struct brw_device_info
    bool has_compr4;
    bool has_surface_tile_offset;
    bool supports_simd16_3src;
+   bool has_resource_streamer;
 
    /**
     * Quirks:
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 088ffd276b4..d40e67133e2 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -279,6 +279,11 @@ throttle(struct brw_context *brw)
    }
 }
 
+/* Drop when RS headers get pulled to libdrm */
+#ifndef I915_EXEC_RESOURCE_STREAMER
+#define I915_EXEC_RESOURCE_STREAMER (1<<15)
+#endif
+
 /* TODO: Push this whole function into bufmgr.
  */
 static int
@@ -305,7 +310,8 @@ do_flush_locked(struct brw_context *brw)
       if (brw->gen >= 6 && batch->ring == BLT_RING) {
          flags = I915_EXEC_BLT;
       } else {
-         flags = I915_EXEC_RENDER;
+         flags = I915_EXEC_RENDER |
+            (brw->use_resource_streamer ? I915_EXEC_RESOURCE_STREAMER : 0);
       }
       if (batch->needs_sol_reset)
 	 flags |= I915_EXEC_GEN7_SOL_RESET;
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index fd343eeb4e1..1470b059d9b 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1348,6 +1348,11 @@ brw_get_revision(int fd)
    return revision;
 }
 
+/* Drop when RS headers get pulled to libdrm */
+#ifndef I915_PARAM_HAS_RESOURCE_STREAMER
+#define I915_PARAM_HAS_RESOURCE_STREAMER 36
+#endif
+
 /**
  * This is the driver specific part of the createNewScreen entry point.
  * Called when using DRI2.
@@ -1440,6 +1445,15 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
    intelScreen->compiler = brw_compiler_create(intelScreen,
                                                intelScreen->devinfo);
 
+   if (intelScreen->devinfo->has_resource_streamer) {
+      int val = -1;
+      getparam.param = I915_PARAM_HAS_RESOURCE_STREAMER;
+      getparam.value = &val;
+
+      drmIoctl(psp->fd, DRM_IOCTL_I915_GETPARAM, &getparam);
+      intelScreen->has_resource_streamer = val > 0;
+   }
+
    return (const __DRIconfig**) intel_screen_make_configs(psp);
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index 941e0fcc752..a741c94e0ff 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -54,6 +54,11 @@ struct intel_screen
 
    bool hw_has_timestamp;
 
+   /**
+    * Does the kernel support resource streamer?
+    */
+   bool has_resource_streamer;
+
    /**
     * Does the kernel support context reset notifications?
     */

From 190756482e62cb57e2bc8c798181e5f0171726fb Mon Sep 17 00:00:00 2001
From: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Date: Wed, 15 Apr 2015 13:04:45 +0300
Subject: [PATCH 0379/1208] i965: Enable hardware-generated binding tables on
 render path.

This patch implements the binding table enable command which is also
used to allocate a binding table pool where where hardware-generated
binding table entries are flushed into. Each binding table offset in
the binding table pool is unique per each shader stage that are
enabled within a batch.

Also insert the required brw_tracked_state objects to enable
hw-generated binding tables in normal render path.

v2: - Use MOCS in binding table pool alloc for GEN8
    - Fix spurious offset when allocating binding table pool entry
      and start from zero instead.
v3: - Include GEN8 fix for spurious offset above.
v4: - Fixup wrong packet length in enable/disable hw-binding table
      for GEN8 (Ville).
    - Don't invoke HW-binding table disable command when we dont
      have resource streamer (Chris).
v5: - Reorder the state cache invalidate flush so it happens in-between
      enabling hw-generated binding tables and the previous sw-binding
      table GPU state (Chris).
v6: - Do the same fix in v5 for gen7_disable_hw_binding_tables().
    - Adhere to coding guidelines and make comments more informative.

Cc: kenneth@whitecape.org
Cc: syrjala@sci.fi
Cc: chris@chris-wilson.co.uk
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
---
 .../drivers/dri/i965/brw_binding_tables.c     | 100 ++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_context.c       |   4 +
 src/mesa/drivers/dri/i965/brw_context.h       |   6 ++
 src/mesa/drivers/dri/i965/brw_state.h         |   6 ++
 src/mesa/drivers/dri/i965/brw_state_upload.c  |   4 +
 src/mesa/drivers/dri/i965/gen7_disable.c      |   4 +-
 src/mesa/drivers/dri/i965/gen8_disable.c      |   4 +-
 src/mesa/drivers/dri/i965/intel_batchbuffer.c |   4 +
 8 files changed, 128 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c
index 98ff0ddcd58..6769f0cd1ab 100644
--- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
+++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
@@ -170,6 +170,106 @@ const struct brw_tracked_state brw_gs_binding_table = {
    .emit = brw_gs_upload_binding_table,
 };
 
+/**
+ * Disable hardware binding table support, falling back to the
+ * older software-generated binding table mechanism.
+ */
+void
+gen7_disable_hw_binding_tables(struct brw_context *brw)
+{
+   if (!brw->use_resource_streamer)
+      return;
+   /* From the Haswell PRM, Volume 7: 3D Media GPGPU,
+    * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note:
+    *
+    * "When switching between HW and SW binding table generation, SW must
+    * issue a state cache invalidate."
+    */
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+
+   int pkt_len = brw->gen >= 8 ? 4 : 3;
+
+   BEGIN_BATCH(pkt_len);
+   OUT_BATCH(_3DSTATE_BINDING_TABLE_POOL_ALLOC << 16 | (pkt_len - 2));
+   if (brw->gen >= 8) {
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+   } else {
+      OUT_BATCH(HSW_BT_POOL_ALLOC_MUST_BE_ONE);
+      OUT_BATCH(0);
+   }
+   ADVANCE_BATCH();
+}
+
+/**
+ * Enable hardware binding tables and set up the binding table pool.
+ */
+void
+gen7_enable_hw_binding_tables(struct brw_context *brw)
+{
+   if (!brw->use_resource_streamer)
+      return;
+
+   if (!brw->hw_bt_pool.bo) {
+      /* We use a single re-usable buffer object for the lifetime of the
+       * context and size it to maximum allowed binding tables that can be
+       * programmed per batch:
+       *
+       * From the Haswell PRM, Volume 7: 3D Media GPGPU,
+       * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note:
+       * "A maximum of 16,383 Binding tables are allowed in any batch buffer"
+       */
+      static const int max_size = 16383 * 4;
+      brw->hw_bt_pool.bo = drm_intel_bo_alloc(brw->bufmgr, "hw_bt",
+                                              max_size, 64);
+      brw->hw_bt_pool.next_offset = 0;
+   }
+
+   /* From the Haswell PRM, Volume 7: 3D Media GPGPU,
+    * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note:
+    *
+    * "When switching between HW and SW binding table generation, SW must
+    * issue a state cache invalidate."
+    */
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+
+   int pkt_len = brw->gen >= 8 ? 4 : 3;
+   uint32_t dw1 = BRW_HW_BINDING_TABLE_ENABLE;
+   if (brw->is_haswell) {
+      dw1 |= SET_FIELD(GEN7_MOCS_L3, GEN7_HW_BT_POOL_MOCS) |
+             HSW_BT_POOL_ALLOC_MUST_BE_ONE;
+   } else if (brw->gen >= 8) {
+      dw1 |= BDW_MOCS_WB;
+   }
+
+   BEGIN_BATCH(pkt_len);
+   OUT_BATCH(_3DSTATE_BINDING_TABLE_POOL_ALLOC << 16 | (pkt_len - 2));
+   if (brw->gen >= 8) {
+      OUT_RELOC64(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0, dw1);
+      OUT_BATCH(brw->hw_bt_pool.bo->size);
+   } else {
+      OUT_RELOC(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0, dw1);
+      OUT_RELOC(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0,
+             brw->hw_bt_pool.bo->size);
+   }
+   ADVANCE_BATCH();
+}
+
+void
+gen7_reset_hw_bt_pool_offsets(struct brw_context *brw)
+{
+   brw->hw_bt_pool.next_offset = 0;
+}
+
+const struct brw_tracked_state gen7_hw_binding_tables = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH,
+   },
+   .emit = gen7_enable_hw_binding_tables
+};
+
 /** @} */
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 05cb53b3711..efcd91aad84 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -941,6 +941,10 @@ intelDestroyContext(__DRIcontext * driContextPriv)
    if (brw->wm.base.scratch_bo)
       drm_intel_bo_unreference(brw->wm.base.scratch_bo);
 
+   gen7_reset_hw_bt_pool_offsets(brw);
+   drm_intel_bo_unreference(brw->hw_bt_pool.bo);
+   brw->hw_bt_pool.bo = NULL;
+
    drm_intel_gem_context_destroy(brw->hw_ctx);
 
    if (ctx->swrast_context) {
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index a9f1f61b268..8bbeb34075c 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1398,6 +1398,12 @@ struct brw_context
       struct brw_cs_prog_data *prog_data;
    } cs;
 
+   /* RS hardware binding table */
+   struct {
+      drm_intel_bo *bo;
+      uint32_t next_offset;
+   } hw_bt_pool;
+
    struct {
       uint32_t state_offset;
       uint32_t blend_state_offset;
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 987672f8815..f8ef98f2db9 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -132,6 +132,7 @@ extern const struct brw_tracked_state gen7_sol_state;
 extern const struct brw_tracked_state gen7_urb;
 extern const struct brw_tracked_state gen7_vs_state;
 extern const struct brw_tracked_state gen7_wm_state;
+extern const struct brw_tracked_state gen7_hw_binding_tables;
 extern const struct brw_tracked_state haswell_cut_index;
 extern const struct brw_tracked_state gen8_blend_state;
 extern const struct brw_tracked_state gen8_disable_stages;
@@ -372,6 +373,11 @@ gen7_upload_constant_state(struct brw_context *brw,
                            const struct brw_stage_state *stage_state,
                            bool active, unsigned opcode);
 
+void gen7_rs_control(struct brw_context *brw, int enable);
+void gen7_enable_hw_binding_tables(struct brw_context *brw);
+void gen7_disable_hw_binding_tables(struct brw_context *brw);
+void gen7_reset_hw_bt_pool_offsets(struct brw_context *brw);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 7662c3b580c..6096b4946a0 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -192,6 +192,8 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
    &gen6_color_calc_state,	/* must do before cc unit */
    &gen6_depth_stencil_state,	/* must do before cc unit */
 
+   &gen7_hw_binding_tables, /* Enable hw-generated binding tables for Haswell */
+
    &gen6_vs_push_constants, /* Before vs_state */
    &gen6_gs_push_constants, /* Before gs_state */
    &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
@@ -268,6 +270,8 @@ static const struct brw_tracked_state *gen8_render_atoms[] =
    &gen8_blend_state,
    &gen6_color_calc_state,
 
+   &gen7_hw_binding_tables, /* Enable hw-generated binding tables for Broadwell */
+
    &gen6_vs_push_constants, /* Before vs_state */
    &gen6_gs_push_constants, /* Before gs_state */
    &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
diff --git a/src/mesa/drivers/dri/i965/gen7_disable.c b/src/mesa/drivers/dri/i965/gen7_disable.c
index 2c43cd77f07..bb509696d72 100644
--- a/src/mesa/drivers/dri/i965/gen7_disable.c
+++ b/src/mesa/drivers/dri/i965/gen7_disable.c
@@ -52,7 +52,7 @@ disable_stages(struct brw_context *brw)
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_HS << 16 | (2 - 2));
-   OUT_BATCH(0);
+   OUT_BATCH(brw->hw_bt_pool.next_offset);
    ADVANCE_BATCH();
 
    /* Disable the TE */
@@ -85,7 +85,7 @@ disable_stages(struct brw_context *brw)
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_DS << 16 | (2 - 2));
-   OUT_BATCH(0);
+   OUT_BATCH(brw->hw_bt_pool.next_offset);
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen8_disable.c b/src/mesa/drivers/dri/i965/gen8_disable.c
index da0d4a5fe7a..32508e377c9 100644
--- a/src/mesa/drivers/dri/i965/gen8_disable.c
+++ b/src/mesa/drivers/dri/i965/gen8_disable.c
@@ -66,7 +66,7 @@ disable_stages(struct brw_context *brw)
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_HS << 16 | (2 - 2));
-   OUT_BATCH(0);
+   OUT_BATCH(brw->hw_bt_pool.next_offset);
    ADVANCE_BATCH();
 
    /* Disable the TE */
@@ -101,7 +101,7 @@ disable_stages(struct brw_context *brw)
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_DS << 16 | (2 - 2));
-   OUT_BATCH(0);
+   OUT_BATCH(brw->hw_bt_pool.next_offset);
    ADVANCE_BATCH();
 
    BEGIN_BATCH(2);
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index d40e67133e2..85f20a05729 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -33,6 +33,7 @@
 #include "intel_fbo.h"
 #include "brw_context.h"
 #include "brw_defines.h"
+#include "brw_state.h"
 
 #include <xf86drm.h>
 #include <i915_drm.h>
@@ -391,6 +392,9 @@ _intel_batchbuffer_flush(struct brw_context *brw,
       drm_intel_bo_wait_rendering(brw->batch.bo);
    }
 
+   if (brw->use_resource_streamer)
+      gen7_reset_hw_bt_pool_offsets(brw);
+
    /* Start a new batch buffer. */
    brw_new_batch(brw);
 

From 2133980bc7dff52bdeb142301184e464d113ce7c Mon Sep 17 00:00:00 2001
From: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Date: Fri, 17 Jul 2015 12:20:18 +0300
Subject: [PATCH 0380/1208] i965: Implement interface to edit binding table
 entries

Unlike normal software binding tables where the driver has to manually
generate and fill a binding table array which are then uploaded to the
hardware, the resource streamer instead presents the driver with an option
to fill out slots for individual binding table indices. The hardware
accumulates the state for these combined edits which it then automatically
flushes to a binding table pool when the binding table pointer state
command is invoked.

v2: Clarify binding table edit bit aligment (Topi).
v3: Make comments and function names more clearer (Ken).

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
---
 .../drivers/dri/i965/brw_binding_tables.c     | 55 +++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_state.h         |  9 +++
 2 files changed, 64 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c
index 6769f0cd1ab..41590ec922e 100644
--- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
+++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
@@ -44,6 +44,12 @@
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
 
+static const GLuint stage_to_bt_edit[MESA_SHADER_FRAGMENT + 1] = {
+   _3DSTATE_BINDING_TABLE_EDIT_VS,
+   _3DSTATE_BINDING_TABLE_EDIT_GS,
+   _3DSTATE_BINDING_TABLE_EDIT_PS,
+};
+
 /**
  * Upload a shader stage's binding table as indirect state.
  *
@@ -170,6 +176,55 @@ const struct brw_tracked_state brw_gs_binding_table = {
    .emit = brw_gs_upload_binding_table,
 };
 
+/**
+ * Edit a single entry in a hardware-generated binding table
+ */
+void
+gen7_edit_hw_binding_table_entry(struct brw_context *brw,
+                                 gl_shader_stage stage,
+                                 uint32_t index,
+                                 uint32_t surf_offset)
+{
+   assert(stage <= MESA_SHADER_FRAGMENT);
+
+   uint32_t dw2 = SET_FIELD(index, BRW_BINDING_TABLE_INDEX) |
+      (brw->gen >= 8 ? GEN8_SURFACE_STATE_EDIT(surf_offset) :
+       HSW_SURFACE_STATE_EDIT(surf_offset));
+
+   BEGIN_BATCH(3);
+   OUT_BATCH(stage_to_bt_edit[stage] << 16 | (3 - 2));
+   OUT_BATCH(BRW_BINDING_TABLE_EDIT_TARGET_ALL);
+   OUT_BATCH(dw2);
+   ADVANCE_BATCH();
+}
+
+/**
+ * Upload a whole hardware binding table for the given stage.
+ *
+ * Takes an array of surface offsets and the number of binding table
+ * entries.
+ */
+void
+gen7_update_binding_table_from_array(struct brw_context *brw,
+                                     gl_shader_stage stage,
+                                     const uint32_t* binding_table,
+                                     int num_surfaces)
+{
+   uint32_t dw2 = 0;
+   assert(stage <= MESA_SHADER_FRAGMENT);
+
+   BEGIN_BATCH(num_surfaces + 2);
+   OUT_BATCH(stage_to_bt_edit[stage] << 16 | num_surfaces);
+   OUT_BATCH(BRW_BINDING_TABLE_EDIT_TARGET_ALL);
+   for (int i = 0; i < num_surfaces; i++) {
+      dw2 = SET_FIELD(i, BRW_BINDING_TABLE_INDEX) |
+         (brw->gen >= 8 ? GEN8_SURFACE_STATE_EDIT(binding_table[i]) :
+          HSW_SURFACE_STATE_EDIT(binding_table[i]));
+      OUT_BATCH(dw2);
+   }
+   ADVANCE_BATCH();
+}
+
 /**
  * Disable hardware binding table support, falling back to the
  * older software-generated binding table mechanism.
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index f8ef98f2db9..2eff1b50e83 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -374,6 +374,15 @@ gen7_upload_constant_state(struct brw_context *brw,
                            bool active, unsigned opcode);
 
 void gen7_rs_control(struct brw_context *brw, int enable);
+
+void gen7_edit_hw_binding_table_entry(struct brw_context *brw,
+                                      gl_shader_stage stage,
+                                      uint32_t index,
+                                      uint32_t surf_offset);
+void gen7_update_binding_table_from_array(struct brw_context *brw,
+                                          gl_shader_stage stage,
+                                          const uint32_t* binding_table,
+                                          int num_surfaces);
 void gen7_enable_hw_binding_tables(struct brw_context *brw);
 void gen7_disable_hw_binding_tables(struct brw_context *brw);
 void gen7_reset_hw_bt_pool_offsets(struct brw_context *brw);

From fc65b6eb610f4b1e42930cae7594131fa9ea566e Mon Sep 17 00:00:00 2001
From: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Date: Wed, 2 Oct 2013 16:37:20 +0300
Subject: [PATCH 0381/1208] i965: Upload binding tables in hw-generated binding
 table format.

When hardware-generated binding tables are enabled, use the hw-generated
binding table format when uploading binding table state.

Normally, the CS will will just consume the binding table pointer commands
as pipelined state. When the RS is enabled however, the RS flushes whatever
edited surface state entries of our on-chip binding table to the binding
table pool before passing the command on to the CS.

Note that the the binding table pointer offset is relative to the binding table
pool base address when resource streamer instead of the surface state base address.

v2: Fix possible buffer overflow when allocating a chunk out of the
    hw-binding table pool (Ken).
v3: Remove extra newline and add missing brace around if-statement (Matt).
v4: Fix broken INTEL_DEBUG=shader_time for hw-generated binding tables.
    Document PRM WaStateBindingTableOverfetch workaround.

Cc: kenneth@whitecape.org
Cc: mattst88@gmail.com
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
---
 .../drivers/dri/i965/brw_binding_tables.c     | 64 ++++++++++++++++---
 1 file changed, 56 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c
index 41590ec922e..8d3697abcd5 100644
--- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
+++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
@@ -50,6 +50,35 @@ static const GLuint stage_to_bt_edit[MESA_SHADER_FRAGMENT + 1] = {
    _3DSTATE_BINDING_TABLE_EDIT_PS,
 };
 
+static uint32_t
+reserve_hw_bt_space(struct brw_context *brw, unsigned bytes)
+{
+   /* From the Broadwell PRM, Volume 16, "Workarounds",
+    * WaStateBindingTableOverfetch:
+    * "HW over-fetches two cache lines of binding table indices.  When
+    *  using the resource streamer, SW needs to pad binding table pointer
+    *  updates with an additional two cache lines."
+    *
+    * Cache lines are 64 bytes, so we subtract 128 bytes from the size of
+    * the binding table pool buffer.
+    */
+   if (brw->hw_bt_pool.next_offset + bytes >= brw->hw_bt_pool.bo->size - 128) {
+      gen7_reset_hw_bt_pool_offsets(brw);
+   }
+
+   uint32_t offset = brw->hw_bt_pool.next_offset;
+
+   /* From the Haswell PRM, Volume 2b: Command Reference: Instructions,
+    * 3DSTATE_BINDING_TABLE_POINTERS_xS:
+    *
+    * "If HW Binding Table is enabled, the offset is relative to the
+    *  Binding Table Pool Base Address and the alignment is 64 bytes."
+    */
+   brw->hw_bt_pool.next_offset += ALIGN(bytes, 64);
+
+   return offset;
+}
+
 /**
  * Upload a shader stage's binding table as indirect state.
  *
@@ -78,22 +107,41 @@ brw_upload_binding_table(struct brw_context *brw,
             brw->shader_time.bo, 0, BRW_SURFACEFORMAT_RAW,
             brw->shader_time.bo->size, 1, true);
       }
+      /* When RS is enabled use hw-binding table uploads, otherwise fallback to
+       * software-uploads.
+       */
+      if (brw->use_resource_streamer) {
+         gen7_update_binding_table_from_array(brw, stage_state->stage,
+                                              stage_state->surf_offset,
+                                              prog_data->binding_table
+                                              .size_bytes / 4);
+      } else {
+         uint32_t *bind = brw_state_batch(brw, AUB_TRACE_BINDING_TABLE,
+                                          prog_data->binding_table.size_bytes,
+                                          32,
+                                          &stage_state->bind_bo_offset);
 
-      uint32_t *bind = brw_state_batch(brw, AUB_TRACE_BINDING_TABLE,
-                                       prog_data->binding_table.size_bytes, 32,
-                                       &stage_state->bind_bo_offset);
-
-      /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
-      memcpy(bind, stage_state->surf_offset,
-             prog_data->binding_table.size_bytes);
+         /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
+         memcpy(bind, stage_state->surf_offset,
+                prog_data->binding_table.size_bytes);
+      }
    }
 
    brw->ctx.NewDriverState |= brw_new_binding_table;
 
    if (brw->gen >= 7) {
+      if (brw->use_resource_streamer) {
+         stage_state->bind_bo_offset =
+            reserve_hw_bt_space(brw, prog_data->binding_table.size_bytes);
+      }
       BEGIN_BATCH(2);
       OUT_BATCH(packet_name << 16 | (2 - 2));
-      OUT_BATCH(stage_state->bind_bo_offset);
+      /* Align SurfaceStateOffset[16:6] format to [15:5] PS Binding Table field
+       * when hw-generated binding table is enabled.
+       */
+      OUT_BATCH(brw->use_resource_streamer ?
+                (stage_state->bind_bo_offset >> 1) :
+                stage_state->bind_bo_offset);
       ADVANCE_BATCH();
    }
 }

From 670914ea7cf7808ff37ca54db2844f711436031c Mon Sep 17 00:00:00 2001
From: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Date: Wed, 20 May 2015 18:02:44 +0300
Subject: [PATCH 0382/1208] i965: Disable resource streamer in BLORP

Switch off hardware-generated binding tables and gather push
constants in the blorp. Blorp requires only a minimal set of
simple constants. There is no need for the extra complexity
to program a gather table entry into the pipeline.

Cc: kenneth@whitecape.org
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
---
 src/mesa/drivers/dri/i965/gen7_blorp.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index abace6df37e..9822dc1fe79 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -794,6 +794,8 @@ gen7_blorp_exec(struct brw_context *brw,
    }
    depthstencil_offset = gen6_blorp_emit_depth_stencil_state(brw, params);
    gen7_blorp_emit_depth_stencil_state_pointers(brw, depthstencil_offset);
+   if (brw->use_resource_streamer)
+      gen7_disable_hw_binding_tables(brw);
    if (params->use_wm_prog) {
       uint32_t wm_surf_offset_renderbuffer;
       uint32_t wm_surf_offset_texture = 0;

From 20e484afa4874e87cd18daffd66286bb893cf3fb Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 18 Jul 2015 16:43:17 -0400
Subject: [PATCH 0383/1208] nvc0/ir: fix txq on indirect samplers

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: mesa-stable@lists.freedesktop.org
---
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     |  2 +-
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 56 ++++++++++++++++++-
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 018a1ec6d1d..e5164ca0d18 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1728,7 +1728,7 @@ Converter::handleTXQ(Value *dst0[4], enum TexQuery query)
    }
    tex->setSrc((c = 0), fetchSrc(0, 0)); // mip level
 
-   setTexRS(tex, c, 1, -1);
+   setTexRS(tex, ++c, 1, -1);
 
    bb->insertTail(tex);
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 7a5d1ce0299..da364f2ad04 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -956,7 +956,61 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd)
 bool
 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
 {
-   // TODO: indirect resource/sampler index
+   if (txq->tex.rIndirectSrc < 0 && txq->tex.sIndirectSrc < 0)
+      return true;
+
+   Value *ticRel = txq->getIndirectR();
+   Value *tscRel = txq->getIndirectS();
+   const int chipset = prog->getTarget()->getChipset();
+
+   txq->setIndirectS(NULL);
+   txq->tex.sIndirectSrc = -1;
+
+   if (chipset < NVISA_GK104_CHIPSET) {
+      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
+
+      if (ticRel) {
+         txq->setSrc(txq->tex.rIndirectSrc, NULL);
+         if (txq->tex.r)
+            ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+                                ticRel, bld.mkImm(txq->tex.r));
+      }
+      if (tscRel) {
+         txq->setSrc(txq->tex.sIndirectSrc, NULL);
+         if (txq->tex.s)
+            tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+                                tscRel, bld.mkImm(txq->tex.s));
+      }
+
+      bld.loadImm(src, 0);
+
+      if (ticRel)
+         bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
+      if (tscRel)
+         bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
+
+      txq->moveSources(0, 1);
+      txq->setSrc(0, src);
+   } else {
+      // XXX this ignores tsc, and assumes a 1:1 mapping
+      assert(txq->tex.rIndirectSrc >= 0);
+      Value *hnd = loadTexHandle(
+            bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                       txq->getIndirectR(), bld.mkImm(2)),
+            txq->tex.r);
+      txq->tex.r = 0xff;
+      txq->tex.s = 0x1f;
+
+      if (chipset < NVISA_GM107_CHIPSET) {
+         txq->setIndirectR(NULL);
+         txq->moveSources(0, 1);
+         txq->setSrc(0, hnd);
+         txq->tex.rIndirectSrc = 0;
+      } else {
+         txq->setIndirectR(hnd);
+      }
+   }
+
    return true;
 }
 

From 346ce0b98832e33d5411200002571b3edea9e2bb Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 18 Jul 2015 18:38:42 -0400
Subject: [PATCH 0384/1208] nvc0/ir: don't worry about sampler in txq handling

There's no need to deal with samplers for texture size queries. That
code also was accidentally setting an invalid sIndirectSrc position, but
it can now just be removed.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: mesa-stable@lists.freedesktop.org
---
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 30 +++++--------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index da364f2ad04..e71fa113d99 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -956,44 +956,30 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd)
 bool
 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
 {
-   if (txq->tex.rIndirectSrc < 0 && txq->tex.sIndirectSrc < 0)
+   if (txq->tex.rIndirectSrc < 0)
       return true;
 
    Value *ticRel = txq->getIndirectR();
-   Value *tscRel = txq->getIndirectS();
    const int chipset = prog->getTarget()->getChipset();
 
    txq->setIndirectS(NULL);
    txq->tex.sIndirectSrc = -1;
 
+   assert(ticRel);
+
    if (chipset < NVISA_GK104_CHIPSET) {
       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
 
-      if (ticRel) {
-         txq->setSrc(txq->tex.rIndirectSrc, NULL);
-         if (txq->tex.r)
-            ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
-                                ticRel, bld.mkImm(txq->tex.r));
-      }
-      if (tscRel) {
-         txq->setSrc(txq->tex.sIndirectSrc, NULL);
-         if (txq->tex.s)
-            tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
-                                tscRel, bld.mkImm(txq->tex.s));
-      }
+      txq->setSrc(txq->tex.rIndirectSrc, NULL);
+      if (txq->tex.r)
+         ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+                             ticRel, bld.mkImm(txq->tex.r));
 
-      bld.loadImm(src, 0);
-
-      if (ticRel)
-         bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
-      if (tscRel)
-         bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
+      bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
 
       txq->moveSources(0, 1);
       txq->setSrc(0, src);
    } else {
-      // XXX this ignores tsc, and assumes a 1:1 mapping
-      assert(txq->tex.rIndirectSrc >= 0);
       Value *hnd = loadTexHandle(
             bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
                        txq->getIndirectR(), bld.mkImm(2)),

From 8c8a71f0d125bb655b17a32914ffecf8d159593b Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 18 Jul 2015 19:02:29 -0400
Subject: [PATCH 0385/1208] gm107/ir: fix indirect txq emission

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: mesa-stable@lists.freedesktop.org
---
 .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp     | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 399a6f1db13..65c1f23e101 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -2441,8 +2441,14 @@ CodeEmitterGM107::emitTXQ()
       break;
    }
 
-   emitInsn (0xdf4a0000);
-   emitField(0x24, 13, insn->tex.r);
+   if (insn->tex.rIndirectSrc >= 0) {
+      emitInsn (0xdf500000);
+   } else {
+      emitInsn (0xdf480000);
+      emitField(0x24, 13, insn->tex.r);
+   }
+
+   emitField(0x31, 1, insn->tex.liveOnly);
    emitField(0x1f, 4, insn->tex.mask);
    emitField(0x16, 6, type);
    emitGPR  (0x08, insn->src(0));

From 801d41fa43eba996c6bd7c071282ad15e51609d3 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 20 Jul 2015 00:19:56 -0400
Subject: [PATCH 0386/1208] nv50: fix max level clamping on G80

It appears that the G80 did not have support for the sampler view
first/last clamping. Put the view's last level in the place of the
texture's so that it doesn't go past what the sampler view allows.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: mesa-stable@lists.freedesktop.org
---
 src/gallium/drivers/nouveau/nv50/nv50_tex.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
index d69c8d6ff0d..17ae27fa85d 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
@@ -71,6 +71,7 @@ nv50_create_texture_view(struct pipe_context *pipe,
                          uint32_t flags,
                          enum pipe_texture_target target)
 {
+   const uint32_t class_3d = nouveau_context(pipe)->screen->class_3d;
    const struct util_format_description *desc;
    uint64_t addr;
    uint32_t *tic;
@@ -201,11 +202,17 @@ nv50_create_texture_view(struct pipe_context *pipe,
 
    tic[5] = (mt->base.base.height0 << mt->ms_y) & 0xffff;
    tic[5] |= depth << 16;
-   tic[5] |= mt->base.base.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT;
+   if (class_3d > NV50_3D_CLASS)
+      tic[5] |= mt->base.base.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT;
+   else
+      tic[5] |= view->pipe.u.tex.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT;
 
    tic[6] = (mt->ms_x > 1) ? 0x88000000 : 0x03000000; /* sampling points */
 
-   tic[7] = (view->pipe.u.tex.last_level << 4) | view->pipe.u.tex.first_level;
+   if (class_3d > NV50_3D_CLASS)
+      tic[7] = (view->pipe.u.tex.last_level << 4) | view->pipe.u.tex.first_level;
+   else
+      tic[7] = 0;
 
    if (unlikely(!(tic[2] & NV50_TIC_2_NORMALIZED_COORDS)))
       if (mt->base.base.last_level)

From 8ba1982b1e37aa69680e243fe391254211ae273a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Fri, 17 Jul 2015 11:54:34 +0200
Subject: [PATCH 0387/1208] i965/nir/fs: removed unneeded support for global
 variables

As functions are inlined, and nir_lower_global_vars_to_local gets
run, all global variables are lowered to local variables.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.h           |  1 -
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp     | 16 ++++------------
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp |  1 -
 3 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index c00566667e0..283ee361d17 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -371,7 +371,6 @@ public:
 
    fs_reg *nir_locals;
    fs_reg *nir_ssa_values;
-   fs_reg *nir_globals;
    fs_reg nir_inputs;
    fs_reg nir_outputs;
    fs_reg *nir_system_values;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 3099dc447ec..12cd4530c37 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -55,14 +55,6 @@ fs_visitor::emit_nir_code()
 
    nir_emit_system_values(nir);
 
-   nir_globals = ralloc_array(mem_ctx, fs_reg, nir->reg_alloc);
-   foreach_list_typed(nir_register, reg, node, &nir->registers) {
-      unsigned array_elems =
-         reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
-      unsigned size = array_elems * reg->num_components;
-      nir_globals[reg->index] = bld.vgrf(BRW_REGISTER_TYPE_F, size);
-   }
-
    /* get the main function and emit it */
    nir_foreach_overload(nir, overload) {
       assert(strcmp(overload->function->name, "main") == 0);
@@ -1159,10 +1151,10 @@ fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
                    unsigned base_offset, nir_src *indirect)
 {
    fs_reg reg;
-   if (nir_reg->is_global)
-      reg = v->nir_globals[nir_reg->index];
-   else
-      reg = v->nir_locals[nir_reg->index];
+
+   assert(!nir_reg->is_global);
+
+   reg = v->nir_locals[nir_reg->index];
 
    reg = offset(reg, v->bld, base_offset * nir_reg->num_components);
    if (indirect) {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index d6a60a7cff7..47dc9254664 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -2029,7 +2029,6 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
 
    this->nir_locals = NULL;
    this->nir_ssa_values = NULL;
-   this->nir_globals = NULL;
 
    memset(&this->payload, 0, sizeof(this->payload));
    memset(this->outputs, 0, sizeof(this->outputs));

From d246a96bbc4253a8339a505df97742fd252ebc55 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 13 Jul 2015 12:51:21 +0200
Subject: [PATCH 0388/1208] nv50: add nesting support for occlusion queries

This is loosely based on nvc0.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv50/nv50_query.c | 29 ++++++++++++-------
 .../drivers/nouveau/nv50/nv50_screen.h        |  2 ++
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index 81f7474e36b..a5b95c13958 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -49,6 +49,7 @@ struct nv50_query {
    uint32_t offset; /* base + i * 32 */
    uint8_t state;
    boolean is64bit;
+   int nesting; /* only used for occlusion queries */
    struct nouveau_mm_allocation *mm;
    struct nouveau_fence *fence;
 };
@@ -175,11 +176,16 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
-      PUSH_SPACE(push, 4);
-      BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT);
-      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
-      PUSH_DATA (push, 1);
+      q->nesting = nv50->screen->num_occlusion_queries_active++;
+      if (q->nesting) {
+         nv50_query_get(push, q, 0x10, 0x0100f002);
+      } else {
+         PUSH_SPACE(push, 4);
+         BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
+         PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT);
+         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+         PUSH_DATA (push, 1);
+      }
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
       nv50_query_get(push, q, 0x10, 0x06805002);
@@ -223,9 +229,11 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
       nv50_query_get(push, q, 0, 0x0100f002);
-      PUSH_SPACE(push, 2);
-      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
-      PUSH_DATA (push, 0);
+      if (--nv50->screen->num_occlusion_queries_active == 0) {
+         PUSH_SPACE(push, 2);
+         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+         PUSH_DATA (push, 0);
+      }
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
       nv50_query_get(push, q, 0, 0x06805002);
@@ -319,7 +327,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       res8[0] = TRUE;
       break;
    case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
-      res64[0] = q->data[1];
+      res64[0] = q->data[1] - q->data[5];
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
    case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
@@ -396,8 +404,7 @@ nv50_render_condition(struct pipe_context *pipe,
       case PIPE_QUERY_OCCLUSION_COUNTER:
       case PIPE_QUERY_OCCLUSION_PREDICATE:
          if (likely(!condition)) {
-            /* XXX: Placeholder, handle nesting here if available */
-            if (unlikely(false))
+            if (unlikely(q->nesting))
                cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL :
                              NV50_3D_COND_MODE_ALWAYS;
             else
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
index 881051b1862..3a12a1f066b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
@@ -54,6 +54,8 @@ struct nv50_screen {
    struct nv50_context *cur_ctx;
    struct nv50_graph_state save_state;
 
+   int num_occlusion_queries_active;
+
    struct nouveau_bo *code;
    struct nouveau_bo *uniforms;
    struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */

From 6d207b8e3548cd7832a5edc7b847a5e7d06c0925 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 13 Jul 2015 12:52:57 +0200
Subject: [PATCH 0389/1208] nv50: turn samples counts off during blit

Fixes the following piglit test:
  occlusion_query_meta_no_fragments

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv50/nv50_surface.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index dc9852d4e47..66eccc25763 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -1432,6 +1432,7 @@ static void
 nv50_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
 {
    struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
    boolean eng3d = FALSE;
 
    if (util_format_is_depth_or_stencil(info->dst.resource->format)) {
@@ -1493,10 +1494,20 @@ nv50_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
         info->src.box.height != -info->dst.box.height))
       eng3d = TRUE;
 
+   if (nv50->screen->num_occlusion_queries_active) {
+      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+      PUSH_DATA (push, 0);
+   }
+
    if (!eng3d)
       nv50_blit_eng2d(nv50, info);
    else
       nv50_blit_3d(nv50, info);
+
+   if (nv50->screen->num_occlusion_queries_active) {
+      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+      PUSH_DATA (push, 1);
+   }
 }
 
 static void

From 19a6214b0ff707ae52e9624c263b7d6c1c20e6d3 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 13 Jul 2015 13:34:31 +0200
Subject: [PATCH 0390/1208] nv50: limit the maximum number of samplers to 16

NV50_3D_BIND_TSC only allows to bind 16 samplers, and since we don't
want to do anything with NV50_3D_BIND_TSC2, just limit the maximum
number of samplers to 16 like for nvc0.

This fixes dmesg fails with the following piglit test:
 max-samplers

But the test still fails.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv50/nv50_screen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index c32ad01eec3..4f99b631aae 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -287,7 +287,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
       /* The chip could handle more sampler views than samplers */
    case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
-      return MIN2(32, PIPE_MAX_SAMPLERS);
+      return MIN2(16, PIPE_MAX_SAMPLERS);
    case PIPE_SHADER_CAP_DOUBLES:
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:

From c2cb771354d2d738e0ab3ca7c8008748c5f57953 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 20 Jul 2015 18:47:17 +0200
Subject: [PATCH 0391/1208] nouveau: always align buffers to 0x100

Only constbufs must be aligned to 0x100, but since all buffers can be
rebinded as constant buffers they must be also aligned.

This patch prevents this behaviour by aligning everything to 256-byte
increments at buffer creation.

This fixes dmesg fails for the following piglit test:
  ext_transform_feedback-immediate-reuse-uniform-buffer -auto -fbo

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nouveau_buffer.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 23619467231..83d528846fd 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -40,13 +40,7 @@ static INLINE boolean
 nouveau_buffer_allocate(struct nouveau_screen *screen,
                         struct nv04_resource *buf, unsigned domain)
 {
-   uint32_t size = buf->base.width0;
-
-   if (buf->base.bind & (PIPE_BIND_CONSTANT_BUFFER |
-                         PIPE_BIND_COMPUTE_RESOURCE |
-                         PIPE_BIND_SHADER_BUFFER |
-                         PIPE_BIND_SHADER_IMAGE))
-      size = align(size, 0x100);
+   uint32_t size = align(buf->base.width0, 0x100);
 
    if (domain == NOUVEAU_BO_VRAM) {
       buf->mm = nouveau_mm_allocate(screen->mm_VRAM, size,

From 5b7dd4d41900e3c795af134e0fad59cac9e0e7b4 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Thu, 16 Jul 2015 23:05:05 +0200
Subject: [PATCH 0392/1208] nvc0: add a missing parameter to
 nvc0_set_shader_images()

This fixes a compilation warning introduced in commit 05a12c5
(gallium: add interface for writable shader images).

While we are at it, fix indentation and rename parameters according to
the gallium interface.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_state.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 337559c3be8..d18b064a0c1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -1125,9 +1125,9 @@ nvc0_set_compute_resources(struct pipe_context *pipe,
 }
 
 static void
-nvc0_set_shader_images(struct pipe_context *pipe,
-                          unsigned start, unsigned nr,
-                          struct pipe_image_view **views)
+nvc0_set_shader_images(struct pipe_context *pipe, unsigned shader,
+                       unsigned start_slot, unsigned count,
+                       struct pipe_image_view **views)
 {
 #if 0
    nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, views);

From 4be30fcd058209966fc72fbfa51bbe881c307ed5 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Mon, 20 Jul 2015 15:12:56 +0000
Subject: [PATCH 0393/1208] gallivm: Initialize LLVM Modules's DataLayout to an
 empty string.

This fixes crashes in llvmpipe with LLVM 3.8 and also some piglit tests
on radeonsi that use the draw module.

This is just a temporary solution.  The correct solution will require
creating a TargetMachine during gallivm initialization and pulling the
DataLayout from there.  This will be a somewhat invasive change, and it
will need to be validatated on multiple LLVM versions.

https://llvm.org/bugs/show_bug.cgi?id=24172

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_init.c | 28 +++++++++++++++++----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 384ea864081..017d0752060 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -106,7 +106,6 @@ enum LLVM_CodeGenOpt_Level {
 static boolean
 create_pass_manager(struct gallivm_state *gallivm)
 {
-   char *td_str;
    assert(!gallivm->passmgr);
    assert(gallivm->target);
 
@@ -122,10 +121,29 @@ create_pass_manager(struct gallivm_state *gallivm)
    // Old versions of LLVM get the DataLayout from the pass manager.
    LLVMAddTargetData(gallivm->target, gallivm->passmgr);
 
-   // New ones from the Module.
-   td_str = LLVMCopyStringRepOfTargetData(gallivm->target);
-   LLVMSetDataLayout(gallivm->module, td_str);
-   free(td_str);
+   /* Setting the module's DataLayout to an empty string will cause the
+    * ExecutionEngine to copy to the DataLayout string from its target
+    * machine to the module.  As of LLVM 3.8 the module and the execution
+    * engine are required to have the same DataLayout.
+    *
+    * TODO: This is just a temporary work-around.  The correct solution is
+    * for gallivm_init_state() to create a TargetMachine and pull the
+    * DataLayout from there.  Currently, the TargetMachine used by llvmpipe
+    * is being implicitly created by the EngineBuilder in
+    * lp_build_create_jit_compiler_for_module()
+    */
+
+#if HAVE_LLVM < 0x0308
+   {
+      char *td_str;
+      // New ones from the Module.
+      td_str = LLVMCopyStringRepOfTargetData(gallivm->target);
+      LLVMSetDataLayout(gallivm->module, td_str);
+      free(td_str);
+   }
+#else
+   LLVMSetDataLayout(gallivm->module, "");
+#endif
 
    if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) {
       /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,

From cd0dec0d9dfab642c51774c3f5788cbdf00b8c9b Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 20 Jul 2015 21:32:43 +0200
Subject: [PATCH 0394/1208] nouveau: use bool instead of boolean

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_driver.h  |  14 +--
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     |   8 +-
 .../codegen/nv50_ir_lowering_gm107.cpp        |   2 +-
 src/gallium/drivers/nouveau/nouveau_buffer.c  | 118 +++++++++---------
 src/gallium/drivers/nouveau/nouveau_buffer.h  |   6 +-
 src/gallium/drivers/nouveau/nouveau_context.h |   4 +-
 src/gallium/drivers/nouveau/nouveau_fence.c   |  36 +++---
 src/gallium/drivers/nouveau/nouveau_fence.h   |  14 +--
 src/gallium/drivers/nouveau/nouveau_screen.c  |   6 +-
 src/gallium/drivers/nouveau/nouveau_screen.h  |   6 +-
 src/gallium/drivers/nouveau/nouveau_video.c   |  56 ++++-----
 src/gallium/drivers/nouveau/nouveau_winsys.h  |   4 +-
 src/gallium/drivers/nouveau/nv30/nv30_clear.c |   2 +-
 .../drivers/nouveau/nv30/nv30_context.c       |   4 +-
 .../drivers/nouveau/nv30/nv30_context.h       |  10 +-
 src/gallium/drivers/nouveau/nv30/nv30_draw.c  |  22 ++--
 .../drivers/nouveau/nv30/nv30_fragprog.c      |   6 +-
 .../drivers/nouveau/nv30/nv30_miptree.c       |   4 +-
 src/gallium/drivers/nouveau/nv30/nv30_push.c  |   6 +-
 src/gallium/drivers/nouveau/nv30/nv30_query.c |   4 +-
 .../drivers/nouveau/nv30/nv30_resource.c      |   4 +-
 .../drivers/nouveau/nv30/nv30_resource.h      |   2 +-
 .../drivers/nouveau/nv30/nv30_screen.c        |   8 +-
 src/gallium/drivers/nouveau/nv30/nv30_state.h |   4 +-
 .../nouveau/nv30/nv30_state_validate.c        |   8 +-
 .../drivers/nouveau/nv30/nv30_transfer.c      |  54 ++++----
 src/gallium/drivers/nouveau/nv30/nv30_vbo.c   |  26 ++--
 .../drivers/nouveau/nv30/nv30_vertprog.c      |  12 +-
 .../drivers/nouveau/nv30/nvfx_fragprog.c      |  46 +++----
 .../drivers/nouveau/nv30/nvfx_shader.h        |   4 +-
 .../drivers/nouveau/nv30/nvfx_vertprog.c      |  50 ++++----
 src/gallium/drivers/nouveau/nv50/nv50_blit.h  |  10 +-
 .../drivers/nouveau/nv50/nv50_context.c       |  14 +--
 .../drivers/nouveau/nv50/nv50_context.h       |  16 +--
 .../drivers/nouveau/nv50/nv50_miptree.c       |  26 ++--
 .../drivers/nouveau/nv50/nv50_program.c       |  18 +--
 .../drivers/nouveau/nv50/nv50_program.h       |   6 +-
 src/gallium/drivers/nouveau/nv50/nv50_push.c  |   8 +-
 src/gallium/drivers/nouveau/nv50/nv50_query.c |  38 +++---
 .../drivers/nouveau/nv50/nv50_resource.h      |   6 +-
 .../drivers/nouveau/nv50/nv50_screen.c        |  16 +--
 .../drivers/nouveau/nv50/nv50_screen.h        |  16 +--
 .../drivers/nouveau/nv50/nv50_shader_state.c  |  18 +--
 src/gallium/drivers/nouveau/nv50/nv50_state.c |  24 ++--
 .../nouveau/nv50/nv50_state_validate.c        |  14 +--
 .../drivers/nouveau/nv50/nv50_stateobj.h      |   6 +-
 .../drivers/nouveau/nv50/nv50_surface.c       |  64 +++++-----
 src/gallium/drivers/nouveau/nv50/nv50_tex.c   |  22 ++--
 src/gallium/drivers/nouveau/nv50/nv50_vbo.c   |  32 ++---
 .../drivers/nouveau/nvc0/nvc0_compute.c       |  24 ++--
 .../drivers/nouveau/nvc0/nvc0_compute.h       |   2 +-
 .../drivers/nouveau/nvc0/nvc0_context.c       |  12 +-
 .../drivers/nouveau/nvc0/nvc0_context.h       |  22 ++--
 .../drivers/nouveau/nvc0/nvc0_miptree.c       |  12 +-
 .../drivers/nouveau/nvc0/nvc0_program.c       |  18 +--
 .../drivers/nouveau/nvc0/nvc0_program.h       |   6 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_query.c |  84 ++++++-------
 .../drivers/nouveau/nvc0/nvc0_screen.c        |  22 ++--
 .../drivers/nouveau/nvc0/nvc0_screen.h        |  16 +--
 .../drivers/nouveau/nvc0/nvc0_shader_state.c  |  12 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_state.c |  20 +--
 .../nouveau/nvc0/nvc0_state_validate.c        |  20 +--
 .../drivers/nouveau/nvc0/nvc0_stateobj.h      |   8 +-
 .../drivers/nouveau/nvc0/nvc0_surface.c       |  54 ++++----
 src/gallium/drivers/nouveau/nvc0/nvc0_tex.c   |  38 +++---
 .../drivers/nouveau/nvc0/nvc0_transfer.c      |   8 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c   |  32 ++---
 .../drivers/nouveau/nvc0/nvc0_vbo_translate.c |  18 +--
 .../drivers/nouveau/nvc0/nve4_compute.c       |  24 ++--
 .../winsys/nouveau/drm/nouveau_drm_winsys.c   |   2 +-
 70 files changed, 679 insertions(+), 679 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index dba56bf2716..e6c4d668b38 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -151,10 +151,10 @@ struct nv50_ir_prog_info
       } gp;
       struct {
          unsigned numColourResults;
-         boolean writesDepth;
-         boolean earlyFragTests;
-         boolean separateFragData;
-         boolean usesDiscard;
+         bool writesDepth;
+         bool earlyFragTests;
+         bool separateFragData;
+         bool usesDiscard;
       } fp;
       struct {
          uint32_t inputOffset; /* base address for user args */
@@ -180,11 +180,11 @@ struct nv50_ir_prog_info
       int8_t viewportId;         /* output index of ViewportIndex */
       uint8_t fragDepth;         /* output index of FragDepth */
       uint8_t sampleMask;        /* output index of SampleMask */
-      boolean sampleInterp;      /* perform sample interp on all fp inputs */
+      bool sampleInterp;         /* perform sample interp on all fp inputs */
       uint8_t backFaceColor[2];  /* input/output indices of back face colour */
       uint8_t globalAccess;      /* 1 for read, 2 for wr, 3 for rw */
-      boolean fp64;              /* program uses fp64 math */
-      boolean nv50styleSurfaces; /* generate gX[] access for raw buffers */
+      bool fp64;                 /* program uses fp64 math */
+      bool nv50styleSurfaces;    /* generate gX[] access for raw buffers */
       uint8_t resInfoCBSlot;     /* cX[] used for tex handles, surface info */
       uint16_t texBindBase;      /* base address for tex handles (nve4) */
       uint16_t suInfoBase;       /* base address for surface info (nve4) */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index e5164ca0d18..dca30943385 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -826,7 +826,7 @@ Source::Source(struct nv50_ir_prog_info *prog) : info(prog)
    if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
       tgsi_dump(tokens, 0);
 
-   mainTempsInLMem = FALSE;
+   mainTempsInLMem = false;
 }
 
 Source::~Source()
@@ -937,7 +937,7 @@ void Source::scanProperty(const struct tgsi_full_property *prop)
       info->prop.gp.instanceCount = prop->u[0].Data;
       break;
    case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
-      info->prop.fp.separateFragData = TRUE;
+      info->prop.fp.separateFragData = true;
       break;
    case TGSI_PROPERTY_FS_COORD_ORIGIN:
    case TGSI_PROPERTY_FS_COORD_PIXEL_CENTER:
@@ -1155,7 +1155,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       } else
       if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
          if (insn.getDst(0).isIndirect(0))
-            mainTempsInLMem = TRUE;
+            mainTempsInLMem = true;
       }
    }
 
@@ -1163,7 +1163,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       Instruction::SrcRegister src = insn.getSrc(s);
       if (src.getFile() == TGSI_FILE_TEMPORARY) {
          if (src.isIndirect(0))
-            mainTempsInLMem = TRUE;
+            mainTempsInLMem = true;
       } else
       if (src.getFile() == TGSI_FILE_RESOURCE) {
          if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 596ac95d489..1f3fce2bb9a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -176,7 +176,7 @@ GM107LoweringPass::handlePOPCNT(Instruction *i)
                            i->getSrc(0), i->getSrc(1));
    i->setSrc(0, tmp);
    i->setSrc(1, NULL);
-   return TRUE;
+   return true;
 }
 
 //
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 83d528846fd..ad05fdc3e76 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -28,7 +28,7 @@ nouveau_transfer(struct pipe_transfer *transfer)
    return (struct nouveau_transfer *)transfer;
 }
 
-static INLINE boolean
+static INLINE bool
 nouveau_buffer_malloc(struct nv04_resource *buf)
 {
    if (!buf->data)
@@ -36,7 +36,7 @@ nouveau_buffer_malloc(struct nv04_resource *buf)
    return !!buf->data;
 }
 
-static INLINE boolean
+static INLINE bool
 nouveau_buffer_allocate(struct nouveau_screen *screen,
                         struct nv04_resource *buf, unsigned domain)
 {
@@ -53,12 +53,12 @@ nouveau_buffer_allocate(struct nouveau_screen *screen,
       buf->mm = nouveau_mm_allocate(screen->mm_GART, size,
                                     &buf->bo, &buf->offset);
       if (!buf->bo)
-         return FALSE;
+         return false;
       NOUVEAU_DRV_STAT(screen, buf_obj_current_bytes_sys, buf->base.width0);
    } else {
       assert(domain == 0);
       if (!nouveau_buffer_malloc(buf))
-         return FALSE;
+         return false;
    }
    buf->domain = domain;
    if (buf->bo)
@@ -66,7 +66,7 @@ nouveau_buffer_allocate(struct nouveau_screen *screen,
 
    util_range_set_empty(&buf->valid_buffer_range);
 
-   return TRUE;
+   return true;
 }
 
 static INLINE void
@@ -93,7 +93,7 @@ nouveau_buffer_release_gpu_storage(struct nv04_resource *buf)
    buf->domain = 0;
 }
 
-static INLINE boolean
+static INLINE bool
 nouveau_buffer_reallocate(struct nouveau_screen *screen,
                           struct nv04_resource *buf, unsigned domain)
 {
@@ -134,13 +134,13 @@ nouveau_buffer_destroy(struct pipe_screen *pscreen,
  */
 static uint8_t *
 nouveau_transfer_staging(struct nouveau_context *nv,
-                         struct nouveau_transfer *tx, boolean permit_pb)
+                         struct nouveau_transfer *tx, bool permit_pb)
 {
    const unsigned adj = tx->base.box.x & NOUVEAU_MIN_BUFFER_MAP_ALIGN_MASK;
    const unsigned size = align(tx->base.box.width, 4) + adj;
 
    if (!nv->push_data)
-      permit_pb = FALSE;
+      permit_pb = false;
 
    if ((size <= NOUVEAU_TRANSFER_PUSHBUF_THRESHOLD) && permit_pb) {
       tx->map = align_malloc(size, NOUVEAU_MIN_BUFFER_MAP_ALIGN);
@@ -162,7 +162,7 @@ nouveau_transfer_staging(struct nouveau_context *nv,
  * buffer. Also updates buf->data if present.
  *
  * Maybe just migrate to GART right away if we actually need to do this. */
-static boolean
+static bool
 nouveau_transfer_read(struct nouveau_context *nv, struct nouveau_transfer *tx)
 {
    struct nv04_resource *buf = nv04_resource(tx->base.resource);
@@ -175,12 +175,12 @@ nouveau_transfer_read(struct nouveau_context *nv, struct nouveau_transfer *tx)
                  buf->bo, buf->offset + base, buf->domain, size);
 
    if (nouveau_bo_wait(tx->bo, NOUVEAU_BO_RD, nv->client))
-      return FALSE;
+      return false;
 
    if (buf->data)
       memcpy(buf->data + base, tx->map, size);
 
-   return TRUE;
+   return true;
 }
 
 static void
@@ -190,7 +190,7 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx,
    struct nv04_resource *buf = nv04_resource(tx->base.resource);
    uint8_t *data = tx->map + offset;
    const unsigned base = tx->base.box.x + offset;
-   const boolean can_cb = !((base | size) & 3);
+   const bool can_cb = !((base | size) & 3);
 
    if (buf->data)
       memcpy(data, buf->data + base, size);
@@ -219,32 +219,32 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx,
 /* Does a CPU wait for the buffer's backing data to become reliably accessible
  * for write/read by waiting on the buffer's relevant fences.
  */
-static INLINE boolean
+static INLINE bool
 nouveau_buffer_sync(struct nv04_resource *buf, unsigned rw)
 {
    if (rw == PIPE_TRANSFER_READ) {
       if (!buf->fence_wr)
-         return TRUE;
+         return true;
       NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
                            !nouveau_fence_signalled(buf->fence_wr));
       if (!nouveau_fence_wait(buf->fence_wr))
-         return FALSE;
+         return false;
    } else {
       if (!buf->fence)
-         return TRUE;
+         return true;
       NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
                            !nouveau_fence_signalled(buf->fence));
       if (!nouveau_fence_wait(buf->fence))
-         return FALSE;
+         return false;
 
       nouveau_fence_ref(NULL, &buf->fence);
    }
    nouveau_fence_ref(NULL, &buf->fence_wr);
 
-   return TRUE;
+   return true;
 }
 
-static INLINE boolean
+static INLINE bool
 nouveau_buffer_busy(struct nv04_resource *buf, unsigned rw)
 {
    if (rw == PIPE_TRANSFER_READ)
@@ -292,11 +292,11 @@ nouveau_buffer_transfer_del(struct nouveau_context *nv,
 }
 
 /* Creates a cache in system memory of the buffer data. */
-static boolean
+static bool
 nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf)
 {
    struct nouveau_transfer tx;
-   boolean ret;
+   bool ret;
    tx.base.resource = &buf->base;
    tx.base.box.x = 0;
    tx.base.box.width = buf->base.width0;
@@ -305,13 +305,13 @@ nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf)
 
    if (!buf->data)
       if (!nouveau_buffer_malloc(buf))
-         return FALSE;
+         return false;
    if (!(buf->status & NOUVEAU_BUFFER_STATUS_DIRTY))
-      return TRUE;
+      return true;
    nv->stats.buf_cache_count++;
 
-   if (!nouveau_transfer_staging(nv, &tx, FALSE))
-      return FALSE;
+   if (!nouveau_transfer_staging(nv, &tx, false))
+      return false;
 
    ret = nouveau_transfer_read(nv, &tx);
    if (ret) {
@@ -330,15 +330,15 @@ nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf)
  * resource. This can be useful if we would otherwise have to wait for a read
  * operation to complete on this data.
  */
-static INLINE boolean
+static INLINE bool
 nouveau_buffer_should_discard(struct nv04_resource *buf, unsigned usage)
 {
    if (!(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE))
-      return FALSE;
+      return false;
    if (unlikely(buf->base.bind & PIPE_BIND_SHARED))
-      return FALSE;
+      return false;
    if (unlikely(usage & PIPE_TRANSFER_PERSISTENT))
-      return FALSE;
+      return false;
    return buf->mm && nouveau_buffer_busy(buf, PIPE_TRANSFER_WRITE);
 }
 
@@ -408,7 +408,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
           * back into VRAM on unmap. */
          if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)
             buf->status &= NOUVEAU_BUFFER_STATUS_REALLOC_MASK;
-         nouveau_transfer_staging(nv, tx, TRUE);
+         nouveau_transfer_staging(nv, tx, true);
       } else {
          if (buf->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
             /* The GPU is currently writing to this buffer. Copy its current
@@ -419,13 +419,13 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
                align_free(buf->data);
                buf->data = NULL;
             }
-            nouveau_transfer_staging(nv, tx, FALSE);
+            nouveau_transfer_staging(nv, tx, false);
             nouveau_transfer_read(nv, tx);
          } else {
             /* The buffer is currently idle. Create a staging area for writes,
              * and make sure that the cached data is up-to-date. */
             if (usage & PIPE_TRANSFER_WRITE)
-               nouveau_transfer_staging(nv, tx, TRUE);
+               nouveau_transfer_staging(nv, tx, true);
             if (!buf->data)
                nouveau_buffer_cache(nv, buf);
          }
@@ -477,7 +477,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
       if (usage & PIPE_TRANSFER_DISCARD_RANGE) {
          /* The whole range is being discarded, so it doesn't matter what was
           * there before. No need to copy anything over. */
-         nouveau_transfer_staging(nv, tx, TRUE);
+         nouveau_transfer_staging(nv, tx, true);
          map = tx->map;
       } else
       if (nouveau_buffer_busy(buf, PIPE_TRANSFER_READ)) {
@@ -488,7 +488,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
       } else {
          /* It is expected that the returned buffer be a representation of the
           * data in question, so we must copy it over from the buffer. */
-         nouveau_transfer_staging(nv, tx, TRUE);
+         nouveau_transfer_staging(nv, tx, true);
          if (tx->map)
             memcpy(tx->map, map, box->width);
          map = tx->map;
@@ -539,7 +539,7 @@ nouveau_buffer_transfer_unmap(struct pipe_context *pipe,
          const uint8_t bind = buf->base.bind;
          /* make sure we invalidate dedicated caches */
          if (bind & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER))
-            nv->vbo_dirty = TRUE;
+            nv->vbo_dirty = true;
       }
 
       util_range_add(&buf->valid_buffer_range,
@@ -634,7 +634,7 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
 {
    struct nouveau_screen *screen = nouveau_screen(pscreen);
    struct nv04_resource *buffer;
-   boolean ret;
+   bool ret;
 
    buffer = CALLOC_STRUCT(nv04_resource);
    if (!buffer)
@@ -678,7 +678,7 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
    }
    ret = nouveau_buffer_allocate(screen, buffer, buffer->domain);
 
-   if (ret == FALSE)
+   if (ret == false)
       goto fail;
 
    if (buffer->domain == NOUVEAU_BO_VRAM && screen->hint_buf_keep_sysmem_copy)
@@ -725,20 +725,20 @@ nouveau_user_buffer_create(struct pipe_screen *pscreen, void *ptr,
    return &buffer->base;
 }
 
-static INLINE boolean
+static INLINE bool
 nouveau_buffer_data_fetch(struct nouveau_context *nv, struct nv04_resource *buf,
                           struct nouveau_bo *bo, unsigned offset, unsigned size)
 {
    if (!nouveau_buffer_malloc(buf))
-      return FALSE;
+      return false;
    if (nouveau_bo_map(bo, NOUVEAU_BO_RD, nv->client))
-      return FALSE;
+      return false;
    memcpy(buf->data, (uint8_t *)bo->map + offset, size);
-   return TRUE;
+   return true;
 }
 
 /* Migrate a linear buffer (vertex, index, constants) USER -> GART -> VRAM. */
-boolean
+bool
 nouveau_buffer_migrate(struct nouveau_context *nv,
                        struct nv04_resource *buf, const unsigned new_domain)
 {
@@ -753,7 +753,7 @@ nouveau_buffer_migrate(struct nouveau_context *nv,
 
    if (new_domain == NOUVEAU_BO_GART && old_domain == 0) {
       if (!nouveau_buffer_allocate(screen, buf, new_domain))
-         return FALSE;
+         return false;
       ret = nouveau_bo_map(buf->bo, 0, nv->client);
       if (ret)
          return ret;
@@ -766,7 +766,7 @@ nouveau_buffer_migrate(struct nouveau_context *nv,
       if (new_domain == NOUVEAU_BO_VRAM) {
          /* keep a system memory copy of our data in case we hit a fallback */
          if (!nouveau_buffer_data_fetch(nv, buf, buf->bo, buf->offset, size))
-            return FALSE;
+            return false;
          if (nouveau_mesa_debug)
             debug_printf("migrating %u KiB to VRAM\n", size / 1024);
       }
@@ -787,28 +787,28 @@ nouveau_buffer_migrate(struct nouveau_context *nv,
    if (new_domain == NOUVEAU_BO_VRAM && old_domain == 0) {
       struct nouveau_transfer tx;
       if (!nouveau_buffer_allocate(screen, buf, NOUVEAU_BO_VRAM))
-         return FALSE;
+         return false;
       tx.base.resource = &buf->base;
       tx.base.box.x = 0;
       tx.base.box.width = buf->base.width0;
       tx.bo = NULL;
       tx.map = NULL;
-      if (!nouveau_transfer_staging(nv, &tx, FALSE))
-         return FALSE;
+      if (!nouveau_transfer_staging(nv, &tx, false))
+         return false;
       nouveau_transfer_write(nv, &tx, 0, tx.base.box.width);
       nouveau_buffer_transfer_del(nv, &tx);
    } else
-      return FALSE;
+      return false;
 
    assert(buf->domain == new_domain);
-   return TRUE;
+   return true;
 }
 
 /* Migrate data from glVertexAttribPointer(non-VBO) user buffers to GART.
  * We'd like to only allocate @size bytes here, but then we'd have to rebase
  * the vertex indices ...
  */
-boolean
+bool
 nouveau_user_buffer_upload(struct nouveau_context *nv,
                            struct nv04_resource *buf,
                            unsigned base, unsigned size)
@@ -820,14 +820,14 @@ nouveau_user_buffer_upload(struct nouveau_context *nv,
 
    buf->base.width0 = base + size;
    if (!nouveau_buffer_reallocate(screen, buf, NOUVEAU_BO_GART))
-      return FALSE;
+      return false;
 
    ret = nouveau_bo_map(buf->bo, 0, nv->client);
    if (ret)
-      return FALSE;
+      return false;
    memcpy((uint8_t *)buf->bo->map + buf->offset + base, buf->data + base, size);
 
-   return TRUE;
+   return true;
 }
 
 
@@ -870,7 +870,7 @@ nouveau_scratch_runout_release(struct nouveau_context *nv)
 /* Allocate an extra bo if we can't fit everything we need simultaneously.
  * (Could happen for very large user arrays.)
  */
-static INLINE boolean
+static INLINE bool
 nouveau_scratch_runout(struct nouveau_context *nv, unsigned size)
 {
    int ret;
@@ -904,7 +904,7 @@ nouveau_scratch_runout(struct nouveau_context *nv, unsigned size)
 /* Continue to next scratch buffer, if available (no wrapping, large enough).
  * Allocate it if it has not yet been created.
  */
-static INLINE boolean
+static INLINE bool
 nouveau_scratch_next(struct nouveau_context *nv, unsigned size)
 {
    struct nouveau_bo *bo;
@@ -912,14 +912,14 @@ nouveau_scratch_next(struct nouveau_context *nv, unsigned size)
    const unsigned i = (nv->scratch.id + 1) % NOUVEAU_MAX_SCRATCH_BUFS;
 
    if ((size > nv->scratch.bo_size) || (i == nv->scratch.wrap))
-      return FALSE;
+      return false;
    nv->scratch.id = i;
 
    bo = nv->scratch.bo[i];
    if (!bo) {
       ret = nouveau_scratch_bo_alloc(nv, &bo, nv->scratch.bo_size);
       if (ret)
-         return FALSE;
+         return false;
       nv->scratch.bo[i] = bo;
    }
    nv->scratch.current = bo;
@@ -932,10 +932,10 @@ nouveau_scratch_next(struct nouveau_context *nv, unsigned size)
    return !ret;
 }
 
-static boolean
+static bool
 nouveau_scratch_more(struct nouveau_context *nv, unsigned min_size)
 {
-   boolean ret;
+   bool ret;
 
    ret = nouveau_scratch_next(nv, min_size);
    if (!ret)
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.h b/src/gallium/drivers/nouveau/nouveau_buffer.h
index de77f481da3..4d05d3d2397 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.h
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.h
@@ -58,7 +58,7 @@ nouveau_copy_buffer(struct nouveau_context *,
                     struct nv04_resource *dst, unsigned dst_pos,
                     struct nv04_resource *src, unsigned src_pos, unsigned size);
 
-boolean
+bool
 nouveau_buffer_migrate(struct nouveau_context *,
                        struct nv04_resource *, unsigned domain);
 
@@ -79,7 +79,7 @@ nv04_resource(struct pipe_resource *resource)
 }
 
 /* is resource mapped into the GPU's address space (i.e. VRAM or GART) ? */
-static INLINE boolean
+static INLINE bool
 nouveau_resource_mapped_by_gpu(struct pipe_resource *resource)
 {
    return nv04_resource(resource)->domain != 0;
@@ -93,7 +93,7 @@ struct pipe_resource *
 nouveau_user_buffer_create(struct pipe_screen *screen, void *ptr,
                            unsigned bytes, unsigned usage);
 
-boolean
+bool
 nouveau_user_buffer_upload(struct nouveau_context *, struct nv04_resource *,
                            unsigned base, unsigned size);
 
diff --git a/src/gallium/drivers/nouveau/nouveau_context.h b/src/gallium/drivers/nouveau/nouveau_context.h
index c2ba0159afe..d99e485bbc6 100644
--- a/src/gallium/drivers/nouveau/nouveau_context.h
+++ b/src/gallium/drivers/nouveau/nouveau_context.h
@@ -13,7 +13,7 @@ struct nouveau_context {
    struct nouveau_client *client;
    struct nouveau_pushbuf *pushbuf;
 
-   boolean vbo_dirty;
+   bool vbo_dirty;
 
    void (*copy_data)(struct nouveau_context *,
                      struct nouveau_bo *dst, unsigned, unsigned,
@@ -104,7 +104,7 @@ nouveau_context_update_frame_stats(struct nouveau_context *nv)
       nv->stats.buf_cache_count = 0;
       nv->stats.buf_cache_frame |= 1;
       if ((nv->stats.buf_cache_frame & 0xf) == 0xf)
-         nv->screen->hint_buf_keep_sysmem_copy = TRUE;
+         nv->screen->hint_buf_keep_sysmem_copy = true;
    }
 }
 
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c
index 17a5174594d..abcdb479954 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.c
+++ b/src/gallium/drivers/nouveau/nouveau_fence.c
@@ -28,13 +28,13 @@
 #include <sched.h>
 #endif
 
-boolean
+bool
 nouveau_fence_new(struct nouveau_screen *screen, struct nouveau_fence **fence,
-                  boolean emit)
+                  bool emit)
 {
    *fence = CALLOC_STRUCT(nouveau_fence);
    if (!*fence)
-      return FALSE;
+      return false;
 
    (*fence)->screen = screen;
    (*fence)->ref = 1;
@@ -43,7 +43,7 @@ nouveau_fence_new(struct nouveau_screen *screen, struct nouveau_fence **fence,
    if (emit)
       nouveau_fence_emit(*fence);
 
-   return TRUE;
+   return true;
 }
 
 static void
@@ -58,7 +58,7 @@ nouveau_fence_trigger_work(struct nouveau_fence *fence)
    }
 }
 
-boolean
+bool
 nouveau_fence_work(struct nouveau_fence *fence,
                    void (*func)(void *), void *data)
 {
@@ -66,16 +66,16 @@ nouveau_fence_work(struct nouveau_fence *fence,
 
    if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
       func(data);
-      return TRUE;
+      return true;
    }
 
    work = CALLOC_STRUCT(nouveau_fence_work);
    if (!work)
-      return FALSE;
+      return false;
    work->func = func;
    work->data = data;
    LIST_ADD(&work->list, &fence->work);
-   return TRUE;
+   return true;
 }
 
 void
@@ -132,7 +132,7 @@ nouveau_fence_del(struct nouveau_fence *fence)
 }
 
 void
-nouveau_fence_update(struct nouveau_screen *screen, boolean flushed)
+nouveau_fence_update(struct nouveau_screen *screen, bool flushed)
 {
    struct nouveau_fence *fence;
    struct nouveau_fence *next = NULL;
@@ -167,21 +167,21 @@ nouveau_fence_update(struct nouveau_screen *screen, boolean flushed)
 
 #define NOUVEAU_FENCE_MAX_SPINS (1 << 31)
 
-boolean
+bool
 nouveau_fence_signalled(struct nouveau_fence *fence)
 {
    struct nouveau_screen *screen = fence->screen;
 
    if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED)
-      return TRUE;
+      return true;
 
    if (fence->state >= NOUVEAU_FENCE_STATE_EMITTED)
-      nouveau_fence_update(screen, FALSE);
+      nouveau_fence_update(screen, false);
 
    return fence->state == NOUVEAU_FENCE_STATE_SIGNALLED;
 }
 
-boolean
+bool
 nouveau_fence_wait(struct nouveau_fence *fence)
 {
    struct nouveau_screen *screen = fence->screen;
@@ -195,16 +195,16 @@ nouveau_fence_wait(struct nouveau_fence *fence)
 
    if (fence->state < NOUVEAU_FENCE_STATE_FLUSHED)
       if (nouveau_pushbuf_kick(screen->pushbuf, screen->pushbuf->channel))
-         return FALSE;
+         return false;
 
    if (fence == screen->fence.current)
       nouveau_fence_next(screen);
 
    do {
-      nouveau_fence_update(screen, FALSE);
+      nouveau_fence_update(screen, false);
 
       if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED)
-         return TRUE;
+         return true;
       if (!spins)
          NOUVEAU_DRV_STAT(screen, any_non_kernel_fence_sync_count, 1);
       spins++;
@@ -218,7 +218,7 @@ nouveau_fence_wait(struct nouveau_fence *fence)
                 fence->sequence,
                 screen->fence.sequence_ack, screen->fence.sequence);
 
-   return FALSE;
+   return false;
 }
 
 void
@@ -229,5 +229,5 @@ nouveau_fence_next(struct nouveau_screen *screen)
 
    nouveau_fence_ref(NULL, &screen->fence.current);
 
-   nouveau_fence_new(screen, &screen->fence.current, FALSE);
+   nouveau_fence_new(screen, &screen->fence.current, false);
 }
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.h b/src/gallium/drivers/nouveau/nouveau_fence.h
index 7bb132a5d15..904df0be7fa 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.h
+++ b/src/gallium/drivers/nouveau/nouveau_fence.h
@@ -29,13 +29,13 @@ struct nouveau_fence {
 void nouveau_fence_emit(struct nouveau_fence *);
 void nouveau_fence_del(struct nouveau_fence *);
 
-boolean nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **,
-                          boolean emit);
-boolean nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *);
-void    nouveau_fence_update(struct nouveau_screen *, boolean flushed);
-void    nouveau_fence_next(struct nouveau_screen *);
-boolean nouveau_fence_wait(struct nouveau_fence *);
-boolean nouveau_fence_signalled(struct nouveau_fence *);
+bool nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **,
+                       bool emit);
+bool nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *);
+void nouveau_fence_update(struct nouveau_screen *, bool flushed);
+void nouveau_fence_next(struct nouveau_screen *);
+bool nouveau_fence_wait(struct nouveau_fence *);
+bool nouveau_fence_signalled(struct nouveau_fence *);
 
 static INLINE void
 nouveau_fence_ref(struct nouveau_fence *fence, struct nouveau_fence **ref)
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index 0f6313c768c..b2290e7e784 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -111,7 +111,7 @@ nouveau_screen_bo_from_handle(struct pipe_screen *pscreen,
 }
 
 
-boolean
+bool
 nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
 			     struct nouveau_bo *bo,
 			     unsigned stride,
@@ -123,11 +123,11 @@ nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
 		return nouveau_bo_name_get(bo, &whandle->handle) == 0;
 	} else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
 		whandle->handle = bo->handle;
-		return TRUE;
+		return true;
 	} else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
 		return nouveau_bo_set_prime(bo, (int *)&whandle->handle) == 0;
 	} else {
-		return FALSE;
+		return false;
 	}
 }
 
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index 30041b271c9..05f56ed5842 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -49,7 +49,7 @@ struct nouveau_screen {
 
 	int64_t cpu_gpu_time_delta;
 
-	boolean hint_buf_keep_sysmem_copy;
+	bool hint_buf_keep_sysmem_copy;
 
 	unsigned vram_domain;
 
@@ -118,9 +118,9 @@ nouveau_screen(struct pipe_screen *pscreen)
 	return (struct nouveau_screen *)pscreen;
 }
 
-boolean nouveau_drm_screen_unref(struct nouveau_screen *screen);
+bool nouveau_drm_screen_unref(struct nouveau_screen *screen);
 
-boolean
+bool
 nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
 			     struct nouveau_bo *bo,
 			     unsigned stride,
diff --git a/src/gallium/drivers/nouveau/nouveau_video.c b/src/gallium/drivers/nouveau/nouveau_video.c
index d6330fa63a8..30f1d9dcb25 100644
--- a/src/gallium/drivers/nouveau/nouveau_video.c
+++ b/src/gallium/drivers/nouveau/nouveau_video.c
@@ -296,16 +296,16 @@ nouveau_vpe_mb_mv_header(struct nouveau_decoder *dec,
       case PIPE_MPEG12_MO_TYPE_DUAL_PRIME: {
          base = NV17_MPEG_CMD_CHROMA_MV_HEADER_COUNT_2;
          if (forward) {
-            nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, FALSE,
-                              x, y, mb->PMV[0][0], dec->past, TRUE);
-            nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, TRUE,
-                              x, y2, mb->PMV[0][0], dec->past, FALSE);
+            nouveau_vpe_mb_mv(dec, base, luma, frame, true, false,
+                              x, y, mb->PMV[0][0], dec->past, true);
+            nouveau_vpe_mb_mv(dec, base, luma, frame, true, true,
+                              x, y2, mb->PMV[0][0], dec->past, false);
          }
          if (backward && forward) {
-            nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, TRUE,
-                              x, y, mb->PMV[1][0], dec->future, TRUE);
-            nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, FALSE,
-                              x, y2, mb->PMV[1][1], dec->future, FALSE);
+            nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, true,
+                              x, y, mb->PMV[1][0], dec->future, true);
+            nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, false,
+                              x, y2, mb->PMV[1][1], dec->future, false);
          } else assert(!backward);
          break;
       }
@@ -320,13 +320,13 @@ nouveau_vpe_mb_mv_header(struct nouveau_decoder *dec,
          if (frame)
             base |= NV17_MPEG_CMD_CHROMA_MV_HEADER_TYPE_FRAME;
          if (forward)
-            nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE,
+            nouveau_vpe_mb_mv(dec, base, luma, frame, true,
                               dec->picture_structure != PIPE_MPEG12_PICTURE_STRUCTURE_FIELD_TOP,
-                              x, y, mb->PMV[0][0], dec->past, TRUE);
+                              x, y, mb->PMV[0][0], dec->past, true);
          if (backward && forward)
-            nouveau_vpe_mb_mv(dec, base, luma, frame, FALSE,
+            nouveau_vpe_mb_mv(dec, base, luma, frame, false,
                               dec->picture_structure == PIPE_MPEG12_PICTURE_STRUCTURE_FIELD_TOP,
-                              x, y, mb->PMV[0][1], dec->future, TRUE);
+                              x, y, mb->PMV[0][1], dec->future, true);
          else assert(!backward);
          break;
       }
@@ -341,11 +341,11 @@ mv1:
        base |= NV17_MPEG_CMD_CHROMA_MV_HEADER_TYPE_FRAME;
     /* frame 16x16 */
    if (forward)
-       nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, FALSE,
-                         x, y, mb->PMV[0][0], dec->past, TRUE);
+       nouveau_vpe_mb_mv(dec, base, luma, frame, true, false,
+                         x, y, mb->PMV[0][0], dec->past, true);
    if (backward)
-       nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, FALSE,
-                         x, y, mb->PMV[0][1], dec->future, TRUE);
+       nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, false,
+                         x, y, mb->PMV[0][1], dec->future, true);
     return;
 
 mv2:
@@ -353,20 +353,20 @@ mv2:
    if (!frame)
       base |= NV17_MPEG_CMD_CHROMA_MV_HEADER_MV_SPLIT_HALF_MB;
    if (forward) {
-      nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE,
+      nouveau_vpe_mb_mv(dec, base, luma, frame, true,
                         mb->motion_vertical_field_select & PIPE_MPEG12_FS_FIRST_FORWARD,
-                        x, y, mb->PMV[0][0], dec->past, TRUE);
-      nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE,
+                        x, y, mb->PMV[0][0], dec->past, true);
+      nouveau_vpe_mb_mv(dec, base, luma, frame, true,
                         mb->motion_vertical_field_select & PIPE_MPEG12_FS_SECOND_FORWARD,
-                        x, y2, mb->PMV[1][0], dec->past, FALSE);
+                        x, y2, mb->PMV[1][0], dec->past, false);
    }
    if (backward) {
       nouveau_vpe_mb_mv(dec, base, luma, frame, !forward,
                         mb->motion_vertical_field_select & PIPE_MPEG12_FS_FIRST_BACKWARD,
-                        x, y, mb->PMV[0][1], dec->future, TRUE);
+                        x, y, mb->PMV[0][1], dec->future, true);
       nouveau_vpe_mb_mv(dec, base, luma, frame, !forward,
                         mb->motion_vertical_field_select & PIPE_MPEG12_FS_SECOND_BACKWARD,
-                        x, y2, mb->PMV[1][1], dec->future, FALSE);
+                        x, y2, mb->PMV[1][1], dec->future, false);
    }
 }
 
@@ -438,14 +438,14 @@ nouveau_decoder_decode_macroblock(struct pipe_video_codec *decoder,
    mb = (const struct pipe_mpeg12_macroblock *)pipe_mb;
    for (i = 0; i < num_macroblocks; ++i, mb++) {
       if (mb->macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA) {
-         nouveau_vpe_mb_dct_header(dec, mb, TRUE);
-         nouveau_vpe_mb_dct_header(dec, mb, FALSE);
+         nouveau_vpe_mb_dct_header(dec, mb, true);
+         nouveau_vpe_mb_dct_header(dec, mb, false);
       } else {
-         nouveau_vpe_mb_mv_header(dec, mb, TRUE);
-         nouveau_vpe_mb_dct_header(dec, mb, TRUE);
+         nouveau_vpe_mb_mv_header(dec, mb, true);
+         nouveau_vpe_mb_dct_header(dec, mb, true);
 
-         nouveau_vpe_mb_mv_header(dec, mb, FALSE);
-         nouveau_vpe_mb_dct_header(dec, mb, FALSE);
+         nouveau_vpe_mb_mv_header(dec, mb, false);
+         nouveau_vpe_mb_dct_header(dec, mb, false);
       }
       if (dec->base.entrypoint <= PIPE_VIDEO_ENTRYPOINT_IDCT)
          nouveau_vpe_mb_dct_blocks(dec, mb);
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
index 51effb1d8d2..a5bb489980b 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -21,12 +21,12 @@ PUSH_AVAIL(struct nouveau_pushbuf *push)
    return push->end - push->cur;
 }
 
-static INLINE boolean
+static INLINE bool
 PUSH_SPACE(struct nouveau_pushbuf *push, uint32_t size)
 {
    if (PUSH_AVAIL(push) < size)
       return nouveau_pushbuf_space(push, size, 0, 0) == 0;
-   return TRUE;
+   return true;
 }
 
 static INLINE void
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_clear.c b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
index 83fd1fa38dd..c92e3b4e7ea 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_clear.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
@@ -58,7 +58,7 @@ nv30_clear(struct pipe_context *pipe, unsigned buffers,
    struct pipe_framebuffer_state *fb = &nv30->framebuffer;
    uint32_t colr = 0, zeta = 0, mode = 0;
 
-   if (!nv30_state_validate(nv30, NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR, TRUE))
+   if (!nv30_state_validate(nv30, NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR, true))
       return;
 
    if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c
index ef035e58f3c..6e88ed725d6 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c
@@ -45,7 +45,7 @@ nv30_context_kick_notify(struct nouveau_pushbuf *push)
    screen = &nv30->screen->base;
 
    nouveau_fence_next(screen);
-   nouveau_fence_update(screen, TRUE);
+   nouveau_fence_update(screen, true);
 
    if (push->bufctx) {
       struct nouveau_bufref *bref;
@@ -239,7 +239,7 @@ nv30_context_create(struct pipe_screen *pscreen, void *priv)
 
    nv30->config.aniso = NV40_3D_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_OFF;
 
-   if (debug_get_bool_option("NV30_SWTNL", FALSE))
+   if (debug_get_bool_option("NV30_SWTNL", false))
       nv30->draw_flags |= NV30_NEW_SWTNL;
 
    nv30->sample_mask = 0xffff;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.h b/src/gallium/drivers/nouveau/nv30/nv30_context.h
index 7181336b562..92e9284d52c 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.h
@@ -52,7 +52,7 @@ struct nv30_context {
       unsigned scissor_off;
       unsigned num_vtxelts;
       int index_bias;
-      boolean  prim_restart;
+      bool prim_restart;
       struct nv30_fragprog *fragprog;
    } state;
 
@@ -115,14 +115,14 @@ struct nv30_context {
    uint32_t vbo_user;
    unsigned vbo_min_index;
    unsigned vbo_max_index;
-   boolean  vbo_push_hint;
+   bool vbo_push_hint;
 
    struct nouveau_heap  *blit_vp;
    struct pipe_resource *blit_fp;
 
    struct pipe_query *render_cond_query;
    unsigned render_cond_mode;
-   boolean render_cond_cond;
+   bool render_cond_cond;
 };
 
 static INLINE struct nv30_context *
@@ -204,8 +204,8 @@ nv30_draw_init(struct pipe_context *pipe);
 void
 nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info);
 
-boolean
-nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl);
+bool
+nv30_state_validate(struct nv30_context *nv30, uint32_t mask, bool hwtnl);
 
 void
 nv30_state_release(struct nv30_context *nv30);
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
index c1665b7ad2f..adfa9f11d66 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -79,12 +79,12 @@ nv30_render_allocate_vertices(struct vbuf_render *render,
                                      PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM,
                                      render->max_vertex_buffer_bytes);
       if (!r->buffer)
-         return FALSE;
+         return false;
 
       r->offset = 0;
    }
 
-   return TRUE;
+   return true;
 }
 
 static void *
@@ -134,7 +134,7 @@ nv30_render_draw_elements(struct vbuf_render *render,
                        NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1);
    }
 
-   if (!nv30_state_validate(nv30, ~0, FALSE))
+   if (!nv30_state_validate(nv30, ~0, false))
       return;
 
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
@@ -179,7 +179,7 @@ nv30_render_draw_arrays(struct vbuf_render *render, unsigned start, uint nr)
                        NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1);
    }
 
-   if (!nv30_state_validate(nv30, ~0, FALSE))
+   if (!nv30_state_validate(nv30, ~0, false))
       return;
 
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
@@ -221,7 +221,7 @@ static const struct {
    [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 },
 };
 
-static boolean
+static bool
 vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
 {
    struct nv30_screen *screen = r->nv30->screen;
@@ -245,7 +245,7 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
    }
 
    if (emit == EMIT_OMIT)
-      return FALSE;
+      return false;
 
    draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib);
    format = draw_translate_vinfo_format(emit);
@@ -272,10 +272,10 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
       assert(sem == TGSI_SEMANTIC_TEXCOORD);
       *idx = 0x00001000 << (result - 8);
    }
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nv30_render_validate(struct nv30_context *nv30)
 {
    struct nv30_render *r = nv30_render(nv30->draw->render);
@@ -300,7 +300,7 @@ nv30_render_validate(struct nv30_context *nv30)
          }
 
          if (nouveau_heap_alloc(heap, 16, &r->vertprog, &r->vertprog))
-            return FALSE;
+            return false;
       }
    }
 
@@ -370,7 +370,7 @@ nv30_render_validate(struct nv30_context *nv30)
    }
 
    vinfo->size /= 4;
-   return TRUE;
+   return true;
 }
 
 void
@@ -519,6 +519,6 @@ nv30_draw_init(struct pipe_context *pipe)
    draw_set_rasterize_stage(draw, stage);
    draw_wide_line_threshold(draw, 10000000.f);
    draw_wide_point_threshold(draw, 10000000.f);
-   draw_wide_point_sprites(draw, TRUE);
+   draw_wide_point_sprites(draw, true);
    nv30->draw = draw;
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
index 54f91bbd48b..6de61bcc1c0 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
@@ -68,7 +68,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
    struct nouveau_pushbuf *push = nv30->base.pushbuf;
    struct nouveau_object *eng3d = nv30->screen->eng3d;
    struct nv30_fragprog *fp = nv30->fragprog.program;
-   boolean upload = FALSE;
+   bool upload = false;
    int i;
 
    if (!fp->translated) {
@@ -76,7 +76,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
       if (!fp->translated)
          return;
 
-      upload = TRUE;
+      upload = true;
    }
 
    /* update constants, also needs to be done on every fp switch as we
@@ -93,7 +93,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
          if (!memcmp(&fp->insn[off], &cbuf[idx], 4 * 4))
             continue;
          memcpy(&fp->insn[off], &cbuf[idx], 4 * 4);
-         upload = TRUE;
+         upload = true;
       }
    }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
index 846dcebc3a5..ea654fb2617 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
@@ -54,7 +54,7 @@ nv30_miptree_get_handle(struct pipe_screen *pscreen,
    unsigned stride;
 
    if (!mt || !mt->base.bo)
-      return FALSE;
+      return false;
 
    stride = mt->level[0].pitch;
 
@@ -372,7 +372,7 @@ nv30_miptree_create(struct pipe_screen *pscreen,
    }
 
    if (!mt->uniform_pitch)
-      mt->swizzled = TRUE;
+      mt->swizzled = true;
 
    size = 0;
    for (l = 0; l <= pt->last_level; l++) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_push.c b/src/gallium/drivers/nouveau/nv30/nv30_push.c
index e0734fa70d3..011f1e437dc 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_push.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_push.c
@@ -47,7 +47,7 @@ struct push_context {
 
    struct translate *translate;
 
-   boolean primitive_restart;
+   bool primitive_restart;
    uint32_t prim;
    uint32_t restart_index;
 };
@@ -199,7 +199,7 @@ nv30_push_vbo(struct nv30_context *nv30, const struct pipe_draw_info *info)
 {
    struct push_context ctx;
    unsigned i, index_size;
-   boolean apply_bias = info->indexed && info->index_bias;
+   bool apply_bias = info->indexed && info->index_bias;
 
    ctx.push = nv30->base.pushbuf;
    ctx.translate = nv30->vertex->translate;
@@ -241,7 +241,7 @@ nv30_push_vbo(struct nv30_context *nv30, const struct pipe_draw_info *info)
    } else {
       ctx.idxbuf = NULL;
       index_size = 0;
-      ctx.primitive_restart = FALSE;
+      ctx.primitive_restart = false;
       ctx.restart_index = 0;
    }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_query.c b/src/gallium/drivers/nouveau/nv30/nv30_query.c
index 516ee83168e..dfe2059e6ea 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_query.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_query.c
@@ -208,7 +208,7 @@ nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    if (ntfy1) {
       while (ntfy1[3] & 0xff000000) {
          if (!wait)
-            return FALSE;
+            return false;
       }
 
       switch (q->type) {
@@ -228,7 +228,7 @@ nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    }
 
    *res64 = q->result;
-   return TRUE;
+   return true;
 }
 
 static void
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.c b/src/gallium/drivers/nouveau/nv30/nv30_resource.c
index 38fac8af898..a98a6464de8 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_resource.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.c
@@ -42,12 +42,12 @@ nv30_memory_barrier(struct pipe_context *pipe, unsigned flags)
          if (!nv30->vtxbuf[i].buffer)
             continue;
          if (nv30->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-            nv30->base.vbo_dirty = TRUE;
+            nv30->base.vbo_dirty = true;
       }
 
       if (nv30->idxbuf.buffer &&
           nv30->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-         nv30->base.vbo_dirty = TRUE;
+         nv30->base.vbo_dirty = true;
    }
 }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.h b/src/gallium/drivers/nouveau/nv30/nv30_resource.h
index 1981c8d9ab9..7ea7b42faa7 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_resource.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.h
@@ -32,7 +32,7 @@ struct nv30_miptree {
    struct nv30_miptree_level level[13];
    uint32_t uniform_pitch;
    uint32_t layer_size;
-   boolean swizzled;
+   bool swizzled;
    unsigned ms_mode;
    unsigned ms_x:1;
    unsigned ms_y:1;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index b63de694fa7..97cf058ce29 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -315,12 +315,12 @@ nv30_screen_is_format_supported(struct pipe_screen *pscreen,
                                 unsigned bindings)
 {
    if (sample_count > 4)
-      return FALSE;
+      return false;
    if (!(0x00000017 & (1 << sample_count)))
-      return FALSE;
+      return false;
 
    if (!util_format_is_supported(format, bindings)) {
-      return FALSE;
+      return false;
    }
 
    /* transfers & shared are always supported */
@@ -658,6 +658,6 @@ nv30_screen_create(struct nouveau_device *dev)
 
    nouveau_pushbuf_kick(push, push->channel);
 
-   nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
+   nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
    return pscreen;
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.h b/src/gallium/drivers/nouveau/nv30/nv30_state.h
index e27e16fae82..1f13d24f349 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state.h
@@ -80,7 +80,7 @@ struct nv30_vertprog {
    struct tgsi_shader_info info;
 
    struct draw_vertex_shader *draw;
-   boolean translated;
+   bool translated;
    unsigned enabled_ucps;
    uint16_t texcoord[10];
 
@@ -109,7 +109,7 @@ struct nv30_fragprog {
    struct tgsi_shader_info info;
 
    struct draw_fragment_shader *draw;
-   boolean translated;
+   bool translated;
 
    uint32_t *insn;
    unsigned insn_len;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
index a954dcce562..8957634f0fa 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
@@ -453,8 +453,8 @@ nv30_state_context_switch(struct nv30_context *nv30)
    nv30->base.pushbuf->user_priv = &nv30->bufctx;
 }
 
-boolean
-nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl)
+bool
+nv30_state_validate(struct nv30_context *nv30, uint32_t mask, bool hwtnl)
 {
    struct nouveau_screen *screen = &nv30->screen->base;
    struct nouveau_pushbuf *push = nv30->base.pushbuf;
@@ -494,7 +494,7 @@ nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl)
    nouveau_pushbuf_bufctx(push, bctx);
    if (nouveau_pushbuf_validate(push)) {
       nouveau_pushbuf_bufctx(push, NULL);
-      return FALSE;
+      return false;
    }
 
    /*XXX*/
@@ -528,7 +528,7 @@ nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl)
       }
    }
 
-   return TRUE;
+   return true;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
index 99bc0994ac2..d76b73ab4a4 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
@@ -41,30 +41,30 @@
  * of different ways.
  */
 
-static INLINE boolean
+static INLINE bool
 nv30_transfer_scaled(struct nv30_rect *src, struct nv30_rect *dst)
 {
    if (src->x1 - src->x0 != dst->x1 - dst->x0)
-      return TRUE;
+      return true;
    if (src->y1 - src->y0 != dst->y1 - dst->y0)
-      return TRUE;
-   return FALSE;
+      return true;
+   return false;
 }
 
-static INLINE boolean
+static INLINE bool
 nv30_transfer_blit(XFER_ARGS)
 {
    if (nv30->screen->eng3d->oclass < NV40_3D_CLASS)
-      return FALSE;
+      return false;
    if (dst->offset & 63 || dst->pitch & 63 || dst->d > 1)
-      return FALSE;
+      return false;
    if (dst->w < 2 || dst->h < 2)
-      return FALSE;
+      return false;
    if (dst->cpp > 4 || (dst->cpp == 1 && !dst->pitch))
-      return FALSE;
+      return false;
    if (src->cpp > 4)
-      return FALSE;
-   return TRUE;
+      return false;
+   return true;
 }
 
 static INLINE struct nouveau_heap *
@@ -368,29 +368,29 @@ nv30_transfer_rect_blit(XFER_ARGS)
    PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_STOP);
 }
 
-static boolean
+static bool
 nv30_transfer_sifm(XFER_ARGS)
 {
    if (!src->pitch || (src->w | src->h) > 1024 || src->w < 2 || src->h < 2)
-      return FALSE;
+      return false;
 
    if (src->d > 1 || dst->d > 1)
-      return FALSE;
+      return false;
 
    if (dst->offset & 63)
-      return FALSE;
+      return false;
 
    if (!dst->pitch) {
       if ((dst->w | dst->h) > 2048 || dst->w < 2 || dst->h < 2)
-         return FALSE;
+         return false;
    } else {
       if (dst->domain != NOUVEAU_BO_VRAM)
-         return FALSE;
+         return false;
       if (dst->pitch & 63)
-         return FALSE;
+         return false;
    }
 
-   return TRUE;
+   return true;
 }
 
 static void
@@ -481,14 +481,14 @@ nv30_transfer_rect_sifm(XFER_ARGS)
  * that name is still accurate on nv4x) error.
  */
 
-static boolean
+static bool
 nv30_transfer_m2mf(XFER_ARGS)
 {
    if (!src->pitch || !dst->pitch)
-      return FALSE;
+      return false;
    if (nv30_transfer_scaled(src, dst))
-      return FALSE;
-   return TRUE;
+      return false;
+   return true;
 }
 
 static void
@@ -540,12 +540,12 @@ nv30_transfer_rect_m2mf(XFER_ARGS)
    }
 }
 
-static boolean
+static bool
 nv30_transfer_cpu(XFER_ARGS)
 {
    if (nv30_transfer_scaled(src, dst))
-      return FALSE;
-   return TRUE;
+      return false;
+   return true;
 }
 
 static char *
@@ -653,7 +653,7 @@ nv30_transfer_rect(struct nv30_context *nv30, enum nv30_transfer_filter filter,
 {
    static const struct {
       char *name;
-      boolean (*possible)(XFER_ARGS);
+      bool (*possible)(XFER_ARGS);
       void (*execute)(XFER_ARGS);
    } *method, methods[] = {
       { "m2mf", nv30_transfer_m2mf, nv30_transfer_rect_m2mf },
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
index adea1dcb77c..b0b1b54046d 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
@@ -119,7 +119,7 @@ nv30_prevalidate_vbufs(struct nv30_context *nv30)
             } else {
                nouveau_buffer_migrate(&nv30->base, buf, NOUVEAU_BO_GART);
             }
-            nv30->base.vbo_dirty = TRUE;
+            nv30->base.vbo_dirty = true;
          }
       }
    }
@@ -160,7 +160,7 @@ nv30_update_user_vbufs(struct nv30_context *nv30)
                        NOUVEAU_BO_LOW | NOUVEAU_BO_RD,
                        0, NV30_3D_VTXBUF_DMA1);
    }
-   nv30->base.vbo_dirty = TRUE;
+   nv30->base.vbo_dirty = true;
 }
 
 static INLINE void
@@ -224,7 +224,7 @@ nv30_vbo_validate(struct nv30_context *nv30)
    for (i = 0; i < vertex->num_elements; i++) {
       struct nv04_resource *res;
       unsigned offset;
-      boolean user;
+      bool user;
 
       ve = &vertex->pipe[i];
       vb = &nv30->vtxbuf[ve->vertex_buffer_index];
@@ -262,7 +262,7 @@ nv30_vertex_state_create(struct pipe_context *pipe, unsigned num_elements,
         return NULL;
     memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
     so->num_elements = num_elements;
-    so->need_conversion = FALSE;
+    so->need_conversion = false;
 
     transkey.nr_elements = 0;
     transkey.output_stride = 0;
@@ -285,7 +285,7 @@ nv30_vertex_state_create(struct pipe_context *pipe, unsigned num_elements,
                 return NULL;
             }
             so->element[i].state = nv30_vtxfmt(pipe->screen, fmt)->hw;
-            so->need_conversion = TRUE;
+            so->need_conversion = true;
         }
 
         if (1) {
@@ -453,7 +453,7 @@ nv30_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
 }
 
 static void
-nv30_draw_elements(struct nv30_context *nv30, boolean shorten,
+nv30_draw_elements(struct nv30_context *nv30, bool shorten,
                    unsigned mode, unsigned start, unsigned count,
                    unsigned instance_count, int32_t index_bias)
 {
@@ -563,7 +563,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (nv30->vbo_user && !(nv30->dirty & (NV30_NEW_VERTEX | NV30_NEW_ARRAYS)))
       nv30_update_user_vbufs(nv30);
 
-   nv30_state_validate(nv30, ~0, TRUE);
+   nv30_state_validate(nv30, ~0, true);
    if (nv30->draw_flags) {
       nv30_render_vbo(pipe, info);
       return;
@@ -577,17 +577,17 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (!nv30->vtxbuf[i].buffer)
          continue;
       if (nv30->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nv30->base.vbo_dirty = TRUE;
+         nv30->base.vbo_dirty = true;
    }
 
    if (!nv30->base.vbo_dirty && nv30->idxbuf.buffer &&
        nv30->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-      nv30->base.vbo_dirty = TRUE;
+      nv30->base.vbo_dirty = true;
 
    if (nv30->base.vbo_dirty) {
       BEGIN_NV04(push, NV30_3D(VTX_CACHE_INVALIDATE_1710), 1);
       PUSH_DATA (push, 0);
-      nv30->base.vbo_dirty = FALSE;
+      nv30->base.vbo_dirty = false;
    }
 
    if (!info->indexed) {
@@ -595,7 +595,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
                        info->mode, info->start, info->count,
                        info->instance_count);
    } else {
-      boolean shorten = info->max_index <= 65535;
+      bool shorten = info->max_index <= 65535;
 
       if (info->primitive_restart != nv30->state.prim_restart) {
          if (info->primitive_restart) {
@@ -604,7 +604,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
             PUSH_DATA (push, info->restart_index);
 
             if (info->restart_index > 65535)
-               shorten = FALSE;
+               shorten = false;
          } else {
             BEGIN_NV04(push, NV40_3D(PRIM_RESTART_ENABLE), 1);
             PUSH_DATA (push, 0);
@@ -616,7 +616,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
          PUSH_DATA (push, info->restart_index);
 
          if (info->restart_index > 65535)
-            shorten = FALSE;
+            shorten = false;
       }
 
       nv30_draw_elements(nv30, shorten,
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
index 4d4145d10b5..ee0a6280d7a 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
@@ -48,7 +48,7 @@ nv30_vertprog_destroy(struct nv30_vertprog *vp)
    vp->consts = NULL;
    vp->nr_consts = 0;
 
-   vp->translated = FALSE;
+   vp->translated = false;
 }
 
 void
@@ -58,8 +58,8 @@ nv30_vertprog_validate(struct nv30_context *nv30)
    struct nouveau_object *eng3d = nv30->screen->eng3d;
    struct nv30_vertprog *vp = nv30->vertprog.program;
    struct nv30_fragprog *fp = nv30->fragprog.program;
-   boolean upload_code = FALSE;
-   boolean upload_data = FALSE;
+   bool upload_code = false;
+   bool upload_data = false;
    unsigned i;
 
    if (nv30->dirty & NV30_NEW_FRAGPROG) {
@@ -125,7 +125,7 @@ nv30_vertprog_validate(struct nv30_context *nv30)
          }
       }
 
-      upload_code = TRUE;
+      upload_code = true;
    }
 
    if (vp->nr_consts && !vp->data) {
@@ -166,8 +166,8 @@ nv30_vertprog_validate(struct nv30_context *nv30)
          }
       }
 
-      upload_code = TRUE;
-      upload_data = TRUE;
+      upload_code = true;
+      upload_data = true;
    }
 
    if (vp->nr_consts) {
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
index 9ef16965f39..9a28e742f3a 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
@@ -442,7 +442,7 @@ tgsi_mask(uint tgsi)
    return mask;
 }
 
-static boolean
+static bool
 nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
             const struct tgsi_full_instruction *finst)
 {
@@ -455,7 +455,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
    int i;
 
    if (finst->Instruction.Opcode == TGSI_OPCODE_END)
-      return TRUE;
+      return true;
 
    for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
       const struct tgsi_full_src_register *fsrc;
@@ -525,7 +525,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
          break;
       default:
          NOUVEAU_ERR("bad src file\n");
-         return FALSE;
+         return false;
       }
    }
 
@@ -868,12 +868,12 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
 
         default:
       NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
-      return FALSE;
+      return false;
    }
 
 out:
    release_temps(fpc);
-   return TRUE;
+   return true;
 nv3x_cflow:
    {
       static int warned = 0;
@@ -887,7 +887,7 @@ nv3x_cflow:
    goto out;
 }
 
-static boolean
+static bool
 nvfx_fragprog_parse_decl_input(struct nvfx_fpc *fpc,
                                const struct tgsi_full_declaration *fdec)
 {
@@ -917,17 +917,17 @@ nvfx_fragprog_parse_decl_input(struct nvfx_fpc *fpc,
    case TGSI_SEMANTIC_GENERIC:
    case TGSI_SEMANTIC_PCOORD:
       /* will be assigned to remaining TC slots later */
-      return TRUE;
+      return true;
    default:
       assert(0);
-      return FALSE;
+      return false;
    }
 
    fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nvfx_fragprog_assign_generic(struct nvfx_fpc *fpc,
                              const struct tgsi_full_declaration *fdec)
 {
@@ -954,16 +954,16 @@ nvfx_fragprog_assign_generic(struct nvfx_fpc *fpc,
             }
             hw = NVFX_FP_OP_INPUT_SRC_TC(hw);
             fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
-            return TRUE;
+            return true;
          }
       }
-      return FALSE;
+      return false;
    default:
-      return TRUE;
+      return true;
    }
 }
 
-static boolean
+static bool
 nvfx_fragprog_parse_decl_output(struct nvfx_fpc *fpc,
             const struct tgsi_full_declaration *fdec)
 {
@@ -984,20 +984,20 @@ nvfx_fragprog_parse_decl_output(struct nvfx_fpc *fpc,
       }
       if(hw > ((fpc->is_nv4x) ? 4 : 2)) {
          NOUVEAU_ERR("bad rcol index\n");
-         return FALSE;
+         return false;
       }
       break;
    default:
       NOUVEAU_ERR("bad output semantic\n");
-      return FALSE;
+      return false;
    }
 
    fpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
    fpc->r_temps |= (1ULL << hw);
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nvfx_fragprog_prepare(struct nvfx_fpc *fpc)
 {
    struct tgsi_parse_context p;
@@ -1081,17 +1081,17 @@ nvfx_fragprog_prepare(struct nvfx_fpc *fpc)
       fpc->r_temps_discard = 0ULL;
    }
 
-   return TRUE;
+   return true;
 
 out_err:
    FREE(fpc->r_temp);
    fpc->r_temp = NULL;
 
    tgsi_parse_free(&p);
-   return FALSE;
+   return false;
 }
 
-DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE)
+DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", false)
 
 void
 _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp)
@@ -1100,7 +1100,7 @@ _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp)
    struct nvfx_fpc *fpc = NULL;
    struct util_dynarray insns;
 
-   fp->translated = FALSE;
+   fp->translated = false;
    fp->point_sprite_control = 0;
    fp->vp_or = 0;
 
@@ -1182,7 +1182,7 @@ _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp)
       debug_printf("\n");
    }
 
-   fp->translated = TRUE;
+   fp->translated = true;
 
 out:
    tgsi_parse_free(&parse);
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_shader.h b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
index 9538a793d7e..b0538911960 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
@@ -449,7 +449,7 @@ struct nvfx_insn
 };
 
 static INLINE struct nvfx_insn
-nvfx_insn(boolean sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2)
+nvfx_insn(bool sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2)
 {
 	struct nvfx_insn insn = {
 		.op = op,
@@ -529,7 +529,7 @@ struct nv30_vertprog;
 void
 _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp);
 
-boolean
+bool
 _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp);
 
 #endif
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
index 1ce0589be71..dfa12b0ec08 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
@@ -455,7 +455,7 @@ tgsi_mask(uint tgsi)
    return mask;
 }
 
-static boolean
+static bool
 nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
             unsigned idx, const struct tgsi_full_instruction *finst)
 {
@@ -466,7 +466,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
    struct nvfx_insn insn;
    struct nvfx_relocation reloc;
    struct nvfx_loop_entry loop;
-   boolean sat = FALSE;
+   bool sat = false;
    int mask;
    int ai = -1, ci = -1, ii = -1;
    int i;
@@ -524,25 +524,25 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
          break;
       default:
          NOUVEAU_ERR("bad src file\n");
-         return FALSE;
+         return false;
       }
    }
 
    for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
       if(src[i].reg.type < 0)
-         return FALSE;
+         return false;
    }
 
    if(finst->Dst[0].Register.File == TGSI_FILE_ADDRESS &&
       finst->Instruction.Opcode != TGSI_OPCODE_ARL)
-      return FALSE;
+      return false;
 
    final_dst = dst  = tgsi_dst(vpc, &finst->Dst[0]);
    mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
    if(finst->Instruction.Saturate) {
       assert(finst->Instruction.Opcode != TGSI_OPCODE_ARL);
       if (vpc->is_nv4x)
-         sat = TRUE;
+         sat = true;
       else
       if(dst.type != NVFXSR_TEMP)
          dst = temp(vpc);
@@ -793,7 +793,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
       break;
    default:
       NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
-      return FALSE;
+      return false;
    }
 
    if(finst->Instruction.Saturate && !vpc->is_nv4x) {
@@ -804,10 +804,10 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
    }
 
    release_temps(vpc);
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
                                 const struct tgsi_full_declaration *fdec)
 {
@@ -825,7 +825,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
       vpc->r_result[idx] = temp(vpc);
       vpc->r_temps_discard = 0;
       vpc->cvtx_idx = idx;
-      return TRUE;
+      return true;
    case TGSI_SEMANTIC_COLOR:
       if (fdec->Semantic.Index == 0) {
          hw = NVFX_VP(INST_DEST_COL0);
@@ -834,7 +834,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
          hw = NVFX_VP(INST_DEST_COL1);
       } else {
          NOUVEAU_ERR("bad colour semantic index\n");
-         return FALSE;
+         return false;
       }
       break;
    case TGSI_SEMANTIC_BCOLOR:
@@ -845,7 +845,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
          hw = NVFX_VP(INST_DEST_BFC1);
       } else {
          NOUVEAU_ERR("bad bcolour semantic index\n");
-         return FALSE;
+         return false;
       }
       break;
    case TGSI_SEMANTIC_FOG:
@@ -868,22 +868,22 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
 
       if (i == num_texcoords) {
          vpc->r_result[idx] = nvfx_reg(NVFXSR_NONE, 0);
-         return TRUE;
+         return true;
       }
       break;
    case TGSI_SEMANTIC_EDGEFLAG:
       vpc->r_result[idx] = nvfx_reg(NVFXSR_NONE, 0);
-      return TRUE;
+      return true;
    default:
       NOUVEAU_ERR("bad output semantic\n");
-      return FALSE;
+      return false;
    }
 
    vpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nvfx_vertprog_prepare(struct nvfx_vpc *vpc)
 {
    struct tgsi_parse_context p;
@@ -924,7 +924,7 @@ nvfx_vertprog_prepare(struct nvfx_vpc *vpc)
             break;
          case TGSI_FILE_OUTPUT:
             if (!nvfx_vertprog_parse_decl_output(vpc, fdec))
-               return FALSE;
+               return false;
             break;
          default:
             break;
@@ -961,12 +961,12 @@ nvfx_vertprog_prepare(struct nvfx_vpc *vpc)
    }
 
    vpc->r_temps_discard = 0;
-   return TRUE;
+   return true;
 }
 
-DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", FALSE)
+DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", false)
 
-boolean
+bool
 _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp)
 {
    struct tgsi_parse_context parse;
@@ -975,13 +975,13 @@ _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp)
    struct util_dynarray insns;
    int i, ucps;
 
-   vp->translated = FALSE;
+   vp->translated = false;
    vp->nr_insns = 0;
    vp->nr_consts = 0;
 
    vpc = CALLOC_STRUCT(nvfx_vpc);
    if (!vpc)
-      return FALSE;
+      return false;
    vpc->is_nv4x = (oclass >= NV40_3D_CLASS) ? ~0 : 0;
    vpc->vp   = vp;
    vpc->pipe = vp->pipe;
@@ -990,7 +990,7 @@ _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp)
 
    if (!nvfx_vertprog_prepare(vpc)) {
       FREE(vpc);
-      return FALSE;
+      return false;
    }
 
    /* Redirect post-transform vertex position to a temp if user clip
@@ -1108,7 +1108,7 @@ _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp)
       debug_printf("\n");
    }
 
-   vp->translated = TRUE;
+   vp->translated = true;
 
 out:
    tgsi_parse_free(&parse);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_blit.h b/src/gallium/drivers/nouveau/nv50/nv50_blit.h
index 756c4c11bf6..de490af4557 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_blit.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_blit.h
@@ -191,8 +191,8 @@ nv50_blit_eng2d_get_mask(const struct pipe_blit_info *info)
 # define nv50_format_table nvc0_format_table
 #endif
 
-/* return TRUE for formats that can be converted among each other by NVC0_2D */
-static INLINE boolean
+/* return true for formats that can be converted among each other by NVC0_2D */
+static INLINE bool
 nv50_2d_dst_format_faithful(enum pipe_format format)
 {
    const uint64_t mask =
@@ -201,7 +201,7 @@ nv50_2d_dst_format_faithful(enum pipe_format format)
    uint8_t id = nv50_format_table[format].rt;
    return (id >= 0xc0) && (mask & (1ULL << (id - 0xc0)));
 }
-static INLINE boolean
+static INLINE bool
 nv50_2d_src_format_faithful(enum pipe_format format)
 {
    const uint64_t mask =
@@ -211,7 +211,7 @@ nv50_2d_src_format_faithful(enum pipe_format format)
    return (id >= 0xc0) && (mask & (1ULL << (id - 0xc0)));
 }
 
-static INLINE boolean
+static INLINE bool
 nv50_2d_format_supported(enum pipe_format format)
 {
    uint8_t id = nv50_format_table[format].rt;
@@ -219,7 +219,7 @@ nv50_2d_format_supported(enum pipe_format format)
       (NV50_ENG2D_SUPPORTED_FORMATS & (1ULL << (id - 0xc0)));
 }
 
-static INLINE boolean
+static INLINE bool
 nv50_2d_dst_format_ops_supported(enum pipe_format format)
 {
    uint8_t id = nv50_format_table[format].rt;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 5b5d3912c20..f8d46db7c67 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -64,12 +64,12 @@ nv50_memory_barrier(struct pipe_context *pipe, unsigned flags)
          if (!nv50->vtxbuf[i].buffer)
             continue;
          if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-            nv50->base.vbo_dirty = TRUE;
+            nv50->base.vbo_dirty = true;
       }
 
       if (nv50->idxbuf.buffer &&
           nv50->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-         nv50->base.vbo_dirty = TRUE;
+         nv50->base.vbo_dirty = true;
 
       for (s = 0; s < 3 && !nv50->cb_dirty; ++s) {
          uint32_t valid = nv50->constbuf_valid[s];
@@ -87,7 +87,7 @@ nv50_memory_barrier(struct pipe_context *pipe, unsigned flags)
                continue;
 
             if (res->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-               nv50->cb_dirty = TRUE;
+               nv50->cb_dirty = true;
          }
       }
    }
@@ -100,9 +100,9 @@ nv50_default_kick_notify(struct nouveau_pushbuf *push)
 
    if (screen) {
       nouveau_fence_next(&screen->base);
-      nouveau_fence_update(&screen->base, TRUE);
+      nouveau_fence_update(&screen->base, true);
       if (screen->cur_ctx)
-         screen->cur_ctx->state.flushed = TRUE;
+         screen->cur_ctx->state.flushed = true;
    }
 }
 
@@ -310,7 +310,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv)
    nv50->base.invalidate_resource_storage = nv50_invalidate_resource_storage;
 
    if (screen->base.device->chipset < 0x84 ||
-       debug_get_bool_option("NOUVEAU_PMPEG", FALSE)) {
+       debug_get_bool_option("NOUVEAU_PMPEG", false)) {
       /* PMPEG */
       nouveau_context_init_vdec(&nv50->base);
    } else if (screen->base.device->chipset < 0x98 ||
@@ -351,7 +351,7 @@ out_err:
 }
 
 void
-nv50_bufctx_fence(struct nouveau_bufctx *bufctx, boolean on_flush)
+nv50_bufctx_fence(struct nouveau_bufctx *bufctx, bool on_flush)
 {
    struct nouveau_list *list = on_flush ? &bufctx->current : &bufctx->pending;
    struct nouveau_list *it;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 1f123ef7e92..5949b4debb5 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -91,7 +91,7 @@
 
 struct nv50_blitctx;
 
-boolean nv50_blitctx_create(struct nv50_context *);
+bool nv50_blitctx_create(struct nv50_context *);
 
 struct nv50_context {
    struct nouveau_context base;
@@ -102,7 +102,7 @@ struct nv50_context {
    struct nouveau_bufctx *bufctx;
 
    uint32_t dirty;
-   boolean cb_dirty;
+   bool cb_dirty;
 
    struct nv50_graph_state state;
 
@@ -152,12 +152,12 @@ struct nv50_context {
    unsigned sample_mask;
    unsigned min_samples;
 
-   boolean vbo_push_hint;
+   bool vbo_push_hint;
 
    uint32_t rt_array_mode;
 
    struct pipe_query *cond_query;
-   boolean cond_cond; /* inverted rendering condition */
+   bool cond_cond; /* inverted rendering condition */
    uint cond_mode;
    uint32_t cond_condmode; /* the calculated condition */
 
@@ -188,7 +188,7 @@ nv50_context_shader_stage(unsigned pipe)
 /* nv50_context.c */
 struct pipe_context *nv50_create(struct pipe_screen *, void *);
 
-void nv50_bufctx_fence(struct nouveau_bufctx *, boolean on_flush);
+void nv50_bufctx_fence(struct nouveau_bufctx *, bool on_flush);
 
 void nv50_default_kick_notify(struct nouveau_pushbuf *);
 
@@ -202,7 +202,7 @@ void nv50_query_pushbuf_submit(struct nouveau_pushbuf *,
 void nv84_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
 void nva0_so_target_save_offset(struct pipe_context *,
                                 struct pipe_stream_output_target *,
-                                unsigned index, boolean seralize);
+                                unsigned index, bool seralize);
 
 #define NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
@@ -221,8 +221,8 @@ extern void nv50_init_state_functions(struct nv50_context *);
 
 /* nv50_state_validate.c */
 /* @words: check for space before emitting relocs */
-extern boolean nv50_state_validate(struct nv50_context *, uint32_t state_mask,
-                                   unsigned space_words);
+extern bool nv50_state_validate(struct nv50_context *, uint32_t state_mask,
+                                unsigned space_words);
 
 /* nv50_surface.c */
 extern void nv50_clear(struct pipe_context *, unsigned buffers,
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
index f15d8f3ecb6..98de9664e12 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
@@ -30,7 +30,7 @@
 
 uint32_t
 nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz,
-                                 boolean is_3d)
+                                 bool is_3d)
 {
    uint32_t tile_mode = 0x000;
 
@@ -59,13 +59,13 @@ nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz,
 }
 
 static uint32_t
-nv50_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, boolean is_3d)
+nv50_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, bool is_3d)
 {
    return nv50_tex_choose_tile_dims_helper(nx, ny * 2, nz, is_3d);
 }
 
 static uint32_t
-nv50_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed)
+nv50_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed)
 {
    const unsigned ms = util_logbase2(mt->base.base.nr_samples);
    uint32_t tile_flags;
@@ -184,7 +184,7 @@ nv50_miptree_get_handle(struct pipe_screen *pscreen,
    unsigned stride;
 
    if (!mt || !mt->base.bo)
-      return FALSE;
+      return false;
 
    stride = mt->level[0].pitch;
 
@@ -204,7 +204,7 @@ const struct u_resource_vtbl nv50_miptree_vtbl =
    u_default_transfer_inline_write  /* transfer_inline_write */
 };
 
-static INLINE boolean
+static INLINE bool
 nv50_miptree_init_ms_mode(struct nv50_miptree *mt)
 {
    switch (mt->base.base.nr_samples) {
@@ -228,12 +228,12 @@ nv50_miptree_init_ms_mode(struct nv50_miptree *mt)
       break;
    default:
       NOUVEAU_ERR("invalid nr_samples: %u\n", mt->base.base.nr_samples);
-      return FALSE;
+      return false;
    }
-   return TRUE;
+   return true;
 }
 
-boolean
+bool
 nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align)
 {
    struct pipe_resource *pt = &mt->base.base;
@@ -241,12 +241,12 @@ nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align)
    unsigned h = pt->height0;
 
    if (util_format_is_depth_or_stencil(pt->format))
-      return FALSE;
+      return false;
 
    if ((pt->last_level > 0) || (pt->depth0 > 1) || (pt->array_size > 1))
-      return FALSE;
+      return false;
    if (mt->ms_x | mt->ms_y)
-      return FALSE;
+      return false;
 
    mt->level[0].pitch = align(pt->width0 * blocksize, pitch_align);
 
@@ -256,7 +256,7 @@ nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align)
 
    mt->total_size = mt->level[0].pitch * h;
 
-   return TRUE;
+   return true;
 }
 
 static void
@@ -335,7 +335,7 @@ nv50_miptree_create(struct pipe_screen *pscreen,
    struct nouveau_device *dev = nouveau_screen(pscreen)->device;
    struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
    struct pipe_resource *pt = &mt->base.base;
-   boolean compressed = dev->drm_version >= 0x01000101;
+   bool compressed = dev->drm_version >= 0x01000101;
    int ret;
    union nouveau_bo_config bo_config;
    uint32_t bo_flags;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index aaca4c550d9..5d3fb608371 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -104,7 +104,7 @@ nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
          prog->vp.bfc[info->out[i].si] = i;
          break;
       case TGSI_SEMANTIC_LAYER:
-         prog->gp.has_layer = TRUE;
+         prog->gp.has_layer = true;
          prog->gp.layerid = n;
          break;
       case TGSI_SEMANTIC_VIEWPORT_INDEX:
@@ -316,7 +316,7 @@ nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
    return so;
 }
 
-boolean
+bool
 nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
 {
    struct nv50_ir_prog_info *info;
@@ -325,7 +325,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
 
    info = CALLOC_STRUCT(nv50_ir_prog_info);
    if (!info)
-      return FALSE;
+      return false;
 
    info->type = prog->type;
    info->target = chipset;
@@ -410,7 +410,7 @@ out:
    return !ret;
 }
 
-boolean
+bool
 nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
 {
    struct nouveau_heap *heap;
@@ -423,7 +423,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break;
    default:
       assert(!"invalid program type");
-      return FALSE;
+      return false;
    }
 
    ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
@@ -440,7 +440,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
       ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
       if (ret) {
          NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
-         return FALSE;
+         return false;
       }
    }
    prog->code_base = prog->mem->start;
@@ -448,10 +448,10 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
    if (ret < 0) {
       nouveau_heap_free(&prog->mem);
-      return FALSE;
+      return false;
    }
    if (ret > 0)
-      nv50->state.new_tls_space = TRUE;
+      nv50->state.new_tls_space = true;
 
    if (prog->fixups)
       nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
@@ -463,7 +463,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
    PUSH_DATA (nv50->base.pushbuf, 0);
 
-   return TRUE;
+   return true;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h
index fe6bd6025be..5d3ff5644d2 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@@ -53,7 +53,7 @@ struct nv50_program {
    struct pipe_shader_state pipe;
 
    ubyte type;
-   boolean translated;
+   bool translated;
 
    uint32_t *code;
    unsigned code_size;
@@ -104,8 +104,8 @@ struct nv50_program {
    struct nv50_stream_output_state *so;
 };
 
-boolean nv50_program_translate(struct nv50_program *, uint16_t chipset);
-boolean nv50_program_upload_code(struct nv50_context *, struct nv50_program *);
+bool nv50_program_translate(struct nv50_program *, uint16_t chipset);
+bool nv50_program_upload_code(struct nv50_context *, struct nv50_program *);
 void nv50_program_destroy(struct nv50_context *, struct nv50_program *);
 
 #endif /* __NV50_PROG_H__ */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_push.c b/src/gallium/drivers/nouveau/nv50/nv50_push.c
index a3a397c52c1..2d5ac605284 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_push.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_push.c
@@ -23,7 +23,7 @@ struct push_context {
 
    struct translate *translate;
 
-   boolean primitive_restart;
+   bool primitive_restart;
    uint32_t prim;
    uint32_t restart_index;
    uint32_t instance_id;
@@ -212,7 +212,7 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
    unsigned i, index_size;
    unsigned inst_count = info->instance_count;
    unsigned vert_count = info->count;
-   boolean apply_bias = info->indexed && info->index_bias;
+   bool apply_bias = info->indexed && info->index_bias;
 
    ctx.push = nv50->base.pushbuf;
    ctx.translate = nv50->vertex->translate;
@@ -258,12 +258,12 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
             NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n");
             return;
          }
-         pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count);
+         pipe->get_query_result(pipe, targ->pq, true, (void *)&vert_count);
          vert_count /= targ->stride;
       }
       ctx.idxbuf = NULL;
       index_size = 0;
-      ctx.primitive_restart = FALSE;
+      ctx.primitive_restart = false;
       ctx.restart_index = 0;
    }
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index a5b95c13958..f4c0639f233 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -48,7 +48,7 @@ struct nv50_query {
    uint32_t base;
    uint32_t offset; /* base + i * 32 */
    uint8_t state;
-   boolean is64bit;
+   bool is64bit;
    int nesting; /* only used for occlusion queries */
    struct nouveau_mm_allocation *mm;
    struct nouveau_fence *fence;
@@ -62,7 +62,7 @@ nv50_query(struct pipe_query *pipe)
    return (struct nv50_query *)pipe;
 }
 
-static boolean
+static bool
 nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
 {
    struct nv50_screen *screen = nv50->screen;
@@ -81,17 +81,17 @@ nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
    if (size) {
       q->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
       if (!q->bo)
-         return FALSE;
+         return false;
       q->offset = q->base;
 
       ret = nouveau_bo_map(q->bo, 0, screen->base.client);
       if (ret) {
          nv50_query_allocate(nv50, q, 0);
-         return FALSE;
+         return false;
       }
       q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
    }
-   return TRUE;
+   return true;
 }
 
 static void
@@ -154,8 +154,8 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
    struct nv50_query *q = nv50_query(pq);
 
    /* For occlusion queries we have to change the storage, because a previous
-    * query might set the initial render conition to FALSE even *after* we re-
-    * initialized it to TRUE.
+    * query might set the initial render conition to false even *after* we re-
+    * initialized it to true.
     */
    if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
       q->offset += 32;
@@ -167,7 +167,7 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
        *  query ?
        */
       q->data[0] = q->sequence; /* initialize sequence */
-      q->data[1] = 1; /* initial render condition = TRUE */
+      q->data[1] = 1; /* initial render condition = true */
       q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
       q->data[5] = 0;
    }
@@ -269,7 +269,7 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nv50_query_get(push, q, 0, 0x0d005002 | (q->index << 5));
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      /* This query is not issued on GPU because disjoint is forced to FALSE */
+      /* This query is not issued on GPU because disjoint is forced to false */
       q->state = NV50_QUERY_STATE_READY;
       break;
    default:
@@ -301,7 +301,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    struct nv50_query *q = nv50_query(pq);
    uint64_t *res64 = (uint64_t *)result;
    uint32_t *res32 = (uint32_t *)result;
-   boolean *res8 = (boolean *)result;
+   uint8_t *res8 = (uint8_t *)result;
    uint64_t *data64 = (uint64_t *)q->data;
    int i;
 
@@ -315,16 +315,16 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
             q->state = NV50_QUERY_STATE_FLUSHED;
             PUSH_KICK(nv50->base.pushbuf);
          }
-         return FALSE;
+         return false;
       }
       if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nv50->screen->base.client))
-         return FALSE;
+         return false;
    }
    q->state = NV50_QUERY_STATE_READY;
 
    switch (q->type) {
    case PIPE_QUERY_GPU_FINISHED:
-      res8[0] = TRUE;
+      res8[0] = true;
       break;
    case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
       res64[0] = q->data[1] - q->data[5];
@@ -346,7 +346,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
       res64[0] = 1000000000;
-      res8[8] = FALSE;
+      res8[8] = false;
       break;
    case PIPE_QUERY_TIME_ELAPSED:
       res64[0] = data64[1] - data64[3];
@@ -355,10 +355,10 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       res32[0] = q->data[1];
       break;
    default:
-      return FALSE;
+      return false;
    }
 
-   return TRUE;
+   return true;
 }
 
 void
@@ -385,7 +385,7 @@ nv50_render_condition(struct pipe_context *pipe,
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nv50_query *q;
    uint32_t cond;
-   boolean wait =
+   bool wait =
       mode != PIPE_RENDER_COND_NO_WAIT &&
       mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT;
 
@@ -399,7 +399,7 @@ nv50_render_condition(struct pipe_context *pipe,
       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
          cond = condition ? NV50_3D_COND_MODE_EQUAL :
                             NV50_3D_COND_MODE_NOT_EQUAL;
-         wait = TRUE;
+         wait = true;
          break;
       case PIPE_QUERY_OCCLUSION_COUNTER:
       case PIPE_QUERY_OCCLUSION_PREDICATE:
@@ -468,7 +468,7 @@ nv50_query_pushbuf_submit(struct nouveau_pushbuf *push,
 void
 nva0_so_target_save_offset(struct pipe_context *pipe,
                            struct pipe_stream_output_target *ptarg,
-                           unsigned index, boolean serialize)
+                           unsigned index, bool serialize)
 {
    struct nv50_so_target *targ = nv50_so_target(ptarg);
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
index f7ee1354a92..19b1a391c4f 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
@@ -35,7 +35,7 @@ nv50_screen_init_resource_functions(struct pipe_screen *pscreen);
 
 uint32_t
 nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz,
-                                 boolean is_3d);
+                                 bool is_3d);
 
 struct nv50_miptree_level {
    uint32_t offset;
@@ -50,7 +50,7 @@ struct nv50_miptree {
    struct nv50_miptree_level level[NV50_MAX_TEXTURE_LEVELS];
    uint32_t total_size;
    uint32_t layer_stride;
-   boolean layout_3d; /* TRUE if layer count varies with mip level */
+   bool layout_3d; /* true if layer count varies with mip level */
    uint8_t ms_x;      /* log2 of number of samples in x/y dimension */
    uint8_t ms_y;
    uint8_t ms_mode;
@@ -70,7 +70,7 @@ nv50_miptree(struct pipe_resource *pt)
 
 /* Internal functions:
  */
-boolean
+bool
 nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align);
 
 struct pipe_resource *
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 4f99b631aae..d869544380a 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -51,19 +51,19 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
                                 unsigned bindings)
 {
    if (sample_count > 8)
-      return FALSE;
+      return false;
    if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */
-      return FALSE;
+      return false;
    if (sample_count == 8 && util_format_get_blocksizebits(format) >= 128)
-      return FALSE;
+      return false;
 
    if (!util_format_is_supported(format, bindings))
-      return FALSE;
+      return false;
 
    switch (format) {
    case PIPE_FORMAT_Z16_UNORM:
       if (nv50_screen(pscreen)->tesla->oclass < NVA0_3D_CLASS)
-         return FALSE;
+         return false;
       break;
    default:
       break;
@@ -455,7 +455,7 @@ nv50_screen_init_hwctx(struct nv50_screen *screen)
    BEGIN_NV04(push, NV50_3D(UNK1400_LANES), 1);
    PUSH_DATA (push, 0xf);
 
-   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", TRUE)) {
+   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", true)) {
       BEGIN_NV04(push, NV50_3D(WATCHDOG_TIMER), 1);
       PUSH_DATA (push, 0x18);
    }
@@ -735,7 +735,7 @@ nv50_screen_create(struct nouveau_device *dev)
    nv50_screen_init_resource_functions(pscreen);
 
    if (screen->base.device->chipset < 0x84 ||
-       debug_get_bool_option("NOUVEAU_PMPEG", FALSE)) {
+       debug_get_bool_option("NOUVEAU_PMPEG", false)) {
       /* PMPEG */
       nouveau_screen_init_vdec(&screen->base);
    } else if (screen->base.device->chipset < 0x98 ||
@@ -891,7 +891,7 @@ nv50_screen_create(struct nouveau_device *dev)
 
    nv50_screen_init_hwctx(screen);
 
-   nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
+   nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
    return pscreen;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
index 3a12a1f066b..d9199e4478a 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
@@ -32,14 +32,14 @@ struct nv50_graph_state {
    uint32_t semantic_color;
    uint32_t semantic_psize;
    int32_t index_bias;
-   boolean uniform_buffer_bound[3];
-   boolean prim_restart;
-   boolean point_sprite;
-   boolean rt_serialize;
-   boolean flushed;
-   boolean rasterizer_discard;
+   bool uniform_buffer_bound[3];
+   bool prim_restart;
+   bool point_sprite;
+   bool rt_serialize;
+   bool flushed;
+   bool rasterizer_discard;
    uint8_t tls_required;
-   boolean new_tls_space;
+   bool new_tls_space;
    uint8_t num_vtxbufs;
    uint8_t num_vtxelts;
    uint8_t num_textures[3];
@@ -103,7 +103,7 @@ nv50_screen(struct pipe_screen *screen)
    return (struct nv50_screen *)screen;
 }
 
-boolean nv50_blitter_create(struct nv50_screen *);
+bool nv50_blitter_create(struct nv50_screen *);
 void nv50_blitter_destroy(struct nv50_screen *);
 
 int nv50_screen_tic_alloc(struct nv50_screen *, void *);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index c698782d8bd..1ec5642ccfc 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -60,7 +60,7 @@ nv50_constbufs_validate(struct nv50_context *nv50)
                continue;
             }
             if (!nv50->state.uniform_buffer_bound[s]) {
-               nv50->state.uniform_buffer_bound[s] = TRUE;
+               nv50->state.uniform_buffer_bound[s] = true;
                BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
                PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
             }
@@ -104,23 +104,23 @@ nv50_constbufs_validate(struct nv50_context *nv50)
                PUSH_DATA (push, (i << 8) | p | 0);
             }
             if (i == 0)
-               nv50->state.uniform_buffer_bound[s] = FALSE;
+               nv50->state.uniform_buffer_bound[s] = false;
          }
       }
    }
 }
 
-static boolean
+static bool
 nv50_program_validate(struct nv50_context *nv50, struct nv50_program *prog)
 {
    if (!prog->translated) {
       prog->translated = nv50_program_translate(
          prog, nv50->screen->base.device->chipset);
       if (!prog->translated)
-         return FALSE;
+         return false;
    } else
    if (prog->mem)
-      return TRUE;
+      return true;
 
    return nv50_program_upload_code(nv50, prog);
 }
@@ -136,7 +136,7 @@ nv50_program_update_context_state(struct nv50_context *nv50,
          nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
       if (!nv50->state.tls_required || nv50->state.new_tls_space)
          BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo);
-      nv50->state.new_tls_space = FALSE;
+      nv50->state.new_tls_space = false;
       nv50->state.tls_required |= 1 << stage;
    } else {
       if (nv50->state.tls_required == (1 << stage))
@@ -243,11 +243,11 @@ nv50_sprite_coords_validate(struct nv50_context *nv50)
          for (i = 0; i < 8; ++i)
             PUSH_DATA(push, 0);
 
-         nv50->state.point_sprite = FALSE;
+         nv50->state.point_sprite = false;
       }
       return;
    } else {
-      nv50->state.point_sprite = TRUE;
+      nv50->state.point_sprite = true;
    }
 
    memset(pntc, 0, sizeof(pntc));
@@ -646,7 +646,7 @@ nv50_stream_output_validate(struct nv50_context *nv50)
             nv50_query_pushbuf_submit(push, targ->pq, 0x4);
          } else {
             PUSH_DATA(push, 0);
-            targ->clean = FALSE;
+            targ->clean = false;
          }
       } else {
          const unsigned limit = targ->pipe.buffer_size /
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index d4d41af3c61..2bc8f4c9ae0 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -116,7 +116,7 @@ nv50_blend_state_create(struct pipe_context *pipe,
 {
    struct nv50_blend_stateobj *so = CALLOC_STRUCT(nv50_blend_stateobj);
    int i;
-   boolean emit_common_func = cso->rt[0].blend_enable;
+   bool emit_common_func = cso->rt[0].blend_enable;
    uint32_t ms;
 
    if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) {
@@ -137,11 +137,11 @@ nv50_blend_state_create(struct pipe_context *pipe,
       for (i = 0; i < 8; ++i) {
          SB_DATA(so, cso->rt[i].blend_enable);
          if (cso->rt[i].blend_enable)
-            emit_common_func = TRUE;
+            emit_common_func = true;
       }
 
       if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) {
-         emit_common_func = FALSE;
+         emit_common_func = false;
 
          for (i = 0; i < 8; ++i) {
             if (!cso->rt[i].blend_enable)
@@ -808,7 +808,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 
    pipe_resource_reference(&nv50->constbuf[s][i].u.buf, res);
 
-   nv50->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE;
+   nv50->constbuf[s][i].user = (cb && cb->user_buffer) ? true : false;
    if (nv50->constbuf[s][i].user) {
       nv50->constbuf[s][i].u.data = cb->user_buffer;
       nv50->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
@@ -1041,7 +1041,7 @@ nv50_so_target_create(struct pipe_context *pipe,
    } else {
       targ->pq = NULL;
    }
-   targ->clean = TRUE;
+   targ->clean = true;
 
    targ->pipe.buffer_size = size;
    targ->pipe.buffer_offset = offset;
@@ -1075,32 +1075,32 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
 {
    struct nv50_context *nv50 = nv50_context(pipe);
    unsigned i;
-   boolean serialize = TRUE;
-   const boolean can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS;
+   bool serialize = true;
+   const bool can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS;
 
    assert(num_targets <= 4);
 
    for (i = 0; i < num_targets; ++i) {
-      const boolean changed = nv50->so_target[i] != targets[i];
-      const boolean append = (offsets[i] == (unsigned)-1);
+      const bool changed = nv50->so_target[i] != targets[i];
+      const bool append = (offsets[i] == (unsigned)-1);
       if (!changed && append)
          continue;
       nv50->so_targets_dirty |= 1 << i;
 
       if (can_resume && changed && nv50->so_target[i]) {
          nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
-         serialize = FALSE;
+         serialize = false;
       }
 
       if (targets[i] && !append)
-         nv50_so_target(targets[i])->clean = TRUE;
+         nv50_so_target(targets[i])->clean = true;
 
       pipe_so_target_reference(&nv50->so_target[i], targets[i]);
    }
    for (; i < nv50->num_so_targets; ++i) {
       if (can_resume && nv50->so_target[i]) {
          nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
-         serialize = FALSE;
+         serialize = false;
       }
       pipe_so_target_reference(&nv50->so_target[i], NULL);
       nv50->so_targets_dirty |= 1 << i;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 293f9802df8..eeec0fb6562 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -82,7 +82,7 @@ nv50_validate_fb(struct nv50_context *nv50)
       ms_mode = mt->ms_mode;
 
       if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING)
-         nv50->state.rt_serialize = TRUE;
+         nv50->state.rt_serialize = true;
       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
       mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
@@ -111,7 +111,7 @@ nv50_validate_fb(struct nv50_context *nv50)
       ms_mode = mt->ms_mode;
 
       if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING)
-         nv50->state.rt_serialize = TRUE;
+         nv50->state.rt_serialize = true;
       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
       mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
@@ -486,7 +486,7 @@ static struct state_validate {
 };
 #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
 
-boolean
+bool
 nv50_state_validate(struct nv50_context *nv50, uint32_t mask, unsigned words)
 {
    uint32_t state_mask;
@@ -508,19 +508,19 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask, unsigned words)
       nv50->dirty &= ~state_mask;
 
       if (nv50->state.rt_serialize) {
-         nv50->state.rt_serialize = FALSE;
+         nv50->state.rt_serialize = false;
          BEGIN_NV04(nv50->base.pushbuf, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
          PUSH_DATA (nv50->base.pushbuf, 0);
       }
 
-      nv50_bufctx_fence(nv50->bufctx_3d, FALSE);
+      nv50_bufctx_fence(nv50->bufctx_3d, false);
    }
    nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_3d);
    ret = nouveau_pushbuf_validate(nv50->base.pushbuf);
 
    if (unlikely(nv50->state.flushed)) {
-      nv50->state.flushed = FALSE;
-      nv50_bufctx_fence(nv50->bufctx_3d, TRUE);
+      nv50->state.flushed = false;
+      nv50_bufctx_fence(nv50->bufctx_3d, true);
    }
    return !ret;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
index eea5327b6cb..6d1d846ca3d 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
@@ -41,7 +41,7 @@ struct nv50_constbuf {
    } u;
    uint32_t size; /* max 65536 */
    uint32_t offset;
-   boolean user; /* should only be TRUE if u.data is valid and non-NULL */
+   bool user; /* should only be true if u.data is valid and non-NULL */
 };
 
 struct nv50_vertex_element {
@@ -56,7 +56,7 @@ struct nv50_vertex_stateobj {
    unsigned num_elements;
    uint32_t instance_elts;
    uint32_t instance_bufs;
-   boolean need_conversion;
+   bool need_conversion;
    unsigned vertex_size;
    unsigned packet_vertex_limit;
    struct nv50_vertex_element element[0];
@@ -66,7 +66,7 @@ struct nv50_so_target {
    struct pipe_stream_output_target pipe;
    struct pipe_query *pq;
    unsigned stride;
-   boolean clean;
+   bool clean;
 };
 
 static INLINE struct nv50_so_target *
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index 66eccc25763..90106e7ac03 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -50,7 +50,7 @@
 #include "nv50/nv50_blit.h"
 
 static INLINE uint8_t
-nv50_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
+nv50_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
 {
    uint8_t id = nv50_format_table[format].rt;
 
@@ -76,7 +76,7 @@ nv50_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
 static int
 nv50_2d_texture_set(struct nouveau_pushbuf *push, int dst,
                     struct nv50_miptree *mt, unsigned level, unsigned layer,
-                    enum pipe_format pformat, boolean dst_src_pformat_equal)
+                    enum pipe_format pformat, bool dst_src_pformat_equal)
 {
    struct nouveau_bo *bo = mt->base.bo;
    uint32_t width, height, depth;
@@ -153,7 +153,7 @@ nv50_2d_texture_do_copy(struct nouveau_pushbuf *push,
    const enum pipe_format dfmt = dst->base.base.format;
    const enum pipe_format sfmt = src->base.base.format;
    int ret;
-   boolean eqfmt = dfmt == sfmt;
+   bool eqfmt = dfmt == sfmt;
 
    if (!PUSH_SPACE(push, 2 * 16 + 32))
       return PIPE_ERROR;
@@ -196,7 +196,7 @@ nv50_resource_copy_region(struct pipe_context *pipe,
 {
    struct nv50_context *nv50 = nv50_context(pipe);
    int ret;
-   boolean m2mf;
+   bool m2mf;
    unsigned dst_layer = dstz, src_layer = src_box->z;
 
    if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
@@ -658,7 +658,7 @@ nv50_blitter_make_vp(struct nv50_blitter *blit)
    };
 
    blit->vp.type = PIPE_SHADER_VERTEX;
-   blit->vp.translated = TRUE;
+   blit->vp.translated = true;
    blit->vp.code = (uint32_t *)code; /* const_cast */
    blit->vp.code_size = sizeof(code);
    blit->vp.max_gpr = 4;
@@ -687,24 +687,24 @@ nv50_blitter_make_fp(struct pipe_context *pipe,
 
    const unsigned target = nv50_blit_get_tgsi_texture_target(ptarg);
 
-   boolean tex_rgbaz = FALSE;
-   boolean tex_s = FALSE;
-   boolean cvt_un8 = FALSE;
+   bool tex_rgbaz = false;
+   bool tex_s = false;
+   bool cvt_un8 = false;
 
    if (mode != NV50_BLIT_MODE_PASS &&
        mode != NV50_BLIT_MODE_Z24X8 &&
        mode != NV50_BLIT_MODE_X8Z24)
-      tex_s = TRUE;
+      tex_s = true;
 
    if (mode != NV50_BLIT_MODE_X24S8 &&
        mode != NV50_BLIT_MODE_S8X24 &&
        mode != NV50_BLIT_MODE_XS)
-      tex_rgbaz = TRUE;
+      tex_rgbaz = true;
 
    if (mode != NV50_BLIT_MODE_PASS &&
        mode != NV50_BLIT_MODE_ZS &&
        mode != NV50_BLIT_MODE_XS)
-      cvt_un8 = TRUE;
+      cvt_un8 = true;
 
    ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
    if (!ureg)
@@ -1271,7 +1271,7 @@ nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info)
    int i;
    uint32_t mode;
    uint32_t mask = nv50_blit_eng2d_get_mask(info);
-   boolean b;
+   bool b;
 
    mode = nv50_blit_get_filter(info) ?
       NV50_2D_BLIT_CONTROL_FILTER_BILINEAR :
@@ -1410,7 +1410,7 @@ nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info)
          PUSH_DATA (push, srcy >> 32);
       }
    }
-   nv50_bufctx_fence(nv50->bufctx, FALSE);
+   nv50_bufctx_fence(nv50->bufctx, false);
 
    nouveau_bufctx_reset(nv50->bufctx, NV50_BIND_2D);
 
@@ -1433,66 +1433,66 @@ nv50_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
 {
    struct nv50_context *nv50 = nv50_context(pipe);
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
-   boolean eng3d = FALSE;
+   bool eng3d = FALSE;
 
    if (util_format_is_depth_or_stencil(info->dst.resource->format)) {
       if (!(info->mask & PIPE_MASK_ZS))
          return;
       if (info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT ||
           info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
-         eng3d = TRUE;
+         eng3d = true;
       if (info->filter != PIPE_TEX_FILTER_NEAREST)
-         eng3d = TRUE;
+         eng3d = true;
    } else {
       if (!(info->mask & PIPE_MASK_RGBA))
          return;
       if (info->mask != PIPE_MASK_RGBA)
-         eng3d = TRUE;
+         eng3d = true;
    }
 
    if (nv50_miptree(info->src.resource)->layout_3d) {
-      eng3d = TRUE;
+      eng3d = true;
    } else
    if (info->src.box.depth != info->dst.box.depth) {
-      eng3d = TRUE;
+      eng3d = true;
       debug_printf("blit: cannot filter array or cube textures in z direction");
    }
 
    if (!eng3d && info->dst.format != info->src.format) {
       if (!nv50_2d_dst_format_faithful(info->dst.format) ||
           !nv50_2d_src_format_faithful(info->src.format)) {
-         eng3d = TRUE;
+         eng3d = true;
       } else
       if (!nv50_2d_src_format_faithful(info->src.format)) {
          if (!util_format_is_luminance(info->src.format)) {
             if (util_format_is_intensity(info->src.format))
-               eng3d = TRUE;
+               eng3d = true;
             else
             if (!nv50_2d_dst_format_ops_supported(info->dst.format))
-               eng3d = TRUE;
+               eng3d = true;
             else
                eng3d = !nv50_2d_format_supported(info->src.format);
          }
       } else
       if (util_format_is_luminance_alpha(info->src.format))
-         eng3d = TRUE;
+         eng3d = true;
    }
 
    if (info->src.resource->nr_samples == 8 &&
        info->dst.resource->nr_samples <= 1)
-      eng3d = TRUE;
+      eng3d = true;
 
    /* FIXME: can't make this work with eng2d anymore */
    if ((info->src.resource->nr_samples | 1) !=
        (info->dst.resource->nr_samples | 1))
-      eng3d = TRUE;
+      eng3d = true;
 
    /* FIXME: find correct src coordinate adjustments */
    if ((info->src.box.width !=  info->dst.box.width &&
         info->src.box.width != -info->dst.box.width) ||
        (info->src.box.height !=  info->dst.box.height &&
         info->src.box.height != -info->dst.box.height))
-      eng3d = TRUE;
+      eng3d = true;
 
    if (nv50->screen->num_occlusion_queries_active) {
       BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
@@ -1516,13 +1516,13 @@ nv50_flush_resource(struct pipe_context *ctx,
 {
 }
 
-boolean
+bool
 nv50_blitter_create(struct nv50_screen *screen)
 {
    screen->blitter = CALLOC_STRUCT(nv50_blitter);
    if (!screen->blitter) {
       NOUVEAU_ERR("failed to allocate blitter struct\n");
-      return FALSE;
+      return false;
    }
 
    pipe_mutex_init(screen->blitter->mutex);
@@ -1530,7 +1530,7 @@ nv50_blitter_create(struct nv50_screen *screen)
    nv50_blitter_make_vp(screen->blitter);
    nv50_blitter_make_sampler(screen->blitter);
 
-   return TRUE;
+   return true;
 }
 
 void
@@ -1553,20 +1553,20 @@ nv50_blitter_destroy(struct nv50_screen *screen)
    FREE(blitter);
 }
 
-boolean
+bool
 nv50_blitctx_create(struct nv50_context *nv50)
 {
    nv50->blit = CALLOC_STRUCT(nv50_blitctx);
    if (!nv50->blit) {
       NOUVEAU_ERR("failed to allocate blit context\n");
-      return FALSE;
+      return false;
    }
 
    nv50->blit->nv50 = nv50;
 
    nv50->blit->rast.pipe.half_pixel_center = 1;
 
-   return TRUE;
+   return true;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
index 17ae27fa85d..f6396fba9d1 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
@@ -32,7 +32,7 @@
     NV50_TIC_0_MAPG__MASK | NV50_TIC_0_MAPR__MASK)
 
 static INLINE uint32_t
-nv50_tic_swizzle(uint32_t tc, unsigned swz, boolean tex_int)
+nv50_tic_swizzle(uint32_t tc, unsigned swz, bool tex_int)
 {
    switch (swz) {
    case PIPE_SWIZZLE_RED:
@@ -79,7 +79,7 @@ nv50_create_texture_view(struct pipe_context *pipe,
    uint32_t depth;
    struct nv50_tic_entry *view;
    struct nv50_miptree *mt = nv50_miptree(texture);
-   boolean tex_int;
+   bool tex_int;
 
    view = MALLOC_STRUCT(nv50_tic_entry);
    if (!view)
@@ -193,7 +193,7 @@ nv50_create_texture_view(struct pipe_context *pipe,
       break;
    default:
       NOUVEAU_ERR("invalid texture target: %d\n", mt->base.base.target);
-      return FALSE;
+      return false;
    }
 
    tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000;
@@ -221,13 +221,13 @@ nv50_create_texture_view(struct pipe_context *pipe,
    return &view->pipe;
 }
 
-static boolean
+static bool
 nv50_validate_tic(struct nv50_context *nv50, int s)
 {
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nouveau_bo *txc = nv50->screen->txc;
    unsigned i;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    assert(nv50->num_textures[s] <= PIPE_MAX_SAMPLERS);
    for (i = 0; i < nv50->num_textures[s]; ++i) {
@@ -270,7 +270,7 @@ nv50_validate_tic(struct nv50_context *nv50, int s)
          BEGIN_NI04(push, NV50_2D(SIFC_DATA), 8);
          PUSH_DATAp(push, &tic->tic[0], 8);
 
-         need_flush = TRUE;
+         need_flush = true;
       } else
       if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
          BEGIN_NV04(push, NV50_3D(TEX_CACHE_CTL), 1);
@@ -316,7 +316,7 @@ nv50_validate_tic(struct nv50_context *nv50, int s)
 
 void nv50_validate_textures(struct nv50_context *nv50)
 {
-   boolean need_flush;
+   bool need_flush;
 
    need_flush  = nv50_validate_tic(nv50, 0);
    need_flush |= nv50_validate_tic(nv50, 1);
@@ -328,12 +328,12 @@ void nv50_validate_textures(struct nv50_context *nv50)
    }
 }
 
-static boolean
+static bool
 nv50_validate_tsc(struct nv50_context *nv50, int s)
 {
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    unsigned i;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    assert(nv50->num_samplers[s] <= PIPE_MAX_SAMPLERS);
    for (i = 0; i < nv50->num_samplers[s]; ++i) {
@@ -350,7 +350,7 @@ nv50_validate_tsc(struct nv50_context *nv50, int s)
          nv50_sifc_linear_u8(&nv50->base, nv50->screen->txc,
                              65536 + tsc->id * 32,
                              NOUVEAU_BO_VRAM, 32, tsc->tsc);
-         need_flush = TRUE;
+         need_flush = true;
       }
       nv50->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
 
@@ -368,7 +368,7 @@ nv50_validate_tsc(struct nv50_context *nv50, int s)
 
 void nv50_validate_samplers(struct nv50_context *nv50)
 {
-   boolean need_flush;
+   bool need_flush;
 
    need_flush  = nv50_validate_tsc(nv50, 0);
    need_flush |= nv50_validate_tsc(nv50, 1);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 3d200bd65e8..95c79ef864b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -58,7 +58,7 @@ nv50_vertex_state_create(struct pipe_context *pipe,
     so->num_elements = num_elements;
     so->instance_elts = 0;
     so->instance_bufs = 0;
-    so->need_conversion = FALSE;
+    so->need_conversion = false;
 
     memset(so->vb_access_size, 0, sizeof(so->vb_access_size));
 
@@ -89,7 +89,7 @@ nv50_vertex_state_create(struct pipe_context *pipe,
                 return NULL;
             }
             so->element[i].state = nv50_format_table[fmt].vtx;
-            so->need_conversion = TRUE;
+            so->need_conversion = true;
         }
         so->element[i].state |= i;
 
@@ -229,7 +229,7 @@ nv50_upload_user_buffers(struct nv50_context *nv50,
          BCTX_REFN_bo(nv50->bufctx_3d, VERTEX_TMP, NOUVEAU_BO_GART |
                       NOUVEAU_BO_RD, bo);
    }
-   nv50->base.vbo_dirty = TRUE;
+   nv50->base.vbo_dirty = true;
 }
 
 static void
@@ -275,7 +275,7 @@ nv50_update_user_vbufs(struct nv50_context *nv50)
       PUSH_DATAh(push, address[b] + ve->src_offset);
       PUSH_DATA (push, address[b] + ve->src_offset);
    }
-   nv50->base.vbo_dirty = TRUE;
+   nv50->base.vbo_dirty = true;
 }
 
 static INLINE void
@@ -316,7 +316,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
          struct nv04_resource *buf = nv04_resource(nv50->vtxbuf[i].buffer);
          if (buf && buf->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
             buf->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
-            nv50->base.vbo_dirty = TRUE;
+            nv50->base.vbo_dirty = true;
             break;
          }
       }
@@ -590,7 +590,7 @@ nv50_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
 }
 
 static void
-nv50_draw_elements(struct nv50_context *nv50, boolean shorten,
+nv50_draw_elements(struct nv50_context *nv50, bool shorten,
                    unsigned mode, unsigned start, unsigned count,
                    unsigned instance_count, int32_t index_bias)
 {
@@ -751,9 +751,9 @@ nv50_draw_vbo_kick_notify(struct nouveau_pushbuf *chan)
 {
    struct nv50_screen *screen = chan->user_priv;
 
-   nouveau_fence_update(&screen->base, TRUE);
+   nouveau_fence_update(&screen->base, true);
 
-   nv50_bufctx_fence(screen->cur_ctx->bufctx_3d, TRUE);
+   nv50_bufctx_fence(screen->cur_ctx->bufctx_3d, true);
 }
 
 void
@@ -806,7 +806,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
             continue;
 
          if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nv50->cb_dirty = TRUE;
+            nv50->cb_dirty = true;
       }
    }
 
@@ -814,7 +814,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (nv50->cb_dirty) {
       BEGIN_NV04(push, NV50_3D(CODE_CB_FLUSH), 1);
       PUSH_DATA (push, 0);
-      nv50->cb_dirty = FALSE;
+      nv50->cb_dirty = false;
    }
 
    if (nv50->vbo_fifo) {
@@ -835,21 +835,21 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (!nv50->vtxbuf[i].buffer)
          continue;
       if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nv50->base.vbo_dirty = TRUE;
+         nv50->base.vbo_dirty = true;
    }
 
    if (!nv50->base.vbo_dirty && nv50->idxbuf.buffer &&
        nv50->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-      nv50->base.vbo_dirty = TRUE;
+      nv50->base.vbo_dirty = true;
 
    if (nv50->base.vbo_dirty) {
       BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1);
       PUSH_DATA (push, 0);
-      nv50->base.vbo_dirty = FALSE;
+      nv50->base.vbo_dirty = false;
    }
 
    if (info->indexed) {
-      boolean shorten = info->max_index <= 65535;
+      bool shorten = info->max_index <= 65535;
 
       if (info->primitive_restart != nv50->state.prim_restart) {
          if (info->primitive_restart) {
@@ -858,7 +858,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
             PUSH_DATA (push, info->restart_index);
 
             if (info->restart_index > 65535)
-               shorten = FALSE;
+               shorten = false;
          } else {
             BEGIN_NV04(push, NV50_3D(PRIM_RESTART_ENABLE), 1);
             PUSH_DATA (push, 0);
@@ -870,7 +870,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
          PUSH_DATA (push, info->restart_index);
 
          if (info->restart_index > 65535)
-            shorten = FALSE;
+            shorten = false;
       }
 
       nv50_draw_elements(nv50, shorten,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index 56fc83d3679..47bd123621b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -121,51 +121,51 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen,
    return 0;
 }
 
-boolean
+bool
 nvc0_compute_validate_program(struct nvc0_context *nvc0)
 {
    struct nvc0_program *prog = nvc0->compprog;
 
    if (prog->mem)
-      return TRUE;
+      return true;
 
    if (!prog->translated) {
       prog->translated = nvc0_program_translate(
          prog, nvc0->screen->base.device->chipset);
       if (!prog->translated)
-         return FALSE;
+         return false;
    }
    if (unlikely(!prog->code_size))
-      return FALSE;
+      return false;
 
    if (likely(prog->code_size)) {
       if (nvc0_program_upload_code(nvc0, prog)) {
          struct nouveau_pushbuf *push = nvc0->base.pushbuf;
          BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1);
          PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CODE);
-         return TRUE;
+         return true;
       }
    }
-   return FALSE;
+   return false;
 }
 
-static boolean
+static bool
 nvc0_compute_state_validate(struct nvc0_context *nvc0)
 {
    if (!nvc0_compute_validate_program(nvc0))
-      return FALSE;
+      return false;
 
    /* TODO: textures, samplers, surfaces, global memory buffers */
 
-   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE);
+   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false);
 
    nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp);
    if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf)))
-      return FALSE;
+      return false;
    if (unlikely(nvc0->state.flushed))
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE);
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
 
-   return TRUE;
+   return true;
 
 }
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h
index 9a1a71760d7..168a6d1bee2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h
@@ -4,7 +4,7 @@
 #include "nv50/nv50_defs.xml.h"
 #include "nvc0/nvc0_compute.xml.h"
 
-boolean
+bool
 nvc0_compute_validate_program(struct nvc0_context *nvc0);
 
 #endif /* NVC0_COMPUTE_H */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index a35c3f66142..84f8db6a8ac 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -63,12 +63,12 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags)
          if (!nvc0->vtxbuf[i].buffer)
             continue;
          if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-            nvc0->base.vbo_dirty = TRUE;
+            nvc0->base.vbo_dirty = true;
       }
 
       if (nvc0->idxbuf.buffer &&
           nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-         nvc0->base.vbo_dirty = TRUE;
+         nvc0->base.vbo_dirty = true;
 
       for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) {
          uint32_t valid = nvc0->constbuf_valid[s];
@@ -86,7 +86,7 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags)
                continue;
 
             if (res->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-               nvc0->cb_dirty = TRUE;
+               nvc0->cb_dirty = true;
          }
       }
    }
@@ -164,9 +164,9 @@ nvc0_default_kick_notify(struct nouveau_pushbuf *push)
 
    if (screen) {
       nouveau_fence_next(&screen->base);
-      nouveau_fence_update(&screen->base, TRUE);
+      nouveau_fence_update(&screen->base, true);
       if (screen->cur_ctx)
-         screen->cur_ctx->state.flushed = TRUE;
+         screen->cur_ctx->state.flushed = true;
       NOUVEAU_DRV_STAT(&screen->base, pushbuf_count, 1);
    }
 }
@@ -378,7 +378,7 @@ out_err:
 
 void
 nvc0_bufctx_fence(struct nvc0_context *nvc0, struct nouveau_bufctx *bufctx,
-                  boolean on_flush)
+                  bool on_flush)
 {
    struct nouveau_list *list = on_flush ? &bufctx->current : &bufctx->pending;
    struct nouveau_list *it;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index a8d7593b398..7665991d1fd 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -93,7 +93,7 @@
 
 struct nvc0_blitctx;
 
-boolean nvc0_blitctx_create(struct nvc0_context *);
+bool nvc0_blitctx_create(struct nvc0_context *);
 void nvc0_blitctx_destroy(struct nvc0_context *);
 
 struct nvc0_context {
@@ -130,7 +130,7 @@ struct nvc0_context {
    struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[6];
    uint16_t constbuf_valid[6];
-   boolean cb_dirty;
+   bool cb_dirty;
 
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
@@ -164,14 +164,14 @@ struct nvc0_context {
    unsigned sample_mask;
    unsigned min_samples;
 
-   boolean vbo_push_hint;
+   bool vbo_push_hint;
 
    uint8_t tfbbuf_dirty;
    struct pipe_stream_output_target *tfbbuf[4];
    unsigned num_tfbbufs;
 
    struct pipe_query *cond_query;
-   boolean cond_cond; /* inverted rendering condition */
+   bool cond_cond; /* inverted rendering condition */
    uint cond_mode;
    uint32_t cond_condmode; /* the calculated condition */
 
@@ -210,15 +210,15 @@ nvc0_shader_stage(unsigned pipe)
 /* nvc0_context.c */
 struct pipe_context *nvc0_create(struct pipe_screen *, void *);
 void nvc0_bufctx_fence(struct nvc0_context *, struct nouveau_bufctx *,
-                       boolean on_flush);
+                       bool on_flush);
 void nvc0_default_kick_notify(struct nouveau_pushbuf *);
 
 /* nvc0_draw.c */
 extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *);
 
 /* nvc0_program.c */
-boolean nvc0_program_translate(struct nvc0_program *, uint16_t chipset);
-boolean nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *);
+bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset);
+bool nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_library_upload(struct nvc0_context *);
 uint32_t nvc0_program_symbol_offset(const struct nvc0_program *,
@@ -231,7 +231,7 @@ void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *,
 void nvc0_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
 void nvc0_so_target_save_offset(struct pipe_context *,
                                 struct pipe_stream_output_target *, unsigned i,
-                                boolean *serialize);
+                                bool *serialize);
 
 #define NVC0_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
@@ -250,8 +250,8 @@ extern void nvc0_init_state_functions(struct nvc0_context *);
 /* nvc0_state_validate.c */
 void nvc0_validate_global_residents(struct nvc0_context *,
                                     struct nouveau_bufctx *, int bin);
-extern boolean nvc0_state_validate(struct nvc0_context *, uint32_t state_mask,
-                                   unsigned space_words);
+extern bool nvc0_state_validate(struct nvc0_context *, uint32_t state_mask,
+                                unsigned space_words);
 
 /* nvc0_surface.c */
 extern void nvc0_clear(struct pipe_context *, unsigned buffers,
@@ -260,7 +260,7 @@ extern void nvc0_clear(struct pipe_context *, unsigned buffers,
 extern void nvc0_init_surface_functions(struct nvc0_context *);
 
 /* nvc0_tex.c */
-boolean nve4_validate_tsc(struct nvc0_context *nvc0, int s);
+bool nve4_validate_tsc(struct nvc0_context *nvc0, int s);
 void nvc0_validate_textures(struct nvc0_context *);
 void nvc0_validate_samplers(struct nvc0_context *);
 void nve4_set_tex_handles(struct nvc0_context *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
index 3875bbf4ca4..75859eea63d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
@@ -29,13 +29,13 @@
 #include "nvc0/nvc0_resource.h"
 
 static uint32_t
-nvc0_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, boolean is_3d)
+nvc0_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, bool is_3d)
 {
    return nv50_tex_choose_tile_dims_helper(nx, ny, nz, is_3d);
 }
 
 static uint32_t
-nvc0_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed)
+nvc0_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed)
 {
    const unsigned ms = util_logbase2(mt->base.base.nr_samples);
 
@@ -133,7 +133,7 @@ nvc0_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed)
    return tile_flags;
 }
 
-static INLINE boolean
+static INLINE bool
 nvc0_miptree_init_ms_mode(struct nv50_miptree *mt)
 {
    switch (mt->base.base.nr_samples) {
@@ -157,9 +157,9 @@ nvc0_miptree_init_ms_mode(struct nv50_miptree *mt)
       break;
    default:
       NOUVEAU_ERR("invalid nr_samples: %u\n", mt->base.base.nr_samples);
-      return FALSE;
+      return false;
    }
-   return TRUE;
+   return true;
 }
 
 static void
@@ -250,7 +250,7 @@ nvc0_miptree_create(struct pipe_screen *pscreen,
    struct nouveau_device *dev = nouveau_screen(pscreen)->device;
    struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
    struct pipe_resource *pt = &mt->base.base;
-   boolean compressed = dev->drm_version >= 0x01000101;
+   bool compressed = dev->drm_version >= 0x01000101;
    int ret;
    union nouveau_bo_config bo_config;
    uint32_t bo_flags;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index e1f5a8c4416..ccf3ecc3c5f 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -523,7 +523,7 @@ nvc0_program_dump(struct nvc0_program *prog)
 }
 #endif
 
-boolean
+bool
 nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
 {
    struct nv50_ir_prog_info *info;
@@ -531,7 +531,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
 
    info = CALLOC_STRUCT(nv50_ir_prog_info);
    if (!info)
-      return FALSE;
+      return false;
 
    info->type = prog->type;
    info->target = chipset;
@@ -630,7 +630,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
       assert(info->bin.tlsSpace < (1 << 24));
       prog->hdr[0] |= 1 << 26;
       prog->hdr[1] |= align(info->bin.tlsSpace, 0x10); /* l[] size */
-      prog->need_tls = TRUE;
+      prog->need_tls = true;
    }
    /* TODO: factor 2 only needed where joinat/precont is used,
     *       and we only have to count non-uniform branches
@@ -638,7 +638,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
    /*
    if ((info->maxCFDepth * 2) > 16) {
       prog->hdr[2] |= (((info->maxCFDepth * 2) + 47) / 48) * 0x200;
-      prog->need_tls = TRUE;
+      prog->need_tls = true;
    }
    */
    if (info->io.globalAccess)
@@ -655,11 +655,11 @@ out:
    return !ret;
 }
 
-boolean
+bool
 nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
 {
    struct nvc0_screen *screen = nvc0->screen;
-   const boolean is_cp = prog->type == PIPE_SHADER_COMPUTE;
+   const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;
    int ret;
    uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
    uint32_t lib_pos = screen->lib_code->start;
@@ -694,7 +694,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
       ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
       if (ret) {
          NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
-         return FALSE;
+         return false;
       }
       IMMED_NVC0(nvc0->base.pushbuf, NVC0_3D(SERIALIZE), 0);
    }
@@ -729,7 +729,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
       nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, lib_pos, 0);
 
 #ifdef DEBUG
-   if (debug_get_bool_option("NV50_PROG_DEBUG", FALSE))
+   if (debug_get_bool_option("NV50_PROG_DEBUG", false))
       nvc0_program_dump(prog);
 #endif
 
@@ -746,7 +746,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
    BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1);
    PUSH_DATA (nvc0->base.pushbuf, 0x1011);
 
-   return TRUE;
+   return true;
 }
 
 /* Upload code for builtin functions like integer division emulation. */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
index 3fd9d21b4c4..390e0c7a4f0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@@ -21,8 +21,8 @@ struct nvc0_program {
    struct pipe_shader_state pipe;
 
    ubyte type;
-   boolean translated;
-   boolean need_tls;
+   bool translated;
+   bool need_tls;
    uint8_t num_gprs;
 
    uint32_t *code;
@@ -41,7 +41,7 @@ struct nvc0_program {
       uint8_t clip_enable; /* mask of defined clip planes */
       uint8_t num_ucps; /* also set to max if ClipDistance is used */
       uint8_t edgeflag; /* attribute index of edgeflag input */
-      boolean need_vertex_id;
+      bool need_vertex_id;
    } vp;
    struct {
       uint8_t early_z;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index aea6cbda02d..2652a4acde3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -44,7 +44,7 @@ struct nvc0_query {
    uint32_t base;
    uint32_t offset; /* base + i * rotate */
    uint8_t state;
-   boolean is64bit;
+   bool is64bit;
    uint8_t rotate;
    int nesting; /* only used for occlusion queries */
    union {
@@ -68,7 +68,7 @@ nvc0_query(struct pipe_query *pipe)
    return (struct nvc0_query *)pipe;
 }
 
-static boolean
+static bool
 nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size)
 {
    struct nvc0_screen *screen = nvc0->screen;
@@ -87,17 +87,17 @@ nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size)
    if (size) {
       q->u.mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
       if (!q->bo)
-         return FALSE;
+         return false;
       q->offset = q->base;
 
       ret = nouveau_bo_map(q->bo, 0, screen->base.client);
       if (ret) {
          nvc0_query_allocate(nvc0, q, 0);
-         return FALSE;
+         return false;
       }
       q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
    }
-   return TRUE;
+   return true;
 }
 
 static void
@@ -126,17 +126,17 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
       space = NVC0_QUERY_ALLOC_SPACE;
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
-      q->is64bit = TRUE;
+      q->is64bit = true;
       space = 512;
       break;
    case PIPE_QUERY_SO_STATISTICS:
    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-      q->is64bit = TRUE;
+      q->is64bit = true;
       space = 64;
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
    case PIPE_QUERY_PRIMITIVES_EMITTED:
-      q->is64bit = TRUE;
+      q->is64bit = true;
       q->index = index;
       space = 32;
       break;
@@ -257,11 +257,11 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_query *q = nvc0_query(pq);
-   boolean ret = true;
+   bool ret = true;
 
    /* For occlusion queries we have to change the storage, because a previous
-    * query might set the initial render conition to FALSE even *after* we re-
-    * initialized it to TRUE.
+    * query might set the initial render conition to false even *after* we re-
+    * initialized it to true.
     */
    if (q->rotate) {
       nvc0_query_rotate(nvc0, q);
@@ -270,7 +270,7 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
        *  query ?
        */
       q->data[0] = q->sequence; /* initialize sequence */
-      q->data[1] = 1; /* initial render condition = TRUE */
+      q->data[1] = 1; /* initial render condition = true */
       q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
       q->data[5] = 0;
    }
@@ -401,7 +401,7 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5));
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      /* This query is not issued on GPU because disjoint is forced to FALSE */
+      /* This query is not issued on GPU because disjoint is forced to false */
       q->state = NVC0_QUERY_STATE_READY;
       break;
    default:
@@ -442,7 +442,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    struct nvc0_query *q = nvc0_query(pq);
    uint64_t *res64 = (uint64_t*)result;
    uint32_t *res32 = (uint32_t*)result;
-   boolean *res8 = (boolean*)result;
+   uint8_t *res8 = (uint8_t*)result;
    uint64_t *data64 = (uint64_t *)q->data;
    unsigned i;
 
@@ -450,7 +450,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
        q->type <= NVC0_QUERY_DRV_STAT_LAST) {
       res64[0] = q->u.value;
-      return TRUE;
+      return true;
    } else
 #endif
    if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) ||
@@ -468,17 +468,17 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
             /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */
             PUSH_KICK(nvc0->base.pushbuf);
          }
-         return FALSE;
+         return false;
       }
       if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->screen->base.client))
-         return FALSE;
+         return false;
       NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1);
    }
    q->state = NVC0_QUERY_STATE_READY;
 
    switch (q->type) {
    case PIPE_QUERY_GPU_FINISHED:
-      res8[0] = TRUE;
+      res8[0] = true;
       break;
    case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
       res64[0] = q->data[1] - q->data[5];
@@ -502,7 +502,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
       res64[0] = 1000000000;
-      res8[8] = FALSE;
+      res8[8] = false;
       break;
    case PIPE_QUERY_TIME_ELAPSED:
       res64[0] = data64[1] - data64[3];
@@ -516,10 +516,10 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       break;
    default:
       assert(0); /* can't happen, we don't create queries with invalid type */
-      return FALSE;
+      return false;
    }
 
-   return TRUE;
+   return true;
 }
 
 void
@@ -549,7 +549,7 @@ nvc0_render_condition(struct pipe_context *pipe,
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_query *q;
    uint32_t cond;
-   boolean wait =
+   bool wait =
       mode != PIPE_RENDER_COND_NO_WAIT &&
       mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT;
 
@@ -563,7 +563,7 @@ nvc0_render_condition(struct pipe_context *pipe,
       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
          cond = condition ? NVC0_3D_COND_MODE_EQUAL :
                           NVC0_3D_COND_MODE_NOT_EQUAL;
-         wait = TRUE;
+         wait = true;
          break;
       case PIPE_QUERY_OCCLUSION_COUNTER:
       case PIPE_QUERY_OCCLUSION_PREDICATE:
@@ -626,12 +626,12 @@ nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push,
 void
 nvc0_so_target_save_offset(struct pipe_context *pipe,
                            struct pipe_stream_output_target *ptarg,
-                           unsigned index, boolean *serialize)
+                           unsigned index, bool *serialize)
 {
    struct nvc0_so_target *targ = nvc0_so_target(ptarg);
 
    if (*serialize) {
-      *serialize = FALSE;
+      *serialize = false;
       PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1);
       IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0);
 
@@ -1080,7 +1080,7 @@ nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
 {
    struct nvc0_screen *screen = nvc0->screen;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   const boolean is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
+   const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
    const struct nvc0_mp_pm_query_cfg *cfg;
    unsigned i, c;
    unsigned num_ab[2] = { 0, 0 };
@@ -1101,7 +1101,7 @@ nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
    PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6);
 
    if (!screen->pm.mp_counters_enabled) {
-      screen->pm.mp_counters_enabled = TRUE;
+      screen->pm.mp_counters_enabled = true;
       BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
       PUSH_DATA (push, 0x1fcb);
    }
@@ -1168,7 +1168,7 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
    struct nvc0_screen *screen = nvc0->screen;
    struct pipe_context *pipe = &nvc0->base.pipe;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   const boolean is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
+   const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
    uint32_t mask;
    uint32_t input[3];
    const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
@@ -1181,7 +1181,7 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
    if (unlikely(!screen->pm.prog)) {
       struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
       prog->type = PIPE_SHADER_COMPUTE;
-      prog->translated = TRUE;
+      prog->translated = true;
       prog->num_gprs = 14;
       prog->parm_size = 12;
       if (is_nve4) {
@@ -1249,9 +1249,9 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
    }
 }
 
-static INLINE boolean
+static INLINE bool
 nvc0_mp_pm_query_read_data(uint32_t count[32][4],
-                           struct nvc0_context *nvc0, boolean wait,
+                           struct nvc0_context *nvc0, bool wait,
                            struct nvc0_query *q,
                            const struct nvc0_mp_pm_query_cfg *cfg,
                            unsigned mp_count)
@@ -1264,19 +1264,19 @@ nvc0_mp_pm_query_read_data(uint32_t count[32][4],
       for (c = 0; c < cfg->num_counters; ++c) {
          if (q->data[b + 8] != q->sequence) {
             if (!wait)
-               return FALSE;
+               return false;
             if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
-               return FALSE;
+               return false;
          }
          count[p][c] = q->data[b + q->ctr[c]];
       }
    }
-   return TRUE;
+   return true;
 }
 
-static INLINE boolean
+static INLINE bool
 nve4_mp_pm_query_read_data(uint32_t count[32][4],
-                           struct nvc0_context *nvc0, boolean wait,
+                           struct nvc0_context *nvc0, bool wait,
                            struct nvc0_query *q,
                            const struct nvc0_mp_pm_query_cfg *cfg,
                            unsigned mp_count)
@@ -1291,9 +1291,9 @@ nve4_mp_pm_query_read_data(uint32_t count[32][4],
          for (d = 0; d < ((q->ctr[c] & ~3) ? 1 : 4); ++d) {
             if (q->data[b + 20 + d] != q->sequence) {
                if (!wait)
-                  return FALSE;
+                  return false;
                if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
-                  return FALSE;
+                  return false;
             }
             if (q->ctr[c] & ~0x3)
                count[p][c] = q->data[b + 16 + (q->ctr[c] & 3)];
@@ -1302,7 +1302,7 @@ nve4_mp_pm_query_read_data(uint32_t count[32][4],
          }
       }
    }
-   return TRUE;
+   return true;
 }
 
 /* Metric calculations:
@@ -1325,7 +1325,7 @@ nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
    unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
    unsigned p, c;
    const struct nvc0_mp_pm_query_cfg *cfg;
-   boolean ret;
+   bool ret;
 
    cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
 
@@ -1334,7 +1334,7 @@ nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
    else
       ret = nvc0_mp_pm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
    if (!ret)
-      return FALSE;
+      return false;
 
    if (cfg->op == NVC0_COUNTER_OPn_SUM) {
       for (c = 0; c < cfg->num_counters; ++c)
@@ -1394,7 +1394,7 @@ nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
    }
 
    *(uint64_t *)result = value;
-   return TRUE;
+   return true;
 }
 
 int
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 518968a75f9..5f12281dd6c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -44,16 +44,16 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen,
                                 unsigned bindings)
 {
    if (sample_count > 8)
-      return FALSE;
+      return false;
    if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */
-      return FALSE;
+      return false;
 
    if (!util_format_is_supported(format, bindings))
-      return FALSE;
+      return false;
 
    if ((bindings & PIPE_BIND_SAMPLER_VIEW) && (target != PIPE_BUFFER))
       if (util_format_get_blocksizebits(format) == 3 * 32)
-         return FALSE;
+         return false;
 
    /* transfers & shared are always supported */
    bindings &= ~(PIPE_BIND_TRANSFER_READ |
@@ -556,7 +556,7 @@ nvc0_screen_init_compute(struct nvc0_screen *screen)
       /* Using COMPUTE has weird effects on 3D state, we need to
        * investigate this further before enabling it by default.
        */
-      if (debug_get_bool_option("NVC0_COMPUTE", FALSE))
+      if (debug_get_bool_option("NVC0_COMPUTE", false))
          return nvc0_screen_compute_setup(screen, screen->base.pushbuf);
       return 0;
    case 0xe0:
@@ -570,7 +570,7 @@ nvc0_screen_init_compute(struct nvc0_screen *screen)
    }
 }
 
-boolean
+bool
 nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
                             uint32_t lpos, uint32_t lneg, uint32_t cstack)
 {
@@ -580,7 +580,7 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
 
    if (size >= (1 << 20)) {
       NOUVEAU_ERR("requested TLS size too large: 0x%"PRIx64"\n", size);
-      return FALSE;
+      return false;
    }
 
    size *= (screen->base.device->chipset >= 0xe0) ? 64 : 48; /* max warps */
@@ -593,11 +593,11 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
                         NULL, &bo);
    if (ret) {
       NOUVEAU_ERR("failed to allocate TLS area, size: 0x%"PRIx64"\n", size);
-      return FALSE;
+      return false;
    }
    nouveau_bo_ref(NULL, &screen->tls);
    screen->tls = bo;
-   return TRUE;
+   return true;
 }
 
 #define FAIL_SCREEN_INIT(str, err)                    \
@@ -791,7 +791,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    BEGIN_NVC0(push, NVC0_3D(COND_MODE), 1);
    PUSH_DATA (push, NVC0_3D_COND_MODE_ALWAYS);
 
-   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", TRUE)) {
+   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", true)) {
       /* kill shaders after about 1 second (at 100 MHz) */
       BEGIN_NVC0(push, NVC0_3D(WATCHDOG_TIMER), 1);
       PUSH_DATA (push, 0x17);
@@ -1041,7 +1041,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    if (!nvc0_blitter_create(screen))
       goto fail;
 
-   nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
+   nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
    return pscreen;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index ef2bd43f006..81b641b1bf3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -28,10 +28,10 @@ struct nvc0_context;
 struct nvc0_blitter;
 
 struct nvc0_graph_state {
-   boolean flushed;
-   boolean rasterizer_discard;
-   boolean early_z_forced;
-   boolean prim_restart;
+   bool flushed;
+   bool rasterizer_discard;
+   bool early_z_forced;
+   bool prim_restart;
    uint32_t instance_elts; /* bitmask of per-instance elements */
    uint32_t instance_base;
    uint32_t constant_vbos;
@@ -95,7 +95,7 @@ struct nvc0_screen {
       struct nvc0_program *prog; /* compute state object to read MP counters */
       struct pipe_query *mp_counter[8]; /* counter to query allocation */
       uint8_t num_mp_pm_active[2];
-      boolean mp_counters_enabled;
+      bool mp_counters_enabled;
    } pm;
 
    struct nouveau_object *eng3d; /* sqrt(1/2)|kepler> + sqrt(1/2)|fermi> */
@@ -276,7 +276,7 @@ int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned,
 int nvc0_screen_get_driver_query_group_info(struct pipe_screen *, unsigned,
                                             struct pipe_driver_query_group_info *);
 
-boolean nvc0_blitter_create(struct nvc0_screen *);
+bool nvc0_blitter_create(struct nvc0_screen *);
 void nvc0_blitter_destroy(struct nvc0_screen *);
 
 void nvc0_screen_make_buffers_resident(struct nvc0_screen *);
@@ -287,8 +287,8 @@ int nvc0_screen_tsc_alloc(struct nvc0_screen *, void *);
 int nve4_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *);
 int nvc0_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *);
 
-boolean nvc0_screen_resize_tls_area(struct nvc0_screen *, uint32_t lpos,
-                                    uint32_t lneg, uint32_t cstack);
+bool nvc0_screen_resize_tls_area(struct nvc0_screen *, uint32_t lpos,
+                                 uint32_t lneg, uint32_t cstack);
 
 static INLINE void
 nvc0_resource_fence(struct nv04_resource *res, uint32_t flags)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index e0842784a88..450e0d52a5b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -63,22 +63,22 @@ nvc0_program_update_context_state(struct nvc0_context *nvc0,
    }
 }
 
-static INLINE boolean
+static INLINE bool
 nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)
 {
    if (prog->mem)
-      return TRUE;
+      return true;
 
    if (!prog->translated) {
       prog->translated = nvc0_program_translate(
          prog, nvc0->screen->base.device->chipset);
       if (!prog->translated)
-         return FALSE;
+         return false;
    }
 
    if (likely(prog->code_size))
       return nvc0_program_upload_code(nvc0, prog);
-   return TRUE; /* stream output info only */
+   return true; /* stream output info only */
 }
 
 void
@@ -192,7 +192,7 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0)
 
    /* we allow GPs with no code for specifying stream output state only */
    if (gp && gp->code_size) {
-      const boolean gp_selects_layer = !!(gp->hdr[13] & (1 << 9));
+      const bool gp_selects_layer = !!(gp->hdr[13] & (1 << 9));
 
       BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1);
       PUSH_DATA (push, 0x41);
@@ -280,7 +280,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
          nvc0_query_pushbuf_submit(push, targ->pq, 0x4);
       } else {
          PUSH_DATA(push, 0); /* TFB_BUFFER_OFFSET */
-         targ->clean = FALSE;
+         targ->clean = false;
       }
    }
    for (; b < 4; ++b)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index d18b064a0c1..1d54151ab8d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -92,8 +92,8 @@ nvc0_blend_state_create(struct pipe_context *pipe,
    int r; /* reference */
    uint32_t ms;
    uint8_t blend_en = 0;
-   boolean indep_masks = FALSE;
-   boolean indep_funcs = FALSE;
+   bool indep_masks = false;
+   bool indep_funcs = false;
 
    so->pipe = *cso;
 
@@ -111,7 +111,7 @@ nvc0_blend_state_create(struct pipe_context *pipe,
              cso->rt[i].alpha_func != cso->rt[r].alpha_func ||
              cso->rt[i].alpha_src_factor != cso->rt[r].alpha_src_factor ||
              cso->rt[i].alpha_dst_factor != cso->rt[r].alpha_dst_factor) {
-            indep_funcs = TRUE;
+            indep_funcs = true;
             break;
          }
       }
@@ -120,7 +120,7 @@ nvc0_blend_state_create(struct pipe_context *pipe,
 
       for (i = 1; i < 8; ++i) {
          if (cso->rt[i].colormask != cso->rt[0].colormask) {
-            indep_masks = TRUE;
+            indep_masks = true;
             break;
          }
       }
@@ -790,7 +790,7 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 
    pipe_resource_reference(&nvc0->constbuf[s][i].u.buf, res);
 
-   nvc0->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE;
+   nvc0->constbuf[s][i].user = (cb && cb->user_buffer) ? true : false;
    if (nvc0->constbuf[s][i].user) {
       nvc0->constbuf[s][i].u.data = cb->user_buffer;
       nvc0->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
@@ -1018,7 +1018,7 @@ nvc0_so_target_create(struct pipe_context *pipe,
       FREE(targ);
       return NULL;
    }
-   targ->clean = TRUE;
+   targ->clean = true;
 
    targ->pipe.buffer_size = size;
    targ->pipe.buffer_offset = offset;
@@ -1051,13 +1051,13 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe,
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    unsigned i;
-   boolean serialize = TRUE;
+   bool serialize = true;
 
    assert(num_targets <= 4);
 
    for (i = 0; i < num_targets; ++i) {
-      const boolean changed = nvc0->tfbbuf[i] != targets[i];
-      const boolean append = (offsets[i] == ((unsigned)-1));
+      const bool changed = nvc0->tfbbuf[i] != targets[i];
+      const bool append = (offsets[i] == ((unsigned)-1));
       if (!changed && append)
          continue;
       nvc0->tfbbuf_dirty |= 1 << i;
@@ -1066,7 +1066,7 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe,
          nvc0_so_target_save_offset(pipe, nvc0->tfbbuf[i], i, &serialize);
 
       if (targets[i] && !append)
-         nvc0_so_target(targets[i])->clean = TRUE;
+         nvc0_so_target(targets[i])->clean = true;
 
       pipe_so_target_reference(&nvc0->tfbbuf[i], targets[i]);
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index 785e52e9561..b07558ac62c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -74,7 +74,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
     struct pipe_framebuffer_state *fb = &nvc0->framebuffer;
     unsigned i, ms;
     unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1;
-    boolean serialize = FALSE;
+    bool serialize = false;
 
     nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
 
@@ -136,7 +136,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
         }
 
         if (res->status & NOUVEAU_BUFFER_STATUS_GPU_READING)
-           serialize = TRUE;
+           serialize = true;
         res->status |=  NOUVEAU_BUFFER_STATUS_GPU_WRITING;
         res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
@@ -168,7 +168,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
         ms_mode = mt->ms_mode;
 
         if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING)
-           serialize = TRUE;
+           serialize = true;
         mt->base.status |=  NOUVEAU_BUFFER_STATUS_GPU_WRITING;
         mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
@@ -518,12 +518,12 @@ static void
 nvc0_validate_derived_1(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   boolean rasterizer_discard;
+   bool rasterizer_discard;
 
    if (nvc0->rast && nvc0->rast->pipe.rasterizer_discard) {
-      rasterizer_discard = TRUE;
+      rasterizer_discard = true;
    } else {
-      boolean zs = nvc0->zsa &&
+      bool zs = nvc0->zsa &&
          (nvc0->zsa->pipe.depth.enabled || nvc0->zsa->pipe.stencil[0].enabled);
       rasterizer_discard = !zs &&
          (!nvc0->fragprog || !nvc0->fragprog->hdr[18]);
@@ -631,7 +631,7 @@ static struct state_validate {
 };
 #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
 
-boolean
+bool
 nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask, unsigned words)
 {
    uint32_t state_mask;
@@ -652,15 +652,15 @@ nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask, unsigned words)
       }
       nvc0->dirty &= ~state_mask;
 
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, FALSE);
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, false);
    }
 
    nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_3d);
    ret = nouveau_pushbuf_validate(nvc0->base.pushbuf);
 
    if (unlikely(nvc0->state.flushed)) {
-      nvc0->state.flushed = FALSE;
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, TRUE);
+      nvc0->state.flushed = false;
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, true);
    }
    return !ret;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
index 1d70b7c7b23..4bc4780ae17 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
@@ -39,7 +39,7 @@ struct nvc0_constbuf {
    } u;
    uint32_t size;
    uint32_t offset;
-   boolean user; /* should only be TRUE if u.data is valid and non-NULL */
+   bool user; /* should only be true if u.data is valid and non-NULL */
 };
 
 struct nvc0_vertex_element {
@@ -55,8 +55,8 @@ struct nvc0_vertex_stateobj {
    unsigned num_elements;
    uint32_t instance_elts;
    uint32_t instance_bufs;
-   boolean shared_slots;
-   boolean need_conversion; /* e.g. VFETCH cannot convert f64 to f32 */
+   bool shared_slots;
+   bool need_conversion; /* e.g. VFETCH cannot convert f64 to f32 */
    unsigned size; /* size of vertex in bytes (when packed) */
    struct nvc0_vertex_element element[0];
 };
@@ -65,7 +65,7 @@ struct nvc0_so_target {
    struct pipe_stream_output_target pipe;
    struct pipe_query *pq;
    unsigned stride;
-   boolean clean;
+   bool clean;
 };
 
 static INLINE struct nvc0_so_target *
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index 3c17f1663ed..2376e45c48e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -48,7 +48,7 @@
 #include "nv50/nv50_blit.h"
 
 static INLINE uint8_t
-nvc0_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
+nvc0_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
 {
    uint8_t id = nvc0_format_table[format].rt;
 
@@ -81,9 +81,9 @@ nvc0_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
 }
 
 static int
-nvc0_2d_texture_set(struct nouveau_pushbuf *push, boolean dst,
+nvc0_2d_texture_set(struct nouveau_pushbuf *push, bool dst,
                     struct nv50_miptree *mt, unsigned level, unsigned layer,
-                    enum pipe_format pformat, boolean dst_src_pformat_equal)
+                    enum pipe_format pformat, bool dst_src_pformat_equal)
 {
    struct nouveau_bo *bo = mt->base.bo;
    uint32_t width, height, depth;
@@ -161,16 +161,16 @@ nvc0_2d_texture_do_copy(struct nouveau_pushbuf *push,
    const enum pipe_format dfmt = dst->base.base.format;
    const enum pipe_format sfmt = src->base.base.format;
    int ret;
-   boolean eqfmt = dfmt == sfmt;
+   bool eqfmt = dfmt == sfmt;
 
    if (!PUSH_SPACE(push, 2 * 16 + 32))
       return PIPE_ERROR;
 
-   ret = nvc0_2d_texture_set(push, TRUE, dst, dst_level, dz, dfmt, eqfmt);
+   ret = nvc0_2d_texture_set(push, true, dst, dst_level, dz, dfmt, eqfmt);
    if (ret)
       return ret;
 
-   ret = nvc0_2d_texture_set(push, FALSE, src, src_level, sz, sfmt, eqfmt);
+   ret = nvc0_2d_texture_set(push, false, src, src_level, sz, sfmt, eqfmt);
    if (ret)
       return ret;
 
@@ -203,7 +203,7 @@ nvc0_resource_copy_region(struct pipe_context *pipe,
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    int ret;
-   boolean m2mf;
+   bool m2mf;
    unsigned dst_layer = dstz, src_layer = src_box->z;
 
    if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
@@ -704,7 +704,7 @@ nvc0_blitter_make_vp(struct nvc0_blitter *blit)
    };
 
    blit->vp.type = PIPE_SHADER_VERTEX;
-   blit->vp.translated = TRUE;
+   blit->vp.translated = true;
    if (blit->screen->base.class_3d >= GM107_3D_CLASS) {
       blit->vp.code = (uint32_t *)code_gm107; /* const_cast */
       blit->vp.code_size = sizeof(code_gm107);
@@ -1217,7 +1217,7 @@ nvc0_blit_eng2d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
    int i;
    uint32_t mode;
    uint32_t mask = nv50_blit_eng2d_get_mask(info);
-   boolean b;
+   bool b;
 
    mode = nv50_blit_get_filter(info) ?
       NV50_2D_BLIT_CONTROL_FILTER_BILINEAR :
@@ -1377,39 +1377,39 @@ nvc0_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   boolean eng3d = FALSE;
+   bool eng3d = false;
 
    if (util_format_is_depth_or_stencil(info->dst.resource->format)) {
       if (!(info->mask & PIPE_MASK_ZS))
          return;
       if (info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT ||
           info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
-         eng3d = TRUE;
+         eng3d = true;
       if (info->filter != PIPE_TEX_FILTER_NEAREST)
-         eng3d = TRUE;
+         eng3d = true;
    } else {
       if (!(info->mask & PIPE_MASK_RGBA))
          return;
       if (info->mask != PIPE_MASK_RGBA)
-         eng3d = TRUE;
+         eng3d = true;
    }
 
    if (nv50_miptree(info->src.resource)->layout_3d) {
-      eng3d = TRUE;
+      eng3d = true;
    } else
    if (info->src.box.depth != info->dst.box.depth) {
-      eng3d = TRUE;
+      eng3d = true;
       debug_printf("blit: cannot filter array or cube textures in z direction");
    }
 
    if (!eng3d && info->dst.format != info->src.format) {
       if (!nv50_2d_dst_format_faithful(info->dst.format)) {
-         eng3d = TRUE;
+         eng3d = true;
       } else
       if (!nv50_2d_src_format_faithful(info->src.format)) {
          if (!util_format_is_luminance(info->src.format)) {
             if (!nv50_2d_dst_format_ops_supported(info->dst.format))
-               eng3d = TRUE;
+               eng3d = true;
             else
             if (util_format_is_intensity(info->src.format))
                eng3d = info->src.format != PIPE_FORMAT_I8_UNORM;
@@ -1421,24 +1421,24 @@ nvc0_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
          }
       } else
       if (util_format_is_luminance_alpha(info->src.format))
-         eng3d = TRUE;
+         eng3d = true;
    }
 
    if (info->src.resource->nr_samples == 8 &&
        info->dst.resource->nr_samples <= 1)
-      eng3d = TRUE;
+      eng3d = true;
 #if 0
    /* FIXME: can't make this work with eng2d anymore, at least not on nv50 */
    if (info->src.resource->nr_samples > 1 ||
        info->dst.resource->nr_samples > 1)
-      eng3d = TRUE;
+      eng3d = true;
 #endif
    /* FIXME: find correct src coordinates adjustments */
    if ((info->src.box.width !=  info->dst.box.width &&
         info->src.box.width != -info->dst.box.width) ||
        (info->src.box.height !=  info->dst.box.height &&
         info->src.box.height != -info->dst.box.height))
-      eng3d = TRUE;
+      eng3d = true;
 
    if (nvc0->screen->num_occlusion_queries_active)
       IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0);
@@ -1460,13 +1460,13 @@ nvc0_flush_resource(struct pipe_context *ctx,
 {
 }
 
-boolean
+bool
 nvc0_blitter_create(struct nvc0_screen *screen)
 {
    screen->blitter = CALLOC_STRUCT(nvc0_blitter);
    if (!screen->blitter) {
       NOUVEAU_ERR("failed to allocate blitter struct\n");
-      return FALSE;
+      return false;
    }
    screen->blitter->screen = screen;
 
@@ -1475,7 +1475,7 @@ nvc0_blitter_create(struct nvc0_screen *screen)
    nvc0_blitter_make_vp(screen->blitter);
    nvc0_blitter_make_sampler(screen->blitter);
 
-   return TRUE;
+   return true;
 }
 
 void
@@ -1498,20 +1498,20 @@ nvc0_blitter_destroy(struct nvc0_screen *screen)
    FREE(blitter);
 }
 
-boolean
+bool
 nvc0_blitctx_create(struct nvc0_context *nvc0)
 {
    nvc0->blit = CALLOC_STRUCT(nvc0_blitctx);
    if (!nvc0->blit) {
       NOUVEAU_ERR("failed to allocate blit context\n");
-      return FALSE;
+      return false;
    }
 
    nvc0->blit->nvc0 = nvc0;
 
    nvc0->blit->rast.pipe.half_pixel_center = 1;
 
-   return TRUE;
+   return true;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index ddc0409ca86..7c041cbfe34 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -35,7 +35,7 @@
     NV50_TIC_0_MAPG__MASK | NV50_TIC_0_MAPR__MASK)
 
 static INLINE uint32_t
-nv50_tic_swizzle(uint32_t tc, unsigned swz, boolean tex_int)
+nv50_tic_swizzle(uint32_t tc, unsigned swz, bool tex_int)
 {
    switch (swz) {
    case PIPE_SWIZZLE_RED:
@@ -82,7 +82,7 @@ nvc0_create_texture_view(struct pipe_context *pipe,
    uint32_t depth;
    struct nv50_tic_entry *view;
    struct nv50_miptree *mt;
-   boolean tex_int;
+   bool tex_int;
 
    view = MALLOC_STRUCT(nv50_tic_entry);
    if (!view)
@@ -195,7 +195,7 @@ nvc0_create_texture_view(struct pipe_context *pipe,
    default:
       NOUVEAU_ERR("unexpected/invalid texture target: %d\n",
                   mt->base.base.target);
-      return FALSE;
+      return false;
    }
 
    tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000;
@@ -226,7 +226,7 @@ nvc0_create_texture_view(struct pipe_context *pipe,
    return &view->pipe;
 }
 
-static boolean
+static bool
 nvc0_validate_tic(struct nvc0_context *nvc0, int s)
 {
    uint32_t commands[32];
@@ -234,12 +234,12 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s)
    struct nouveau_bo *txc = nvc0->screen->txc;
    unsigned i;
    unsigned n = 0;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    for (i = 0; i < nvc0->num_textures[s]; ++i) {
       struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
       struct nv04_resource *res;
-      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+      const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
 
       if (!tic) {
          if (dirty)
@@ -263,7 +263,7 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s)
          BEGIN_NIC0(push, NVC0_M2MF(DATA), 8);
          PUSH_DATAp(push, &tic->tic[0], 8);
 
-         need_flush = TRUE;
+         need_flush = true;
       } else
       if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
          BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
@@ -295,18 +295,18 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s)
    return need_flush;
 }
 
-static boolean
+static bool
 nve4_validate_tic(struct nvc0_context *nvc0, unsigned s)
 {
    struct nouveau_bo *txc = nvc0->screen->txc;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    unsigned i;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    for (i = 0; i < nvc0->num_textures[s]; ++i) {
       struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
       struct nv04_resource *res;
-      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+      const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
 
       if (!tic) {
          nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
@@ -328,7 +328,7 @@ nve4_validate_tic(struct nvc0_context *nvc0, unsigned s)
          PUSH_DATA (push, 0x1001);
          PUSH_DATAp(push, &tic->tic[0], 8);
 
-         need_flush = TRUE;
+         need_flush = true;
       } else
       if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
          BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
@@ -356,7 +356,7 @@ nve4_validate_tic(struct nvc0_context *nvc0, unsigned s)
 
 void nvc0_validate_textures(struct nvc0_context *nvc0)
 {
-   boolean need_flush;
+   bool need_flush;
 
    if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
       need_flush  = nve4_validate_tic(nvc0, 0);
@@ -374,14 +374,14 @@ void nvc0_validate_textures(struct nvc0_context *nvc0)
    }
 }
 
-static boolean
+static bool
 nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
 {
    uint32_t commands[16];
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    unsigned i;
    unsigned n = 0;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    for (i = 0; i < nvc0->num_samplers[s]; ++i) {
       struct nv50_tsc_entry *tsc = nv50_tsc_entry(nvc0->samplers[s][i]);
@@ -398,7 +398,7 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
          nvc0_m2mf_push_linear(&nvc0->base, nvc0->screen->txc,
                                65536 + tsc->id * 32, NV_VRAM_DOMAIN(&nvc0->screen->base),
                                32, tsc->tsc);
-         need_flush = TRUE;
+         need_flush = true;
       }
       nvc0->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
 
@@ -418,13 +418,13 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
    return need_flush;
 }
 
-boolean
+bool
 nve4_validate_tsc(struct nvc0_context *nvc0, int s)
 {
    struct nouveau_bo *txc = nvc0->screen->txc;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    unsigned i;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    for (i = 0; i < nvc0->num_samplers[s]; ++i) {
       struct nv50_tsc_entry *tsc = nv50_tsc_entry(nvc0->samplers[s][i]);
@@ -447,7 +447,7 @@ nve4_validate_tsc(struct nvc0_context *nvc0, int s)
          PUSH_DATA (push, 0x1001);
          PUSH_DATAp(push, &tsc->tsc[0], 8);
 
-         need_flush = TRUE;
+         need_flush = true;
       }
       nvc0->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
 
@@ -466,7 +466,7 @@ nve4_validate_tsc(struct nvc0_context *nvc0, int s)
 
 void nvc0_validate_samplers(struct nvc0_context *nvc0)
 {
-   boolean need_flush;
+   bool need_flush;
 
    if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
       need_flush  = nve4_validate_tsc(nvc0, 0);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
index 45c6f7cc3ca..d21c7cc64fb 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
@@ -329,17 +329,17 @@ nve4_m2mf_copy_linear(struct nouveau_context *nv,
 }
 
 
-static INLINE boolean
+static INLINE bool
 nvc0_mt_transfer_can_map_directly(struct nv50_miptree *mt)
 {
    if (mt->base.domain == NOUVEAU_BO_VRAM)
-      return FALSE;
+      return false;
    if (mt->base.base.usage != PIPE_USAGE_STAGING)
-      return FALSE;
+      return false;
    return !nouveau_bo_memtype(mt->base.bo);
 }
 
-static INLINE boolean
+static INLINE bool
 nvc0_mt_sync(struct nvc0_context *nvc0, struct nv50_miptree *mt, unsigned usage)
 {
    if (!mt->base.mm) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 8cf2584b0ce..8458ff07b27 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -61,8 +61,8 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
     so->num_elements = num_elements;
     so->instance_elts = 0;
     so->instance_bufs = 0;
-    so->shared_slots = FALSE;
-    so->need_conversion = FALSE;
+    so->shared_slots = false;
+    so->need_conversion = false;
 
     memset(so->vb_access_size, 0, sizeof(so->vb_access_size));
 
@@ -93,7 +93,7 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
                 return NULL;
             }
             so->element[i].state = nvc0_format_table[fmt].vtx;
-            so->need_conversion = TRUE;
+            so->need_conversion = true;
         }
         size = util_format_get_blocksize(fmt);
 
@@ -141,7 +141,7 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
 
     if (so->instance_elts || src_offset_max >= (1 << 14))
        return so;
-    so->shared_slots = TRUE;
+    so->shared_slots = true;
 
     for (i = 0; i < num_elements; ++i) {
        const unsigned b = elements[i].vertex_buffer_index;
@@ -265,7 +265,7 @@ nvc0_update_user_vbufs(struct nvc0_context *nvc0)
       PUSH_DATAh(push, address[b] + ve->src_offset);
       PUSH_DATA (push, address[b] + ve->src_offset);
    }
-   nvc0->base.vbo_dirty = TRUE;
+   nvc0->base.vbo_dirty = true;
 }
 
 static void
@@ -419,7 +419,7 @@ nvc0_vertex_arrays_validate(struct nvc0_context *nvc0)
    uint32_t const_vbos;
    unsigned i;
    uint8_t vbo_mode;
-   boolean update_vertex;
+   bool update_vertex;
 
    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
 
@@ -559,7 +559,7 @@ nvc0_draw_vbo_kick_notify(struct nouveau_pushbuf *push)
 {
    struct nvc0_screen *screen = push->user_priv;
 
-   nouveau_fence_update(&screen->base, TRUE);
+   nouveau_fence_update(&screen->base, true);
 
    NOUVEAU_DRV_STAT(&screen->base, pushbuf_count, 1);
 }
@@ -695,7 +695,7 @@ nvc0_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
 }
 
 static void
-nvc0_draw_elements(struct nvc0_context *nvc0, boolean shorten,
+nvc0_draw_elements(struct nvc0_context *nvc0, bool shorten,
                    unsigned mode, unsigned start, unsigned count,
                    unsigned instance_count, int32_t index_bias)
 {
@@ -836,7 +836,7 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
 }
 
 static INLINE void
-nvc0_update_prim_restart(struct nvc0_context *nvc0, boolean en, uint32_t index)
+nvc0_update_prim_restart(struct nvc0_context *nvc0, bool en, uint32_t index)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 
@@ -910,13 +910,13 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
             continue;
 
          if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nvc0->cb_dirty = TRUE;
+            nvc0->cb_dirty = true;
       }
    }
 
    if (nvc0->cb_dirty) {
       IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011);
-      nvc0->cb_dirty = FALSE;
+      nvc0->cb_dirty = false;
    }
 
    if (nvc0->state.vbo_mode) {
@@ -940,19 +940,19 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (!nvc0->vtxbuf[i].buffer)
          continue;
       if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nvc0->base.vbo_dirty = TRUE;
+         nvc0->base.vbo_dirty = true;
    }
 
    if (!nvc0->base.vbo_dirty && nvc0->idxbuf.buffer &&
        nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-      nvc0->base.vbo_dirty = TRUE;
+      nvc0->base.vbo_dirty = true;
 
    nvc0_update_prim_restart(nvc0, info->primitive_restart, info->restart_index);
 
    if (nvc0->base.vbo_dirty) {
       if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS)
          IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);
-      nvc0->base.vbo_dirty = FALSE;
+      nvc0->base.vbo_dirty = false;
    }
 
    if (unlikely(info->indirect)) {
@@ -962,10 +962,10 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       nvc0_draw_stream_output(nvc0, info);
    } else
    if (info->indexed) {
-      boolean shorten = info->max_index <= 65535;
+      bool shorten = info->max_index <= 65535;
 
       if (info->primitive_restart && info->restart_index > 65535)
-         shorten = FALSE;
+         shorten = false;
 
       nvc0_draw_elements(nvc0, shorten,
                          info->mode, info->start, info->count,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
index f180087161d..ff9a66e63c2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
@@ -21,12 +21,12 @@ struct push_context {
    uint32_t restart_index;
    uint32_t instance_id;
 
-   boolean prim_restart;
-   boolean need_vertex_id;
+   bool prim_restart;
+   bool need_vertex_id;
 
    struct {
-      boolean enabled;
-      boolean value;
+      bool enabled;
+      bool value;
       unsigned stride;
       const uint8_t *data;
    } edgeflag;
@@ -47,7 +47,7 @@ nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx)
    ctx->need_vertex_id =
       nvc0->vertprog->vp.need_vertex_id && (nvc0->vertex->num_elements < 32);
 
-   ctx->edgeflag.value = TRUE;
+   ctx->edgeflag.value = true;
    ctx->edgeflag.enabled = nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS;
 
    /* silence warnings */
@@ -136,14 +136,14 @@ prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index)
    return i;
 }
 
-static INLINE boolean
+static INLINE bool
 ef_value(const struct push_context *ctx, uint32_t index)
 {
    float *pf = (float *)&ctx->edgeflag.data[index * ctx->edgeflag.stride];
-   return *pf ? TRUE : FALSE;
+   return *pf ? true : false;
 }
 
-static INLINE boolean
+static INLINE bool
 ef_toggle(struct push_context *ctx)
 {
    ctx->edgeflag.value = !ctx->edgeflag.value;
@@ -483,7 +483,7 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
          struct pipe_context *pipe = &nvc0->base.pipe;
          struct nvc0_so_target *targ;
          targ = nvc0_so_target(info->count_from_stream_output);
-         pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count);
+         pipe->get_query_result(pipe, targ->pq, true, (void *)&vert_count);
          vert_count /= targ->stride;
       }
       ctx.idxbuf = NULL; /* shut up warnings */
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index fce02a7cc57..2933d9b24db 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -250,7 +250,7 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
 static void
 nve4_compute_validate_samplers(struct nvc0_context *nvc0)
 {
-   boolean need_flush = nve4_validate_tsc(nvc0, 5);
+   bool need_flush = nve4_validate_tsc(nvc0, 5);
    if (need_flush) {
       BEGIN_NVC0(nvc0->base.pushbuf, NVE4_COMPUTE(TSC_FLUSH), 1);
       PUSH_DATA (nvc0->base.pushbuf, 0);
@@ -299,11 +299,11 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
 }
 
 
-static boolean
+static bool
 nve4_compute_state_validate(struct nvc0_context *nvc0)
 {
    if (!nvc0_compute_validate_program(nvc0))
-      return FALSE;
+      return false;
    if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES)
       nve4_compute_validate_textures(nvc0);
    if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS)
@@ -316,15 +316,15 @@ nve4_compute_state_validate(struct nvc0_context *nvc0)
       nvc0_validate_global_residents(nvc0,
                                      nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL);
 
-   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE);
+   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false);
 
    nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp);
    if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf)))
-      return FALSE;
+      return false;
    if (unlikely(nvc0->state.flushed))
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE);
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
 
-   return TRUE;
+   return true;
 }
 
 
@@ -505,7 +505,7 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
    for (i = 0; i < nvc0->num_textures[s]; ++i) {
       struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
       struct nv04_resource *res;
-      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+      const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
 
       if (!tic) {
          nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
@@ -575,18 +575,18 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
 {
    const uint32_t *data = (const uint32_t *)desc;
    unsigned i;
-   boolean zero = FALSE;
+   bool zero = false;
 
    debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n");
 
    for (i = 0; i < sizeof(*desc); i += 4) {
       if (data[i / 4]) {
          debug_printf("[%x]: 0x%08x\n", i, data[i / 4]);
-         zero = FALSE;
+         zero = false;
       } else
       if (!zero) {
          debug_printf("...\n");
-         zero = TRUE;
+         zero = true;
       }
    }
 
@@ -606,7 +606,7 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
    for (i = 0; i < 8; ++i) {
       uint64_t address;
       uint32_t size = desc->cb[i].size;
-      boolean valid = !!(desc->cb_mask & (1 << i));
+      bool valid = !!(desc->cb_mask & (1 << i));
 
       address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l;
 
diff --git a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
index 5a4c256539d..c6603e38a00 100644
--- a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
+++ b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
@@ -17,7 +17,7 @@ static struct util_hash_table *fd_tab = NULL;
 
 pipe_static_mutex(nouveau_screen_mutex);
 
-boolean nouveau_drm_screen_unref(struct nouveau_screen *screen)
+bool nouveau_drm_screen_unref(struct nouveau_screen *screen)
 {
 	int ret;
 	if (screen->refcount == -1)

From 2f11e92cef51c88a09bc778e2ceca4ab50cf0017 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sat, 18 Jul 2015 01:22:00 -0700
Subject: [PATCH 0395/1208] mesa: Rename _mesa_lookup_enum_by_nr() to
 _mesa_enum_to_string().

Generated by sed; no manual changes.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mapi/glapi/gen/gl_enums.py                |   2 +-
 src/mesa/drivers/common/meta_blit.c           |   2 +-
 .../drivers/common/meta_generate_mipmap.c     |   2 +-
 src/mesa/drivers/dri/i915/i830_state.c        |  20 +--
 src/mesa/drivers/dri/i915/intel_fbo.c         |   2 +-
 src/mesa/drivers/dri/i915/intel_mipmap_tree.c |   2 +-
 src/mesa/drivers/dri/i915/intel_render.c      |   2 +-
 src/mesa/drivers/dri/i915/intel_tex_image.c   |   2 +-
 .../drivers/dri/i915/intel_tex_subimage.c     |   2 +-
 src/mesa/drivers/dri/i915/intel_tris.c        |   4 +-
 src/mesa/drivers/dri/i965/brw_draw.c          |   8 +-
 src/mesa/drivers/dri/i965/brw_draw_upload.c   |   2 +-
 src/mesa/drivers/dri/i965/gen6_cc.c           |   4 +-
 src/mesa/drivers/dri/i965/intel_fbo.c         |   2 +-
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c |   2 +-
 src/mesa/drivers/dri/i965/intel_tex_image.c   |   4 +-
 .../drivers/dri/i965/intel_tex_subimage.c     |   4 +-
 src/mesa/drivers/dri/r200/r200_state.c        |   2 +-
 src/mesa/drivers/dri/r200/r200_tex.c          |  18 +--
 src/mesa/drivers/dri/radeon/radeon_common.c   |   2 +-
 src/mesa/drivers/dri/radeon/radeon_fbo.c      |   2 +-
 .../drivers/dri/radeon/radeon_mipmap_tree.c   |   2 +-
 .../drivers/dri/radeon/radeon_pixel_read.c    |   2 +-
 src/mesa/drivers/dri/radeon/radeon_state.c    |   2 +-
 src/mesa/drivers/dri/radeon/radeon_swtcl.c    |   2 +-
 src/mesa/drivers/dri/radeon/radeon_tex.c      |   6 +-
 src/mesa/drivers/dri/radeon/radeon_texture.c  |   4 +-
 src/mesa/main/api_validate.c                  |   2 +-
 src/mesa/main/atifragshader.c                 |  30 ++---
 src/mesa/main/attrib.c                        |   2 +-
 src/mesa/main/blend.c                         |  34 +++---
 src/mesa/main/blit.c                          |   8 +-
 src/mesa/main/bufferobj.c                     |  14 +--
 src/mesa/main/buffers.c                       |  22 ++--
 src/mesa/main/clear.c                         |   8 +-
 src/mesa/main/condrender.c                    |   4 +-
 src/mesa/main/copyimage.c                     |   8 +-
 src/mesa/main/debug.c                         |   2 +-
 src/mesa/main/depth.c                         |   2 +-
 src/mesa/main/dlist.c                         |  14 +--
 src/mesa/main/drawpix.c                       |  18 +--
 src/mesa/main/enable.c                        |  12 +-
 src/mesa/main/enums.h                         |   2 +-
 src/mesa/main/errors.c                        |   4 +-
 src/mesa/main/fbobject.c                      |  72 +++++------
 src/mesa/main/feedback.c                      |   2 +-
 src/mesa/main/formatquery.c                   |   8 +-
 src/mesa/main/framebuffer.c                   |   2 +-
 src/mesa/main/genmipmap.c                     |   2 +-
 src/mesa/main/get.c                           |   8 +-
 src/mesa/main/getstring.c                     |   4 +-
 src/mesa/main/glformats.c                     |   6 +-
 src/mesa/main/hint.c                          |   4 +-
 src/mesa/main/light.c                         |   6 +-
 src/mesa/main/matrix.c                        |   8 +-
 src/mesa/main/objectlabel.c                   |   2 +-
 src/mesa/main/pipelineobj.c                   |   2 +-
 src/mesa/main/polygon.c                       |   8 +-
 src/mesa/main/program_resource.c              |  20 +--
 src/mesa/main/queryobj.c                      |  28 ++---
 src/mesa/main/readpix.c                       |  12 +-
 src/mesa/main/samplerobj.c                    |  20 +--
 src/mesa/main/shader_query.cpp                |  14 +--
 src/mesa/main/shaderapi.c                     |   8 +-
 src/mesa/main/shaderimage.c                   |   2 +-
 src/mesa/main/tests/enum_strings.cpp          |   4 +-
 src/mesa/main/texenv.c                        |  10 +-
 src/mesa/main/texformat.c                     |   2 +-
 src/mesa/main/texgen.c                        |   6 +-
 src/mesa/main/texgetimage.c                   |   4 +-
 src/mesa/main/teximage.c                      | 114 +++++++++---------
 src/mesa/main/texobj.c                        |   6 +-
 src/mesa/main/texparam.c                      |  24 ++--
 src/mesa/main/texstate.c                      |  36 +++---
 src/mesa/main/texstate.h                      |   2 +-
 src/mesa/main/texstorage.c                    |  16 +--
 src/mesa/main/textureview.c                   |  10 +-
 src/mesa/main/uniforms.c                      |   2 +-
 src/mesa/main/varray.c                        |   6 +-
 src/mesa/main/viewport.c                      |   4 +-
 src/mesa/tnl/t_draw.c                         |   2 +-
 src/mesa/tnl/t_vb_render.c                    |   2 +-
 src/mesa/tnl/t_vertex_sse.c                   |   2 +-
 src/mesa/tnl_dd/t_dd_dmatmp.h                 |   2 +-
 src/mesa/tnl_dd/t_dd_unfilled.h               |   2 +-
 src/mesa/vbo/vbo_exec_array.c                 |  58 ++++-----
 86 files changed, 423 insertions(+), 423 deletions(-)

diff --git a/src/mapi/glapi/gen/gl_enums.py b/src/mapi/glapi/gen/gl_enums.py
index 955f27d0818..a2d6c7ca2fd 100644
--- a/src/mapi/glapi/gen/gl_enums.py
+++ b/src/mapi/glapi/gen/gl_enums.py
@@ -78,7 +78,7 @@ static int compar_nr( const int *a, enum_elt *b )
 
 static char token_tmp[20];
 
-const char *_mesa_lookup_enum_by_nr( int nr )
+const char *_mesa_enum_to_string( int nr )
 {
    enum_elt *elt;
 
diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index 9cace2b245a..317a304bebf 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -312,7 +312,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
       break;
    default:
       _mesa_problem(ctx, "Unkown texture target %s\n",
-                    _mesa_lookup_enum_by_nr(target));
+                    _mesa_enum_to_string(target));
       shader_index = BLIT_2X_MSAA_SHADER_2D_MULTISAMPLE_RESOLVE;
    }
 
diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c
index c1b6d3c1f86..f764ebad718 100644
--- a/src/mesa/drivers/common/meta_generate_mipmap.c
+++ b/src/mesa/drivers/common/meta_generate_mipmap.c
@@ -66,7 +66,7 @@ fallback_required(struct gl_context *ctx, GLenum target,
    if (target == GL_TEXTURE_3D) {
       _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_HIGH,
                        "glGenerateMipmap() to %s target\n",
-                       _mesa_lookup_enum_by_nr(target));
+                       _mesa_enum_to_string(target));
       return true;
    }
 
diff --git a/src/mesa/drivers/dri/i915/i830_state.c b/src/mesa/drivers/dri/i915/i830_state.c
index ea54e2b25b1..906e942b020 100644
--- a/src/mesa/drivers/dri/i915/i830_state.c
+++ b/src/mesa/drivers/dri/i915/i830_state.c
@@ -57,7 +57,7 @@ i830StencilFuncSeparate(struct gl_context * ctx, GLenum face, GLenum func, GLint
    mask = mask & 0xff;
 
    DBG("%s : func: %s, ref : 0x%x, mask: 0x%x\n", __func__,
-       _mesa_lookup_enum_by_nr(func), ref, mask);
+       _mesa_enum_to_string(func), ref, mask);
 
 
    I830_STATECHANGE(i830, I830_UPLOAD_CTX);
@@ -95,9 +95,9 @@ i830StencilOpSeparate(struct gl_context * ctx, GLenum face, GLenum fail, GLenum
    int fop, dfop, dpop;
 
    DBG("%s: fail : %s, zfail: %s, zpass : %s\n", __func__,
-       _mesa_lookup_enum_by_nr(fail),
-       _mesa_lookup_enum_by_nr(zfail), 
-       _mesa_lookup_enum_by_nr(zpass));
+       _mesa_enum_to_string(fail),
+       _mesa_enum_to_string(zfail), 
+       _mesa_enum_to_string(zpass));
 
    fop = 0;
    dfop = 0;
@@ -389,8 +389,8 @@ static void
 i830BlendEquationSeparate(struct gl_context * ctx, GLenum modeRGB, GLenum modeA)
 {
    DBG("%s -> %s, %s\n", __func__,
-       _mesa_lookup_enum_by_nr(modeRGB),
-       _mesa_lookup_enum_by_nr(modeA));
+       _mesa_enum_to_string(modeRGB),
+       _mesa_enum_to_string(modeA));
 
    (void) modeRGB;
    (void) modeA;
@@ -403,10 +403,10 @@ i830BlendFuncSeparate(struct gl_context * ctx, GLenum sfactorRGB,
                       GLenum dfactorRGB, GLenum sfactorA, GLenum dfactorA)
 {
    DBG("%s -> RGB(%s, %s) A(%s, %s)\n", __func__,
-       _mesa_lookup_enum_by_nr(sfactorRGB),
-       _mesa_lookup_enum_by_nr(dfactorRGB),
-       _mesa_lookup_enum_by_nr(sfactorA),
-       _mesa_lookup_enum_by_nr(dfactorA));
+       _mesa_enum_to_string(sfactorRGB),
+       _mesa_enum_to_string(dfactorRGB),
+       _mesa_enum_to_string(sfactorA),
+       _mesa_enum_to_string(dfactorA));
 
    (void) sfactorRGB;
    (void) dfactorRGB;
diff --git a/src/mesa/drivers/dri/i915/intel_fbo.c b/src/mesa/drivers/dri/i915/intel_fbo.c
index a5d5c5832fb..67013666377 100644
--- a/src/mesa/drivers/dri/i915/intel_fbo.c
+++ b/src/mesa/drivers/dri/i915/intel_fbo.c
@@ -216,7 +216,7 @@ intel_alloc_private_renderbuffer_storage(struct gl_context * ctx, struct gl_rend
    intel_miptree_release(&irb->mt);
 
    DBG("%s: %s: %s (%dx%d)\n", __func__,
-       _mesa_lookup_enum_by_nr(internalFormat),
+       _mesa_enum_to_string(internalFormat),
        _mesa_get_format_name(rb->Format), width, height);
 
    if (width == 0 || height == 0)
diff --git a/src/mesa/drivers/dri/i915/intel_mipmap_tree.c b/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
index e56b9859377..1aa06c18f15 100644
--- a/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
@@ -81,7 +81,7 @@ intel_miptree_create_layout(struct intel_context *intel,
       return NULL;
 
    DBG("%s target %s format %s level %d..%d <-- %p\n", __func__,
-       _mesa_lookup_enum_by_nr(target),
+       _mesa_enum_to_string(target),
        _mesa_get_format_name(format),
        first_level, last_level, mt);
 
diff --git a/src/mesa/drivers/dri/i915/intel_render.c b/src/mesa/drivers/dri/i915/intel_render.c
index 0b0d48e1663..5962dad7d11 100644
--- a/src/mesa/drivers/dri/i915/intel_render.c
+++ b/src/mesa/drivers/dri/i915/intel_render.c
@@ -113,7 +113,7 @@ static void
 intelDmaPrimitive(struct intel_context *intel, GLenum prim)
 {
    if (0)
-      fprintf(stderr, "%s %s\n", __func__, _mesa_lookup_enum_by_nr(prim));
+      fprintf(stderr, "%s %s\n", __func__, _mesa_enum_to_string(prim));
    INTEL_FIREVERTICES(intel);
    intel->vtbl.reduced_primitive_state(intel, reduced_prim[prim]);
    intel_set_prim(intel, hw_prim[prim]);
diff --git a/src/mesa/drivers/dri/i915/intel_tex_image.c b/src/mesa/drivers/dri/i915/intel_tex_image.c
index 01de966a134..0a213e9f614 100644
--- a/src/mesa/drivers/dri/i915/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i915/intel_tex_image.c
@@ -189,7 +189,7 @@ intelTexImage(struct gl_context * ctx,
               const struct gl_pixelstore_attrib *unpack)
 {
    DBG("%s target %s level %d %dx%dx%d\n", __func__,
-       _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
+       _mesa_enum_to_string(texImage->TexObject->Target),
        texImage->Level, texImage->Width, texImage->Height, texImage->Depth);
 
    /* Attempt to use the blitter for PBO image uploads.
diff --git a/src/mesa/drivers/dri/i915/intel_tex_subimage.c b/src/mesa/drivers/dri/i915/intel_tex_subimage.c
index 2e02d50f13f..f11ef2ea329 100644
--- a/src/mesa/drivers/dri/i915/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i915/intel_tex_subimage.c
@@ -72,7 +72,7 @@ intel_blit_texsubimage(struct gl_context * ctx,
 
    DBG("BLT subimage %s target %s level %d offset %d,%d %dx%d\n",
        __func__,
-       _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
+       _mesa_enum_to_string(texImage->TexObject->Target),
        texImage->Level, xoffset, yoffset, width, height);
 
    pixels = _mesa_validate_pbo_teximage(ctx, 2, width, height, 1,
diff --git a/src/mesa/drivers/dri/i915/intel_tris.c b/src/mesa/drivers/dri/i915/intel_tris.c
index 144f0fc911a..ae62a800fb7 100644
--- a/src/mesa/drivers/dri/i915/intel_tris.c
+++ b/src/mesa/drivers/dri/i915/intel_tris.c
@@ -1134,7 +1134,7 @@ intelRasterPrimitive(struct gl_context * ctx, GLenum rprim, GLuint hwprim)
 
    if (0)
       fprintf(stderr, "%s %s %x\n", __func__,
-              _mesa_lookup_enum_by_nr(rprim), hwprim);
+              _mesa_enum_to_string(rprim), hwprim);
 
    intel->vtbl.reduced_primitive_state(intel, rprim);
 
@@ -1158,7 +1158,7 @@ intelRenderPrimitive(struct gl_context * ctx, GLenum prim)
                          ctx->Polygon.BackMode != GL_FILL);
 
    if (0)
-      fprintf(stderr, "%s %s\n", __func__, _mesa_lookup_enum_by_nr(prim));
+      fprintf(stderr, "%s %s\n", __func__, _mesa_enum_to_string(prim));
 
    /* Let some clipping routines know which primitive they're dealing
     * with.
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index ec13473798c..b5ef276edfc 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -110,7 +110,7 @@ static void brw_set_prim(struct brw_context *brw,
    struct gl_context *ctx = &brw->ctx;
    uint32_t hw_prim = get_hw_prim_for_gl_prim(prim->mode);
 
-   DBG("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim->mode));
+   DBG("PRIM: %s\n", _mesa_enum_to_string(prim->mode));
 
    /* Slight optimization to avoid the GS program when not needed:
     */
@@ -143,7 +143,7 @@ static void gen6_set_prim(struct brw_context *brw,
 {
    uint32_t hw_prim;
 
-   DBG("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim->mode));
+   DBG("PRIM: %s\n", _mesa_enum_to_string(prim->mode));
 
    hw_prim = get_hw_prim_for_gl_prim(prim->mode);
 
@@ -182,7 +182,7 @@ static void brw_emit_prim(struct brw_context *brw,
    int indirect_flag;
    int predicate_enable;
 
-   DBG("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode),
+   DBG("PRIM: %s %d %d\n", _mesa_enum_to_string(prim->mode),
        prim->start, prim->count);
 
    int start_vertex_location = prim->start;
@@ -582,7 +582,7 @@ void brw_draw_prims( struct gl_context *ctx,
     */
    if (ctx->RenderMode != GL_RENDER) {
       perf_debug("%s render mode not supported in hardware\n",
-                 _mesa_lookup_enum_by_nr(ctx->RenderMode));
+                 _mesa_enum_to_string(ctx->RenderMode));
       _swsetup_Wakeup(ctx);
       _tnl_wakeup(ctx);
       _tnl_draw_prims(ctx, prims, nr_prims, ib,
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index c95f0c37f89..c6dd69bbd29 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -230,7 +230,7 @@ brw_get_vertex_surface_type(struct brw_context *brw,
 
    if (unlikely(INTEL_DEBUG & DEBUG_VERTS))
       fprintf(stderr, "type %s size %d normalized %d\n",
-              _mesa_lookup_enum_by_nr(glarray->Type),
+              _mesa_enum_to_string(glarray->Type),
               glarray->Size, glarray->Normalized);
 
    if (glarray->Integer) {
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 2b76e241d1a..3bab8f46ae8 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -97,8 +97,8 @@ gen6_upload_blend_state(struct brw_context *brw)
                    rb_type != GL_UNSIGNED_NORMALIZED &&
                    rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
                    "renderbuffer\n",
-                   _mesa_lookup_enum_by_nr(ctx->Color.LogicOp),
-                   _mesa_lookup_enum_by_nr(rb_type));
+                   _mesa_enum_to_string(ctx->Color.LogicOp),
+                   _mesa_enum_to_string(rb_type));
 	 if (rb_type == GL_UNSIGNED_NORMALIZED) {
 	    blend[b].blend1.logic_op_enable = 1;
 	    blend[b].blend1.logic_op_func =
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 26f895bf904..87ba624b70d 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -310,7 +310,7 @@ intel_alloc_private_renderbuffer_storage(struct gl_context * ctx, struct gl_rend
    intel_miptree_release(&irb->mt);
 
    DBG("%s: %s: %s (%dx%d)\n", __func__,
-       _mesa_lookup_enum_by_nr(internalFormat),
+       _mesa_enum_to_string(internalFormat),
        _mesa_get_format_name(rb->Format), width, height);
 
    if (width == 0 || height == 0)
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 3f22a87a742..5e7859cf990 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -279,7 +279,7 @@ intel_miptree_create_layout(struct brw_context *brw,
       return NULL;
 
    DBG("%s target %s format %s level %d..%d slices %d <-- %p\n", __func__,
-       _mesa_lookup_enum_by_nr(target),
+       _mesa_enum_to_string(target),
        _mesa_get_format_name(format),
        first_level, last_level, depth0, mt);
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index 226aaeb4d54..536cc406846 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -97,8 +97,8 @@ intelTexImage(struct gl_context * ctx,
 
    DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
        __func__, _mesa_get_format_name(texImage->TexFormat),
-       _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
-       _mesa_lookup_enum_by_nr(format), _mesa_lookup_enum_by_nr(type),
+       _mesa_enum_to_string(texImage->TexObject->Target),
+       _mesa_enum_to_string(format), _mesa_enum_to_string(type),
        texImage->Level, texImage->Width, texImage->Height, texImage->Depth);
 
    /* Allocate storage for texture data. */
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index 7507f7669a0..31e511f0b7b 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -206,8 +206,8 @@ intelTexSubImage(struct gl_context * ctx,
 
    DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
        __func__, _mesa_get_format_name(texImage->TexFormat),
-       _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
-       _mesa_lookup_enum_by_nr(format), _mesa_lookup_enum_by_nr(type),
+       _mesa_enum_to_string(texImage->TexObject->Target),
+       _mesa_enum_to_string(format), _mesa_enum_to_string(type),
        texImage->Level, texImage->Width, texImage->Height, texImage->Depth);
 
    ok = _mesa_meta_pbo_TexSubImage(ctx, dims, texImage,
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index 6fe70b5c9d0..2023ba09449 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -1669,7 +1669,7 @@ static void r200Enable( struct gl_context *ctx, GLenum cap, GLboolean state )
 
    if ( R200_DEBUG & RADEON_STATE )
       fprintf( stderr, "%s( %s = %s )\n", __func__,
-	       _mesa_lookup_enum_by_nr( cap ),
+	       _mesa_enum_to_string( cap ),
 	       state ? "GL_TRUE" : "GL_FALSE" );
 
    switch ( cap ) {
diff --git a/src/mesa/drivers/dri/r200/r200_tex.c b/src/mesa/drivers/dri/r200/r200_tex.c
index 083a1840d9e..feee0b2ba3f 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.c
+++ b/src/mesa/drivers/dri/r200/r200_tex.c
@@ -68,9 +68,9 @@ static void r200SetTexWrap( radeonTexObjPtr t, GLenum swrap, GLenum twrap, GLenu
    radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 		"%s(tex %p) sw %s, tw %s, rw %s\n",
 		__func__, t,
-		_mesa_lookup_enum_by_nr(swrap),
-		_mesa_lookup_enum_by_nr(twrap),
-		_mesa_lookup_enum_by_nr(rwrap));
+		_mesa_enum_to_string(swrap),
+		_mesa_enum_to_string(twrap),
+		_mesa_enum_to_string(rwrap));
 
    t->pp_txfilter &= ~(R200_CLAMP_S_MASK | R200_CLAMP_T_MASK | R200_BORDER_MODE_D3D);
 
@@ -225,8 +225,8 @@ static void r200SetTexFilter( radeonTexObjPtr t, GLenum minf, GLenum magf )
    radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 	"%s(tex %p) minf %s, maxf %s, anisotropy %d.\n",
 	__func__, t,
-	_mesa_lookup_enum_by_nr(minf),
-	_mesa_lookup_enum_by_nr(magf),
+	_mesa_enum_to_string(minf),
+	_mesa_enum_to_string(magf),
 	anisotropy);
 
    if ( anisotropy == R200_MAX_ANISO_1_TO_1 ) {
@@ -302,7 +302,7 @@ static void r200TexEnv( struct gl_context *ctx, GLenum target,
    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
 
    radeon_print(RADEON_TEXTURE | RADEON_STATE, RADEON_VERBOSE, "%s( %s )\n",
-	       __func__, _mesa_lookup_enum_by_nr( pname ) );
+	       __func__, _mesa_enum_to_string( pname ) );
 
    /* This is incorrect: Need to maintain this data for each of
     * GL_TEXTURE_{123}D, GL_TEXTURE_RECTANGLE_NV, etc, and switch
@@ -384,7 +384,7 @@ static void r200TexParameter( struct gl_context *ctx,
    radeon_print(RADEON_TEXTURE | RADEON_STATE, RADEON_VERBOSE,
 		"%s(%p, tex %p)  pname %s\n",
 		__func__, ctx, texObj,
-	       _mesa_lookup_enum_by_nr( pname ) );
+	       _mesa_enum_to_string( pname ) );
 
    switch ( pname ) {
    case GL_TEXTURE_MIN_FILTER:
@@ -415,7 +415,7 @@ static void r200DeleteTexture(struct gl_context * ctx, struct gl_texture_object
    radeon_print(RADEON_TEXTURE | RADEON_STATE, RADEON_NORMAL,
            "%s( %p (target = %s) )\n", __func__,
 	   (void *)texObj,
-	   _mesa_lookup_enum_by_nr(texObj->Target));
+	   _mesa_enum_to_string(texObj->Target));
 
    if (rmesa) {
       int i;
@@ -473,7 +473,7 @@ static struct gl_texture_object *r200NewTextureObject(struct gl_context * ctx,
    radeon_print(RADEON_STATE | RADEON_TEXTURE, RADEON_NORMAL,
            "%s(%p) target %s, new texture %p.\n",
 	   __func__, ctx,
-	   _mesa_lookup_enum_by_nr(target), t);
+	   _mesa_enum_to_string(target), t);
 
    _mesa_initialize_texture_object(ctx, &t->base, name, target);
    t->base.Sampler.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index d834d9bde3f..fde89214ed2 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -343,7 +343,7 @@ void radeonDrawBuffer( struct gl_context *ctx, GLenum mode )
 {
 	if (RADEON_DEBUG & RADEON_DRI)
 		fprintf(stderr, "%s %s\n", __func__,
-			_mesa_lookup_enum_by_nr( mode ));
+			_mesa_enum_to_string( mode ));
 
 	if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 		radeonContextPtr radeon = RADEON_CONTEXT(ctx);
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
index ef62d097bae..5e4aaca1e0e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -700,7 +700,7 @@ radeon_bind_framebuffer(struct gl_context * ctx, GLenum target,
   radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 		"%s(%p, fb %p, target %s) \n",
 		__func__, ctx, fb,
-		_mesa_lookup_enum_by_nr(target));
+		_mesa_enum_to_string(target));
 
    if (target == GL_FRAMEBUFFER_EXT || target == GL_DRAW_FRAMEBUFFER_EXT) {
       radeon_draw_buffer(ctx, fb);
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
index 28591cad895..c71766d0a3e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
@@ -276,7 +276,7 @@ static void calculate_min_max_lod(struct gl_sampler_object *samp, struct gl_text
 	radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 			"%s(%p) target %s, min %d, max %d.\n",
 			__func__, tObj,
-			_mesa_lookup_enum_by_nr(tObj->Target),
+			_mesa_enum_to_string(tObj->Target),
 			minLod, maxLod);
 
 	/* save these values */
diff --git a/src/mesa/drivers/dri/radeon/radeon_pixel_read.c b/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
index 6998444fb66..e115b749da5 100644
--- a/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
+++ b/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
@@ -212,7 +212,7 @@ radeonReadPixels(struct gl_context * ctx,
      */
     radeon_print(RADEON_FALLBACKS, RADEON_NORMAL,
                  "Falling back to sw for ReadPixels (format %s, type %s)\n",
-                 _mesa_lookup_enum_by_nr(format), _mesa_lookup_enum_by_nr(type));
+                 _mesa_enum_to_string(format), _mesa_enum_to_string(type));
 
     if (ctx->NewState)
         _mesa_update_state(ctx);
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index cba3d9c9689..156afb6f6ac 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -1452,7 +1452,7 @@ static void radeonEnable( struct gl_context *ctx, GLenum cap, GLboolean state )
 
    if ( RADEON_DEBUG & RADEON_STATE )
       fprintf( stderr, "%s( %s = %s )\n", __func__,
-	       _mesa_lookup_enum_by_nr( cap ),
+	       _mesa_enum_to_string( cap ),
 	       state ? "GL_TRUE" : "GL_FALSE" );
 
    switch ( cap ) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
index 8a1fbab39f8..2fbd353297b 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
@@ -442,7 +442,7 @@ static GLboolean radeon_run_render( struct gl_context *ctx,
 
       radeon_print(RADEON_SWRENDER, RADEON_NORMAL,
 	  "radeon_render.c: prim %s %d..%d\n",
-		 _mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK), 
+		 _mesa_enum_to_string(prim & PRIM_MODE_MASK), 
 		 start, start+length);
 
       if (length)
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.c b/src/mesa/drivers/dri/radeon/radeon_tex.c
index 353fdb00ec8..0955a135de8 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.c
@@ -263,7 +263,7 @@ static void radeonTexEnv( struct gl_context *ctx, GLenum target,
 
    if ( RADEON_DEBUG & RADEON_STATE ) {
       fprintf( stderr, "%s( %s )\n",
-	       __func__, _mesa_lookup_enum_by_nr( pname ) );
+	       __func__, _mesa_enum_to_string( pname ) );
    }
 
    switch ( pname ) {
@@ -335,7 +335,7 @@ static void radeonTexParameter( struct gl_context *ctx,
    radeonTexObj* t = radeon_tex_obj(texObj);
 
    radeon_print(RADEON_TEXTURE, RADEON_VERBOSE, "%s( %s )\n", __func__,
-	       _mesa_lookup_enum_by_nr( pname ) );
+	       _mesa_enum_to_string( pname ) );
 
    switch ( pname ) {
    case GL_TEXTURE_BASE_LEVEL:
@@ -359,7 +359,7 @@ static void radeonDeleteTexture( struct gl_context *ctx,
 
    radeon_print(RADEON_TEXTURE, RADEON_NORMAL,
 	 "%s( %p (target = %s) )\n", __func__, (void *)texObj,
-	       _mesa_lookup_enum_by_nr( texObj->Target ) );
+	       _mesa_enum_to_string( texObj->Target ) );
 
    if ( rmesa ) {
      radeon_firevertices(&rmesa->radeon);
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index d05c870128c..4794ddae069 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -279,8 +279,8 @@ mesa_format radeonChooseTextureFormat(struct gl_context * ctx,
 	radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 		"%s InternalFormat=%s(%d) type=%s format=%s\n",
 		__func__,
-		_mesa_lookup_enum_by_nr(internalFormat), internalFormat,
-		_mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
+		_mesa_enum_to_string(internalFormat), internalFormat,
+		_mesa_enum_to_string(type), _mesa_enum_to_string(format));
 	radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 			"%s do32bpt=%d force16bpt=%d\n",
 			__func__, do32bpt, force16bpt);
diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/api_validate.c
index 9c2e29e6472..21ae07b9d50 100644
--- a/src/mesa/main/api_validate.c
+++ b/src/mesa/main/api_validate.c
@@ -291,7 +291,7 @@ valid_elements_type(struct gl_context *ctx, GLenum type, const char *name)
 
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(type = %s)", name,
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(type));
       return false;
    }
 }
diff --git a/src/mesa/main/atifragshader.c b/src/mesa/main/atifragshader.c
index 9fc35520a38..935ba05b7cc 100644
--- a/src/mesa/main/atifragshader.c
+++ b/src/mesa/main/atifragshader.c
@@ -132,21 +132,21 @@ static void debug_op(GLint optype, GLuint arg_count, GLenum op, GLuint dst,
 
   op_name = atifs_ops[(arg_count-1)+(optype?3:0)];
   
-  fprintf(stderr, "%s(%s, %s", op_name, _mesa_lookup_enum_by_nr(op),
-	      _mesa_lookup_enum_by_nr(dst));
+  fprintf(stderr, "%s(%s, %s", op_name, _mesa_enum_to_string(op),
+	      _mesa_enum_to_string(dst));
   if (!optype)
     fprintf(stderr, ", %d", dstMask);
   
   fprintf(stderr, ", %s", create_dst_mod_str(dstMod));
   
-  fprintf(stderr, ", %s, %s, %d", _mesa_lookup_enum_by_nr(arg1),
-	      _mesa_lookup_enum_by_nr(arg1Rep), arg1Mod);
+  fprintf(stderr, ", %s, %s, %d", _mesa_enum_to_string(arg1),
+	      _mesa_enum_to_string(arg1Rep), arg1Mod);
   if (arg_count>1)
-    fprintf(stderr, ", %s, %s, %d", _mesa_lookup_enum_by_nr(arg2),
-	      _mesa_lookup_enum_by_nr(arg2Rep), arg2Mod);
+    fprintf(stderr, ", %s, %s, %d", _mesa_enum_to_string(arg2),
+	      _mesa_enum_to_string(arg2Rep), arg2Mod);
   if (arg_count>2)
-    fprintf(stderr, ", %s, %s, %d", _mesa_lookup_enum_by_nr(arg3),
-	      _mesa_lookup_enum_by_nr(arg3Rep), arg3Mod);
+    fprintf(stderr, ", %s, %s, %d", _mesa_enum_to_string(arg3),
+	      _mesa_enum_to_string(arg3Rep), arg3Mod);
 
   fprintf(stderr,")\n");
 
@@ -383,7 +383,7 @@ _mesa_EndFragmentShaderATI(void)
    for (j = 0; j < MAX_NUM_PASSES_ATI; j++) {
       for (i = 0; i < MAX_NUM_FRAGMENT_REGISTERS_ATI; i++) {
 	 GLuint op = curProg->SetupInst[j][i].Opcode;
-	 const char *op_enum = op > 5 ? _mesa_lookup_enum_by_nr(op) : "0";
+	 const char *op_enum = op > 5 ? _mesa_enum_to_string(op) : "0";
 	 GLuint src = curProg->SetupInst[j][i].src;
 	 GLuint swizzle = curProg->SetupInst[j][i].swizzle;
 	 fprintf(stderr, "%2d %04X %s %d %04X\n", i, op, op_enum, src,
@@ -392,8 +392,8 @@ _mesa_EndFragmentShaderATI(void)
       for (i = 0; i < curProg->numArithInstr[j]; i++) {
 	 GLuint op0 = curProg->Instructions[j][i].Opcode[0];
 	 GLuint op1 = curProg->Instructions[j][i].Opcode[1];
-	 const char *op0_enum = op0 > 5 ? _mesa_lookup_enum_by_nr(op0) : "0";
-	 const char *op1_enum = op1 > 5 ? _mesa_lookup_enum_by_nr(op1) : "0";
+	 const char *op0_enum = op0 > 5 ? _mesa_enum_to_string(op0) : "0";
+	 const char *op1_enum = op1 > 5 ? _mesa_enum_to_string(op1) : "0";
 	 GLuint count0 = curProg->Instructions[j][i].ArgCount[0];
 	 GLuint count1 = curProg->Instructions[j][i].ArgCount[1];
 	 fprintf(stderr, "%2d %04X %s %d %04X %s %d\n", i, op0, op0_enum, count0,
@@ -477,8 +477,8 @@ _mesa_PassTexCoordATI(GLuint dst, GLuint coord, GLenum swizzle)
 
 #if MESA_DEBUG_ATI_FS
    _mesa_debug(ctx, "%s(%s, %s, %s)\n", __func__,
-	       _mesa_lookup_enum_by_nr(dst), _mesa_lookup_enum_by_nr(coord),
-	       _mesa_lookup_enum_by_nr(swizzle));
+	       _mesa_enum_to_string(dst), _mesa_enum_to_string(coord),
+	       _mesa_enum_to_string(swizzle));
 #endif
 }
 
@@ -550,8 +550,8 @@ _mesa_SampleMapATI(GLuint dst, GLuint interp, GLenum swizzle)
 
 #if MESA_DEBUG_ATI_FS
    _mesa_debug(ctx, "%s(%s, %s, %s)\n", __func__,
-	       _mesa_lookup_enum_by_nr(dst), _mesa_lookup_enum_by_nr(interp),
-	       _mesa_lookup_enum_by_nr(swizzle));
+	       _mesa_enum_to_string(dst), _mesa_enum_to_string(interp),
+	       _mesa_enum_to_string(swizzle));
 #endif
 }
 
diff --git a/src/mesa/main/attrib.c b/src/mesa/main/attrib.c
index 53626e38be9..08f13178f84 100644
--- a/src/mesa/main/attrib.c
+++ b/src/mesa/main/attrib.c
@@ -937,7 +937,7 @@ _mesa_PopAttrib(void)
 
       if (MESA_VERBOSE & VERBOSE_API) {
          _mesa_debug(ctx, "glPopAttrib %s\n",
-                     _mesa_lookup_enum_by_nr(attr->kind));
+                     _mesa_enum_to_string(attr->kind));
       }
 
       switch (attr->kind) {
diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index d36530541ec..99ab3c354ab 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -128,28 +128,28 @@ validate_blend_factors(struct gl_context *ctx, const char *func,
    if (!legal_src_factor(ctx, sfactorRGB)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(sfactorRGB = %s)", func,
-                  _mesa_lookup_enum_by_nr(sfactorRGB));
+                  _mesa_enum_to_string(sfactorRGB));
       return GL_FALSE;
    }
 
    if (!legal_dst_factor(ctx, dfactorRGB)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(dfactorRGB = %s)", func,
-                  _mesa_lookup_enum_by_nr(dfactorRGB));
+                  _mesa_enum_to_string(dfactorRGB));
       return GL_FALSE;
    }
 
    if (sfactorA != sfactorRGB && !legal_src_factor(ctx, sfactorA)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(sfactorA = %s)", func,
-                  _mesa_lookup_enum_by_nr(sfactorA));
+                  _mesa_enum_to_string(sfactorA));
       return GL_FALSE;
    }
 
    if (dfactorA != dfactorRGB && !legal_dst_factor(ctx, dfactorA)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(dfactorA = %s)", func,
-                  _mesa_lookup_enum_by_nr(dfactorA));
+                  _mesa_enum_to_string(dfactorA));
       return GL_FALSE;
    }
 
@@ -208,10 +208,10 @@ _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB,
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendFuncSeparate %s %s %s %s\n",
-                  _mesa_lookup_enum_by_nr(sfactorRGB),
-                  _mesa_lookup_enum_by_nr(dfactorRGB),
-                  _mesa_lookup_enum_by_nr(sfactorA),
-                  _mesa_lookup_enum_by_nr(dfactorA));
+                  _mesa_enum_to_string(sfactorRGB),
+                  _mesa_enum_to_string(dfactorRGB),
+                  _mesa_enum_to_string(sfactorA),
+                  _mesa_enum_to_string(dfactorA));
 
    if (!validate_blend_factors(ctx, "glBlendFuncSeparate",
                                sfactorRGB, dfactorRGB,
@@ -342,7 +342,7 @@ _mesa_BlendEquation( GLenum mode )
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquation(%s)\n",
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(mode));
 
    if (!legal_blend_equation(ctx, mode)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glBlendEquation");
@@ -385,7 +385,7 @@ _mesa_BlendEquationiARB(GLuint buf, GLenum mode)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquationi(%u, %s)\n",
-                  buf, _mesa_lookup_enum_by_nr(mode));
+                  buf, _mesa_enum_to_string(mode));
 
    if (buf >= ctx->Const.MaxDrawBuffers) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glBlendFuncSeparatei(buffer=%u)",
@@ -421,8 +421,8 @@ _mesa_BlendEquationSeparate( GLenum modeRGB, GLenum modeA )
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquationSeparateEXT(%s %s)\n",
-                  _mesa_lookup_enum_by_nr(modeRGB),
-                  _mesa_lookup_enum_by_nr(modeA));
+                  _mesa_enum_to_string(modeRGB),
+                  _mesa_enum_to_string(modeA));
 
    if ( (modeRGB != modeA) && !ctx->Extensions.EXT_blend_equation_separate ) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -476,8 +476,8 @@ _mesa_BlendEquationSeparateiARB(GLuint buf, GLenum modeRGB, GLenum modeA)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquationSeparatei(%u, %s %s)\n", buf,
-                  _mesa_lookup_enum_by_nr(modeRGB),
-                  _mesa_lookup_enum_by_nr(modeA));
+                  _mesa_enum_to_string(modeRGB),
+                  _mesa_enum_to_string(modeA));
 
    if (buf >= ctx->Const.MaxDrawBuffers) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glBlendEquationSeparatei(buffer=%u)",
@@ -567,7 +567,7 @@ _mesa_AlphaFunc( GLenum func, GLclampf ref )
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glAlphaFunc(%s, %f)\n",
-                  _mesa_lookup_enum_by_nr(func), ref);
+                  _mesa_enum_to_string(func), ref);
 
    switch (func) {
    case GL_NEVER:
@@ -613,7 +613,7 @@ _mesa_LogicOp( GLenum opcode )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glLogicOp(%s)\n", _mesa_lookup_enum_by_nr(opcode));
+      _mesa_debug(ctx, "glLogicOp(%s)\n", _mesa_enum_to_string(opcode));
 
    switch (opcode) {
       case GL_CLEAR:
@@ -790,7 +790,7 @@ _mesa_ClampColor(GLenum target, GLenum clamp)
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "glClampColor(%s)",
-               _mesa_lookup_enum_by_nr(target));
+               _mesa_enum_to_string(target));
 }
 
 static GLboolean
diff --git a/src/mesa/main/blit.c b/src/mesa/main/blit.c
index 4765198d63a..a32f1a42aea 100644
--- a/src/mesa/main/blit.c
+++ b/src/mesa/main/blit.c
@@ -212,7 +212,7 @@ _mesa_blit_framebuffer(struct gl_context *ctx,
 
    if (!is_valid_blit_filter(ctx, filter)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid filter %s)", func,
-                  _mesa_lookup_enum_by_nr(filter));
+                  _mesa_enum_to_string(filter));
       return;
    }
 
@@ -220,7 +220,7 @@ _mesa_blit_framebuffer(struct gl_context *ctx,
         filter == GL_SCALED_RESOLVE_NICEST_EXT) &&
         (readFb->Visual.samples == 0 || drawFb->Visual.samples > 0)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s: invalid samples)", func,
-                  _mesa_lookup_enum_by_nr(filter));
+                  _mesa_enum_to_string(filter));
       return;
    }
 
@@ -548,7 +548,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                   " %d, %d, %d, %d, 0x%x, %s)\n",
                   srcX0, srcY0, srcX1, srcY1,
                   dstX0, dstY0, dstX1, dstY1,
-                  mask, _mesa_lookup_enum_by_nr(filter));
+                  mask, _mesa_enum_to_string(filter));
 
    _mesa_blit_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
                           srcX0, srcY0, srcX1, srcY1,
@@ -573,7 +573,7 @@ _mesa_BlitNamedFramebuffer(GLuint readFramebuffer, GLuint drawFramebuffer,
                   readFramebuffer, drawFramebuffer,
                   srcX0, srcY0, srcX1, srcY1,
                   dstX0, dstY0, dstX1, dstY1,
-                  mask, _mesa_lookup_enum_by_nr(filter));
+                  mask, _mesa_enum_to_string(filter));
 
    /*
     * According to PDF page 533 of the OpenGL 4.5 core spec (30.10.2014,
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 4e25a72cbdb..9425b095e81 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -1182,7 +1182,7 @@ _mesa_BindBuffer(GLenum target, GLuint buffer)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBindBuffer(%s, %u)\n",
-                  _mesa_lookup_enum_by_nr(target), buffer);
+                  _mesa_enum_to_string(target), buffer);
 
    bind_buffer_object(ctx, target, buffer);
 }
@@ -1535,9 +1535,9 @@ _mesa_buffer_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "%s(%s, %ld, %p, %s)\n",
                   func,
-                  _mesa_lookup_enum_by_nr(target),
+                  _mesa_enum_to_string(target),
                   (long int) size, data,
-                  _mesa_lookup_enum_by_nr(usage));
+                  _mesa_enum_to_string(usage));
 
    if (size < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(size < 0)", func);
@@ -1570,7 +1570,7 @@ _mesa_buffer_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
 
    if (!valid_usage) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid usage: %s)", func,
-                  _mesa_lookup_enum_by_nr(usage));
+                  _mesa_enum_to_string(usage));
       return;
    }
 
@@ -2025,7 +2025,7 @@ get_buffer_parameter(struct gl_context *ctx,
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid pname: %s)", func,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
    return false;
 }
 
@@ -4367,7 +4367,7 @@ _mesa_BindBuffersRange(GLenum target, GLuint first, GLsizei count,
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBindBuffersRange(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       break;
    }
 }
@@ -4393,7 +4393,7 @@ _mesa_BindBuffersBase(GLenum target, GLuint first, GLsizei count,
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBindBuffersBase(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       break;
    }
 }
diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c
index 0536266d756..93588a2ee18 100644
--- a/src/mesa/main/buffers.c
+++ b/src/mesa/main/buffers.c
@@ -251,7 +251,7 @@ _mesa_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
    FLUSH_VERTICES(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API) {
-      _mesa_debug(ctx, "%s %s\n", caller, _mesa_lookup_enum_by_nr(buffer));
+      _mesa_debug(ctx, "%s %s\n", caller, _mesa_enum_to_string(buffer));
    }
 
    if (buffer == GL_NONE) {
@@ -264,14 +264,14 @@ _mesa_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
       if (destMask == BAD_MASK) {
          /* totally bogus buffer */
          _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)", caller,
-                     _mesa_lookup_enum_by_nr(buffer));
+                     _mesa_enum_to_string(buffer));
          return;
       }
       destMask &= supportedMask;
       if (destMask == 0x0) {
          /* none of the named color buffers exist! */
          _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid buffer %s)",
-                     caller, _mesa_lookup_enum_by_nr(buffer));
+                     caller, _mesa_enum_to_string(buffer));
          return;
       }
    }
@@ -411,7 +411,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
           */
          if (destMask[output] == BAD_MASK) {
             _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -427,7 +427,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
           */
          if (_mesa_bitcount(destMask[output]) > 1) {
             _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -445,7 +445,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
          if (destMask[output] == 0) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(unsupported buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -459,7 +459,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
              buffers[output] != GL_COLOR_ATTACHMENT0 + output) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(unsupported buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -471,7 +471,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
          if (destMask[output] & usedBufferMask) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(duplicated buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -700,7 +700,7 @@ _mesa_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
    FLUSH_VERTICES(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "%s %s\n", caller, _mesa_lookup_enum_by_nr(buffer));
+      _mesa_debug(ctx, "%s %s\n", caller, _mesa_enum_to_string(buffer));
 
    if (buffer == GL_NONE) {
       /* This is legal--it means that no buffer should be bound for reading. */
@@ -712,14 +712,14 @@ _mesa_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
       if (srcBuffer == -1) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "%s(invalid buffer %s)", caller,
-                     _mesa_lookup_enum_by_nr(buffer));
+                     _mesa_enum_to_string(buffer));
          return;
       }
       supportedMask = supported_buffer_bitmask(ctx, fb);
       if (((1 << srcBuffer) & supportedMask) == 0) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "%s(invalid buffer %s)", caller,
-                     _mesa_lookup_enum_by_nr(buffer));
+                     _mesa_enum_to_string(buffer));
          return;
       }
    }
diff --git a/src/mesa/main/clear.c b/src/mesa/main/clear.c
index 426caea4709..8284dcab528 100644
--- a/src/mesa/main/clear.c
+++ b/src/mesa/main/clear.c
@@ -395,7 +395,7 @@ _mesa_ClearBufferiv(GLenum buffer, GLint drawbuffer, const GLint *value)
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferiv(buffer=%s)",
-                  _mesa_lookup_enum_by_nr(buffer));
+                  _mesa_enum_to_string(buffer));
       return;
    }
 }
@@ -485,7 +485,7 @@ _mesa_ClearBufferuiv(GLenum buffer, GLint drawbuffer, const GLuint *value)
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferuiv(buffer=%s)",
-                  _mesa_lookup_enum_by_nr(buffer));
+                  _mesa_enum_to_string(buffer));
       return;
    }
 }
@@ -596,7 +596,7 @@ _mesa_ClearBufferfv(GLenum buffer, GLint drawbuffer, const GLfloat *value)
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferfv(buffer=%s)",
-                  _mesa_lookup_enum_by_nr(buffer));
+                  _mesa_enum_to_string(buffer));
       return;
    }
 }
@@ -636,7 +636,7 @@ _mesa_ClearBufferfi(GLenum buffer, GLint drawbuffer,
 
    if (buffer != GL_DEPTH_STENCIL) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferfi(buffer=%s)",
-                  _mesa_lookup_enum_by_nr(buffer));
+                  _mesa_enum_to_string(buffer));
       return;
    }
 
diff --git a/src/mesa/main/condrender.c b/src/mesa/main/condrender.c
index 77e4b95ee8f..46c6036d2a5 100644
--- a/src/mesa/main/condrender.c
+++ b/src/mesa/main/condrender.c
@@ -87,7 +87,7 @@ _mesa_BeginConditionalRender(GLuint queryId, GLenum mode)
       /* fallthrough - invalid */
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBeginConditionalRender(mode=%s)",
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(mode));
       return;
    }
 
@@ -184,7 +184,7 @@ _mesa_check_conditional_render(struct gl_context *ctx)
    default:
       _mesa_problem(ctx, "Bad cond render mode %s in "
                     " _mesa_check_conditional_render()",
-                    _mesa_lookup_enum_by_nr(ctx->Query.CondRenderMode));
+                    _mesa_enum_to_string(ctx->Query.CondRenderMode));
       return GL_TRUE;
    }
 }
diff --git a/src/mesa/main/copyimage.c b/src/mesa/main/copyimage.c
index e8732c6175b..05bc50dd2c6 100644
--- a/src/mesa/main/copyimage.c
+++ b/src/mesa/main/copyimage.c
@@ -93,7 +93,7 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum *target, int level,
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glCopyImageSubData(%sTarget = %s)", dbg_prefix,
-                  _mesa_lookup_enum_by_nr(*target));
+                  _mesa_enum_to_string(*target));
       return false;
    }
 
@@ -159,7 +159,7 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum *target, int level,
       if ((*tex_obj)->Target != *target) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "glCopyImageSubData(%sTarget = %s)", dbg_prefix,
-                     _mesa_lookup_enum_by_nr(*target));
+                     _mesa_enum_to_string(*target));
          return false;
       }
 
@@ -416,9 +416,9 @@ _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel,
       _mesa_debug(ctx, "glCopyImageSubData(%u, %s, %d, %d, %d, %d, "
                                           "%u, %s, %d, %d, %d, %d, "
                                           "%d, %d, %d)\n",
-                  srcName, _mesa_lookup_enum_by_nr(srcTarget), srcLevel,
+                  srcName, _mesa_enum_to_string(srcTarget), srcLevel,
                   srcX, srcY, srcZ,
-                  dstName, _mesa_lookup_enum_by_nr(dstTarget), dstLevel,
+                  dstName, _mesa_enum_to_string(dstTarget), dstLevel,
                   dstX, dstY, dstZ,
                   srcWidth, srcHeight, srcWidth);
 
diff --git a/src/mesa/main/debug.c b/src/mesa/main/debug.c
index c93e84a04d0..3090a007707 100644
--- a/src/mesa/main/debug.c
+++ b/src/mesa/main/debug.c
@@ -411,7 +411,7 @@ dump_renderbuffer(const struct gl_renderbuffer *rb, GLboolean writeImage)
 {
    printf("Renderbuffer %u: %u x %u  IntFormat = %s\n",
 	  rb->Name, rb->Width, rb->Height,
-	  _mesa_lookup_enum_by_nr(rb->InternalFormat));
+	  _mesa_enum_to_string(rb->InternalFormat));
    if (writeImage) {
       _mesa_write_renderbuffer_image(rb);
    }
diff --git a/src/mesa/main/depth.c b/src/mesa/main/depth.c
index bb4591cf152..c3534407599 100644
--- a/src/mesa/main/depth.c
+++ b/src/mesa/main/depth.c
@@ -63,7 +63,7 @@ _mesa_DepthFunc( GLenum func )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glDepthFunc %s\n", _mesa_lookup_enum_by_nr(func));
+      _mesa_debug(ctx, "glDepthFunc %s\n", _mesa_enum_to_string(func));
 
    if (ctx->Depth.Func == func)
       return;
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index aafe486fb60..5554738d1a3 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -9000,7 +9000,7 @@ _mesa_NewList(GLuint name, GLenum mode)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glNewList %u %s\n", name,
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(mode));
 
    if (name == 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glNewList");
@@ -9688,7 +9688,7 @@ _mesa_initialize_save_table(const struct gl_context *ctx)
 static const char *
 enum_string(GLenum k)
 {
-   return _mesa_lookup_enum_by_nr(k);
+   return _mesa_enum_to_string(k);
 }
 
 
@@ -9827,19 +9827,19 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
             break;
          case OPCODE_BIND_TEXTURE:
             fprintf(f, "BindTexture %s %d\n",
-                         _mesa_lookup_enum_by_nr(n[1].ui), n[2].ui);
+                         _mesa_enum_to_string(n[1].ui), n[2].ui);
             break;
          case OPCODE_SHADE_MODEL:
-            fprintf(f, "ShadeModel %s\n", _mesa_lookup_enum_by_nr(n[1].ui));
+            fprintf(f, "ShadeModel %s\n", _mesa_enum_to_string(n[1].ui));
             break;
          case OPCODE_MAP1:
             fprintf(f, "Map1 %s %.3f %.3f %d %d\n",
-                         _mesa_lookup_enum_by_nr(n[1].ui),
+                         _mesa_enum_to_string(n[1].ui),
                          n[2].f, n[3].f, n[4].i, n[5].i);
             break;
          case OPCODE_MAP2:
             fprintf(f, "Map2 %s %.3f %.3f %.3f %.3f %d %d %d %d\n",
-                         _mesa_lookup_enum_by_nr(n[1].ui),
+                         _mesa_enum_to_string(n[1].ui),
                          n[2].f, n[3].f, n[4].f, n[5].f,
                          n[6].i, n[7].i, n[8].i, n[9].i);
             break;
@@ -9918,7 +9918,7 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
 
          case OPCODE_PROVOKING_VERTEX:
             fprintf(f, "ProvokingVertex %s\n",
-                         _mesa_lookup_enum_by_nr(n[1].ui));
+                         _mesa_enum_to_string(n[1].ui));
             break;
 
             /*
diff --git a/src/mesa/main/drawpix.c b/src/mesa/main/drawpix.c
index 55035f214b3..720a082ce6d 100644
--- a/src/mesa/main/drawpix.c
+++ b/src/mesa/main/drawpix.c
@@ -53,10 +53,10 @@ _mesa_DrawPixels( GLsizei width, GLsizei height,
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glDrawPixels(%d, %d, %s, %s, %p) // to %s at %d, %d\n",
                   width, height,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type),
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type),
                   pixels,
-                  _mesa_lookup_enum_by_nr(ctx->DrawBuffer->ColorDrawBuffer[0]),
+                  _mesa_enum_to_string(ctx->DrawBuffer->ColorDrawBuffer[0]),
                   IROUND(ctx->Current.RasterPos[0]),
                   IROUND(ctx->Current.RasterPos[1]));
 
@@ -96,8 +96,8 @@ _mesa_DrawPixels( GLsizei width, GLsizei height,
    err = _mesa_error_check_format_and_type(ctx, format, type);
    if (err != GL_NO_ERROR) {
       _mesa_error(ctx, err, "glDrawPixels(invalid format %s and/or type %s)",
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       goto end;
    }
 
@@ -198,9 +198,9 @@ _mesa_CopyPixels( GLint srcx, GLint srcy, GLsizei width, GLsizei height,
       _mesa_debug(ctx,
                   "glCopyPixels(%d, %d, %d, %d, %s) // from %s to %s at %d, %d\n",
                   srcx, srcy, width, height,
-                  _mesa_lookup_enum_by_nr(type),
-                  _mesa_lookup_enum_by_nr(ctx->ReadBuffer->ColorReadBuffer),
-                  _mesa_lookup_enum_by_nr(ctx->DrawBuffer->ColorDrawBuffer[0]),
+                  _mesa_enum_to_string(type),
+                  _mesa_enum_to_string(ctx->ReadBuffer->ColorReadBuffer),
+                  _mesa_enum_to_string(ctx->DrawBuffer->ColorDrawBuffer[0]),
                   IROUND(ctx->Current.RasterPos[0]),
                   IROUND(ctx->Current.RasterPos[1]));
 
@@ -218,7 +218,7 @@ _mesa_CopyPixels( GLint srcx, GLint srcy, GLsizei width, GLsizei height,
        type != GL_STENCIL &&
        type != GL_DEPTH_STENCIL) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glCopyPixels(type=%s)",
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(type));
       return;
    }
 
diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
index 9008a386343..d0583edc019 100644
--- a/src/mesa/main/enable.c
+++ b/src/mesa/main/enable.c
@@ -146,7 +146,7 @@ client_state(struct gl_context *ctx, GLenum cap, GLboolean state)
 
 invalid_enum_error:
    _mesa_error(ctx, GL_INVALID_ENUM, "gl%sClientState(%s)",
-               state ? "Enable" : "Disable", _mesa_lookup_enum_by_nr(cap));
+               state ? "Enable" : "Disable", _mesa_enum_to_string(cap));
 }
 
 
@@ -283,7 +283,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "%s %s (newstate is %x)\n",
                   state ? "glEnable" : "glDisable",
-                  _mesa_lookup_enum_by_nr(cap),
+                  _mesa_enum_to_string(cap),
                   ctx->NewState);
 
    switch (cap) {
@@ -1022,7 +1022,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
 
 invalid_enum_error:
    _mesa_error(ctx, GL_INVALID_ENUM, "gl%s(%s)",
-               state ? "Enable" : "Disable", _mesa_lookup_enum_by_nr(cap));
+               state ? "Enable" : "Disable", _mesa_enum_to_string(cap));
 }
 
 
@@ -1101,7 +1101,7 @@ _mesa_set_enablei(struct gl_context *ctx, GLenum cap,
 invalid_enum_error:
     _mesa_error(ctx, GL_INVALID_ENUM, "%s(cap=%s)",
                 state ? "glEnablei" : "glDisablei",
-                _mesa_lookup_enum_by_nr(cap));
+                _mesa_enum_to_string(cap));
 }
 
 
@@ -1143,7 +1143,7 @@ _mesa_IsEnabledi( GLenum cap, GLuint index )
       return (ctx->Scissor.EnableFlags >> index) & 1;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glIsEnabledIndexed(cap=%s)",
-                  _mesa_lookup_enum_by_nr(cap));
+                  _mesa_enum_to_string(cap));
       return GL_FALSE;
    }
 }
@@ -1623,6 +1623,6 @@ _mesa_IsEnabled( GLenum cap )
 
 invalid_enum_error:
    _mesa_error(ctx, GL_INVALID_ENUM, "glIsEnabled(%s)",
-               _mesa_lookup_enum_by_nr(cap));
+               _mesa_enum_to_string(cap));
    return GL_FALSE;
 }
diff --git a/src/mesa/main/enums.h b/src/mesa/main/enums.h
index 66bdd53bbab..0e18cd407e9 100644
--- a/src/mesa/main/enums.h
+++ b/src/mesa/main/enums.h
@@ -42,7 +42,7 @@ extern "C" {
 #endif
 
 
-extern const char *_mesa_lookup_enum_by_nr( int nr );
+extern const char *_mesa_enum_to_string( int nr );
 
 /* Get the name of an enum given that it is a primitive type.  Avoids
  * GL_FALSE/GL_POINTS ambiguity and others.
diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index b3406665d94..f720de316e4 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -1314,7 +1314,7 @@ flush_delayed_errors( struct gl_context *ctx )
    if (ctx->ErrorDebugCount) {
       _mesa_snprintf(s, MAX_DEBUG_MESSAGE_LENGTH, "%d similar %s errors", 
                      ctx->ErrorDebugCount,
-                     _mesa_lookup_enum_by_nr(ctx->ErrorValue));
+                     _mesa_enum_to_string(ctx->ErrorValue));
 
       output_if_debug("Mesa", s, GL_TRUE);
 
@@ -1503,7 +1503,7 @@ _mesa_error( struct gl_context *ctx, GLenum error, const char *fmtString, ... )
       }
 
       len = _mesa_snprintf(s2, MAX_DEBUG_MESSAGE_LENGTH, "%s in %s",
-                           _mesa_lookup_enum_by_nr(error), s);
+                           _mesa_enum_to_string(error), s);
       if (len >= MAX_DEBUG_MESSAGE_LENGTH) {
          /* Same as above. */
          assert(0);
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index f8dcf122d99..f46554b761e 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -2007,7 +2007,7 @@ renderbuffer_storage(struct gl_context *ctx, struct gl_renderbuffer *rb,
    baseFormat = _mesa_base_fbo_format(ctx, internalFormat);
    if (baseFormat == 0) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(internalFormat=%s)",
-                  func, _mesa_lookup_enum_by_nr(internalFormat));
+                  func, _mesa_enum_to_string(internalFormat));
       return;
    }
 
@@ -2095,12 +2095,12 @@ renderbuffer_storage_named(GLuint renderbuffer, GLenum internalFormat,
       if (samples == NO_SAMPLES)
          _mesa_debug(ctx, "%s(%u, %s, %d, %d)\n",
                      func, renderbuffer,
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(internalFormat),
                      width, height);
       else
          _mesa_debug(ctx, "%s(%u, %s, %d, %d, %d)\n",
                      func, renderbuffer,
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(internalFormat),
                      width, height, samples);
    }
 
@@ -2131,14 +2131,14 @@ renderbuffer_storage_target(GLenum target, GLenum internalFormat,
       if (samples == NO_SAMPLES)
          _mesa_debug(ctx, "%s(%s, %s, %d, %d)\n",
                      func,
-                     _mesa_lookup_enum_by_nr(target),
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(target),
+                     _mesa_enum_to_string(internalFormat),
                      width, height);
       else
          _mesa_debug(ctx, "%s(%s, %s, %d, %d, %d)\n",
                      func,
-                     _mesa_lookup_enum_by_nr(target),
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(target),
+                     _mesa_enum_to_string(internalFormat),
                      width, height, samples);
    }
 
@@ -2311,7 +2311,7 @@ get_render_buffer_parameteriv(struct gl_context *ctx,
       /* fallthrough */
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid pname=%s)", func,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       return;
    }
 }
@@ -2694,13 +2694,13 @@ _mesa_CheckFramebufferStatus(GLenum target)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glCheckFramebufferStatus(%s)\n",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
 
    fb = get_framebuffer_target(ctx, target);
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glCheckFramebufferStatus(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return 0;
    }
 
@@ -2732,7 +2732,7 @@ _mesa_CheckNamedFramebufferStatus(GLuint framebuffer, GLenum target)
       default:
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "glCheckNamedFramebufferStatus(invalid target %s)",
-                     _mesa_lookup_enum_by_nr(target));
+                     _mesa_enum_to_string(target));
          return 0;
    }
 
@@ -2851,7 +2851,7 @@ check_layered_texture_target(struct gl_context *ctx, GLenum target,
 
    _mesa_error(ctx, GL_INVALID_OPERATION,
                "%s(invalid texture target %s)", caller,
-               _mesa_lookup_enum_by_nr(target));
+               _mesa_enum_to_string(target));
    return false;
 }
 
@@ -2893,7 +2893,7 @@ check_texture_target(struct gl_context *ctx, GLenum target,
 
    _mesa_error(ctx, GL_INVALID_OPERATION,
                "%s(invalid texture target %s)", caller,
-               _mesa_lookup_enum_by_nr(target));
+               _mesa_enum_to_string(target));
    return false;
 }
 
@@ -2962,7 +2962,7 @@ check_textarget(struct gl_context *ctx, int dims, GLenum target,
    if (err) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(invalid textarget %s)",
-                  caller, _mesa_lookup_enum_by_nr(textarget));
+                  caller, _mesa_enum_to_string(textarget));
       return false;
    }
 
@@ -3074,7 +3074,7 @@ _mesa_framebuffer_texture(struct gl_context *ctx, struct gl_framebuffer *fb,
    att = get_attachment(ctx, fb, attachment);
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", caller,
-                  _mesa_lookup_enum_by_nr(attachment));
+                  _mesa_enum_to_string(attachment));
       return;
    }
 
@@ -3157,7 +3157,7 @@ framebuffer_texture_with_dims(int dims, GLenum target,
    fb = get_framebuffer_target(ctx, target);
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", caller,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -3225,7 +3225,7 @@ _mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glFramebufferTextureLayer(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -3319,7 +3319,7 @@ _mesa_FramebufferTexture(GLenum target, GLenum attachment,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glFramebufferTexture(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -3400,7 +3400,7 @@ _mesa_framebuffer_renderbuffer(struct gl_context *ctx,
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(invalid attachment %s)", func,
-                  _mesa_lookup_enum_by_nr(attachment));
+                  _mesa_enum_to_string(attachment));
       return;
    }
 
@@ -3440,7 +3440,7 @@ _mesa_FramebufferRenderbuffer(GLenum target, GLenum attachment,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glFramebufferRenderbuffer(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -3539,7 +3539,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
           attachment != GL_DEPTH && attachment != GL_STENCIL) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "%s(invalid attachment %s)", caller,
-                     _mesa_lookup_enum_by_nr(attachment));
+                     _mesa_enum_to_string(attachment));
          return;
       }
       /* the default / window-system FBO */
@@ -3552,7 +3552,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
 
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", caller,
-                  _mesa_lookup_enum_by_nr(attachment));
+                  _mesa_enum_to_string(attachment));
       return;
    }
 
@@ -3609,7 +3609,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else {
          goto invalid_pname_enum;
@@ -3626,7 +3626,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else {
          goto invalid_pname_enum;
@@ -3637,7 +3637,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
          goto invalid_pname_enum;
       } else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       } else if (att->Type == GL_TEXTURE) {
          if (att->Texture && (att->Texture->Target == GL_TEXTURE_3D ||
              att->Texture->Target == GL_TEXTURE_2D_ARRAY)) {
@@ -3659,7 +3659,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else {
          if (ctx->Extensions.EXT_framebuffer_sRGB) {
@@ -3682,7 +3682,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else {
          mesa_format format = att->Renderbuffer->Format;
@@ -3734,7 +3734,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else if (att->Texture) {
          const struct gl_texture_image *texImage =
@@ -3763,7 +3763,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
          *params = att->Layered;
       } else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       } else {
          goto invalid_pname_enum;
       }
@@ -3776,7 +3776,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
 
 invalid_pname_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid pname %s)", caller,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
    return;
 }
 
@@ -3792,7 +3792,7 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
    if (!buffer) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetFramebufferAttachmentParameteriv(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4009,7 +4009,7 @@ invalidate_framebuffer_storage(struct gl_context *ctx,
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", name,
-               _mesa_lookup_enum_by_nr(attachments[i]));
+               _mesa_enum_to_string(attachments[i]));
    return;
 }
 
@@ -4026,7 +4026,7 @@ _mesa_InvalidateSubFramebuffer(GLenum target, GLsizei numAttachments,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glInvalidateSubFramebuffer(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4076,7 +4076,7 @@ _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glInvalidateFramebuffer(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4152,7 +4152,7 @@ _mesa_DiscardFramebufferEXT(GLenum target, GLsizei numAttachments,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
          "glDiscardFramebufferEXT(target %s)",
-         _mesa_lookup_enum_by_nr(target));
+         _mesa_enum_to_string(target));
       return;
    }
 
@@ -4189,5 +4189,5 @@ _mesa_DiscardFramebufferEXT(GLenum target, GLsizei numAttachments,
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM,
                "glDiscardFramebufferEXT(attachment %s)",
-              _mesa_lookup_enum_by_nr(attachments[i]));
+              _mesa_enum_to_string(attachments[i]));
 }
diff --git a/src/mesa/main/feedback.c b/src/mesa/main/feedback.c
index 6bc4294f9c7..699e2a855a3 100644
--- a/src/mesa/main/feedback.c
+++ b/src/mesa/main/feedback.c
@@ -415,7 +415,7 @@ _mesa_RenderMode( GLenum mode )
    ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glRenderMode %s\n", _mesa_lookup_enum_by_nr(mode));
+      _mesa_debug(ctx, "glRenderMode %s\n", _mesa_enum_to_string(mode));
 
    FLUSH_VERTICES(ctx, _NEW_RENDERMODE);
 
diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c
index 7741cabada1..fe96d5b29fb 100644
--- a/src/mesa/main/formatquery.c
+++ b/src/mesa/main/formatquery.c
@@ -80,7 +80,7 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetInternalformativ(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -107,7 +107,7 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
        _mesa_base_fbo_format(ctx, internalformat) == 0) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetInternalformativ(internalformat=%s)",
-                  _mesa_lookup_enum_by_nr(internalformat));
+                  _mesa_enum_to_string(internalformat));
       return;
    }
 
@@ -119,7 +119,7 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
    if (bufSize < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glGetInternalformativ(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -168,7 +168,7 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetInternalformativ(pname=%s)",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       return;
    }
 
diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c
index 77c04b8dab8..37e2c29c89c 100644
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -938,7 +938,7 @@ _mesa_print_framebuffer(const struct gl_framebuffer *fb)
 
    fprintf(stderr, "Mesa Framebuffer %u at %p\n", fb->Name, (void *) fb);
    fprintf(stderr, "  Size: %u x %u  Status: %s\n", fb->Width, fb->Height,
-           _mesa_lookup_enum_by_nr(fb->_Status));
+           _mesa_enum_to_string(fb->_Status));
    fprintf(stderr, "  Attachments:\n");
 
    for (i = 0; i < BUFFER_COUNT; i++) {
diff --git a/src/mesa/main/genmipmap.c b/src/mesa/main/genmipmap.c
index 9aef090194e..c18f9d5223f 100644
--- a/src/mesa/main/genmipmap.c
+++ b/src/mesa/main/genmipmap.c
@@ -83,7 +83,7 @@ _mesa_generate_texture_mipmap(struct gl_context *ctx,
 
    if (error) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGenerate%sMipmap(target=%s)",
-                  suffix, _mesa_lookup_enum_by_nr(target));
+                  suffix, _mesa_enum_to_string(target));
       return;
    }
 
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 3d6d63916b3..ffafe51ba05 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -1161,7 +1161,7 @@ check_extra(struct gl_context *ctx, const char *func, const struct value_desc *d
 
    if (api_check && !api_found) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=%s)", func,
-                  _mesa_lookup_enum_by_nr(d->pname));
+                  _mesa_enum_to_string(d->pname));
       return GL_FALSE;
    }
 
@@ -1222,7 +1222,7 @@ find_value(const char *func, GLenum pname, void **p, union value *v)
        * any valid enum. */
       if (unlikely(idx == 0)) {
          _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=%s)", func,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
          return &error_value;
       }
 
@@ -2004,11 +2004,11 @@ find_value_indexed(const char *func, GLenum pname, GLuint index, union value *v)
 
  invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=%s)", func,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
    return TYPE_INVALID;
  invalid_value:
    _mesa_error(ctx, GL_INVALID_VALUE, "%s(pname=%s)", func,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
    return TYPE_INVALID;
 }
 
diff --git a/src/mesa/main/getstring.c b/src/mesa/main/getstring.c
index 72d99ca4e22..9873fdbf1a4 100644
--- a/src/mesa/main/getstring.c
+++ b/src/mesa/main/getstring.c
@@ -208,7 +208,7 @@ _mesa_GetPointerv( GLenum pname, GLvoid **params )
       return;
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glGetPointerv %s\n", _mesa_lookup_enum_by_nr(pname));
+      _mesa_debug(ctx, "glGetPointerv %s\n", _mesa_enum_to_string(pname));
 
    switch (pname) {
       case GL_VERTEX_ARRAY_POINTER:
@@ -299,7 +299,7 @@ _mesa_GetError( void )
    ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glGetError <-- %s\n", _mesa_lookup_enum_by_nr(e));
+      _mesa_debug(ctx, "glGetError <-- %s\n", _mesa_enum_to_string(e));
 
    ctx->ErrorValue = (GLenum) GL_NO_ERROR;
    ctx->ErrorDebugCount = 0;
diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index ac69fabccaa..ae97079836e 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -186,7 +186,7 @@ get_map_idx(GLenum value)
       return IDX_RG;
    default:
       _mesa_problem(NULL, "Unexpected inFormat %s",
-                    _mesa_lookup_enum_by_nr(value));
+                    _mesa_enum_to_string(value));
       return 0;
    }
 }
@@ -216,8 +216,8 @@ _mesa_compute_component_mapping(GLenum inFormat, GLenum outFormat, GLubyte *map)
 
 #if 0
    printf("from %x/%s to %x/%s map %d %d %d %d %d %d\n",
-	  inFormat, _mesa_lookup_enum_by_nr(inFormat),
-	  outFormat, _mesa_lookup_enum_by_nr(outFormat),
+	  inFormat, _mesa_enum_to_string(inFormat),
+	  outFormat, _mesa_enum_to_string(outFormat),
 	  map[0],
 	  map[1],
 	  map[2],
diff --git a/src/mesa/main/hint.c b/src/mesa/main/hint.c
index 3e056ebaf13..984239a7276 100644
--- a/src/mesa/main/hint.c
+++ b/src/mesa/main/hint.c
@@ -40,8 +40,8 @@ _mesa_Hint( GLenum target, GLenum mode )
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glHint %s %s\n",
-                  _mesa_lookup_enum_by_nr(target),
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(target),
+                  _mesa_enum_to_string(mode));
 
    if (mode != GL_NICEST && mode != GL_FASTEST && mode != GL_DONT_CARE) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glHint(mode)");
diff --git a/src/mesa/main/light.c b/src/mesa/main/light.c
index 4021dbef922..dab21d1a634 100644
--- a/src/mesa/main/light.c
+++ b/src/mesa/main/light.c
@@ -42,7 +42,7 @@ _mesa_ShadeModel( GLenum mode )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glShadeModel %s\n", _mesa_lookup_enum_by_nr(mode));
+      _mesa_debug(ctx, "glShadeModel %s\n", _mesa_enum_to_string(mode));
 
    if (mode != GL_FLAT && mode != GL_SMOOTH) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glShadeModel");
@@ -723,8 +723,8 @@ _mesa_ColorMaterial( GLenum face, GLenum mode )
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glColorMaterial %s %s\n",
-                  _mesa_lookup_enum_by_nr(face),
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(face),
+                  _mesa_enum_to_string(mode));
 
    bitmask = _mesa_material_bitmask(ctx, face, mode, legal, "glColorMaterial");
    if (bitmask == 0)
diff --git a/src/mesa/main/matrix.c b/src/mesa/main/matrix.c
index 80c8a248ce4..2b8016a4a72 100644
--- a/src/mesa/main/matrix.c
+++ b/src/mesa/main/matrix.c
@@ -229,7 +229,7 @@ _mesa_PushMatrix( void )
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glPushMatrix %s\n",
-                  _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+                  _mesa_enum_to_string(ctx->Transform.MatrixMode));
 
    if (stack->Depth + 1 >= stack->MaxDepth) {
       if (ctx->Transform.MatrixMode == GL_TEXTURE) {
@@ -239,7 +239,7 @@ _mesa_PushMatrix( void )
       }
       else {
          _mesa_error(ctx,  GL_STACK_OVERFLOW, "glPushMatrix(mode=%s)",
-                     _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+                     _mesa_enum_to_string(ctx->Transform.MatrixMode));
       }
       return;
    }
@@ -270,7 +270,7 @@ _mesa_PopMatrix( void )
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glPopMatrix %s\n",
-                  _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+                  _mesa_enum_to_string(ctx->Transform.MatrixMode));
 
    if (stack->Depth == 0) {
       if (ctx->Transform.MatrixMode == GL_TEXTURE) {
@@ -280,7 +280,7 @@ _mesa_PopMatrix( void )
       }
       else {
          _mesa_error(ctx,  GL_STACK_UNDERFLOW, "glPopMatrix(mode=%s)",
-                     _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+                     _mesa_enum_to_string(ctx->Transform.MatrixMode));
       }
       return;
    }
diff --git a/src/mesa/main/objectlabel.c b/src/mesa/main/objectlabel.c
index 5626054687b..1019f893ba8 100644
--- a/src/mesa/main/objectlabel.c
+++ b/src/mesa/main/objectlabel.c
@@ -234,7 +234,7 @@ get_label_pointer(struct gl_context *ctx, GLenum identifier, GLuint name,
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(identifier = %s)",
-               caller, _mesa_lookup_enum_by_nr(identifier));
+               caller, _mesa_enum_to_string(identifier));
    return NULL;
 }
 
diff --git a/src/mesa/main/pipelineobj.c b/src/mesa/main/pipelineobj.c
index 279ae2078fe..bdf343a6763 100644
--- a/src/mesa/main/pipelineobj.c
+++ b/src/mesa/main/pipelineobj.c
@@ -635,7 +635,7 @@ _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
    }
 
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramPipelineiv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 /**
diff --git a/src/mesa/main/polygon.c b/src/mesa/main/polygon.c
index a1f0aa02da1..c1c31660e12 100644
--- a/src/mesa/main/polygon.c
+++ b/src/mesa/main/polygon.c
@@ -56,7 +56,7 @@ _mesa_CullFace( GLenum mode )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE&VERBOSE_API)
-      _mesa_debug(ctx, "glCullFace %s\n", _mesa_lookup_enum_by_nr(mode));
+      _mesa_debug(ctx, "glCullFace %s\n", _mesa_enum_to_string(mode));
 
    if (mode!=GL_FRONT && mode!=GL_BACK && mode!=GL_FRONT_AND_BACK) {
       _mesa_error( ctx, GL_INVALID_ENUM, "glCullFace" );
@@ -91,7 +91,7 @@ _mesa_FrontFace( GLenum mode )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE&VERBOSE_API)
-      _mesa_debug(ctx, "glFrontFace %s\n", _mesa_lookup_enum_by_nr(mode));
+      _mesa_debug(ctx, "glFrontFace %s\n", _mesa_enum_to_string(mode));
 
    if (mode!=GL_CW && mode!=GL_CCW) {
       _mesa_error( ctx, GL_INVALID_ENUM, "glFrontFace" );
@@ -128,8 +128,8 @@ _mesa_PolygonMode( GLenum face, GLenum mode )
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glPolygonMode %s %s\n",
-                  _mesa_lookup_enum_by_nr(face),
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(face),
+                  _mesa_enum_to_string(mode));
 
    if (mode!=GL_POINT && mode!=GL_LINE && mode!=GL_FILL) {
       _mesa_error( ctx, GL_INVALID_ENUM, "glPolygonMode(mode)" );
diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index d857b84e60d..e77bb03e9a1 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -81,7 +81,7 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
    /* Validate interface. */
    if (!supported_interface_enum(programInterface)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "glGetProgramInterfaceiv(%s)",
-                  _mesa_lookup_enum_by_nr(programInterface));
+                  _mesa_enum_to_string(programInterface));
       return;
    }
 
@@ -96,8 +96,8 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
       if (programInterface == GL_ATOMIC_COUNTER_BUFFER) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glGetProgramInterfaceiv(%s pname %s)",
-                     _mesa_lookup_enum_by_nr(programInterface),
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(programInterface),
+                     _mesa_enum_to_string(pname));
          return;
       }
       /* Name length consists of base name, 3 additional chars '[0]' if
@@ -138,15 +138,15 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
       default:
         _mesa_error(ctx, GL_INVALID_OPERATION,
                     "glGetProgramInterfaceiv(%s pname %s)",
-                    _mesa_lookup_enum_by_nr(programInterface),
-                    _mesa_lookup_enum_by_nr(pname));
+                    _mesa_enum_to_string(programInterface),
+                    _mesa_enum_to_string(pname));
       };
       break;
    case GL_MAX_NUM_COMPATIBLE_SUBROUTINES:
    default:
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glGetProgramInterfaceiv(pname %s)",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
    }
 }
 
@@ -234,7 +234,7 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
    case GL_ATOMIC_COUNTER_BUFFER:
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceIndex(%s)",
-                  _mesa_lookup_enum_by_nr(programInterface));
+                  _mesa_enum_to_string(programInterface));
    }
 
    return GL_INVALID_INDEX;
@@ -262,7 +262,7 @@ _mesa_GetProgramResourceName(GLuint program, GLenum programInterface,
    if (programInterface == GL_ATOMIC_COUNTER_BUFFER ||
        !supported_interface_enum(programInterface)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceName(%s)",
-                  _mesa_lookup_enum_by_nr(programInterface));
+                  _mesa_enum_to_string(programInterface));
       return;
    }
 
@@ -380,7 +380,7 @@ _mesa_GetProgramResourceLocation(GLuint program, GLenum programInterface,
 
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceLocation(%s %s)",
-                  _mesa_lookup_enum_by_nr(programInterface), name);
+                  _mesa_enum_to_string(programInterface), name);
    }
 
    return _mesa_program_resource_location(shProg, programInterface, name);
@@ -408,7 +408,7 @@ _mesa_GetProgramResourceLocationIndex(GLuint program, GLenum programInterface,
    if (programInterface != GL_PROGRAM_OUTPUT) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetProgramResourceLocationIndex(%s)",
-                  _mesa_lookup_enum_by_nr(programInterface));
+                  _mesa_enum_to_string(programInterface));
       return -1;
    }
 
diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index 5ff1b953231..e5c6f9ae31e 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -295,7 +295,7 @@ _mesa_CreateQueries(GLenum target, GLsizei n, GLuint *ids)
       break;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glCreateQueries(invalid target = %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -390,7 +390,7 @@ _mesa_BeginQueryIndexed(GLenum target, GLuint index, GLuint id)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBeginQueryIndexed(%s, %u, %u)\n",
-                  _mesa_lookup_enum_by_nr(target), index, id);
+                  _mesa_enum_to_string(target), index, id);
 
    if (!query_error_check_index(ctx, target, index))
       return;
@@ -412,7 +412,7 @@ _mesa_BeginQueryIndexed(GLenum target, GLuint index, GLuint id)
    if (*bindpt) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glBeginQuery{Indexed}(target=%s is active)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -496,7 +496,7 @@ _mesa_EndQueryIndexed(GLenum target, GLuint index)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glEndQueryIndexed(%s, %u)\n",
-                  _mesa_lookup_enum_by_nr(target), index);
+                  _mesa_enum_to_string(target), index);
 
    if (!query_error_check_index(ctx, target, index))
       return;
@@ -516,8 +516,8 @@ _mesa_EndQueryIndexed(GLenum target, GLuint index)
    if (q && q->Target != target) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glEndQuery(target=%s with active query of target %s)",
-                  _mesa_lookup_enum_by_nr(target),
-                  _mesa_lookup_enum_by_nr(q->Target));
+                  _mesa_enum_to_string(target),
+                  _mesa_enum_to_string(q->Target));
       return;
    }
 
@@ -553,7 +553,7 @@ _mesa_QueryCounter(GLuint id, GLenum target)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glQueryCounter(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
 
    /* error checking */
    if (target != GL_TIMESTAMP) {
@@ -628,9 +628,9 @@ _mesa_GetQueryIndexediv(GLenum target, GLuint index, GLenum pname,
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryIndexediv(%s, %u, %s)\n",
-                  _mesa_lookup_enum_by_nr(target),
+                  _mesa_enum_to_string(target),
                   index,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (!query_error_check_index(ctx, target, index))
       return;
@@ -712,7 +712,7 @@ _mesa_GetQueryIndexediv(GLenum target, GLuint index, GLenum pname,
          default:
             _mesa_problem(ctx,
                           "Unknown target in glGetQueryIndexediv(target = %s)",
-                          _mesa_lookup_enum_by_nr(target));
+                          _mesa_enum_to_string(target));
             *params = 0;
             break;
          }
@@ -740,7 +740,7 @@ _mesa_GetQueryObjectiv(GLuint id, GLenum pname, GLint *params)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryObjectiv(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (id)
       q = _mesa_lookup_query_object(ctx, id);
@@ -794,7 +794,7 @@ _mesa_GetQueryObjectuiv(GLuint id, GLenum pname, GLuint *params)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryObjectuiv(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (id)
       q = _mesa_lookup_query_object(ctx, id);
@@ -851,7 +851,7 @@ _mesa_GetQueryObjecti64v(GLuint id, GLenum pname, GLint64EXT *params)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryObjecti64v(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (id)
       q = _mesa_lookup_query_object(ctx, id);
@@ -894,7 +894,7 @@ _mesa_GetQueryObjectui64v(GLuint id, GLenum pname, GLuint64EXT *params)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryObjectui64v(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (id)
       q = _mesa_lookup_query_object(ctx, id);
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index e256695480a..32e31ffae2f 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -950,8 +950,8 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glReadPixels(%d, %d, %s, %s, %p)\n",
                   width, height,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type),
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type),
                   pixels);
 
    if (width < 0 || height < 0) {
@@ -1008,8 +1008,8 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
 
       if (err != GL_NO_ERROR) {
          _mesa_error(ctx, err, "glReadPixels(invalid format %s and/or type %s)",
-                     _mesa_lookup_enum_by_nr(format),
-                     _mesa_lookup_enum_by_nr(type));
+                     _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type));
          return;
       }
    }
@@ -1017,8 +1017,8 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
    err = _mesa_error_check_format_and_type(ctx, format, type);
    if (err != GL_NO_ERROR) {
       _mesa_error(ctx, err, "glReadPixels(invalid format %s and/or type %s)",
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       return;
    }
 
diff --git a/src/mesa/main/samplerobj.c b/src/mesa/main/samplerobj.c
index a3aacc66aa3..d023fefabff 100644
--- a/src/mesa/main/samplerobj.c
+++ b/src/mesa/main/samplerobj.c
@@ -813,7 +813,7 @@ _mesa_SamplerParameteri(GLuint sampler, GLenum pname, GLint param)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameteri(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameteri(param=%d)\n",
@@ -906,7 +906,7 @@ _mesa_SamplerParameterf(GLuint sampler, GLenum pname, GLfloat param)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterf(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterf(param=%f)\n",
@@ -1006,7 +1006,7 @@ _mesa_SamplerParameteriv(GLuint sampler, GLenum pname, const GLint *params)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameteriv(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameteriv(param=%d)\n",
@@ -1099,7 +1099,7 @@ _mesa_SamplerParameterfv(GLuint sampler, GLenum pname, const GLfloat *params)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterfv(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterfv(param=%f)\n",
@@ -1184,7 +1184,7 @@ _mesa_SamplerParameterIiv(GLuint sampler, GLenum pname, const GLint *params)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterIiv(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterIiv(param=%d)\n",
@@ -1270,7 +1270,7 @@ _mesa_SamplerParameterIuiv(GLuint sampler, GLenum pname, const GLuint *params)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterIuiv(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterIuiv(param=%u)\n",
@@ -1380,7 +1380,7 @@ _mesa_GetSamplerParameteriv(GLuint sampler, GLenum pname, GLint *params)
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetSamplerParameteriv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1466,7 +1466,7 @@ _mesa_GetSamplerParameterfv(GLuint sampler, GLenum pname, GLfloat *params)
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetSamplerParameterfv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1545,7 +1545,7 @@ _mesa_GetSamplerParameterIiv(GLuint sampler, GLenum pname, GLint *params)
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetSamplerParameterIiv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1624,7 +1624,7 @@ _mesa_GetSamplerParameterIuiv(GLuint sampler, GLenum pname, GLuint *params)
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetSamplerParameterIuiv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index a6246a39aad..d562a90d2b2 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -925,8 +925,8 @@ get_buffer_property(struct gl_shader_program *shProg,
 
 invalid_operation:
    _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s prop %s)", caller,
-               _mesa_lookup_enum_by_nr(res->Type),
-               _mesa_lookup_enum_by_nr(prop));
+               _mesa_enum_to_string(res->Type),
+               _mesa_enum_to_string(prop));
 
    return 0;
 }
@@ -1057,14 +1057,14 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(%s prop %s)", caller,
-               _mesa_lookup_enum_by_nr(res->Type),
-               _mesa_lookup_enum_by_nr(prop));
+               _mesa_enum_to_string(res->Type),
+               _mesa_enum_to_string(prop));
    return 0;
 
 invalid_operation:
    _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s prop %s)", caller,
-               _mesa_lookup_enum_by_nr(res->Type),
-               _mesa_lookup_enum_by_nr(prop));
+               _mesa_enum_to_string(res->Type),
+               _mesa_enum_to_string(prop));
    return 0;
 }
 
@@ -1086,7 +1086,7 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg,
    if (!res || bufSize < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glGetProgramResourceiv(%s index %d bufSize %d)",
-                  _mesa_lookup_enum_by_nr(programInterface), index, bufSize);
+                  _mesa_enum_to_string(programInterface), index, bufSize);
       return;
    }
 
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index a4296adf799..3365c7a6727 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -716,7 +716,7 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
    }
 
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramiv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1172,7 +1172,7 @@ _mesa_CreateShader(GLenum type)
 {
    GET_CURRENT_CONTEXT(ctx);
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glCreateShader %s\n", _mesa_lookup_enum_by_nr(type));
+      _mesa_debug(ctx, "glCreateShader %s\n", _mesa_enum_to_string(type));
    return create_shader(ctx, type);
 }
 
@@ -1857,7 +1857,7 @@ _mesa_ProgramParameteri(GLuint program, GLenum pname, GLint value)
 
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glProgramParameteri(pname=%s)",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       return;
    }
 
@@ -1865,7 +1865,7 @@ invalid_value:
    _mesa_error(ctx, GL_INVALID_VALUE,
                "glProgramParameteri(pname=%s, value=%d): "
                "value must be 0 or 1.",
-               _mesa_lookup_enum_by_nr(pname),
+               _mesa_enum_to_string(pname),
                value);
 }
 
diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index 80b77275f93..a348cdb0405 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -610,7 +610,7 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures)
                         "glBindImageTextures(the internal format %s of "
                         "the level zero texture image of textures[%d]=%u "
                         "is not supported)",
-                        _mesa_lookup_enum_by_nr(tex_format),
+                        _mesa_enum_to_string(tex_format),
                         i, texture);
             continue;
          }
diff --git a/src/mesa/main/tests/enum_strings.cpp b/src/mesa/main/tests/enum_strings.cpp
index dc5fe751a86..84c11954f44 100644
--- a/src/mesa/main/tests/enum_strings.cpp
+++ b/src/mesa/main/tests/enum_strings.cpp
@@ -39,13 +39,13 @@ TEST(EnumStrings, LookUpByNumber)
 {
    for (unsigned i = 0; everything[i].name != NULL; i++) {
       EXPECT_STREQ(everything[i].name,
-		   _mesa_lookup_enum_by_nr(everything[i].value));
+		   _mesa_enum_to_string(everything[i].value));
    }
 }
 
 TEST(EnumStrings, LookUpUnknownNumber)
 {
-   EXPECT_STRCASEEQ("0xEEEE", _mesa_lookup_enum_by_nr(0xEEEE));
+   EXPECT_STRCASEEQ("0xEEEE", _mesa_enum_to_string(0xEEEE));
 }
 
 /* Please type the name and the value.  This makes it easier to detect
diff --git a/src/mesa/main/texenv.c b/src/mesa/main/texenv.c
index 3edafc0f776..091922161c5 100644
--- a/src/mesa/main/texenv.c
+++ b/src/mesa/main/texenv.c
@@ -42,7 +42,7 @@
 
 
 #define TE_ERROR(errCode, msg, value)				\
-   _mesa_error(ctx, errCode, msg, _mesa_lookup_enum_by_nr(value));
+   _mesa_error(ctx, errCode, msg, _mesa_enum_to_string(value));
 
 
 /** Set texture env mode */
@@ -482,16 +482,16 @@ _mesa_TexEnvfv( GLenum target, GLenum pname, const GLfloat *param )
    }
    else {
       _mesa_error(ctx, GL_INVALID_ENUM, "glTexEnv(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
    if (MESA_VERBOSE&(VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTexEnv %s %s %.1f(%s) ...\n",
-                  _mesa_lookup_enum_by_nr(target),
-                  _mesa_lookup_enum_by_nr(pname),
+                  _mesa_enum_to_string(target),
+                  _mesa_enum_to_string(pname),
                   *param,
-                  _mesa_lookup_enum_by_nr((GLenum) iparam0));
+                  _mesa_enum_to_string((GLenum) iparam0));
 
    /* Tell device driver about the new texture environment */
    if (ctx->Driver.TexEnv) {
diff --git a/src/mesa/main/texformat.c b/src/mesa/main/texformat.c
index 3c4baca7026..f4d17e1bdb5 100644
--- a/src/mesa/main/texformat.c
+++ b/src/mesa/main/texformat.c
@@ -847,7 +847,7 @@ _mesa_choose_tex_format(struct gl_context *ctx, GLenum target,
    }
 
    _mesa_problem(ctx, "unexpected format %s in _mesa_choose_tex_format()",
-                 _mesa_lookup_enum_by_nr(internalFormat));
+                 _mesa_enum_to_string(internalFormat));
    return MESA_FORMAT_NONE;
 }
 
diff --git a/src/mesa/main/texgen.c b/src/mesa/main/texgen.c
index 41e428b69e7..24ba295746a 100644
--- a/src/mesa/main/texgen.c
+++ b/src/mesa/main/texgen.c
@@ -76,10 +76,10 @@ _mesa_TexGenfv( GLenum coord, GLenum pname, const GLfloat *params )
 
    if (MESA_VERBOSE&(VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTexGen %s %s %.1f(%s)...\n",
-                  _mesa_lookup_enum_by_nr(coord),
-                  _mesa_lookup_enum_by_nr(pname),
+                  _mesa_enum_to_string(coord),
+                  _mesa_enum_to_string(pname),
                   *params,
-		  _mesa_lookup_enum_by_nr((GLenum) (GLint) *params));
+		  _mesa_enum_to_string((GLenum) (GLint) *params));
 
    if (ctx->Texture.CurrentUnit >= ctx->Const.MaxTextureCoordUnits) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "glTexGen(current unit)");
diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 92b4d6795c6..de6db44ac26 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -1129,7 +1129,7 @@ _mesa_GetTextureImage(GLuint texture, GLint level, GLenum format,
     */
    if (!legal_getteximage_target(ctx, texObj->Target, true)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetTextureImage(target=%s)",
-                  _mesa_lookup_enum_by_nr(texObj->Target));
+                  _mesa_enum_to_string(texObj->Target));
       return;
    }
 
@@ -1191,7 +1191,7 @@ getcompressedteximage_error_check(struct gl_context *ctx,
    if (!legal_getteximage_target(ctx, target, dsa)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetCompressedTex%sImage(target=%s)", suffix,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return GL_TRUE;
    }
 
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index a70b9da7e91..842b50d8b40 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -2169,8 +2169,8 @@ texture_error_check( struct gl_context *ctx,
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "glTexImage%dD(format = %s, internalFormat = %s)",
                         dimensions,
-                        _mesa_lookup_enum_by_nr(format),
-                        _mesa_lookup_enum_by_nr(internalFormat));
+                        _mesa_enum_to_string(format),
+                        _mesa_enum_to_string(internalFormat));
             return GL_TRUE;
          }
 
@@ -2180,9 +2180,9 @@ texture_error_check( struct gl_context *ctx,
          _mesa_error(ctx, err,
                      "glTexImage%dD(format = %s, type = %s, internalFormat = %s)",
                      dimensions,
-                     _mesa_lookup_enum_by_nr(format),
-                     _mesa_lookup_enum_by_nr(type),
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type),
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
@@ -2191,7 +2191,7 @@ texture_error_check( struct gl_context *ctx,
    if (_mesa_base_tex_format(ctx, internalFormat) < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glTexImage%dD(internalFormat=%s)",
-                  dimensions, _mesa_lookup_enum_by_nr(internalFormat));
+                  dimensions, _mesa_enum_to_string(internalFormat));
       return GL_TRUE;
    }
 
@@ -2200,8 +2200,8 @@ texture_error_check( struct gl_context *ctx,
    if (err != GL_NO_ERROR) {
       _mesa_error(ctx, err,
                   "glTexImage%dD(incompatible format = %s, type = %s)",
-                  dimensions, _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  dimensions, _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       return GL_TRUE;
    }
 
@@ -2216,8 +2216,8 @@ texture_error_check( struct gl_context *ctx,
    if (!texture_formats_agree(internalFormat, format)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glTexImage%dD(incompatible internalFormat = %s, format = %s)",
-                  dimensions, _mesa_lookup_enum_by_nr(internalFormat),
-                  _mesa_lookup_enum_by_nr(format));
+                  dimensions, _mesa_enum_to_string(internalFormat),
+                  _mesa_enum_to_string(format));
       return GL_TRUE;
    }
 
@@ -2332,7 +2332,7 @@ compressed_texture_error_check(struct gl_context *ctx, GLint dimensions,
    if (!_mesa_is_compressed_format(ctx, internalFormat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glCompressedTexImage%dD(internalFormat=%s)",
-                  dimensions, _mesa_lookup_enum_by_nr(internalFormat));
+                  dimensions, _mesa_enum_to_string(internalFormat));
       return GL_TRUE;
    }
 
@@ -2490,7 +2490,7 @@ texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
    /* check target (proxies not allowed) */
    if (!legal_texsubimage_target(ctx, dimensions, target, dsa)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(target=%s)",
-                  callerName, _mesa_lookup_enum_by_nr(target));
+                  callerName, _mesa_enum_to_string(target));
       return GL_TRUE;
    }
 
@@ -2509,8 +2509,8 @@ texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
       err = _mesa_es_error_check_format_and_type(format, type, dimensions);
       if (err != GL_NO_ERROR) {
          _mesa_error(ctx, err, "%s(format = %s, type = %s)",
-                     callerName, _mesa_lookup_enum_by_nr(format),
-                     _mesa_lookup_enum_by_nr(type));
+                     callerName, _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type));
          return GL_TRUE;
       }
    }
@@ -2519,8 +2519,8 @@ texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
    if (err != GL_NO_ERROR) {
       _mesa_error(ctx, err,
                   "%s(incompatible format = %s, type = %s)",
-                  callerName, _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  callerName, _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       return GL_TRUE;
    }
 
@@ -2598,7 +2598,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
    /* check target */
    if (!legal_texsubimage_target(ctx, dimensions, target, false)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glCopyTexImage%uD(target=%s)",
-                  dimensions, _mesa_lookup_enum_by_nr(target));
+                  dimensions, _mesa_enum_to_string(target));
       return GL_TRUE;
    }
 
@@ -2658,7 +2658,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       default:
          _mesa_error(ctx, GL_INVALID_VALUE,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
@@ -2667,7 +2667,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
    if (baseFormat < 0) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                  _mesa_lookup_enum_by_nr(internalFormat));
+                  _mesa_enum_to_string(internalFormat));
       return GL_TRUE;
    }
 
@@ -2677,7 +2677,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       if (rb_base_format < 0) {
          _mesa_error(ctx, GL_INVALID_VALUE,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
@@ -2704,7 +2704,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       if (!valid) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
@@ -2746,7 +2746,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       if(_mesa_is_enum_format_snorm(internalFormat)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
@@ -3111,8 +3111,8 @@ _mesa_choose_texture_format(struct gl_context *ctx,
                        "DXT compression requested (%s), "
                        "but libtxc_dxtn library not installed.  Using %s "
                        "instead.",
-                       _mesa_lookup_enum_by_nr(before),
-                       _mesa_lookup_enum_by_nr(internalFormat));
+                       _mesa_enum_to_string(before),
+                       _mesa_enum_to_string(internalFormat));
       }
    }
 
@@ -3199,18 +3199,18 @@ teximage(struct gl_context *ctx, GLboolean compressed, GLuint dims,
          _mesa_debug(ctx,
                      "glCompressedTexImage%uD %s %d %s %d %d %d %d %p\n",
                      dims,
-                     _mesa_lookup_enum_by_nr(target), level,
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(target), level,
+                     _mesa_enum_to_string(internalFormat),
                      width, height, depth, border, pixels);
       else
          _mesa_debug(ctx,
                      "glTexImage%uD %s %d %s %d %d %d %d %s %s %p\n",
                      dims,
-                     _mesa_lookup_enum_by_nr(target), level,
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(target), level,
+                     _mesa_enum_to_string(internalFormat),
                      width, height, depth, border,
-                     _mesa_lookup_enum_by_nr(format),
-                     _mesa_lookup_enum_by_nr(type), pixels);
+                     _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type), pixels);
    }
 
    internalFormat = override_internal_format(internalFormat, width, height);
@@ -3218,7 +3218,7 @@ teximage(struct gl_context *ctx, GLboolean compressed, GLuint dims,
    /* target error checking */
    if (!legal_teximage_target(ctx, dims, target)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s%uD(target=%s)",
-                  func, dims, _mesa_lookup_enum_by_nr(target));
+                  func, dims, _mesa_enum_to_string(target));
       return;
    }
 
@@ -3330,7 +3330,7 @@ teximage(struct gl_context *ctx, GLboolean compressed, GLuint dims,
          _mesa_error(ctx, GL_OUT_OF_MEMORY,
                      "glTexImage%uD(image too large: %d x %d x %d, %s format)",
                      dims, width, height, depth,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return;
       }
 
@@ -3527,7 +3527,7 @@ _mesa_texture_sub_image(struct gl_context *ctx, GLuint dims,
    if (!legal_texsubimage_target(ctx, dims, target, dsa)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sSubImage%uD(target=%s)",
                   dsa ? "ture" : "",
-                  dims, _mesa_lookup_enum_by_nr(target));
+                  dims, _mesa_enum_to_string(target));
       return;
    }
 
@@ -3597,10 +3597,10 @@ texsubimage(struct gl_context *ctx, GLuint dims, GLenum target, GLint level,
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTexSubImage%uD %s %d %d %d %d %d %d %d %s %s %p\n",
                   dims,
-                  _mesa_lookup_enum_by_nr(target), level,
+                  _mesa_enum_to_string(target), level,
                   xoffset, yoffset, zoffset, width, height, depth,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type), pixels);
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type), pixels);
 
    _mesa_texture_sub_image(ctx, dims, texObj, texImage, target, level,
                            xoffset, yoffset, zoffset, width, height, depth,
@@ -3629,8 +3629,8 @@ texturesubimage(struct gl_context *ctx, GLuint dims,
                   "glTextureSubImage%uD %d %d %d %d %d %d %d %d %s %s %p\n",
                   dims, texture, level,
                   xoffset, yoffset, zoffset, width, height, depth,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type), pixels);
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type), pixels);
 
    /* Get the texture object by Name. */
    texObj = _mesa_lookup_texture(ctx, texture);
@@ -3891,8 +3891,8 @@ copyteximage(struct gl_context *ctx, GLuint dims,
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glCopyTexImage%uD %s %d %s %d %d %d %d %d\n",
                   dims,
-                  _mesa_lookup_enum_by_nr(target), level,
-                  _mesa_lookup_enum_by_nr(internalFormat),
+                  _mesa_enum_to_string(target), level,
+                  _mesa_enum_to_string(internalFormat),
                   x, y, width, height, border);
 
    if (ctx->NewState & NEW_COPY_TEX_STATE)
@@ -4051,7 +4051,7 @@ _mesa_copy_texture_sub_image(struct gl_context *ctx, GLuint dims,
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "%s %s %d %d %d %d %d %d %d %d\n", caller,
-                  _mesa_lookup_enum_by_nr(target),
+                  _mesa_enum_to_string(target),
                   level, xoffset, yoffset, zoffset, x, y, width, height);
 
    if (ctx->NewState & NEW_COPY_TEX_STATE)
@@ -4113,7 +4113,7 @@ _mesa_CopyTexSubImage1D( GLenum target, GLint level,
     */
    if (!legal_texsubimage_target(ctx, 1, target, false)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4141,7 +4141,7 @@ _mesa_CopyTexSubImage2D( GLenum target, GLint level,
     */
    if (!legal_texsubimage_target(ctx, 2, target, false)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4170,7 +4170,7 @@ _mesa_CopyTexSubImage3D( GLenum target, GLint level,
     */
    if (!legal_texsubimage_target(ctx, 3, target, false)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4198,7 +4198,7 @@ _mesa_CopyTextureSubImage1D(GLuint texture, GLint level,
    /* Check target (proxies not allowed). */
    if (!legal_texsubimage_target(ctx, 1, texObj->Target, true)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(texObj->Target));
+                  _mesa_enum_to_string(texObj->Target));
       return;
    }
 
@@ -4222,7 +4222,7 @@ _mesa_CopyTextureSubImage2D(GLuint texture, GLint level,
    /* Check target (proxies not allowed). */
    if (!legal_texsubimage_target(ctx, 2, texObj->Target, true)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(texObj->Target));
+                  _mesa_enum_to_string(texObj->Target));
       return;
    }
 
@@ -4249,7 +4249,7 @@ _mesa_CopyTextureSubImage3D(GLuint texture, GLint level,
    /* Check target (proxies not allowed). */
    if (!legal_texsubimage_target(ctx, 3, texObj->Target, true)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(texObj->Target));
+                  _mesa_enum_to_string(texObj->Target));
       return;
    }
 
@@ -4296,8 +4296,8 @@ check_clear_tex_image(struct gl_context *ctx,
       _mesa_error(ctx, err,
                   "%s(incompatible format = %s, type = %s)",
                   function,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       return false;
    }
 
@@ -4306,8 +4306,8 @@ check_clear_tex_image(struct gl_context *ctx,
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(incompatible internalFormat = %s, format = %s)",
                   function,
-                  _mesa_lookup_enum_by_nr(internalFormat),
-                  _mesa_lookup_enum_by_nr(format));
+                  _mesa_enum_to_string(internalFormat),
+                  _mesa_enum_to_string(format));
       return false;
    }
 
@@ -4549,7 +4549,7 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
 
    if (dsa && target == GL_TEXTURE_RECTANGLE) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid target %s)", caller,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return GL_TRUE;
    }
 
@@ -4604,8 +4604,8 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
          if (invalidformat) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(invalid target %s for format %s)", caller,
-                        _mesa_lookup_enum_by_nr(target),
-                        _mesa_lookup_enum_by_nr(format));
+                        _mesa_enum_to_string(target),
+                        _mesa_enum_to_string(format));
             return GL_TRUE;
          }
       }
@@ -4620,7 +4620,7 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
 
    if (!targetOK) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", caller,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return GL_TRUE;
    }
 
@@ -5604,14 +5604,14 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
    if (immutable && !_mesa_is_legal_tex_storage_format(ctx, internalformat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
             "%s(internalformat=%s not legal for immutable-format)",
-            func, _mesa_lookup_enum_by_nr(internalformat));
+            func, _mesa_enum_to_string(internalformat));
       return;
    }
 
    if (!is_renderable_texture_format(ctx, internalformat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
             "%s(internalformat=%s)",
-            func, _mesa_lookup_enum_by_nr(internalformat));
+            func, _mesa_enum_to_string(internalformat));
       return;
    }
 
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index c563f1e7434..094c3a61a71 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -1255,7 +1255,7 @@ create_textures(struct gl_context *ctx, GLenum target,
          if (targetIndex < 0) { /* Bad Target */
             mtx_unlock(&ctx->Shared->Mutex);
             _mesa_error(ctx, GL_INVALID_ENUM, "gl%sTextures(target = %s)",
-                        func, _mesa_lookup_enum_by_nr(texObj->Target));
+                        func, _mesa_enum_to_string(texObj->Target));
             return;
          }
          assert(targetIndex < NUM_TEXTURE_TARGETS);
@@ -1642,7 +1642,7 @@ _mesa_BindTexture( GLenum target, GLuint texName )
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glBindTexture %s %d\n",
-                  _mesa_lookup_enum_by_nr(target), (GLint) texName);
+                  _mesa_enum_to_string(target), (GLint) texName);
 
    targetIndex = _mesa_tex_target_to_index(ctx, target);
    if (targetIndex < 0) {
@@ -1806,7 +1806,7 @@ _mesa_BindTextureUnit(GLuint unit, GLuint texture)
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glBindTextureUnit %s %d\n",
-                  _mesa_lookup_enum_by_nr(GL_TEXTURE0+unit), (GLint) texture);
+                  _mesa_enum_to_string(GL_TEXTURE0+unit), (GLint) texture);
 
    /* Section 8.1 (Texture Objects) of the OpenGL 4.5 core profile spec
     * (20141030) says:
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index d74134f41b1..efdeed529fa 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -381,7 +381,7 @@ set_tex_parameteri(struct gl_context *ctx,
       if (texObj->Target == GL_TEXTURE_RECTANGLE_ARB && params[0] != 0) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glTex%sParameter(target=%s, param=%d)", suffix,
-                     _mesa_lookup_enum_by_nr(texObj->Target), params[0]);
+                     _mesa_enum_to_string(texObj->Target), params[0]);
          return GL_FALSE;
       }
       incomplete(ctx, texObj);
@@ -610,22 +610,22 @@ set_tex_parameteri(struct gl_context *ctx,
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 
 invalid_param:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(param=%s)",
-               suffix, _mesa_lookup_enum_by_nr(params[0]));
+               suffix, _mesa_enum_to_string(params[0]));
    return GL_FALSE;
 
 invalid_operation:
    _mesa_error(ctx, GL_INVALID_OPERATION, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 }
 
@@ -745,12 +745,12 @@ set_tex_parameterf(struct gl_context *ctx,
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 }
 
@@ -1395,7 +1395,7 @@ get_tex_level_parameter_image(struct gl_context *ctx,
     else {
        _mesa_error(ctx, GL_INVALID_OPERATION,
                    "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-                   _mesa_lookup_enum_by_nr(pname));
+                   _mesa_enum_to_string(pname));
     }
          break;
       case GL_TEXTURE_COMPRESSED:
@@ -1444,7 +1444,7 @@ get_tex_level_parameter_image(struct gl_context *ctx,
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM,
                "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1528,7 +1528,7 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
          /* Always illegal for GL_TEXTURE_BUFFER */
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
          break;
 
       /* GL_ARB_texture_float */
@@ -1557,7 +1557,7 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM,
                "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1586,7 +1586,7 @@ get_tex_level_parameteriv(struct gl_context *ctx,
    if (!legal_get_tex_level_parameter_target(ctx, target, dsa)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetTex%sLevelParameter[if]v(target=%s)", suffix,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
diff --git a/src/mesa/main/texstate.c b/src/mesa/main/texstate.c
index 1af9d47f030..1e75e0a2ff1 100644
--- a/src/mesa/main/texstate.c
+++ b/src/mesa/main/texstate.c
@@ -123,21 +123,21 @@ _mesa_print_texunit_state( struct gl_context *ctx, GLuint unit )
 {
    const struct gl_texture_unit *texUnit = ctx->Texture.Unit + unit;
    printf("Texture Unit %d\n", unit);
-   printf("  GL_TEXTURE_ENV_MODE = %s\n", _mesa_lookup_enum_by_nr(texUnit->EnvMode));
-   printf("  GL_COMBINE_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.ModeRGB));
-   printf("  GL_COMBINE_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.ModeA));
-   printf("  GL_SOURCE0_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceRGB[0]));
-   printf("  GL_SOURCE1_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceRGB[1]));
-   printf("  GL_SOURCE2_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceRGB[2]));
-   printf("  GL_SOURCE0_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceA[0]));
-   printf("  GL_SOURCE1_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceA[1]));
-   printf("  GL_SOURCE2_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceA[2]));
-   printf("  GL_OPERAND0_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandRGB[0]));
-   printf("  GL_OPERAND1_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandRGB[1]));
-   printf("  GL_OPERAND2_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandRGB[2]));
-   printf("  GL_OPERAND0_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandA[0]));
-   printf("  GL_OPERAND1_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandA[1]));
-   printf("  GL_OPERAND2_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandA[2]));
+   printf("  GL_TEXTURE_ENV_MODE = %s\n", _mesa_enum_to_string(texUnit->EnvMode));
+   printf("  GL_COMBINE_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.ModeRGB));
+   printf("  GL_COMBINE_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.ModeA));
+   printf("  GL_SOURCE0_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceRGB[0]));
+   printf("  GL_SOURCE1_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceRGB[1]));
+   printf("  GL_SOURCE2_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceRGB[2]));
+   printf("  GL_SOURCE0_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceA[0]));
+   printf("  GL_SOURCE1_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceA[1]));
+   printf("  GL_SOURCE2_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceA[2]));
+   printf("  GL_OPERAND0_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandRGB[0]));
+   printf("  GL_OPERAND1_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandRGB[1]));
+   printf("  GL_OPERAND2_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandRGB[2]));
+   printf("  GL_OPERAND0_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandA[0]));
+   printf("  GL_OPERAND1_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandA[1]));
+   printf("  GL_OPERAND2_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandA[2]));
    printf("  GL_RGB_SCALE = %d\n", 1 << texUnit->Combine.ScaleShiftRGB);
    printf("  GL_ALPHA_SCALE = %d\n", 1 << texUnit->Combine.ScaleShiftA);
    printf("  GL_TEXTURE_ENV_COLOR = (%f, %f, %f, %f)\n", texUnit->EnvColor[0], texUnit->EnvColor[1], texUnit->EnvColor[2], texUnit->EnvColor[3]);
@@ -295,11 +295,11 @@ _mesa_ActiveTexture(GLenum texture)
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glActiveTexture %s\n",
-                  _mesa_lookup_enum_by_nr(texture));
+                  _mesa_enum_to_string(texture));
 
    if (texUnit >= k) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glActiveTexture(texture=%s)",
-                  _mesa_lookup_enum_by_nr(texture));
+                  _mesa_enum_to_string(texture));
       return;
    }
 
@@ -325,7 +325,7 @@ _mesa_ClientActiveTexture(GLenum texture)
 
    if (MESA_VERBOSE & (VERBOSE_API | VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glClientActiveTexture %s\n",
-                  _mesa_lookup_enum_by_nr(texture));
+                  _mesa_enum_to_string(texture));
 
    if (texUnit >= ctx->Const.MaxTextureCoordUnits) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glClientActiveTexture(texture)");
diff --git a/src/mesa/main/texstate.h b/src/mesa/main/texstate.h
index 662435b47cc..bee8c9c3316 100644
--- a/src/mesa/main/texstate.h
+++ b/src/mesa/main/texstate.h
@@ -77,7 +77,7 @@ _mesa_get_tex_unit_err(struct gl_context *ctx, GLuint unit, const char *func)
     *     implementation."
     */
    _mesa_error(ctx, GL_INVALID_OPERATION, "%s(unit=%s)", func,
-               _mesa_lookup_enum_by_nr(GL_TEXTURE0+unit));
+               _mesa_enum_to_string(GL_TEXTURE0+unit));
    return NULL;
 }
 
diff --git a/src/mesa/main/texstorage.c b/src/mesa/main/texstorage.c
index aa8fa3e6343..4a2cc6065df 100644
--- a/src/mesa/main/texstorage.c
+++ b/src/mesa/main/texstorage.c
@@ -308,7 +308,7 @@ tex_storage_error_check(struct gl_context *ctx,
       _mesa_error(ctx, _mesa_is_desktop_gl(ctx)?
                   GL_INVALID_ENUM : GL_INVALID_OPERATION,
                   "glTex%sStorage%dD(internalformat = %s)", suffix, dims,
-                  _mesa_lookup_enum_by_nr(internalformat));
+                  _mesa_enum_to_string(internalformat));
       return GL_TRUE;
    }
 
@@ -465,21 +465,21 @@ texstorage(GLuint dims, GLenum target, GLsizei levels, GLenum internalformat,
    if (!legal_texobj_target(ctx, dims, target)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glTexStorage%uD(illegal target=%s)",
-                  dims, _mesa_lookup_enum_by_nr(target));
+                  dims, _mesa_enum_to_string(target));
       return;
    }
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTexStorage%uD %s %d %s %d %d %d\n",
                   dims,
-                  _mesa_lookup_enum_by_nr(target), levels,
-                  _mesa_lookup_enum_by_nr(internalformat),
+                  _mesa_enum_to_string(target), levels,
+                  _mesa_enum_to_string(internalformat),
                   width, height, depth);
    /* Check the format to make sure it is sized. */
    if (!_mesa_is_legal_tex_storage_format(ctx, internalformat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glTexStorage%uD(internalformat = %s)", dims,
-                  _mesa_lookup_enum_by_nr(internalformat));
+                  _mesa_enum_to_string(internalformat));
       return;
    }
 
@@ -505,14 +505,14 @@ texturestorage(GLuint dims, GLuint texture, GLsizei levels,
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTextureStorage%uD %d %d %s %d %d %d\n",
                   dims, texture, levels,
-                  _mesa_lookup_enum_by_nr(internalformat),
+                  _mesa_enum_to_string(internalformat),
                   width, height, depth);
 
    /* Check the format to make sure it is sized. */
    if (!_mesa_is_legal_tex_storage_format(ctx, internalformat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glTextureStorage%uD(internalformat = %s)", dims,
-                  _mesa_lookup_enum_by_nr(internalformat));
+                  _mesa_enum_to_string(internalformat));
       return;
    }
 
@@ -530,7 +530,7 @@ texturestorage(GLuint dims, GLuint texture, GLsizei levels,
    if (!legal_texobj_target(ctx, dims, texObj->Target)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glTextureStorage%uD(illegal target=%s)",
-                  dims, _mesa_lookup_enum_by_nr(texObj->Target));
+                  dims, _mesa_enum_to_string(texObj->Target));
       return;
    }
 
diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c
index 6b0aed4ea1a..5a3282a40c1 100644
--- a/src/mesa/main/textureview.c
+++ b/src/mesa/main/textureview.c
@@ -313,7 +313,7 @@ target_valid(struct gl_context *ctx, GLenum origTarget, GLenum newTarget)
    }
    _mesa_error(ctx, GL_INVALID_OPERATION,
                "glTextureView(illegal target=%s)",
-               _mesa_lookup_enum_by_nr(newTarget));
+               _mesa_enum_to_string(newTarget));
    return false;
 }
 #undef RETURN_IF_SUPPORTED
@@ -435,8 +435,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
 
    if (MESA_VERBOSE & (VERBOSE_API | VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTextureView %d %s %d %s %d %d %d %d\n",
-                  texture, _mesa_lookup_enum_by_nr(target), origtexture,
-                  _mesa_lookup_enum_by_nr(internalformat),
+                  texture, _mesa_enum_to_string(target), origtexture,
+                  _mesa_enum_to_string(internalformat),
                   minlevel, numlevels, minlayer, numlayers);
 
    if (origtexture == 0) {
@@ -523,8 +523,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
                                    internalformat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
           "glTextureView(internalformat %s not compatible with origtexture %s)",
-          _mesa_lookup_enum_by_nr(internalformat),
-          _mesa_lookup_enum_by_nr(origTexObj->Image[0][0]->InternalFormat));
+          _mesa_enum_to_string(internalformat),
+          _mesa_enum_to_string(origTexObj->Image[0][0]->InternalFormat));
       return;
    }
 
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index 5548d1d026f..a577a2569f2 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -1113,7 +1113,7 @@ mesa_bufferiv(struct gl_shader_program *shProg, GLenum type,
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(pname 0x%x (%s))", caller, pname,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       return;
    }
 }
diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c
index ebdd9eaf02e..3bab9850588 100644
--- a/src/mesa/main/varray.c
+++ b/src/mesa/main/varray.c
@@ -300,7 +300,7 @@ update_array_format(struct gl_context *ctx,
    typeBit = type_to_bit(ctx, type);
    if (typeBit == 0x0 || (typeBit & legalTypesMask) == 0x0) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(type = %s)",
-                  func, _mesa_lookup_enum_by_nr(type));
+                  func, _mesa_enum_to_string(type));
       return false;
    }
 
@@ -333,7 +333,7 @@ update_array_format(struct gl_context *ctx,
 
       if (bgra_error) {
          _mesa_error(ctx, GL_INVALID_OPERATION, "%s(size=GL_BGRA and type=%s)",
-                     func, _mesa_lookup_enum_by_nr(type));
+                     func, _mesa_enum_to_string(type));
          return false;
       }
 
@@ -2310,7 +2310,7 @@ print_array(const char *name, GLint index, const struct gl_client_array *array)
    else
       fprintf(stderr, "  %s: ", name);
    fprintf(stderr, "Ptr=%p, Type=%s, Size=%d, ElemSize=%u, Stride=%d, Buffer=%u(Size %lu)\n",
-           array->Ptr, _mesa_lookup_enum_by_nr(array->Type), array->Size,
+           array->Ptr, _mesa_enum_to_string(array->Type), array->Size,
            array->_ElementSize, array->StrideB, array->BufferObj->Name,
            (unsigned long) array->BufferObj->Size);
 }
diff --git a/src/mesa/main/viewport.c b/src/mesa/main/viewport.c
index b27063031c4..9917f2de299 100644
--- a/src/mesa/main/viewport.c
+++ b/src/mesa/main/viewport.c
@@ -391,8 +391,8 @@ _mesa_ClipControl(GLenum origin, GLenum depth)
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glClipControl(%s, %s)\n",
-	          _mesa_lookup_enum_by_nr(origin),
-                  _mesa_lookup_enum_by_nr(depth));
+	          _mesa_enum_to_string(origin),
+                  _mesa_enum_to_string(depth));
 
    ASSERT_OUTSIDE_BEGIN_END(ctx);
 
diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c
index 6adf1dce676..d5dcb5e8a7d 100644
--- a/src/mesa/tnl/t_draw.c
+++ b/src/mesa/tnl/t_draw.c
@@ -451,7 +451,7 @@ void _tnl_draw_prims(struct gl_context *ctx,
       printf("%s %d..%d\n", __func__, min_index, max_index);
       for (i = 0; i < nr_prims; i++)
 	 printf("prim %d: %s start %d count %d\n", i, 
-		_mesa_lookup_enum_by_nr(prim[i].mode),
+		_mesa_enum_to_string(prim[i].mode),
 		prim[i].start,
 		prim[i].count);
    }
diff --git a/src/mesa/tnl/t_vb_render.c b/src/mesa/tnl/t_vb_render.c
index 4960ac0969e..03e8fcfa196 100644
--- a/src/mesa/tnl/t_vb_render.c
+++ b/src/mesa/tnl/t_vb_render.c
@@ -315,7 +315,7 @@ static GLboolean run_render( struct gl_context *ctx,
 
 	 if (MESA_VERBOSE & VERBOSE_PRIMS) 
 	    _mesa_debug(NULL, "MESA prim %s %d..%d\n", 
-			_mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK), 
+			_mesa_enum_to_string(prim & PRIM_MODE_MASK), 
 			start, start+length);
 
 	 if (length)
diff --git a/src/mesa/tnl/t_vertex_sse.c b/src/mesa/tnl/t_vertex_sse.c
index 30dc1a72080..14e7812ec78 100644
--- a/src/mesa/tnl/t_vertex_sse.c
+++ b/src/mesa/tnl/t_vertex_sse.c
@@ -592,7 +592,7 @@ static GLboolean build_vertex_emit( struct x86_program *p )
 	    break;
 	 case GL_UNSIGNED_SHORT:
 	 default:
-	    printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
+	    printf("unknown CHAN_TYPE %s\n", _mesa_enum_to_string(CHAN_TYPE));
 	    return GL_FALSE;
 	 }
 	 break;
diff --git a/src/mesa/tnl_dd/t_dd_dmatmp.h b/src/mesa/tnl_dd/t_dd_dmatmp.h
index 667e2a6e5d5..7be39541e43 100644
--- a/src/mesa/tnl_dd/t_dd_dmatmp.h
+++ b/src/mesa/tnl_dd/t_dd_dmatmp.h
@@ -1256,7 +1256,7 @@ static GLboolean TAG(validate_render)( struct gl_context *ctx,
       }
       
       if (!ok) {
-/* 	 fprintf(stderr, "not ok %s\n", _mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK)); */
+/* 	 fprintf(stderr, "not ok %s\n", _mesa_enum_to_string(prim & PRIM_MODE_MASK)); */
 	 return GL_FALSE;
       }
    }
diff --git a/src/mesa/tnl_dd/t_dd_unfilled.h b/src/mesa/tnl_dd/t_dd_unfilled.h
index 82190c08916..ee15e773c88 100644
--- a/src/mesa/tnl_dd/t_dd_unfilled.h
+++ b/src/mesa/tnl_dd/t_dd_unfilled.h
@@ -60,7 +60,7 @@ static void TAG(unfilled_tri)( struct gl_context *ctx,
    }
 
 /*     fprintf(stderr, "%s %s %d %d %d\n", __func__, */
-/*  	   _mesa_lookup_enum_by_nr( mode ), */
+/*  	   _mesa_enum_to_string( mode ), */
 /*  	   ef[e0], ef[e1], ef[e2]); */
 
    if (mode == GL_POINT) {
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 72b8206ec23..29889382fb1 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -786,7 +786,7 @@ vbo_exec_DrawArrays(GLenum mode, GLint start, GLsizei count)
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawArrays(%s, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), start, count);
+                  _mesa_enum_to_string(mode), start, count);
 
    if (!_mesa_validate_DrawArrays(ctx, mode, count))
       return;
@@ -813,7 +813,7 @@ vbo_exec_DrawArraysInstanced(GLenum mode, GLint start, GLsizei count,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawArraysInstanced(%s, %d, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), start, count, numInstances);
+                  _mesa_enum_to_string(mode), start, count, numInstances);
 
    if (!_mesa_validate_DrawArraysInstanced(ctx, mode, start, count, numInstances))
       return;
@@ -839,7 +839,7 @@ vbo_exec_DrawArraysInstancedBaseInstance(GLenum mode, GLint first, GLsizei count
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawArraysInstancedBaseInstance(%s, %d, %d, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), first, count,
+                  _mesa_enum_to_string(mode), first, count,
                   numInstances, baseInstance);
 
    if (!_mesa_validate_DrawArraysInstanced(ctx, mode, first, count,
@@ -1021,8 +1021,8 @@ vbo_exec_DrawRangeElementsBaseVertex(GLenum mode,
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx,
                 "glDrawRangeElementsBaseVertex(%s, %u, %u, %d, %s, %p, %d)\n",
-                _mesa_lookup_enum_by_nr(mode), start, end, count,
-                _mesa_lookup_enum_by_nr(type), indices, basevertex);
+                _mesa_enum_to_string(mode), start, end, count,
+                _mesa_enum_to_string(type), indices, basevertex);
 
    if (!_mesa_validate_DrawRangeElements(ctx, mode, start, end, count,
                                          type, indices))
@@ -1099,8 +1099,8 @@ vbo_exec_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
       GET_CURRENT_CONTEXT(ctx);
       _mesa_debug(ctx,
                   "glDrawRangeElements(%s, %u, %u, %d, %s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode), start, end, count,
-                  _mesa_lookup_enum_by_nr(type), indices);
+                  _mesa_enum_to_string(mode), start, end, count,
+                  _mesa_enum_to_string(type), indices);
    }
 
    vbo_exec_DrawRangeElementsBaseVertex(mode, start, end, count, type,
@@ -1119,8 +1119,8 @@ vbo_exec_DrawElements(GLenum mode, GLsizei count, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElements(%s, %u, %s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices);
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices);
 
    if (!_mesa_validate_DrawElements(ctx, mode, count, type, indices))
       return;
@@ -1141,8 +1141,8 @@ vbo_exec_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsBaseVertex(%s, %d, %s, %p, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices, basevertex);
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices, basevertex);
 
    if (!_mesa_validate_DrawElements(ctx, mode, count, type, indices))
       return;
@@ -1163,8 +1163,8 @@ vbo_exec_DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsInstanced(%s, %d, %s, %p, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices, numInstances);
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices, numInstances);
 
    if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
                                              numInstances))
@@ -1187,8 +1187,8 @@ vbo_exec_DrawElementsInstancedBaseVertex(GLenum mode, GLsizei count, GLenum type
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsInstancedBaseVertex(%s, %d, %s, %p, %d; %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices,
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices,
                   numInstances, basevertex);
 
    if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
@@ -1212,8 +1212,8 @@ vbo_exec_DrawElementsInstancedBaseInstance(GLenum mode, GLsizei count, GLenum ty
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsInstancedBaseInstance(%s, %d, %s, %p, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices,
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices,
                   numInstances, baseInstance);
 
    if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
@@ -1238,8 +1238,8 @@ vbo_exec_DrawElementsInstancedBaseVertexBaseInstance(GLenum mode, GLsizei count,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsInstancedBaseVertexBaseInstance(%s, %d, %s, %p, %d, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices,
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices,
                   numInstances, basevertex, baseInstance);
 
    if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
@@ -1488,7 +1488,7 @@ vbo_exec_DrawTransformFeedback(GLenum mode, GLuint name)
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawTransformFeedback(%s, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), name);
+                  _mesa_enum_to_string(mode), name);
 
    vbo_draw_transform_feedback(ctx, mode, obj, 0, 1);
 }
@@ -1502,7 +1502,7 @@ vbo_exec_DrawTransformFeedbackStream(GLenum mode, GLuint name, GLuint stream)
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawTransformFeedbackStream(%s, %u, %u)\n",
-                  _mesa_lookup_enum_by_nr(mode), name, stream);
+                  _mesa_enum_to_string(mode), name, stream);
 
    vbo_draw_transform_feedback(ctx, mode, obj, stream, 1);
 }
@@ -1517,7 +1517,7 @@ vbo_exec_DrawTransformFeedbackInstanced(GLenum mode, GLuint name,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawTransformFeedbackInstanced(%s, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), name);
+                  _mesa_enum_to_string(mode), name);
 
    vbo_draw_transform_feedback(ctx, mode, obj, 0, primcount);
 }
@@ -1533,7 +1533,7 @@ vbo_exec_DrawTransformFeedbackStreamInstanced(GLenum mode, GLuint name,
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawTransformFeedbackStreamInstanced"
                   "(%s, %u, %u, %i)\n",
-                  _mesa_lookup_enum_by_nr(mode), name, stream, primcount);
+                  _mesa_enum_to_string(mode), name, stream, primcount);
 
    vbo_draw_transform_feedback(ctx, mode, obj, stream, primcount);
 }
@@ -1709,7 +1709,7 @@ vbo_exec_DrawArraysIndirect(GLenum mode, const GLvoid *indirect)
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawArraysIndirect(%s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode), indirect);
+                  _mesa_enum_to_string(mode), indirect);
 
    if (!_mesa_validate_DrawArraysIndirect(ctx, mode, indirect))
       return;
@@ -1725,8 +1725,8 @@ vbo_exec_DrawElementsIndirect(GLenum mode, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsIndirect(%s, %s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode),
-                  _mesa_lookup_enum_by_nr(type), indirect);
+                  _mesa_enum_to_string(mode),
+                  _mesa_enum_to_string(type), indirect);
 
    if (!_mesa_validate_DrawElementsIndirect(ctx, mode, type, indirect))
       return;
@@ -1743,7 +1743,7 @@ vbo_exec_MultiDrawArraysIndirect(GLenum mode,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glMultiDrawArraysIndirect(%s, %p, %i, %i)\n",
-                  _mesa_lookup_enum_by_nr(mode), indirect, primcount, stride);
+                  _mesa_enum_to_string(mode), indirect, primcount, stride);
 
    /* If <stride> is zero, the array elements are treated as tightly packed. */
    if (stride == 0)
@@ -1768,8 +1768,8 @@ vbo_exec_MultiDrawElementsIndirect(GLenum mode, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glMultiDrawElementsIndirect(%s, %s, %p, %i, %i)\n",
-                  _mesa_lookup_enum_by_nr(mode),
-                  _mesa_lookup_enum_by_nr(type), indirect, primcount, stride);
+                  _mesa_enum_to_string(mode),
+                  _mesa_enum_to_string(type), indirect, primcount, stride);
 
    /* If <stride> is zero, the array elements are treated as tightly packed. */
    if (stride == 0)

From b298311d517017834841e53b7e641738e6067cdc Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 20 Jul 2015 12:58:12 +0200
Subject: [PATCH 0396/1208] i965/vec4: Fix liveness analysis with
 BRW_OPCODE_SEL

We only consider a vgrf defined by a given block if the block writes to it
unconditionally. So far we have been checking this by testing that the
instruction is not predicated, however, in the case of BRW_OPCODE_SEL,
the predication is used to select the value to write, not to decide if
the write is actually done. The consequence of this was increased life
spans for affected vgrfs, which could lead to additional register pressure.

Since NIR generates selects for conditional writes this was causing massive
register pressure in a handful of piglit and dEQP tests that had a large
number of select operations with the NIR-vec4 backend.

Fixes the following piglit tests with the NIR-vec4 backend:
spec/glsl-1.50/execution/variable-indexing/vs-output-array-vec4-index-wr-before-gs
spec/glsl-1.50/execution/variable-indexing/gs-input-array-vec4-index-rd
spec/glsl-1.50/execution/variable-indexing/vs-output-array-vec2-index-wr-before-gs
spec/glsl-1.50/execution/variable-indexing/vs-output-array-vec3-index-wr-before-gs
spec/glsl-1.50/execution/variable-indexing/vs-output-array-float-index-wr-before-gs

Fixes 80 dEQP tests with the NIR-vec4 backend in the following category:
dEQP-GLES3.functional.ubo.*

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
index 29b4a53418a..cc688ef8083 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
@@ -96,7 +96,8 @@ vec4_live_variables::setup_def_use()
 	  * are the things that screen off preceding definitions of a
 	  * variable, and thus qualify for being in def[].
 	  */
-	 if (inst->dst.file == GRF && !inst->predicate) {
+	 if (inst->dst.file == GRF &&
+	     (!inst->predicate || inst->opcode == BRW_OPCODE_SEL)) {
             for (unsigned i = 0; i < inst->regs_written; i++) {
                for (int c = 0; c < 4; c++) {
                   if (inst->dst.writemask & (1 << c)) {

From 5f8d9ae5a54961deb02eb52e924a84b99b60f035 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 14:50:24 +0300
Subject: [PATCH 0397/1208] i965/fs: Fix stride for immediate registers.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the width field was removed from fs_reg the BROADCAST handling
code in opt_algebraic() started to miss a number of trivial
optimization cases resulting in the ugly indirect-addressing sequence
to be emitted unnecessarily for some variable-indexed texturing and
UBO loads regardless of one of the sources of BROADCAST being
immediate.  Apparently the reason was that we were setting the stride
field to one for immediates even though they are typically uniform.
Width used to be set to one too which is why this optimization used to
work previously until the "reg.width == 1" check was removed.

The stride field of vector immediates is intentionally left equal to
one, because they are strictly speaking not uniform.  The assertion in
fs_generator makes sure that immediates have the expected stride as
consistency check.

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp           | 3 +++
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 51ef32c1b6a..f7fdb174efb 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -362,6 +362,7 @@ fs_reg::fs_reg(float f)
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_F;
+   this->stride = 0;
    this->fixed_hw_reg.dw1.f = f;
 }
 
@@ -371,6 +372,7 @@ fs_reg::fs_reg(int32_t i)
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_D;
+   this->stride = 0;
    this->fixed_hw_reg.dw1.d = i;
 }
 
@@ -380,6 +382,7 @@ fs_reg::fs_reg(uint32_t u)
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_UD;
+   this->stride = 0;
    this->fixed_hw_reg.dw1.ud = u;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index bae72167972..7f5ac6b9665 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -79,6 +79,10 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg)
       brw_reg = byte_offset(brw_reg, reg->subreg_offset);
       break;
    case IMM:
+      assert(reg->stride == ((reg->type == BRW_REGISTER_TYPE_V ||
+                              reg->type == BRW_REGISTER_TYPE_UV ||
+                              reg->type == BRW_REGISTER_TYPE_VF) ? 1 : 0));
+
       switch (reg->type) {
       case BRW_REGISTER_TYPE_F:
 	 brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);

From 9383664a9cbc5bc4858fc50d7fa565f43028d779 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 15:29:39 +0300
Subject: [PATCH 0398/1208] i965/fs: Fix stride field for uniforms.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes essentially the same problem as for immediates.  Registers
of the UNIFORM file are typically accessed according to the formula:

 read_uniform(r, channel_index, array_index) =
    read_element(r, channel_index * 0 + array_index * 1)

Which matches the general direct addressing formula for stride=0:

 read_direct(r, channel_index, array_index) =
    read_element(r, channel_index * stride +
                    array_index * max{1, stride * width})

In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:

 read_indirect(r, channel_index, array_index) =
    read_direct(r, channel_index,
                read(r.reladdr, channel_index, array_index))

where:
 read(r, channel_index, array_index) = if r.reladdr == NULL
    then read_direct(r, channel_index, array_index)
    else read_indirect(r, channel_index, array_index)

In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero.  After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index f7fdb174efb..9af15377361 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -823,6 +823,7 @@ fs_reg::fs_reg(enum register_file file, int reg)
    this->file = file;
    this->reg = reg;
    this->type = BRW_REGISTER_TYPE_F;
+   this->stride = (file == UNIFORM ? 0 : 1);
 }
 
 /** Fixed HW reg constructor. */
@@ -832,6 +833,7 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
    this->file = file;
    this->reg = reg;
    this->type = type;
+   this->stride = (file == UNIFORM ? 0 : 1);
 }
 
 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
@@ -1272,6 +1274,7 @@ fs_visitor::assign_curb_setup()
 						  constant_nr / 8,
 						  constant_nr % 8);
 
+            assert(inst->src[i].stride == 0);
 	    inst->src[i].file = HW_REG;
 	    inst->src[i].fixed_hw_reg = byte_offset(
                retype(brw_reg, inst->src[i].type),
@@ -1822,6 +1825,8 @@ fs_visitor::demote_pull_constants()
          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
          fs_reg dst = vgrf(glsl_type::float_type);
 
+         assert(inst->src[i].stride == 0);
+
          /* Generate a pull load into dst. */
          if (inst->src[i].reladdr) {
             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
@@ -1829,6 +1834,7 @@ fs_visitor::demote_pull_constants()
                                        *inst->src[i].reladdr,
                                        pull_index);
             inst->src[i].reladdr = NULL;
+            inst->src[i].stride = 1;
          } else {
             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
             ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,

From fadf34773527779eef4622b2586d87ec00476c0f Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 15:52:28 +0300
Subject: [PATCH 0399/1208] i965: Fix stride field for the result of
 emit_uniformize().
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is essentially the same problem fixed in an earlier patch for
immediates.  Setting the stride to zero will be particularly useful
for my future SIMD lowering pass, because we will be able to just
check whether the stride of a source register is zero and skip
emitting the copies required to unzip it in that case.

Instead of setting stride to zero in every caller of emit_uniformize()
I've changed the function to return the result as its return value
(previously it was being written into a caller-provided destination
register), because this way we can enforce that the result is used with
the correct regioning from the function itself.

The changes to the prototype of its VEC4 counterpart are mainly for
the sake of symmetry, VEC4 registers don't have stride.

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_fs_builder.h     | 16 +++++++++-------
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp       |  4 ++--
 src/mesa/drivers/dri/i965/brw_vec4.h           |  7 +++++--
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 18 ++++++++++--------
 4 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index 34646d7fd74..c4ee9d484b7 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -350,17 +350,19 @@ namespace brw {
       }
 
       /**
-       * Copy any live channel from \p src to the first channel of \p dst.
+       * Copy any live channel from \p src to the first channel of the result.
        */
-      void
-      emit_uniformize(const dst_reg &dst, const src_reg &src) const
+      src_reg
+      emit_uniformize(const src_reg &src) const
       {
          const fs_builder ubld = exec_all();
-         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
+         const dst_reg chan_index = component(vgrf(BRW_REGISTER_TYPE_UD), 0);
+         const dst_reg dst = component(vgrf(src.type), 0);
 
-         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0));
-         ubld.emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
-                   src, component(chan_index, 0));
+         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
+         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index);
+
+         return src_reg(dst);
       }
 
       /**
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 12cd4530c37..6c979334ac7 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1378,7 +1378,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          surf_index = vgrf(glsl_type::uint_type);
          bld.ADD(surf_index, get_nir_src(instr->src[0]),
                  fs_reg(stage_prog_data->binding_table.ubo_start));
-         bld.emit_uniformize(surf_index, surf_index);
+         surf_index = bld.emit_uniformize(surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
           * a tighter bound, but the array information is already lowered away.
@@ -1673,7 +1673,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
          /* Emit code to evaluate the actual indexing expression */
          sampler_reg = vgrf(glsl_type::uint_type);
          bld.ADD(sampler_reg, src, fs_reg(sampler));
-         bld.emit_uniformize(sampler_reg, sampler_reg);
+         sampler_reg = bld.emit_uniformize(sampler_reg);
          break;
       }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 36436511df0..7bf027a0306 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -293,8 +293,11 @@ public:
    void emit_lrp(const dst_reg &dst,
                  const src_reg &x, const src_reg &y, const src_reg &a);
 
-   /** Copy any live channel from \p src to the first channel of \p dst. */
-   void emit_uniformize(const dst_reg &dst, const src_reg &src);
+   /**
+    * Copy any live channel from \p src to the first channel of the
+    * result.
+    */
+   src_reg emit_uniformize(const src_reg &src);
 
    void emit_block_move(dst_reg *dst, src_reg *src,
                         const struct glsl_type *type, brw_predicate predicate);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index f351bf4075b..a6eee47fcb0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1374,15 +1374,19 @@ vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
       emit(pull);
 }
 
-void
-vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
+src_reg
+vec4_visitor::emit_uniformize(const src_reg &src)
 {
    const src_reg chan_index(this, glsl_type::uint_type);
+   const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
+                              src.type);
 
    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
       ->force_writemask_all = true;
    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
       ->force_writemask_all = true;
+
+   return src_reg(dst);
 }
 
 void
@@ -1826,7 +1830,7 @@ vec4_visitor::visit(ir_expression *ir)
          surf_index = src_reg(this, glsl_type::uint_type);
          emit(ADD(dst_reg(surf_index), op[0],
                   src_reg(prog_data->base.binding_table.ubo_start)));
-         emit_uniformize(dst_reg(surf_index), surf_index);
+         surf_index = emit_uniformize(surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
           * a tighter bound, but the array information is already lowered away.
@@ -2522,11 +2526,9 @@ vec4_visitor::visit(ir_texture *ir)
 
       /* Emit code to evaluate the actual indexing expression */
       nonconst_sampler_index->accept(this);
-      dst_reg temp(this, glsl_type::uint_type);
-      emit(ADD(temp, this->result, src_reg(sampler)));
-      emit_uniformize(temp, src_reg(temp));
-
-      sampler_reg = src_reg(temp);
+      src_reg temp(this, glsl_type::uint_type);
+      emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
+      sampler_reg = emit_uniformize(temp);
    } else {
       /* Single sampler, or constant array index; the indexing expression
        * is just an immediate.

From 545dec5b3efeab7691ab3eb1436747048f241cf9 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Fri, 17 Jul 2015 14:14:29 -0700
Subject: [PATCH 0400/1208] Revert "i965/gen9: Plugin the code for selecting
 YF/YS tiling on skl+"

Commit c9dbdc0 introduced some dead code which is supposed to be used
once we have Yf/Ys tiling working and performing better. Ken reported
the issue that static analysis tool now shows warnings due to the dead
code. To fix these warnings, this patch reverts the changes made in
commit c9dbdc0.

It'll be better to add the Yf/Ys tiling selection code later, when we
are ready to use it.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Acked-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 113 ++++-----------------
 1 file changed, 19 insertions(+), 94 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index a12b4af579e..85c0864d2c8 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -807,107 +807,32 @@ intel_miptree_set_alignment(struct brw_context *brw,
    }
 }
 
-static void
-intel_miptree_release_levels(struct intel_mipmap_tree *mt)
-{
-   unsigned int level = 0;
-
-   for (level = mt->first_level; level <= mt->last_level; level++) {
-      free(mt->level[level].slice);
-      mt->level[level].slice = NULL;
-   }
-}
-
-static bool
-intel_miptree_can_use_tr_mode(const struct intel_mipmap_tree *mt)
-{
-   if (mt->tiling == I915_TILING_Y ||
-       mt->tiling == (I915_TILING_Y | I915_TILING_X) ||
-       mt->tr_mode == INTEL_MIPTREE_TRMODE_NONE) {
-      /* FIXME: Don't allow YS tiling at the moment. Using 64KB tiling
-       * for small textures might result in to memory wastage. Revisit
-       * this condition when we have more information about the specific
-       * cases where using YS over YF will be useful.
-       */
-      if (mt->tr_mode != INTEL_MIPTREE_TRMODE_YS)
-         return true;
-   }
-   return false;
-}
-
 void
 brw_miptree_layout(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
                    uint32_t layout_flags)
 {
-   const unsigned bpp = mt->cpp * 8;
-   /* Enable YF/YS tiling only for color surfaces because depth and
-    * stencil surfaces are not supported in blitter using fast copy
-    * blit and meta PBO upload, download paths. No other paths
-    * currently support Yf/Ys tiled surfaces.
-    * FINISHME:  Remove this restriction once we have a tiled_memcpy()
-    * path to do depth/stencil data upload/download to Yf/Ys tiled
-    * surfaces.
-    */
-   const bool is_tr_mode_yf_ys_allowed =
-      brw->gen >= 9 &&
-      !(layout_flags & MIPTREE_LAYOUT_FOR_BO) &&
-      !mt->compressed &&
-      _mesa_is_format_color_format(mt->format) &&
-      (layout_flags & MIPTREE_LAYOUT_ALLOC_YTILED) &&
-      (bpp && is_power_of_two(bpp)) &&
-      /* FIXME: To avoid piglit regressions keep the Yf/Ys tiling
-       * disabled at the moment.
-       */
-      false;
+   mt->tr_mode = INTEL_MIPTREE_TRMODE_NONE;
 
-   /* Lower index (Yf) is the higher priority mode */
-   const uint32_t tr_mode[3] = {INTEL_MIPTREE_TRMODE_YF,
-                                INTEL_MIPTREE_TRMODE_YS,
-                                INTEL_MIPTREE_TRMODE_NONE};
-   int i = is_tr_mode_yf_ys_allowed ? 0 : ARRAY_SIZE(tr_mode) - 1;
+   intel_miptree_set_alignment(brw, mt, layout_flags);
+   intel_miptree_set_total_width_height(brw, mt);
 
-   while (i < ARRAY_SIZE(tr_mode)) {
-      if (brw->gen < 9)
-         assert(tr_mode[i] == INTEL_MIPTREE_TRMODE_NONE);
-      else
-         assert(tr_mode[i] == INTEL_MIPTREE_TRMODE_YF ||
-                tr_mode[i] == INTEL_MIPTREE_TRMODE_YS ||
-                tr_mode[i] == INTEL_MIPTREE_TRMODE_NONE);
-
-      mt->tr_mode = tr_mode[i];
-      intel_miptree_set_alignment(brw, mt, layout_flags);
-      intel_miptree_set_total_width_height(brw, mt);
-
-      if (!mt->total_width || !mt->total_height) {
-         intel_miptree_release(&mt);
-         break;
-      }
-
-      /* On Gen9+ the alignment values are expressed in multiples of the
-       * block size.
-       */
-      if (brw->gen >= 9) {
-         unsigned int i, j;
-         _mesa_get_format_block_size(mt->format, &i, &j);
-         mt->align_w /= i;
-         mt->align_h /= j;
-      }
-
-      /* If there is already a BO, we cannot effect tiling modes */
-      if (layout_flags & MIPTREE_LAYOUT_FOR_BO)
-         break;
-
-      mt->tiling = brw_miptree_choose_tiling(brw, mt, layout_flags);
-      if (is_tr_mode_yf_ys_allowed) {
-         if (intel_miptree_can_use_tr_mode(mt))
-            break;
-         /* Failed to use selected tr_mode. Free up the memory allocated
-          * for miptree levels in intel_miptree_total_width_height().
-          */
-         intel_miptree_release_levels(mt);
-      }
-      i++;
+   if (!mt->total_width || !mt->total_height) {
+      intel_miptree_release(&mt);
+      return;
    }
+
+   /* On Gen9+ the alignment values are expressed in multiples of the block
+    * size
+    */
+   if (brw->gen >= 9) {
+      unsigned int i, j;
+      _mesa_get_format_block_size(mt->format, &i, &j);
+      mt->align_w /= i;
+      mt->align_h /= j;
+   }
+
+   if ((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0)
+      mt->tiling = brw_miptree_choose_tiling(brw, mt, layout_flags);
 }
 

From 5b4a7ec8f1d2eee12895541bb5c7d15382370884 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Tue, 21 Jul 2015 12:08:20 -0400
Subject: [PATCH 0401/1208] r600/sb: Fix an &/&& mistake

gcc says:

    sb/sb_sched.cpp: In member function 'bool r600_sb::alu_group_tracker::try_reserve(r600_sb::alu_node*)':
    sb/sb_sched.cpp:492:7: warning: suggest parentheses around operand of '!' or change '&' to '&&' or '!' to '~' [-Wparentheses]
      if (!trans & fbs)

It happens to be harmless; if fbs is ever non-zero, it will be VEC_210,
which is 5, so (!trans & 5) == 1 and the branch works as expected.  But
logical AND is clearly what was meant.

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Adam Jackson <ajax@redhat.com>
---
 src/gallium/drivers/r600/sb/sb_sched.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp
index 2e38a62c05a..62680788c5e 100644
--- a/src/gallium/drivers/r600/sb/sb_sched.cpp
+++ b/src/gallium/drivers/r600/sb/sb_sched.cpp
@@ -489,7 +489,7 @@ bool alu_group_tracker::try_reserve(alu_node* n) {
 
 	n->bc.bank_swizzle = 0;
 
-	if (!trans & fbs)
+	if (!trans && fbs)
 		n->bc.bank_swizzle = VEC_210;
 
 	if (gpr.try_reserve(n)) {

From 912921059d137085faef676504bea265328bdde4 Mon Sep 17 00:00:00 2001
From: Vinson Lee <vlee@freedesktop.org>
Date: Mon, 20 Jul 2015 21:52:40 -0700
Subject: [PATCH 0402/1208] st/mesa: Silence GCC unused-variable warning.

Silence a release build warning.

st_glsl_to_tgsi.cpp: In function 'pipe_error st_translate_program(gl_context*, uint, ureg_program*, glsl_to_tgsi_visitor*, const gl_program*, GLuint, const GLuint*, const GLuint*, const ubyte*, const ubyte*, const GLuint*, const GLuint*, GLuint, const GLuint*, const GLuint*, const ubyte*, const ubyte*, boolean, boolean)':
st_glsl_to_tgsi.cpp:5461:36: warning: unused variable 'pscreen' [-Wunused-variable]
                struct pipe_screen *pscreen = st->pipe->screen;
                                    ^

Signed-off-by: Vinson Lee <vlee@freedesktop.org>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 25e30c7deb2..c9d40c578ef 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -5461,6 +5461,7 @@ st_translate_program(
                struct pipe_screen *pscreen = st->pipe->screen;
                assert(procType == TGSI_PROCESSOR_VERTEX);
                assert(pscreen->get_shader_param(pscreen, PIPE_SHADER_VERTEX, PIPE_SHADER_CAP_INTEGERS));
+               (void) pscreen;
                if (!ctx->Const.NativeIntegers) {
                   struct ureg_dst temp = ureg_DECL_local_temporary(t->ureg);
                   ureg_U2F( t->ureg, ureg_writemask(temp, TGSI_WRITEMASK_X), t->systemValues[i]);

From a62ccdec622ea43a7cdbf572a32dfae19ba9c904 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 21 Jul 2015 21:35:43 +0200
Subject: [PATCH 0403/1208] nv50: force cache flush when binding a new ubo

This fixes the following piglit test:
  ext_transform_feedback-immediate-reuse-uniform-buffer

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv50/nv50_shader_state.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 1ec5642ccfc..9369093614d 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -99,6 +99,8 @@ nv50_constbufs_validate(struct nv50_context *nv50)
                PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
 
                BCTX_REFN(nv50->bufctx_3d, CB(s, i), res, RD);
+
+               nv50->cb_dirty = 1; /* Force cache flush for UBO. */
             } else {
                BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
                PUSH_DATA (push, (i << 8) | p | 0);

From 958b5c31116f46a81249d11033164354ec158556 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 21 Jul 2015 21:58:08 +0200
Subject: [PATCH 0404/1208] nvc0: force cache flush when binding a new ubo

This fixes the following piglit test:
  ext_transform_feedback-immediate-reuse-uniform-buffer

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index b07558ac62c..242831461de 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -455,6 +455,8 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0)
                PUSH_DATA (push, (i << 4) | 1);
 
                BCTX_REFN(nvc0->bufctx_3d, CB(s, i), res, RD);
+
+               nvc0->cb_dirty = 1; /* Force cache flush for UBO. */
             } else {
                BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1);
                PUSH_DATA (push, (i << 4) | 0);

From a2a1a5805fd617e7f3cc8be44dd79b50da07ebb9 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 20 Jul 2015 19:58:43 -0400
Subject: [PATCH 0405/1208] gallium: replace INLINE with inline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Generated by running:
git grep -l INLINE src/gallium/ | xargs sed -i 's/\bINLINE\b/inline/g'
git grep -l INLINE src/mesa/state_tracker/ | xargs sed -i 's/\bINLINE\b/inline/g'
git checkout src/gallium/state_trackers/clover/Doxyfile

and manual edits to
src/gallium/include/pipe/p_compiler.h
src/gallium/README.portability

to remove mentions of the inline define.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Acked-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/README.portability                |   2 -
 src/gallium/auxiliary/cso_cache/cso_cache.c   |   8 +-
 src/gallium/auxiliary/cso_cache/cso_context.c |   8 +-
 src/gallium/auxiliary/draw/draw_gs.c          |   4 +-
 src/gallium/auxiliary/draw/draw_llvm.c        |   2 +-
 src/gallium/auxiliary/draw/draw_llvm.h        |  10 +-
 src/gallium/auxiliary/draw/draw_pipe.h        |   2 +-
 src/gallium/auxiliary/draw/draw_pipe_aaline.c |   2 +-
 .../auxiliary/draw/draw_pipe_aapoint.c        |   2 +-
 src/gallium/auxiliary/draw/draw_pipe_clip.c   |  10 +-
 src/gallium/auxiliary/draw/draw_pipe_cull.c   |   4 +-
 .../auxiliary/draw/draw_pipe_flatshade.c      |   6 +-
 src/gallium/auxiliary/draw/draw_pipe_offset.c |   2 +-
 .../auxiliary/draw/draw_pipe_pstipple.c       |   2 +-
 .../auxiliary/draw/draw_pipe_stipple.c        |   4 +-
 .../auxiliary/draw/draw_pipe_twoside.c        |   4 +-
 .../auxiliary/draw/draw_pipe_unfilled.c       |   2 +-
 src/gallium/auxiliary/draw/draw_pipe_vbuf.c   |   8 +-
 .../auxiliary/draw/draw_pipe_wide_line.c      |   2 +-
 .../auxiliary/draw/draw_pipe_wide_point.c     |   2 +-
 src/gallium/auxiliary/draw/draw_private.h     |   4 +-
 .../draw/draw_pt_fetch_shade_pipeline.c       |   2 +-
 .../draw/draw_pt_fetch_shade_pipeline_llvm.c  |   2 +-
 src/gallium/auxiliary/draw/draw_pt_post_vs.c  |   4 +-
 src/gallium/auxiliary/draw/draw_pt_so_emit.c  |   2 +-
 src/gallium/auxiliary/draw/draw_pt_vsplit.c   |  12 +-
 .../auxiliary/draw/draw_pt_vsplit_tmp.h       |   2 +-
 src/gallium/auxiliary/draw/draw_vertex.h      |  12 +-
 src/gallium/auxiliary/draw/draw_vs.h          |   4 +-
 src/gallium/auxiliary/gallivm/lp_bld_arit.c   |  14 +--
 src/gallium/auxiliary/gallivm/lp_bld_const.h  |   6 +-
 src/gallium/auxiliary/gallivm/lp_bld_debug.h  |   2 +-
 .../auxiliary/gallivm/lp_bld_format_aos.c     |   4 +-
 .../auxiliary/gallivm/lp_bld_format_yuv.c     |   2 +-
 src/gallium/auxiliary/gallivm/lp_bld_limits.h |   2 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |   8 +-
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.h   |   4 +-
 .../auxiliary/gallivm/lp_bld_tgsi_info.c      |   2 +-
 .../auxiliary/gallivm/lp_bld_tgsi_soa.c       |   8 +-
 src/gallium/auxiliary/gallivm/lp_bld_type.h   |  30 ++---
 src/gallium/auxiliary/os/os_memory_aligned.h  |   4 +-
 src/gallium/auxiliary/os/os_memory_stdc.h     |   2 +-
 src/gallium/auxiliary/os/os_mman.h            |   4 +-
 src/gallium/auxiliary/os/os_thread.h          |  34 ++---
 src/gallium/auxiliary/os/os_time.h            |   4 +-
 src/gallium/auxiliary/pipebuffer/pb_buffer.h  |  18 +--
 .../auxiliary/pipebuffer/pb_buffer_fenced.c   |  14 +--
 .../auxiliary/pipebuffer/pb_buffer_malloc.c   |   2 +-
 .../auxiliary/pipebuffer/pb_bufmgr_alt.c      |   2 +-
 .../auxiliary/pipebuffer/pb_bufmgr_cache.c    |   8 +-
 .../auxiliary/pipebuffer/pb_bufmgr_debug.c    |   8 +-
 .../auxiliary/pipebuffer/pb_bufmgr_mm.c       |   4 +-
 .../auxiliary/pipebuffer/pb_bufmgr_ondemand.c |   4 +-
 .../auxiliary/pipebuffer/pb_bufmgr_pool.c     |   4 +-
 .../auxiliary/pipebuffer/pb_bufmgr_slab.c     |   6 +-
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c    |   4 +-
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h    |   4 +-
 .../target-helpers/inline_debug_helper.h      |   2 +-
 .../target-helpers/inline_sw_helper.h         |   8 +-
 .../target-helpers/inline_wrapper_sw_helper.h |   2 +-
 src/gallium/auxiliary/tgsi/tgsi_exec.c        |   2 +-
 src/gallium/auxiliary/tgsi/tgsi_exec.h        |   6 +-
 src/gallium/auxiliary/tgsi/tgsi_info.c        |   2 +-
 src/gallium/auxiliary/tgsi/tgsi_parse.c       |   2 +-
 src/gallium/auxiliary/tgsi/tgsi_parse.h       |   2 +-
 src/gallium/auxiliary/tgsi/tgsi_sanity.c      |   2 +-
 src/gallium/auxiliary/tgsi/tgsi_strings.c     |   2 +-
 src/gallium/auxiliary/tgsi/tgsi_transform.h   |  24 ++--
 src/gallium/auxiliary/tgsi/tgsi_ureg.c        |   2 +-
 src/gallium/auxiliary/tgsi/tgsi_ureg.h        | 116 +++++++++---------
 src/gallium/auxiliary/translate/translate.h   |   6 +-
 .../auxiliary/translate/translate_cache.c     |   6 +-
 src/gallium/auxiliary/util/u_bitmask.c        |   6 +-
 src/gallium/auxiliary/util/u_blend.h          |   2 +-
 src/gallium/auxiliary/util/u_blit.c           |   4 +-
 src/gallium/auxiliary/util/u_blitter.c        |   6 +-
 src/gallium/auxiliary/util/u_blitter.h        |  40 +++---
 src/gallium/auxiliary/util/u_box.h            |  22 ++--
 src/gallium/auxiliary/util/u_cache.c          |   2 +-
 src/gallium/auxiliary/util/u_clear.h          |   2 +-
 src/gallium/auxiliary/util/u_cpu_detect.c     |   8 +-
 src/gallium/auxiliary/util/u_debug.h          |   6 +-
 src/gallium/auxiliary/util/u_debug_memory.c   |   6 +-
 src/gallium/auxiliary/util/u_debug_refcnt.h   |   4 +-
 src/gallium/auxiliary/util/u_debug_symbol.c   |   4 +-
 src/gallium/auxiliary/util/u_dirty_surfaces.h |  16 +--
 src/gallium/auxiliary/util/u_draw.h           |  12 +-
 src/gallium/auxiliary/util/u_dual_blend.h     |   4 +-
 src/gallium/auxiliary/util/u_dump_state.c     |   2 +-
 src/gallium/auxiliary/util/u_dynarray.h       |  10 +-
 src/gallium/auxiliary/util/u_fifo.h           |   8 +-
 src/gallium/auxiliary/util/u_format.h         |  64 +++++-----
 src/gallium/auxiliary/util/u_format_pack.py   |   6 +-
 .../auxiliary/util/u_format_r11g11b10f.h      |  12 +-
 src/gallium/auxiliary/util/u_format_rgb9e5.h  |   8 +-
 src/gallium/auxiliary/util/u_format_s3tc.c    |   8 +-
 src/gallium/auxiliary/util/u_format_yuv.h     |   8 +-
 src/gallium/auxiliary/util/u_format_zs.c      |  20 +--
 src/gallium/auxiliary/util/u_half.h           |   4 +-
 src/gallium/auxiliary/util/u_handle_table.c   |   4 +-
 src/gallium/auxiliary/util/u_hash_table.c     |   6 +-
 src/gallium/auxiliary/util/u_inlines.h        |  66 +++++-----
 src/gallium/auxiliary/util/u_keymap.c         |   6 +-
 src/gallium/auxiliary/util/u_linear.h         |   2 +-
 src/gallium/auxiliary/util/u_math.h           |  90 +++++++-------
 src/gallium/auxiliary/util/u_memory.h         |   2 +-
 src/gallium/auxiliary/util/u_mm.c             |   2 +-
 src/gallium/auxiliary/util/u_pack_color.h     |  26 ++--
 src/gallium/auxiliary/util/u_pointer.h        |  16 +--
 src/gallium/auxiliary/util/u_prim.h           |  20 +--
 src/gallium/auxiliary/util/u_range.h          |  10 +-
 src/gallium/auxiliary/util/u_rect.h           |  10 +-
 src/gallium/auxiliary/util/u_resource.h       |   2 +-
 src/gallium/auxiliary/util/u_ringbuffer.c     |   4 +-
 src/gallium/auxiliary/util/u_split_prim.h     |   4 +-
 src/gallium/auxiliary/util/u_sse.h            |  12 +-
 src/gallium/auxiliary/util/u_string.h         |  22 ++--
 src/gallium/auxiliary/util/u_surfaces.h       |   6 +-
 src/gallium/auxiliary/util/u_tile.h           |   2 +-
 src/gallium/auxiliary/util/u_time.h           |  14 +--
 src/gallium/auxiliary/util/u_transfer.c       |   2 +-
 src/gallium/auxiliary/util/u_video.h          |  10 +-
 src/gallium/auxiliary/vl/vl_compositor.c      |  10 +-
 .../auxiliary/vl/vl_mpeg12_bitstream.c        |  22 ++--
 src/gallium/auxiliary/vl/vl_mpeg12_decoder.c  |   6 +-
 src/gallium/auxiliary/vl/vl_rbsp.h            |  12 +-
 src/gallium/auxiliary/vl/vl_vlc.h             |  30 ++---
 .../drivers/freedreno/a2xx/fd2_blend.h        |   2 +-
 .../drivers/freedreno/a2xx/fd2_context.h      |   2 +-
 .../drivers/freedreno/a2xx/fd2_rasterizer.h   |   2 +-
 .../drivers/freedreno/a2xx/fd2_texture.h      |   4 +-
 src/gallium/drivers/freedreno/a2xx/fd2_zsa.h  |   2 +-
 .../drivers/freedreno/a3xx/fd3_blend.h        |   2 +-
 .../drivers/freedreno/a3xx/fd3_context.h      |   2 +-
 .../drivers/freedreno/a3xx/fd3_format.h       |   2 +-
 .../drivers/freedreno/a3xx/fd3_rasterizer.h   |   2 +-
 .../drivers/freedreno/a3xx/fd3_texture.h      |   4 +-
 src/gallium/drivers/freedreno/a3xx/fd3_zsa.h  |   2 +-
 .../drivers/freedreno/a4xx/fd4_blend.h        |   2 +-
 .../drivers/freedreno/a4xx/fd4_context.h      |   2 +-
 .../drivers/freedreno/a4xx/fd4_rasterizer.h   |   2 +-
 .../drivers/freedreno/a4xx/fd4_texture.h      |   4 +-
 src/gallium/drivers/freedreno/a4xx/fd4_zsa.h  |   2 +-
 .../drivers/freedreno/freedreno_context.h     |   8 +-
 .../drivers/freedreno/freedreno_resource.h    |   8 +-
 .../drivers/freedreno/freedreno_screen.h      |   2 +-
 .../drivers/freedreno/freedreno_surface.h     |   2 +-
 src/gallium/drivers/i915/i915_batchbuffer.h   |  16 +--
 src/gallium/drivers/i915/i915_context.h       |   4 +-
 src/gallium/drivers/i915/i915_debug.h         |   6 +-
 src/gallium/drivers/i915/i915_fpc.h           |   2 +-
 src/gallium/drivers/i915/i915_fpc_translate.c |   2 +-
 src/gallium/drivers/i915/i915_prim_emit.c     |   6 +-
 src/gallium/drivers/i915/i915_prim_vbuf.c     |   2 +-
 src/gallium/drivers/i915/i915_resource.h      |   4 +-
 .../drivers/i915/i915_resource_texture.c      |   8 +-
 src/gallium/drivers/i915/i915_screen.h        |   2 +-
 src/gallium/drivers/i915/i915_state_dynamic.c |   4 +-
 .../drivers/i915/i915_state_immediate.c       |   2 +-
 src/gallium/drivers/i915/i915_state_inlines.h |  14 +--
 src/gallium/drivers/llvmpipe/lp_bld_blend.c   |   2 +-
 src/gallium/drivers/llvmpipe/lp_context.h     |   2 +-
 src/gallium/drivers/llvmpipe/lp_debug.h       |   2 +-
 src/gallium/drivers/llvmpipe/lp_fence.h       |   4 +-
 src/gallium/drivers/llvmpipe/lp_rast.h        |  18 +--
 src/gallium/drivers/llvmpipe/lp_rast_priv.h   |   6 +-
 src/gallium/drivers/llvmpipe/lp_rast_tri.c    |  10 +-
 src/gallium/drivers/llvmpipe/lp_scene.h       |  16 +--
 src/gallium/drivers/llvmpipe/lp_screen.h      |   2 +-
 src/gallium/drivers/llvmpipe/lp_setup.h       |   2 +-
 src/gallium/drivers/llvmpipe/lp_setup_line.c  |   6 +-
 src/gallium/drivers/llvmpipe/lp_setup_point.c |   2 +-
 src/gallium/drivers/llvmpipe/lp_setup_tri.c   |  12 +-
 src/gallium/drivers/llvmpipe/lp_setup_vbuf.c  |   2 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c    |  14 +--
 src/gallium/drivers/llvmpipe/lp_test.h        |   2 +-
 src/gallium/drivers/llvmpipe/lp_texture.h     |  14 +--
 src/gallium/drivers/nouveau/nouveau_buffer.c  |  30 ++---
 src/gallium/drivers/nouveau/nouveau_buffer.h  |   6 +-
 src/gallium/drivers/nouveau/nouveau_context.h |   8 +-
 src/gallium/drivers/nouveau/nouveau_fence.h   |   4 +-
 src/gallium/drivers/nouveau/nouveau_gldefs.h  |  14 +--
 src/gallium/drivers/nouveau/nouveau_mm.c      |   8 +-
 src/gallium/drivers/nouveau/nouveau_screen.h  |   2 +-
 .../drivers/nouveau/nouveau_statebuf.h        |   2 +-
 src/gallium/drivers/nouveau/nouveau_video.c   |  10 +-
 src/gallium/drivers/nouveau/nouveau_video.h   |  12 +-
 .../drivers/nouveau/nouveau_vp3_video.h       |  12 +-
 src/gallium/drivers/nouveau/nouveau_winsys.h  |  14 +--
 src/gallium/drivers/nouveau/nv30/nv30_clear.c |   4 +-
 .../drivers/nouveau/nv30/nv30_context.h       |   4 +-
 src/gallium/drivers/nouveau/nv30/nv30_draw.c  |   2 +-
 .../drivers/nouveau/nv30/nv30_format.h        |   8 +-
 .../drivers/nouveau/nv30/nv30_miptree.c       |   6 +-
 src/gallium/drivers/nouveau/nv30/nv30_push.c  |   6 +-
 src/gallium/drivers/nouveau/nv30/nv30_query.c |   2 +-
 .../drivers/nouveau/nv30/nv30_resource.h      |   4 +-
 .../drivers/nouveau/nv30/nv30_screen.h        |   2 +-
 .../drivers/nouveau/nv30/nv30_texture.c       |   8 +-
 .../drivers/nouveau/nv30/nv30_transfer.c      |  12 +-
 src/gallium/drivers/nouveau/nv30/nv30_vbo.c   |   4 +-
 .../drivers/nouveau/nv30/nv30_winsys.h        |  22 ++--
 .../drivers/nouveau/nv30/nvfx_fragprog.c      |  10 +-
 .../drivers/nouveau/nv30/nvfx_shader.h        |  12 +-
 .../drivers/nouveau/nv30/nvfx_vertprog.c      |   2 +-
 src/gallium/drivers/nouveau/nv50/nv50_blit.h  |  22 ++--
 .../drivers/nouveau/nv50/nv50_context.h       |   4 +-
 .../drivers/nouveau/nv50/nv50_miptree.c       |   4 +-
 .../drivers/nouveau/nv50/nv50_program.c       |   2 +-
 src/gallium/drivers/nouveau/nv50/nv50_push.c  |   8 +-
 src/gallium/drivers/nouveau/nv50/nv50_query.c |   4 +-
 .../drivers/nouveau/nv50/nv50_resource.h      |   6 +-
 .../drivers/nouveau/nv50/nv50_screen.h        |  14 +--
 .../drivers/nouveau/nv50/nv50_shader_state.c  |   2 +-
 src/gallium/drivers/nouveau/nv50/nv50_state.c |  10 +-
 .../nouveau/nv50/nv50_state_validate.c        |   4 +-
 .../drivers/nouveau/nv50/nv50_stateobj.h      |   2 +-
 .../drivers/nouveau/nv50/nv50_stateobj_tex.h  |   4 +-
 .../drivers/nouveau/nv50/nv50_surface.c       |   2 +-
 src/gallium/drivers/nouveau/nv50/nv50_tex.c   |   2 +-
 src/gallium/drivers/nouveau/nv50/nv50_vbo.c   |   6 +-
 .../drivers/nouveau/nv50/nv50_winsys.h        |  22 ++--
 src/gallium/drivers/nouveau/nv50/nv84_video.h |   4 +-
 .../drivers/nouveau/nv50/nv84_video_vp.c      |   2 +-
 .../drivers/nouveau/nvc0/nvc0_context.h       |   4 +-
 .../drivers/nouveau/nvc0/nvc0_miptree.c       |   4 +-
 .../drivers/nouveau/nvc0/nvc0_program.c       |   2 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_query.c |   8 +-
 .../drivers/nouveau/nvc0/nvc0_screen.h        |  14 +--
 .../drivers/nouveau/nvc0/nvc0_shader_state.c  |   4 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_state.c |  10 +-
 .../nouveau/nvc0/nvc0_state_validate.c        |   6 +-
 .../drivers/nouveau/nvc0/nvc0_stateobj.h      |   2 +-
 .../drivers/nouveau/nvc0/nvc0_surface.c       |   2 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_tex.c   |   6 +-
 .../drivers/nouveau/nvc0/nvc0_transfer.c      |   4 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c   |   8 +-
 .../drivers/nouveau/nvc0/nvc0_vbo_translate.c |  34 ++---
 .../drivers/nouveau/nvc0/nvc0_winsys.h        |  26 ++--
 .../drivers/nouveau/nvc0/nve4_compute.c       |   4 +-
 .../drivers/nouveau/nvc0/nve4_compute.h       |   6 +-
 src/gallium/drivers/r300/r300_context.h       |  18 +--
 src/gallium/drivers/r300/r300_fs.h            |   4 +-
 src/gallium/drivers/r300/r300_render.c        |   2 +-
 src/gallium/drivers/r300/r300_screen.c        |   2 +-
 src/gallium/drivers/r300/r300_screen.h        |   8 +-
 src/gallium/drivers/r300/r300_screen_buffer.h |   2 +-
 .../drivers/r300/r300_shader_semantics.h      |   2 +-
 src/gallium/drivers/r300/r300_state_inlines.h |  28 ++---
 src/gallium/drivers/r300/r300_transfer.c      |   2 +-
 src/gallium/drivers/r600/evergreen_state.c    |   2 +-
 src/gallium/drivers/r600/r600_formats.h       |   4 +-
 src/gallium/drivers/r600/r600_pipe.h          |  50 ++++----
 src/gallium/drivers/r600/r600_state.c         |   2 +-
 src/gallium/drivers/r600/r600_state_common.c  |   2 +-
 src/gallium/drivers/radeon/r600_cs.h          |  20 +--
 src/gallium/drivers/radeon/r600_pipe_common.h |   4 +-
 src/gallium/drivers/radeon/radeon_winsys.h    |   4 +-
 src/gallium/drivers/radeonsi/si_pipe.h        |   4 +-
 .../drivers/radeonsi/si_state_shaders.c       |   2 +-
 src/gallium/drivers/rbug/rbug_context.h       |   2 +-
 src/gallium/drivers/rbug/rbug_objects.h       |  20 +--
 src/gallium/drivers/rbug/rbug_screen.h        |   2 +-
 src/gallium/drivers/softpipe/sp_context.h     |   2 +-
 src/gallium/drivers/softpipe/sp_fs_exec.c     |   2 +-
 src/gallium/drivers/softpipe/sp_prim_vbuf.c   |   2 +-
 src/gallium/drivers/softpipe/sp_quad_blend.c  |   2 +-
 src/gallium/drivers/softpipe/sp_quad_fs.c     |   4 +-
 src/gallium/drivers/softpipe/sp_screen.h      |   2 +-
 src/gallium/drivers/softpipe/sp_setup.c       |  10 +-
 src/gallium/drivers/softpipe/sp_tex_sample.c  |  56 ++++-----
 .../drivers/softpipe/sp_tex_tile_cache.c      |   2 +-
 .../drivers/softpipe/sp_tex_tile_cache.h      |   4 +-
 src/gallium/drivers/softpipe/sp_texture.h     |   6 +-
 src/gallium/drivers/softpipe/sp_tile_cache.c  |   6 +-
 src/gallium/drivers/softpipe/sp_tile_cache.h  |   4 +-
 .../drivers/svga/include/svga3d_shaderdefs.h  |   2 +-
 .../drivers/svga/include/svga_overlay.h       |   2 +-
 src/gallium/drivers/svga/svga_cmd.c           |   2 +-
 src/gallium/drivers/svga/svga_context.h       |   6 +-
 src/gallium/drivers/svga/svga_debug.h         |   2 +-
 src/gallium/drivers/svga/svga_draw_private.h  |   2 +-
 src/gallium/drivers/svga/svga_pipe_blend.c    |   4 +-
 .../drivers/svga/svga_pipe_depthstencil.c     |   4 +-
 src/gallium/drivers/svga/svga_pipe_query.c    |   2 +-
 src/gallium/drivers/svga/svga_pipe_sampler.c  |   6 +-
 .../drivers/svga/svga_resource_buffer.c       |   2 +-
 .../drivers/svga/svga_resource_buffer.h       |  12 +-
 .../drivers/svga/svga_resource_texture.h      |  16 +--
 src/gallium/drivers/svga/svga_sampler_view.h  |   2 +-
 src/gallium/drivers/svga/svga_screen.h        |   2 +-
 src/gallium/drivers/svga/svga_screen_cache.c  |   2 +-
 src/gallium/drivers/svga/svga_shader.h        |   2 +-
 src/gallium/drivers/svga/svga_state_fs.c      |   2 +-
 src/gallium/drivers/svga/svga_state_rss.c     |   2 +-
 src/gallium/drivers/svga/svga_state_tss.c     |   2 +-
 src/gallium/drivers/svga/svga_state_vs.c      |   2 +-
 src/gallium/drivers/svga/svga_surface.h       |   4 +-
 src/gallium/drivers/svga/svga_swtnl_private.h |   2 +-
 src/gallium/drivers/svga/svga_tgsi.c          |   2 +-
 src/gallium/drivers/svga/svga_tgsi.h          |   6 +-
 src/gallium/drivers/svga/svga_tgsi_emit.h     |  20 +--
 .../drivers/svga/svgadump/svga_shader.h       |   6 +-
 src/gallium/drivers/trace/tr_context.c        | 100 +++++++--------
 src/gallium/drivers/trace/tr_context.h        |   2 +-
 src/gallium/drivers/trace/tr_dump.c           |  24 ++--
 src/gallium/drivers/trace/tr_dump_defines.h   |   4 +-
 src/gallium/drivers/trace/tr_texture.h        |   8 +-
 src/gallium/drivers/vc4/vc4_resource.h        |   6 +-
 src/gallium/include/pipe/p_compiler.h         |   7 +-
 src/gallium/include/state_tracker/st_api.h    |   2 +-
 src/gallium/state_trackers/dri/dri2_buffer.h  |   2 +-
 src/gallium/state_trackers/dri/dri_context.h  |   2 +-
 src/gallium/state_trackers/dri/dri_drawable.h |   2 +-
 src/gallium/state_trackers/dri/dri_screen.h   |   6 +-
 src/gallium/state_trackers/dri/drisw.c        |  14 +--
 src/gallium/state_trackers/glx/xlib/xm_api.h  |   4 +-
 src/gallium/state_trackers/glx/xlib/xm_st.c   |   2 +-
 src/gallium/state_trackers/hgl/hgl.c          |   4 +-
 src/gallium/state_trackers/nine/adapter9.c    |   4 +-
 src/gallium/state_trackers/nine/adapter9.h    |   2 +-
 .../nine/authenticatedchannel9.h              |   2 +-
 .../state_trackers/nine/basetexture9.h        |   8 +-
 .../state_trackers/nine/cryptosession9.h      |   2 +-
 .../state_trackers/nine/cubetexture9.h        |   2 +-
 src/gallium/state_trackers/nine/device9.c     |   6 +-
 src/gallium/state_trackers/nine/device9.h     |   2 +-
 src/gallium/state_trackers/nine/device9ex.h   |   2 +-
 .../state_trackers/nine/device9video.h        |   2 +-
 .../state_trackers/nine/indexbuffer9.h        |   2 +-
 src/gallium/state_trackers/nine/iunknown.h    |  12 +-
 src/gallium/state_trackers/nine/nine_dump.h   |  10 +-
 src/gallium/state_trackers/nine/nine_ff.c     |  14 +--
 .../state_trackers/nine/nine_helpers.c        |   4 +-
 .../state_trackers/nine/nine_helpers.h        |   6 +-
 src/gallium/state_trackers/nine/nine_pipe.c   |   2 +-
 src/gallium/state_trackers/nine/nine_pipe.h   |  64 +++++-----
 src/gallium/state_trackers/nine/nine_shader.c |  56 ++++-----
 src/gallium/state_trackers/nine/nine_shader.h |  12 +-
 src/gallium/state_trackers/nine/nine_state.c  |  18 +--
 .../nine/nineexoverlayextension.h             |   2 +-
 .../state_trackers/nine/pixelshader9.h        |   2 +-
 src/gallium/state_trackers/nine/query9.c      |   2 +-
 src/gallium/state_trackers/nine/query9.h      |   2 +-
 src/gallium/state_trackers/nine/resource9.h   |   2 +-
 src/gallium/state_trackers/nine/stateblock9.h |   2 +-
 src/gallium/state_trackers/nine/surface9.c    |   4 +-
 src/gallium/state_trackers/nine/surface9.h    |  12 +-
 src/gallium/state_trackers/nine/swapchain9.c  |   2 +-
 src/gallium/state_trackers/nine/swapchain9.h  |   2 +-
 .../state_trackers/nine/swapchain9ex.h        |   2 +-
 src/gallium/state_trackers/nine/texture9.h    |   2 +-
 .../state_trackers/nine/vertexbuffer9.h       |   2 +-
 .../state_trackers/nine/vertexdeclaration9.c  |   8 +-
 .../state_trackers/nine/vertexdeclaration9.h  |   2 +-
 .../state_trackers/nine/vertexshader9.h       |   2 +-
 src/gallium/state_trackers/nine/volume9.c     |   8 +-
 src/gallium/state_trackers/nine/volume9.h     |   6 +-
 .../state_trackers/nine/volumetexture9.h      |   2 +-
 src/gallium/state_trackers/osmesa/osmesa.c    |   4 +-
 src/gallium/state_trackers/wgl/stw_device.h   |   2 +-
 .../state_trackers/wgl/stw_framebuffer.c      |   4 +-
 src/gallium/state_trackers/wgl/stw_st.c       |   2 +-
 src/gallium/state_trackers/wgl/stw_tls.c      |   4 +-
 src/gallium/state_trackers/xa/xa_composite.c  |   4 +-
 src/gallium/state_trackers/xa/xa_priv.h       |  10 +-
 src/gallium/state_trackers/xa/xa_renderer.c   |  14 +--
 src/gallium/state_trackers/xa/xa_tgsi.c       |   6 +-
 .../state_trackers/xvmc/xvmc_private.h        |   2 +-
 src/gallium/targets/d3dadapter9/drm.c         |   8 +-
 src/gallium/tests/graw/graw_util.h            |  14 +--
 .../winsys/i915/drm/i915_drm_batchbuffer.c    |   2 +-
 src/gallium/winsys/i915/drm/i915_drm_winsys.h |   6 +-
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c |   4 +-
 src/gallium/winsys/radeon/drm/radeon_drm_bo.h |   2 +-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c |   2 +-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h |   8 +-
 .../winsys/radeon/drm/radeon_drm_winsys.h     |   2 +-
 .../winsys/svga/drm/pb_buffer_simple_fenced.c |  14 +--
 src/gallium/winsys/svga/drm/vmw_buffer.c      |   4 +-
 src/gallium/winsys/svga/drm/vmw_buffer.h      |   4 +-
 src/gallium/winsys/svga/drm/vmw_context.c     |   4 +-
 src/gallium/winsys/svga/drm/vmw_fence.c       |   6 +-
 src/gallium/winsys/svga/drm/vmw_screen.h      |   2 +-
 src/gallium/winsys/svga/drm/vmw_screen_dri.c  |   2 +-
 .../winsys/svga/drm/vmw_screen_ioctl.c        |   2 +-
 src/gallium/winsys/svga/drm/vmw_shader.h      |   4 +-
 src/gallium/winsys/svga/drm/vmw_surface.h     |   4 +-
 src/gallium/winsys/sw/dri/dri_sw_winsys.c     |   4 +-
 src/gallium/winsys/sw/gdi/gdi_sw_winsys.c     |   2 +-
 src/gallium/winsys/sw/hgl/hgl_sw_winsys.c     |   2 +-
 .../winsys/sw/kms-dri/kms_dri_sw_winsys.c     |   4 +-
 .../winsys/sw/wrapper/wrapper_sw_winsys.c     |   4 +-
 src/gallium/winsys/sw/xlib/xlib_sw_winsys.c   |   2 +-
 src/mesa/state_tracker/st_cb_perfmon.h        |   2 +-
 395 files changed, 1515 insertions(+), 1522 deletions(-)

diff --git a/src/gallium/README.portability b/src/gallium/README.portability
index adecf4bb798..cf6cc36afbb 100644
--- a/src/gallium/README.portability
+++ b/src/gallium/README.portability
@@ -13,8 +13,6 @@ headers in general, should strictly follow these guidelines to ensure
 
 * Include the p_compiler.h.
 
-* Don't use the 'inline' keyword, use the INLINE macro in p_compiler.h instead.
-
 * Cast explicitly when converting to integer types of smaller sizes.
 
 * Cast explicitly when converting between float, double and integral types.
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c
index dd56e4a154e..d36f1fbd717 100644
--- a/src/gallium/auxiliary/cso_cache/cso_cache.c
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.c
@@ -80,7 +80,7 @@ unsigned cso_construct_key(void *item, int item_size)
    return hash_key((item), item_size);
 }
 
-static INLINE struct cso_hash *_cso_hash_for_type(struct cso_cache *sc, enum cso_cache_type type)
+static inline struct cso_hash *_cso_hash_for_type(struct cso_cache *sc, enum cso_cache_type type)
 {
    struct cso_hash *hash;
    hash = sc->hashes[type];
@@ -127,7 +127,7 @@ static void delete_velements(void *state, void *data)
    FREE(state);
 }
 
-static INLINE void delete_cso(void *state, enum cso_cache_type type)
+static inline void delete_cso(void *state, enum cso_cache_type type)
 {
    switch (type) {
    case CSO_BLEND:
@@ -152,7 +152,7 @@ static INLINE void delete_cso(void *state, enum cso_cache_type type)
 }
 
 
-static INLINE void sanitize_hash(struct cso_cache *sc,
+static inline void sanitize_hash(struct cso_cache *sc,
                                  struct cso_hash *hash,
                                  enum cso_cache_type type,
                                  int max_size)
@@ -162,7 +162,7 @@ static INLINE void sanitize_hash(struct cso_cache *sc,
 }
 
 
-static INLINE void sanitize_cb(struct cso_hash *hash, enum cso_cache_type type,
+static inline void sanitize_cb(struct cso_hash *hash, enum cso_cache_type type,
                                int max_size, void *user_data)
 {
    /* if we're approach the maximum size, remove fourth of the entries
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 744b00cbd92..5aef2d5c487 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -192,7 +192,7 @@ static boolean delete_vertex_elements(struct cso_context *ctx,
 }
 
 
-static INLINE boolean delete_cso(struct cso_context *ctx,
+static inline boolean delete_cso(struct cso_context *ctx,
                                  void *state, enum cso_cache_type type)
 {
    switch (type) {
@@ -213,7 +213,7 @@ static INLINE boolean delete_cso(struct cso_context *ctx,
    return FALSE;
 }
 
-static INLINE void
+static inline void
 sanitize_hash(struct cso_hash *hash, enum cso_cache_type type,
               int max_size, void *user_data)
 {
@@ -921,14 +921,14 @@ void cso_restore_tesseval_shader(struct cso_context *ctx)
 
 /* clip state */
 
-static INLINE void
+static inline void
 clip_state_cpy(struct pipe_clip_state *dst,
                const struct pipe_clip_state *src)
 {
    memcpy(dst->ucp, src->ucp, sizeof(dst->ucp));
 }
 
-static INLINE int
+static inline int
 clip_state_cmp(const struct pipe_clip_state *a,
                const struct pipe_clip_state *b)
 {
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index a1564f93292..c827a68ea0a 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -45,7 +45,7 @@
 /* fixme: move it from here */
 #define MAX_PRIMITIVES 64
 
-static INLINE int
+static inline int
 draw_gs_get_input_index(int semantic, int index,
                         const struct tgsi_shader_info *input_info)
 {
@@ -66,7 +66,7 @@ draw_gs_get_input_index(int semantic, int index,
  * the number of elements in the SOA vector. This ensures that the
  * throughput is optimized for the given vector instruction set.
  */
-static INLINE boolean
+static inline boolean
 draw_gs_should_flush(struct draw_geometry_shader *shader)
 {
    return (shader->fetched_prim_count == shader->vector_length);
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 90a31bc6ac0..b1e1bcbee04 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -72,7 +72,7 @@ struct draw_gs_llvm_iface {
    LLVMValueRef input;
 };
 
-static INLINE const struct draw_gs_llvm_iface *
+static inline const struct draw_gs_llvm_iface *
 draw_gs_llvm_iface(const struct lp_build_tgsi_gs_iface *iface)
 {
    return (const struct draw_gs_llvm_iface *)iface;
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index d48ed721593..d153c166ead 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -350,7 +350,7 @@ struct draw_gs_llvm_variant_key
     PIPE_MAX_SHADER_SAMPLER_VIEWS * sizeof(struct draw_sampler_static_state))
 
 
-static INLINE size_t
+static inline size_t
 draw_llvm_variant_key_size(unsigned nr_vertex_elements,
                            unsigned nr_samplers)
 {
@@ -360,7 +360,7 @@ draw_llvm_variant_key_size(unsigned nr_vertex_elements,
 }
 
 
-static INLINE size_t
+static inline size_t
 draw_gs_llvm_variant_key_size(unsigned nr_samplers)
 {
    return (sizeof(struct draw_gs_llvm_variant_key) +
@@ -368,7 +368,7 @@ draw_gs_llvm_variant_key_size(unsigned nr_samplers)
 }
 
 
-static INLINE struct draw_sampler_static_state *
+static inline struct draw_sampler_static_state *
 draw_llvm_variant_key_samplers(struct draw_llvm_variant_key *key)
 {
    return (struct draw_sampler_static_state *)
@@ -476,13 +476,13 @@ struct draw_llvm {
 };
 
 
-static INLINE struct llvm_vertex_shader *
+static inline struct llvm_vertex_shader *
 llvm_vertex_shader(struct draw_vertex_shader *vs)
 {
    return (struct llvm_vertex_shader *)vs;
 }
 
-static INLINE struct llvm_geometry_shader *
+static inline struct llvm_geometry_shader *
 llvm_geometry_shader(struct draw_geometry_shader *gs)
 {
    return (struct llvm_geometry_shader *)gs;
diff --git a/src/gallium/auxiliary/draw/draw_pipe.h b/src/gallium/auxiliary/draw/draw_pipe.h
index 35273330d13..e69dcbded0e 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.h
+++ b/src/gallium/auxiliary/draw/draw_pipe.h
@@ -115,7 +115,7 @@ void draw_unfilled_prepare_outputs(struct draw_context *context,
  * \param idx  index into stage's tmp[] array to put the copy (dest)
  * \return  pointer to the copied vertex
  */
-static INLINE struct vertex_header *
+static inline struct vertex_header *
 dup_vert( struct draw_stage *stage,
 	  const struct vertex_header *vert,
 	  unsigned idx )
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 936046ea5f5..85d24b7a6a1 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -511,7 +511,7 @@ bind_aaline_fragment_shader(struct aaline_stage *aaline)
 
 
 
-static INLINE struct aaline_stage *
+static inline struct aaline_stage *
 aaline_stage( struct draw_stage *stage )
 {
    return (struct aaline_stage *) stage;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index 7feb49ae934..3918923296d 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -427,7 +427,7 @@ bind_aapoint_fragment_shader(struct aapoint_stage *aapoint)
 
 
 
-static INLINE struct aapoint_stage *
+static inline struct aapoint_stage *
 aapoint_stage( struct draw_stage *stage )
 {
    return (struct aapoint_stage *) stage;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index e1e7dcc6f63..c22758bc702 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -70,12 +70,12 @@ struct clip_stage {
 
 
 /** Cast wrapper */
-static INLINE struct clip_stage *clip_stage( struct draw_stage *stage )
+static inline struct clip_stage *clip_stage( struct draw_stage *stage )
 {
    return (struct clip_stage *)stage;
 }
 
-static INLINE unsigned
+static inline unsigned
 draw_viewport_index(struct draw_context *draw,
                     const struct vertex_header *leading_vertex)
 {
@@ -210,7 +210,7 @@ static void interp( const struct clip_stage *clip,
  * true, otherwise returns false.
  * Triangle is considered null/empty if it's area is qual to zero.
  */
-static INLINE boolean
+static inline boolean
 is_tri_null(struct draw_context *draw, const struct prim_header *header)
 {
    const unsigned pos_attr = draw_current_shader_position_output(draw);
@@ -322,7 +322,7 @@ static void emit_poly( struct draw_stage *stage,
 }
 
 
-static INLINE float
+static inline float
 dot4(const float *a, const float *b)
 {
    return (a[0] * b[0] +
@@ -336,7 +336,7 @@ dot4(const float *a, const float *b)
  * it first checks if the shader provided a clip distance, otherwise
  * it works out the value using the clipvertex
  */
-static INLINE float getclipdist(const struct clip_stage *clipper,
+static inline float getclipdist(const struct clip_stage *clipper,
                                 struct vertex_header *vert,
                                 int plane_idx)
 {
diff --git a/src/gallium/auxiliary/draw/draw_pipe_cull.c b/src/gallium/auxiliary/draw/draw_pipe_cull.c
index fa344089a8a..fc8293bd128 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_cull.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c
@@ -46,12 +46,12 @@ struct cull_stage {
 };
 
 
-static INLINE struct cull_stage *cull_stage( struct draw_stage *stage )
+static inline struct cull_stage *cull_stage( struct draw_stage *stage )
 {
    return (struct cull_stage *)stage;
 }
 
-static INLINE boolean
+static inline boolean
 cull_distance_is_out(float dist)
 {
    return (dist < 0.0f) || util_is_inf_or_nan(dist);
diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
index 59e33b472f4..0ea740861d6 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
@@ -47,7 +47,7 @@ struct flat_stage
 };
 
 
-static INLINE struct flat_stage *
+static inline struct flat_stage *
 flat_stage(struct draw_stage *stage)
 {
    return (struct flat_stage *) stage;
@@ -55,7 +55,7 @@ flat_stage(struct draw_stage *stage)
 
 
 /** Copy all the constant attributes from 'src' vertex to 'dst' vertex */
-static INLINE void copy_flats( struct draw_stage *stage,
+static inline void copy_flats( struct draw_stage *stage,
                                struct vertex_header *dst,
                                const struct vertex_header *src )
 {
@@ -70,7 +70,7 @@ static INLINE void copy_flats( struct draw_stage *stage,
 
 
 /** Copy all the color attributes from src vertex to dst0 & dst1 vertices */
-static INLINE void copy_flats2( struct draw_stage *stage,
+static inline void copy_flats2( struct draw_stage *stage,
                                 struct vertex_header *dst0,
                                 struct vertex_header *dst1,
                                 const struct vertex_header *src )
diff --git a/src/gallium/auxiliary/draw/draw_pipe_offset.c b/src/gallium/auxiliary/draw/draw_pipe_offset.c
index b25dd21fd4d..5e0d8ce793d 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_offset.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c
@@ -49,7 +49,7 @@ struct offset_stage {
 
 
 
-static INLINE struct offset_stage *offset_stage( struct draw_stage *stage )
+static inline struct offset_stage *offset_stage( struct draw_stage *stage )
 {
    return (struct offset_stage *) stage;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index 445f195e59c..186b4cb4935 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -462,7 +462,7 @@ bind_pstip_fragment_shader(struct pstip_stage *pstip)
 }
 
 
-static INLINE struct pstip_stage *
+static inline struct pstip_stage *
 pstip_stage( struct draw_stage *stage )
 {
    return (struct pstip_stage *) stage;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index 476c011b9a0..381aa41530b 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -53,7 +53,7 @@ struct stipple_stage {
 };
 
 
-static INLINE struct stipple_stage *
+static inline struct stipple_stage *
 stipple_stage(struct draw_stage *stage)
 {
    return (struct stipple_stage *) stage;
@@ -108,7 +108,7 @@ emit_segment(struct draw_stage *stage, struct prim_header *header,
 }
 
 
-static INLINE unsigned
+static inline unsigned
 stipple_test(int counter, ushort pattern, int factor)
 {
    int b = (counter / factor) & 0xf;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
index 8148f6b4569..7f958d9b985 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_twoside.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
@@ -43,7 +43,7 @@ struct twoside_stage {
 };
 
 
-static INLINE struct twoside_stage *twoside_stage( struct draw_stage *stage )
+static inline struct twoside_stage *twoside_stage( struct draw_stage *stage )
 {
    return (struct twoside_stage *)stage;
 }
@@ -51,7 +51,7 @@ static INLINE struct twoside_stage *twoside_stage( struct draw_stage *stage )
 /**
  * Copy back color(s) to front color(s).
  */
-static INLINE struct vertex_header *
+static inline struct vertex_header *
 copy_bfc( struct twoside_stage *twoside, 
           const struct vertex_header *v,
           unsigned idx )
diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
index 51fbdb97ae8..8e6435cdbb4 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
@@ -53,7 +53,7 @@ struct unfilled_stage {
 };
 
 
-static INLINE struct unfilled_stage *unfilled_stage( struct draw_stage *stage )
+static inline struct unfilled_stage *unfilled_stage( struct draw_stage *stage )
 {
    return (struct unfilled_stage *)stage;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index e0e32dd9bbe..5cc866d7eee 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -85,7 +85,7 @@ struct vbuf_stage {
 /**
  * Basically a cast wrapper.
  */
-static INLINE struct vbuf_stage *
+static inline struct vbuf_stage *
 vbuf_stage( struct draw_stage *stage )
 {
    assert(stage);
@@ -97,7 +97,7 @@ static void vbuf_flush_vertices( struct vbuf_stage *vbuf );
 static void vbuf_alloc_vertices( struct vbuf_stage *vbuf );
 
 
-static INLINE boolean 
+static inline boolean 
 overflow( void *map, void *ptr, unsigned bytes, unsigned bufsz )
 {
    unsigned long used = (unsigned long) ((char *)ptr - (char *)map);
@@ -105,7 +105,7 @@ overflow( void *map, void *ptr, unsigned bytes, unsigned bufsz )
 }
 
 
-static INLINE void 
+static inline void 
 check_space( struct vbuf_stage *vbuf, unsigned nr )
 {
    if (vbuf->nr_vertices + nr > vbuf->max_vertices ||
@@ -126,7 +126,7 @@ check_space( struct vbuf_stage *vbuf, unsigned nr )
  * have a couple of slots at the beginning (1-dword header, 4-dword
  * clip pos) that we ignore here.  We only use the vertex->data[] fields.
  */
-static INLINE ushort 
+static inline ushort 
 emit_vertex( struct vbuf_stage *vbuf,
              struct vertex_header *vertex )
 {
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
index 6c57d5c1e3e..38ac11a9adf 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
@@ -45,7 +45,7 @@ struct wideline_stage {
 
 
 
-static INLINE struct wideline_stage *wideline_stage( struct draw_stage *stage )
+static inline struct wideline_stage *wideline_stage( struct draw_stage *stage )
 {
    return (struct wideline_stage *)stage;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index 05beba8cd97..348b0e93bbc 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -83,7 +83,7 @@ struct widepoint_stage {
 
 
 
-static INLINE struct widepoint_stage *
+static inline struct widepoint_stage *
 widepoint_stage( struct draw_stage *stage )
 {
    return (struct widepoint_stage *)stage;
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 7b893cb2692..0ad94bb031f 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -494,7 +494,7 @@ void draw_update_viewport_flags(struct draw_context *draw);
  * Return index of the given viewport clamping it
  * to be between 0 <= and < PIPE_MAX_VIEWPORTS
  */
-static INLINE unsigned
+static inline unsigned
 draw_clamp_viewport_idx(int idx)
 {
    return ((PIPE_MAX_VIEWPORTS > idx && idx >= 0) ? idx : 0);
@@ -505,7 +505,7 @@ draw_clamp_viewport_idx(int idx)
  * overflows then it returns the value from
  * the overflow_value variable.
  */
-static INLINE unsigned
+static inline unsigned
 draw_overflow_uadd(unsigned a, unsigned b,
                    unsigned overflow_value)
 {
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index 5af845ff938..ffec863ae6f 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -54,7 +54,7 @@ struct fetch_pipeline_middle_end {
 
 
 /** cast wrapper */
-static INLINE struct fetch_pipeline_middle_end *
+static inline struct fetch_pipeline_middle_end *
 fetch_pipeline_middle_end(struct draw_pt_middle_end *middle)
 {
    return (struct fetch_pipeline_middle_end *) middle;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index d17d6959b44..e42c4af0e70 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -60,7 +60,7 @@ struct llvm_middle_end {
 
 
 /** cast wrapper */
-static INLINE struct llvm_middle_end *
+static inline struct llvm_middle_end *
 llvm_middle_end(struct draw_pt_middle_end *middle)
 {
    return (struct llvm_middle_end *) middle;
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index 71a7d3918e9..f0d5e0f5656 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -53,7 +53,7 @@ struct pt_post_vs {
                    const struct draw_prim_info *prim_info );
 };
 
-static INLINE void
+static inline void
 initialize_vertex_header(struct vertex_header *header)
 {
    header->clipmask = 0;
@@ -62,7 +62,7 @@ initialize_vertex_header(struct vertex_header *header)
    header->vertex_id = UNDEFINED_VERTEX_ID;
 }
 
-static INLINE float
+static inline float
 dot4(const float *a, const float *b)
 {
    return (a[0]*b[0] +
diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
index 91e67c0840d..20de26fd08a 100644
--- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
@@ -65,7 +65,7 @@ draw_so_info(const struct draw_context *draw)
    return state;
 }
 
-static INLINE boolean
+static inline boolean
 draw_has_so(const struct draw_context *draw)
 {
    const struct pipe_stream_output_info *state = draw_so_info(draw);
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
index 8098adea61f..8d448f92a26 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vsplit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
@@ -84,7 +84,7 @@ vsplit_flush_cache(struct vsplit_frontend *vsplit, unsigned flags)
 /**
  * Add a fetch element and add it to the draw elements.
  */
-static INLINE void
+static inline void
 vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch, unsigned ofbias)
 {
    unsigned hash;
@@ -111,7 +111,7 @@ vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch, unsigned ofbias
  * The value is checked for overflows (both integer overflows
  * and the elements array overflow).
  */
-static INLINE unsigned
+static inline unsigned
 vsplit_get_base_idx(struct vsplit_frontend *vsplit,
                     unsigned start, unsigned fetch, unsigned *ofbit)
 {
@@ -137,7 +137,7 @@ vsplit_get_base_idx(struct vsplit_frontend *vsplit,
  * index, plus the element bias, clamped to maximum elememt
  * index if that addition overflows.
  */
-static INLINE unsigned
+static inline unsigned
 vsplit_get_bias_idx(struct vsplit_frontend *vsplit,
                     int idx, int bias, unsigned *ofbias)
 {
@@ -170,7 +170,7 @@ vsplit_get_bias_idx(struct vsplit_frontend *vsplit,
    elt_idx = vsplit_get_base_idx(vsplit, start, fetch, &ofbit);          \
    elt_idx = vsplit_get_bias_idx(vsplit, ofbit ? 0 : DRAW_GET_IDX(elts, elt_idx), elt_bias, &ofbias)
 
-static INLINE void
+static inline void
 vsplit_add_cache_ubyte(struct vsplit_frontend *vsplit, const ubyte *elts,
                        unsigned start, unsigned fetch, int elt_bias)
 {
@@ -179,7 +179,7 @@ vsplit_add_cache_ubyte(struct vsplit_frontend *vsplit, const ubyte *elts,
    vsplit_add_cache(vsplit, elt_idx, ofbias);
 }
 
-static INLINE void
+static inline void
 vsplit_add_cache_ushort(struct vsplit_frontend *vsplit, const ushort *elts,
                        unsigned start, unsigned fetch, int elt_bias)
 {
@@ -193,7 +193,7 @@ vsplit_add_cache_ushort(struct vsplit_frontend *vsplit, const ushort *elts,
  * Add a fetch element and add it to the draw elements.  The fetch element is
  * in full range (uint).
  */
-static INLINE void
+static inline void
 vsplit_add_cache_uint(struct vsplit_frontend *vsplit, const uint *elts,
                       unsigned start, unsigned fetch, int elt_bias)
 {
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
index 0f7a3cdc012..0afabb01398 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
@@ -129,7 +129,7 @@ CONCAT(vsplit_primitive_, ELT_TYPE)(struct vsplit_frontend *vsplit,
  * When spoken is TRUE, ispoken replaces istart;  When close is TRUE, iclose is
  * appended.
  */
-static INLINE void
+static inline void
 CONCAT(vsplit_segment_cache_, ELT_TYPE)(struct vsplit_frontend *vsplit,
                                         unsigned flags,
                                         unsigned istart, unsigned icount,
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index b4178d6a6c5..ee11d2f9276 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -91,13 +91,13 @@ struct vertex_info
    } attrib[PIPE_MAX_SHADER_OUTPUTS];
 };
 
-static INLINE size_t
+static inline size_t
 draw_vinfo_size( const struct vertex_info *a )
 {
    return offsetof(const struct vertex_info, attrib[a->num_attribs]);
 }
 
-static INLINE int
+static inline int
 draw_vinfo_compare( const struct vertex_info *a,
                     const struct vertex_info *b )
 {
@@ -105,7 +105,7 @@ draw_vinfo_compare( const struct vertex_info *a,
    return memcmp( a, b, sizea );
 }
 
-static INLINE void
+static inline void
 draw_vinfo_copy( struct vertex_info *dst,
                  const struct vertex_info *src )
 {
@@ -121,7 +121,7 @@ draw_vinfo_copy( struct vertex_info *dst,
  *                   corresponds to this attribute.
  * \return slot in which the attribute was added
  */
-static INLINE uint
+static inline uint
 draw_emit_vertex_attr(struct vertex_info *vinfo,
                       enum attrib_emit emit, 
                       enum interp_mode interp, /* only used by softpipe??? */
@@ -150,7 +150,7 @@ void draw_dump_emitted_vertex(const struct vertex_info *vinfo,
                               const uint8_t *data);
 
 
-static INLINE enum pipe_format draw_translate_vinfo_format(enum attrib_emit emit)
+static inline enum pipe_format draw_translate_vinfo_format(enum attrib_emit emit)
 {
    switch (emit) {
    case EMIT_OMIT:
@@ -174,7 +174,7 @@ static INLINE enum pipe_format draw_translate_vinfo_format(enum attrib_emit emit
    }
 }
 
-static INLINE unsigned draw_translate_vinfo_size(enum attrib_emit emit)
+static inline unsigned draw_translate_vinfo_size(enum attrib_emit emit)
 {
    switch (emit) {
    case EMIT_OMIT:
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 1d54e7ef298..24b29e70dd9 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -191,12 +191,12 @@ draw_vs_create_variant_generic( struct draw_vertex_shader *vs,
 
 
 
-static INLINE int draw_vs_variant_keysize( const struct draw_vs_variant_key *key )
+static inline int draw_vs_variant_keysize( const struct draw_vs_variant_key *key )
 {
    return 2 * sizeof(int) + key->nr_elements * sizeof(struct draw_variant_element);
 }
 
-static INLINE int draw_vs_variant_key_compare( const struct draw_vs_variant_key *a,
+static inline int draw_vs_variant_key_compare( const struct draw_vs_variant_key *a,
                                          const struct draw_vs_variant_key *b )
 {
    int keysize = draw_vs_variant_keysize(a);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 8fba43f85d7..1d327ce48a9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1135,7 +1135,7 @@ lp_build_div(struct lp_build_context *bld,
  *
  * @sa http://www.stereopsis.com/doubleblend.html
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_lerp_simple(struct lp_build_context *bld,
                      LLVMValueRef x,
                      LLVMValueRef v0,
@@ -1674,7 +1674,7 @@ enum lp_build_round_mode
  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_round_sse41(struct lp_build_context *bld,
                      LLVMValueRef a,
                      enum lp_build_round_mode mode)
@@ -1761,7 +1761,7 @@ lp_build_round_sse41(struct lp_build_context *bld,
 }
 
 
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
                              LLVMValueRef a)
 {
@@ -1817,7 +1817,7 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
 
 /*
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_round_altivec(struct lp_build_context *bld,
                        LLVMValueRef a,
                        enum lp_build_round_mode mode)
@@ -1851,7 +1851,7 @@ lp_build_round_altivec(struct lp_build_context *bld,
    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
 }
 
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_round_arch(struct lp_build_context *bld,
                     LLVMValueRef a,
                     enum lp_build_round_mode mode)
@@ -2439,7 +2439,7 @@ lp_build_sqrt(struct lp_build_context *bld,
  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_rcp_refine(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef rcp_a)
@@ -2524,7 +2524,7 @@ lp_build_rcp(struct lp_build_context *bld,
  *
  * See also Intel 64 and IA-32 Architectures Optimization Manual.
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_rsqrt_refine(struct lp_build_context *bld,
                       LLVMValueRef a,
                       LLVMValueRef rsqrt_a)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h
index b17c41931f4..a4c3bf0977a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h
@@ -120,14 +120,14 @@ lp_build_const_mask_aos_swizzled(struct gallivm_state *gallivm,
                                  const unsigned char *swizzle);
 
 
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_const_int32(struct gallivm_state *gallivm, int i)
 {
    return LLVMConstInt(LLVMInt32TypeInContext(gallivm->context), i, 0);
 }
 
 
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_const_float(struct gallivm_state *gallivm, float x)
 {
    return LLVMConstReal(LLVMFloatTypeInContext(gallivm->context), x);
@@ -135,7 +135,7 @@ lp_build_const_float(struct gallivm_state *gallivm, float x)
 
 
 /** Return constant-valued pointer to int */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_const_int_pointer(struct gallivm_state *gallivm, const void *ptr)
 {
    LLVMTypeRef int_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index 321e09d56b9..375ba6cb5ff 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -59,7 +59,7 @@ extern unsigned gallivm_debug;
 #endif
 
 
-static INLINE void
+static inline void
 lp_build_name(LLVMValueRef val, const char *format, ...)
 {
 #ifdef DEBUG
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index efe71704c3a..ddf3ad1dfc6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -95,7 +95,7 @@ lp_build_format_swizzle_aos(const struct util_format_description *desc,
 /**
  * Whether the format matches the vector type, apart of swizzles.
  */
-static INLINE boolean
+static inline boolean
 format_matches_type(const struct util_format_description *desc,
                     struct lp_type type)
 {
@@ -146,7 +146,7 @@ format_matches_type(const struct util_format_description *desc,
  *
  * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector.
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
                                const struct util_format_description *desc,
                                LLVMValueRef packed)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index 4f5a45c6a3d..fa0e8b656bb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -212,7 +212,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm,
 }
 
 
-static INLINE void
+static inline void
 yuv_to_rgb_soa(struct gallivm_state *gallivm,
                unsigned n,
                LLVMValueRef y, LLVMValueRef u, LLVMValueRef v,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index 3db7261e255..571c615f9f8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -88,7 +88,7 @@
  * actually try to allocate the maximum and run out of memory and crash.  So
  * stick with something reasonable here.
  */
-static INLINE int
+static inline int
 gallivm_get_shader_param(enum pipe_shader_cap param)
 {
    switch(param) {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 640b7e0d7e0..eba758da6ae 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -371,7 +371,7 @@ struct lp_build_sample_context
  * We only support a few wrap modes in lp_build_sample_wrap_linear_int() at
  * this time.  Return whether the given mode is supported by that function.
  */
-static INLINE boolean
+static inline boolean
 lp_is_simple_wrap_mode(unsigned mode)
 {
    switch (mode) {
@@ -384,7 +384,7 @@ lp_is_simple_wrap_mode(unsigned mode)
 }
 
 
-static INLINE void
+static inline void
 apply_sampler_swizzle(struct lp_build_sample_context *bld,
                       LLVMValueRef *texel)
 {
@@ -402,7 +402,7 @@ apply_sampler_swizzle(struct lp_build_sample_context *bld,
  * not really dimension as such, this indicates the amount of
  * "normal" texture coords subject to minification, wrapping etc.
  */
-static INLINE unsigned
+static inline unsigned
 texture_dims(enum pipe_texture_target tex)
 {
    switch (tex) {
@@ -424,7 +424,7 @@ texture_dims(enum pipe_texture_target tex)
    }
 }
 
-static INLINE boolean
+static inline boolean
 has_layer_coord(enum pipe_texture_target tex)
 {
    switch (tex) {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 5809c5a2535..2ca9c6194b3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -562,13 +562,13 @@ struct lp_build_tgsi_aos_context
 
 };
 
-static INLINE struct lp_build_tgsi_soa_context *
+static inline struct lp_build_tgsi_soa_context *
 lp_soa_context(struct lp_build_tgsi_context *bld_base)
 {
    return (struct lp_build_tgsi_soa_context *)bld_base;
 }
 
-static INLINE struct lp_build_tgsi_aos_context *
+static inline struct lp_build_tgsi_aos_context *
 lp_aos_context(struct lp_build_tgsi_context *bld_base)
 {
    return (struct lp_build_tgsi_aos_context *)bld_base;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
index 55acea83799..906a1745551 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
@@ -462,7 +462,7 @@ analyse_instruction(struct analysis_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 dump_info(const struct tgsi_token *tokens,
           struct lp_tgsi_info *info)
 {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index faa546df11b..55f606f9c59 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -106,7 +106,7 @@ emit_dump_reg(struct gallivm_state *gallivm,
  * Return the context for the current function.
  * (always 'main', if shader doesn't do any function calls)
  */
-static INLINE struct function_ctx *
+static inline struct function_ctx *
 func_ctx(struct lp_exec_mask *mask)
 {
    assert(mask->function_stack_size > 0);
@@ -120,7 +120,7 @@ func_ctx(struct lp_exec_mask *mask)
  * no loop inside the current function, but we were inside
  * a loop inside another function, from which this one was called.
  */
-static INLINE boolean
+static inline boolean
 mask_has_loop(struct lp_exec_mask *mask)
 {
    int i;
@@ -138,7 +138,7 @@ mask_has_loop(struct lp_exec_mask *mask)
  * no switch in the current function, but we were inside
  * a switch inside another function, from which this one was called.
  */
-static INLINE boolean
+static inline boolean
 mask_has_switch(struct lp_exec_mask *mask)
 {
    int i;
@@ -156,7 +156,7 @@ mask_has_switch(struct lp_exec_mask *mask)
  * no conditional in the current function, but we were inside
  * a conditional inside another function, from which this one was called.
  */
-static INLINE boolean
+static inline boolean
 mask_has_cond(struct lp_exec_mask *mask)
 {
    int i;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index 191cf92d2d1..7fb449fd03f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -173,7 +173,7 @@ struct lp_build_context
  *
  * e.g. With PIPE_FORMAT_R32G32B32A32_FLOAT returns an lp_type with float[4]
  */
-static INLINE void
+static inline void
 lp_type_from_format_desc(struct lp_type* type, const struct util_format_description *format_desc)
 {
    assert(format_desc->is_array);
@@ -189,14 +189,14 @@ lp_type_from_format_desc(struct lp_type* type, const struct util_format_descript
 }
 
 
-static INLINE void
+static inline void
 lp_type_from_format(struct lp_type* type, enum pipe_format format)
 {
    lp_type_from_format_desc(type, util_format_description(format));
 }
 
 
-static INLINE unsigned
+static inline unsigned
 lp_type_width(struct lp_type type)
 {
    return type.width * type.length;
@@ -204,7 +204,7 @@ lp_type_width(struct lp_type type)
 
 
 /** Create scalar float type */
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_float(unsigned width)
 {
    struct lp_type res_type;
@@ -220,7 +220,7 @@ lp_type_float(unsigned width)
 
 
 /** Create vector of float type */
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_float_vec(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
@@ -236,7 +236,7 @@ lp_type_float_vec(unsigned width, unsigned total_width)
 
 
 /** Create scalar int type */
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_int(unsigned width)
 {
    struct lp_type res_type;
@@ -251,7 +251,7 @@ lp_type_int(unsigned width)
 
 
 /** Create vector int type */
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_int_vec(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
@@ -266,7 +266,7 @@ lp_type_int_vec(unsigned width, unsigned total_width)
 
 
 /** Create scalar uint type */
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_uint(unsigned width)
 {
    struct lp_type res_type;
@@ -280,7 +280,7 @@ lp_type_uint(unsigned width)
 
 
 /** Create vector uint type */
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_uint_vec(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
@@ -293,7 +293,7 @@ lp_type_uint_vec(unsigned width, unsigned total_width)
 }
 
 
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_unorm(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
@@ -307,7 +307,7 @@ lp_type_unorm(unsigned width, unsigned total_width)
 }
 
 
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_fixed(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
@@ -322,7 +322,7 @@ lp_type_fixed(unsigned width, unsigned total_width)
 }
 
 
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_ufixed(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
@@ -364,7 +364,7 @@ LLVMTypeRef
 lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type);
 
 
-static INLINE struct lp_type
+static inline struct lp_type
 lp_float32_vec4_type(void)
 {
    struct lp_type type;
@@ -380,7 +380,7 @@ lp_float32_vec4_type(void)
 }
 
 
-static INLINE struct lp_type
+static inline struct lp_type
 lp_int32_vec4_type(void)
 {
    struct lp_type type;
@@ -396,7 +396,7 @@ lp_int32_vec4_type(void)
 }
 
 
-static INLINE struct lp_type
+static inline struct lp_type
 lp_unorm8_vec4_type(void)
 {
    struct lp_type type;
diff --git a/src/gallium/auxiliary/os/os_memory_aligned.h b/src/gallium/auxiliary/os/os_memory_aligned.h
index bb15f24ade3..f7d0e3652ed 100644
--- a/src/gallium/auxiliary/os/os_memory_aligned.h
+++ b/src/gallium/auxiliary/os/os_memory_aligned.h
@@ -55,7 +55,7 @@ add_overflow_size_t(size_t a, size_t b, size_t *res)
 /**
  * Return memory on given byte alignment
  */
-static INLINE void *
+static inline void *
 os_malloc_aligned(size_t size, size_t alignment)
 {
    char *ptr, *buf;
@@ -87,7 +87,7 @@ os_malloc_aligned(size_t size, size_t alignment)
 /**
  * Free memory returned by align_malloc().
  */
-static INLINE void
+static inline void
 os_free_aligned(void *ptr)
 {
    if (ptr) {
diff --git a/src/gallium/auxiliary/os/os_memory_stdc.h b/src/gallium/auxiliary/os/os_memory_stdc.h
index 806e5363568..c9fde06d8ac 100644
--- a/src/gallium/auxiliary/os/os_memory_stdc.h
+++ b/src/gallium/auxiliary/os/os_memory_stdc.h
@@ -50,7 +50,7 @@
 
 #if defined(HAVE_POSIX_MEMALIGN)
 
-static INLINE void *
+static inline void *
 os_malloc_aligned(size_t size, size_t alignment)
 {
    void *ptr;
diff --git a/src/gallium/auxiliary/os/os_mman.h b/src/gallium/auxiliary/os/os_mman.h
index e892610bdbd..2ae0027c1c2 100644
--- a/src/gallium/auxiliary/os/os_mman.h
+++ b/src/gallium/auxiliary/os/os_mman.h
@@ -58,7 +58,7 @@ extern "C" {
 
 extern void *__mmap2(void *, size_t, int, int, int, size_t);
 
-static INLINE void *os_mmap(void *addr, size_t length, int prot, int flags,
+static inline void *os_mmap(void *addr, size_t length, int prot, int flags,
                             int fd, loff_t offset)
 {
    /* offset must be aligned to 4096 (not necessarily the page size) */
@@ -78,7 +78,7 @@ static INLINE void *os_mmap(void *addr, size_t length, int prot, int flags,
 #  define os_mmap(addr, length, prot, flags, fd, offset) \
              mmap(addr, length, prot, flags, fd, offset)
 
-static INLINE int os_munmap(void *addr, size_t length)
+static inline int os_munmap(void *addr, size_t length)
 {
    /* Copied from configure code generated by AC_SYS_LARGEFILE */
 #define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + \
diff --git a/src/gallium/auxiliary/os/os_thread.h b/src/gallium/auxiliary/os/os_thread.h
index e9da8954885..be8adcc6cf2 100644
--- a/src/gallium/auxiliary/os/os_thread.h
+++ b/src/gallium/auxiliary/os/os_thread.h
@@ -54,7 +54,7 @@ typedef thrd_t pipe_thread;
 #define PIPE_THREAD_ROUTINE( name, param ) \
    int name( void *param )
 
-static INLINE pipe_thread pipe_thread_create( PIPE_THREAD_ROUTINE((*routine), ), void *param )
+static inline pipe_thread pipe_thread_create( PIPE_THREAD_ROUTINE((*routine), ), void *param )
 {
    pipe_thread thread;
 #ifdef HAVE_PTHREAD
@@ -75,17 +75,17 @@ static INLINE pipe_thread pipe_thread_create( PIPE_THREAD_ROUTINE((*routine), ),
    return thread;
 }
 
-static INLINE int pipe_thread_wait( pipe_thread thread )
+static inline int pipe_thread_wait( pipe_thread thread )
 {
    return thrd_join( thread, NULL );
 }
 
-static INLINE int pipe_thread_destroy( pipe_thread thread )
+static inline int pipe_thread_destroy( pipe_thread thread )
 {
    return thrd_detach( thread );
 }
 
-static INLINE void pipe_thread_setname( const char *name )
+static inline void pipe_thread_setname( const char *name )
 {
 #if defined(HAVE_PTHREAD)
 #  if defined(__GNU_LIBRARY__) && defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \
@@ -145,17 +145,17 @@ typedef cnd_t pipe_condvar;
 
 typedef pthread_barrier_t pipe_barrier;
 
-static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
+static inline void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
 {
    pthread_barrier_init(barrier, NULL, count);
 }
 
-static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
+static inline void pipe_barrier_destroy(pipe_barrier *barrier)
 {
    pthread_barrier_destroy(barrier);
 }
 
-static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
+static inline void pipe_barrier_wait(pipe_barrier *barrier)
 {
    pthread_barrier_wait(barrier);
 }
@@ -171,7 +171,7 @@ typedef struct {
    pipe_condvar condvar;
 } pipe_barrier;
 
-static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
+static inline void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
 {
    barrier->count = count;
    barrier->waiters = 0;
@@ -180,14 +180,14 @@ static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
    pipe_condvar_init(barrier->condvar);
 }
 
-static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
+static inline void pipe_barrier_destroy(pipe_barrier *barrier)
 {
    assert(barrier->waiters == 0);
    pipe_mutex_destroy(barrier->mutex);
    pipe_condvar_destroy(barrier->condvar);
 }
 
-static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
+static inline void pipe_barrier_wait(pipe_barrier *barrier)
 {
    pipe_mutex_lock(barrier->mutex);
 
@@ -225,7 +225,7 @@ typedef struct
 } pipe_semaphore;
 
 
-static INLINE void
+static inline void
 pipe_semaphore_init(pipe_semaphore *sema, int init_val)
 {
    pipe_mutex_init(sema->mutex);
@@ -233,7 +233,7 @@ pipe_semaphore_init(pipe_semaphore *sema, int init_val)
    sema->counter = init_val;
 }
 
-static INLINE void
+static inline void
 pipe_semaphore_destroy(pipe_semaphore *sema)
 {
    pipe_mutex_destroy(sema->mutex);
@@ -241,7 +241,7 @@ pipe_semaphore_destroy(pipe_semaphore *sema)
 }
 
 /** Signal/increment semaphore counter */
-static INLINE void
+static inline void
 pipe_semaphore_signal(pipe_semaphore *sema)
 {
    pipe_mutex_lock(sema->mutex);
@@ -251,7 +251,7 @@ pipe_semaphore_signal(pipe_semaphore *sema)
 }
 
 /** Wait for semaphore counter to be greater than zero */
-static INLINE void
+static inline void
 pipe_semaphore_wait(pipe_semaphore *sema)
 {
    pipe_mutex_lock(sema->mutex);
@@ -277,7 +277,7 @@ typedef struct {
 #define PIPE_TSD_INIT_MAGIC 0xff8adc98
 
 
-static INLINE void
+static inline void
 pipe_tsd_init(pipe_tsd *tsd)
 {
    if (tss_create(&tsd->key, NULL/*free*/) != 0) {
@@ -286,7 +286,7 @@ pipe_tsd_init(pipe_tsd *tsd)
    tsd->initMagic = PIPE_TSD_INIT_MAGIC;
 }
 
-static INLINE void *
+static inline void *
 pipe_tsd_get(pipe_tsd *tsd)
 {
    if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
@@ -295,7 +295,7 @@ pipe_tsd_get(pipe_tsd *tsd)
    return tss_get(tsd->key);
 }
 
-static INLINE void
+static inline void
 pipe_tsd_set(pipe_tsd *tsd, void *value)
 {
    if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
diff --git a/src/gallium/auxiliary/os/os_time.h b/src/gallium/auxiliary/os/os_time.h
index e786ee1885b..9312e028809 100644
--- a/src/gallium/auxiliary/os/os_time.h
+++ b/src/gallium/auxiliary/os/os_time.h
@@ -60,7 +60,7 @@ os_time_get_nano(void);
 /*
  * Get the current time in microseconds from an unknown base.
  */
-static INLINE int64_t
+static inline int64_t
 os_time_get(void)
 {
    return os_time_get_nano() / 1000;
@@ -83,7 +83,7 @@ os_time_sleep(int64_t usecs);
  *
  * Returns true if the current time has elapsed beyond the specified interval.
  */
-static INLINE boolean
+static inline boolean
 os_time_timeout(int64_t start,
                 int64_t end,
                 int64_t curr)
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 03bdce31513..ba48d461d5c 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -158,7 +158,7 @@ struct pb_vtbl
 
 /* Accessor functions for pb->vtbl:
  */
-static INLINE void *
+static inline void *
 pb_map(struct pb_buffer *buf, 
        unsigned flags, void *flush_ctx)
 {
@@ -170,7 +170,7 @@ pb_map(struct pb_buffer *buf,
 }
 
 
-static INLINE void 
+static inline void 
 pb_unmap(struct pb_buffer *buf)
 {
    assert(buf);
@@ -181,7 +181,7 @@ pb_unmap(struct pb_buffer *buf)
 }
 
 
-static INLINE void
+static inline void
 pb_get_base_buffer( struct pb_buffer *buf,
 		    struct pb_buffer **base_buf,
 		    pb_size *offset )
@@ -200,7 +200,7 @@ pb_get_base_buffer( struct pb_buffer *buf,
 }
 
 
-static INLINE enum pipe_error 
+static inline enum pipe_error 
 pb_validate(struct pb_buffer *buf, struct pb_validate *vl, unsigned flags)
 {
    assert(buf);
@@ -211,7 +211,7 @@ pb_validate(struct pb_buffer *buf, struct pb_validate *vl, unsigned flags)
 }
 
 
-static INLINE void 
+static inline void 
 pb_fence(struct pb_buffer *buf, struct pipe_fence_handle *fence)
 {
    assert(buf);
@@ -222,7 +222,7 @@ pb_fence(struct pb_buffer *buf, struct pipe_fence_handle *fence)
 }
 
 
-static INLINE void 
+static inline void 
 pb_destroy(struct pb_buffer *buf)
 {
    assert(buf);
@@ -232,7 +232,7 @@ pb_destroy(struct pb_buffer *buf)
    buf->vtbl->destroy(buf);
 }
 
-static INLINE void
+static inline void
 pb_reference(struct pb_buffer **dst,
              struct pb_buffer *src)
 {
@@ -248,7 +248,7 @@ pb_reference(struct pb_buffer **dst,
  * Utility function to check whether the provided alignment is consistent with
  * the requested or not.
  */
-static INLINE boolean
+static inline boolean
 pb_check_alignment(pb_size requested, pb_size provided)
 {
    if(!requested)
@@ -265,7 +265,7 @@ pb_check_alignment(pb_size requested, pb_size provided)
  * Utility function to check whether the provided alignment is consistent with
  * the requested or not.
  */
-static INLINE boolean
+static inline boolean
 pb_check_usage(unsigned requested, unsigned provided)
 {
    return (requested & provided) == requested ? TRUE : FALSE;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index fc81e11b972..08935b4dec7 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -149,7 +149,7 @@ struct fenced_buffer
 };
 
 
-static INLINE struct fenced_manager *
+static inline struct fenced_manager *
 fenced_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -157,7 +157,7 @@ fenced_manager(struct pb_manager *mgr)
 }
 
 
-static INLINE struct fenced_buffer *
+static inline struct fenced_buffer *
 fenced_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -240,7 +240,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
 }
 
 
-static INLINE void
+static inline void
 fenced_buffer_destroy_locked(struct fenced_manager *fenced_mgr,
                              struct fenced_buffer *fenced_buf)
 {
@@ -265,7 +265,7 @@ fenced_buffer_destroy_locked(struct fenced_manager *fenced_mgr,
  *
  * Reference count should be incremented before calling this function.
  */
-static INLINE void
+static inline void
 fenced_buffer_add_locked(struct fenced_manager *fenced_mgr,
                          struct fenced_buffer *fenced_buf)
 {
@@ -289,7 +289,7 @@ fenced_buffer_add_locked(struct fenced_manager *fenced_mgr,
  *
  * Returns TRUE if the buffer was detroyed.
  */
-static INLINE boolean
+static inline boolean
 fenced_buffer_remove_locked(struct fenced_manager *fenced_mgr,
                             struct fenced_buffer *fenced_buf)
 {
@@ -326,7 +326,7 @@ fenced_buffer_remove_locked(struct fenced_manager *fenced_mgr,
  * This function will release and re-acquire the mutex, so any copy of mutable
  * state must be discarded after calling it.
  */
-static INLINE enum pipe_error
+static inline enum pipe_error
 fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
                             struct fenced_buffer *fenced_buf)
 {
@@ -550,7 +550,7 @@ fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf)
  * This function is a shorthand around pb_manager::create_buffer for
  * fenced_buffer_create_gpu_storage_locked()'s benefit.
  */
-static INLINE boolean
+static inline boolean
 fenced_buffer_try_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
                                             struct fenced_buffer *fenced_buf)
 {
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
index bf1a538bf79..b97771457d6 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
@@ -49,7 +49,7 @@ struct malloc_buffer
 
 extern const struct pb_vtbl malloc_buffer_vtbl;
 
-static INLINE struct malloc_buffer *
+static inline struct malloc_buffer *
 malloc_buffer(struct pb_buffer *buf)
 {
    assert(buf);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
index 62df2a6b9de..47cbaeb20ac 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
@@ -50,7 +50,7 @@ struct pb_alt_manager
 };
 
 
-static INLINE struct pb_alt_manager *
+static inline struct pb_alt_manager *
 pb_alt_manager(struct pb_manager *mgr)
 {
    assert(mgr);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index 5023687ec04..3b35049f679 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -88,7 +88,7 @@ struct pb_cache_manager
 };
 
 
-static INLINE struct pb_cache_buffer *
+static inline struct pb_cache_buffer *
 pb_cache_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -96,7 +96,7 @@ pb_cache_buffer(struct pb_buffer *buf)
 }
 
 
-static INLINE struct pb_cache_manager *
+static inline struct pb_cache_manager *
 pb_cache_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -107,7 +107,7 @@ pb_cache_manager(struct pb_manager *mgr)
 /**
  * Actually destroy the buffer.
  */
-static INLINE void
+static inline void
 _pb_cache_buffer_destroy(struct pb_cache_buffer *buf)
 {
    struct pb_cache_manager *mgr = buf->mgr;
@@ -235,7 +235,7 @@ pb_cache_buffer_vtbl = {
 };
 
 
-static INLINE int
+static inline int
 pb_cache_is_buffer_compat(struct pb_cache_buffer *buf,  
                           pb_size size,
                           const struct pb_desc *desc)
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index 6236afb70d1..7ad70f293a6 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -99,7 +99,7 @@ struct pb_debug_manager
 };
 
 
-static INLINE struct pb_debug_buffer *
+static inline struct pb_debug_buffer *
 pb_debug_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -107,7 +107,7 @@ pb_debug_buffer(struct pb_buffer *buf)
 }
 
 
-static INLINE struct pb_debug_manager *
+static inline struct pb_debug_manager *
 pb_debug_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -123,7 +123,7 @@ static const uint8_t random_pattern[32] = {
 };
 
 
-static INLINE void 
+static inline void 
 fill_random_pattern(uint8_t *dst, pb_size size)
 {
    pb_size i = 0;
@@ -134,7 +134,7 @@ fill_random_pattern(uint8_t *dst, pb_size size)
 }
 
 
-static INLINE boolean 
+static inline boolean 
 check_random_pattern(const uint8_t *dst, pb_size size, 
                      pb_size *min_ofs, pb_size *max_ofs) 
 {
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index 84eb6edda34..72099ba5850 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -65,7 +65,7 @@ struct mm_pb_manager
 };
 
 
-static INLINE struct mm_pb_manager *
+static inline struct mm_pb_manager *
 mm_pb_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -83,7 +83,7 @@ struct mm_buffer
 };
 
 
-static INLINE struct mm_buffer *
+static inline struct mm_buffer *
 mm_buffer(struct pb_buffer *buf)
 {
    assert(buf);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
index 77e642ada08..c20e2dca02d 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
@@ -70,7 +70,7 @@ struct pb_ondemand_manager
 
 extern const struct pb_vtbl pb_ondemand_buffer_vtbl;
 
-static INLINE struct pb_ondemand_buffer *
+static inline struct pb_ondemand_buffer *
 pb_ondemand_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -80,7 +80,7 @@ pb_ondemand_buffer(struct pb_buffer *buf)
    return (struct pb_ondemand_buffer *)buf;
 }
 
-static INLINE struct pb_ondemand_manager *
+static inline struct pb_ondemand_manager *
 pb_ondemand_manager(struct pb_manager *mgr)
 {
    assert(mgr);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
index 51525b0f97c..56a5e82ece0 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -73,7 +73,7 @@ struct pool_pb_manager
 };
 
 
-static INLINE struct pool_pb_manager *
+static inline struct pool_pb_manager *
 pool_pb_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -93,7 +93,7 @@ struct pool_buffer
 };
 
 
-static INLINE struct pool_buffer *
+static inline struct pool_buffer *
 pool_buffer(struct pb_buffer *buf)
 {
    assert(buf);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index 6a62b4f5fdb..aadeaa087f4 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -163,7 +163,7 @@ struct pb_slab_range_manager
 };
 
 
-static INLINE struct pb_slab_buffer *
+static inline struct pb_slab_buffer *
 pb_slab_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -171,7 +171,7 @@ pb_slab_buffer(struct pb_buffer *buf)
 }
 
 
-static INLINE struct pb_slab_manager *
+static inline struct pb_slab_manager *
 pb_slab_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -179,7 +179,7 @@ pb_slab_manager(struct pb_manager *mgr)
 }
 
 
-static INLINE struct pb_slab_range_manager *
+static inline struct pb_slab_range_manager *
 pb_slab_range_manager(struct pb_manager *mgr)
 {
    assert(mgr);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index f9637889187..27ee8f1242a 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -510,7 +510,7 @@ void x86_mov8_imm( struct x86_function *p, struct x86_reg dst, uint8_t imm )
 /**
  * Immediate group 1 instructions.
  */
-static INLINE void 
+static inline void 
 x86_group1_imm( struct x86_function *p, 
                 unsigned op, struct x86_reg dst, int imm )
 {
@@ -2196,7 +2196,7 @@ void x86_release_func( struct x86_function *p )
 }
 
 
-static INLINE x86_func
+static inline x86_func
 voidptr_to_x86_func(void *v)
 {
    union {
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 498ca824cd1..b44d917cd43 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -136,7 +136,7 @@ enum x86_target
 };
 
 /* make this read a member of x86_function if target != host is desired */
-static INLINE enum x86_target x86_target( struct x86_function* p )
+static inline enum x86_target x86_target( struct x86_function* p )
 {
 #ifdef PIPE_ARCH_X86
    return X86_32;
@@ -147,7 +147,7 @@ static INLINE enum x86_target x86_target( struct x86_function* p )
 #endif
 }
 
-static INLINE unsigned x86_target_caps( struct x86_function* p )
+static inline unsigned x86_target_caps( struct x86_function* p )
 {
    return p->caps;
 }
diff --git a/src/gallium/auxiliary/target-helpers/inline_debug_helper.h b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
index 0648e596549..d353ab81e34 100644
--- a/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
@@ -27,7 +27,7 @@
  * TODO: Audit the following *screen_create() - all of
  * them should return the original screen on failuire.
  */
-static INLINE struct pipe_screen *
+static inline struct pipe_screen *
 debug_screen_wrap(struct pipe_screen *screen)
 {
 #if defined(GALLIUM_RBUG)
diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
index d8cee2b2917..5f46552f6c3 100644
--- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
@@ -20,7 +20,7 @@
 #endif
 
 
-static INLINE struct pipe_screen *
+static inline struct pipe_screen *
 sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
 {
    struct pipe_screen *screen = NULL;
@@ -39,7 +39,7 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
 }
 
 
-static INLINE struct pipe_screen *
+static inline struct pipe_screen *
 sw_screen_create(struct sw_winsys *winsys)
 {
    const char *default_driver;
@@ -71,7 +71,7 @@ PUBLIC const __DRIextension **__driDriverGetExtensions_swrast(void)
    return galliumsw_driver_extensions;
 }
 
-INLINE struct pipe_screen *
+inline struct pipe_screen *
 drisw_create_screen(struct drisw_loader_funcs *lf)
 {
    struct sw_winsys *winsys = NULL;
@@ -98,7 +98,7 @@ drisw_create_screen(struct drisw_loader_funcs *lf)
 
 extern struct pipe_screen *ninesw_create_screen(struct pipe_screen *screen);
 
-INLINE struct pipe_screen *
+inline struct pipe_screen *
 ninesw_create_screen(struct pipe_screen *pscreen)
 {
    struct sw_winsys *winsys = NULL;
diff --git a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
index 0a2e215352b..4f38ba9f919 100644
--- a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
@@ -9,7 +9,7 @@
  * Try to wrap a hw screen with a software screen.
  * On failure will return given screen.
  */
-static INLINE struct pipe_screen *
+static inline struct pipe_screen *
 sw_screen_wrap(struct pipe_screen *screen)
 {
 #if defined(GALLIUM_SOFTPIPE) || defined(GALLIUM_LLVMPIPE)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 44000ffdb6c..75cd0d53c5a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -735,7 +735,7 @@ static const union tgsi_exec_channel M128Vec = {
  * not lead to crashes, etc.  But when debugging, it's helpful to catch
  * them.
  */
-static INLINE void
+static inline void
 check_inf_or_nan(const union tgsi_exec_channel *chan)
 {
    assert(!util_is_inf_or_nan((chan)->f[0]));
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index e8ee2565831..5d56aab2216 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -386,7 +386,7 @@ boolean
 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst);
 
 
-static INLINE void
+static inline void
 tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask)
 {
    mach->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] =
@@ -395,7 +395,7 @@ tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask)
 
 
 /** Set execution mask values prior to executing the shader */
-static INLINE void
+static inline void
 tgsi_set_exec_mask(struct tgsi_exec_machine *mach,
                    boolean ch0, boolean ch1, boolean ch2, boolean ch3)
 {
@@ -414,7 +414,7 @@ tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
                                const unsigned *buf_sizes);
 
 
-static INLINE int
+static inline int
 tgsi_exec_get_shader_param(enum pipe_shader_cap param)
 {
    switch(param) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 8da42b98468..fb29ea0d53d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -316,7 +316,7 @@ tgsi_get_processor_name( uint processor )
  *
  * MOV and UCMP is special so return VOID
  */
-static INLINE enum tgsi_opcode_type
+static inline enum tgsi_opcode_type
 tgsi_opcode_infer_type( uint opcode )
 {
    switch (opcode) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index b8873b24241..0729b5d2426 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -69,7 +69,7 @@ tgsi_parse_end_of_tokens(
  * warnings.  The warnings seem harmless on x86 but on PPC they cause
  * real failures.
  */
-static INLINE void
+static inline void
 copy_token(void *dst, const void *src)
 {
    memcpy(dst, src, 4);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index cd4b2afdb8b..35e1c7cfd62 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -133,7 +133,7 @@ void
 tgsi_parse_token(
    struct tgsi_parse_context *ctx );
 
-static INLINE unsigned
+static inline unsigned
 tgsi_num_tokens(const struct tgsi_token *tokens)
 {
    struct tgsi_header header;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index be4851f5dcb..d14372feb30 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -63,7 +63,7 @@ struct sanity_check_ctx
    boolean print;
 };
 
-static INLINE unsigned
+static inline unsigned
 scan_register_key(const scan_register *reg)
 {
    unsigned key = reg->file;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 6b6a14f55f5..8271ea08177 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -203,7 +203,7 @@ const char *tgsi_immediate_type_names[4] =
 };
 
 
-static INLINE void
+static inline void
 tgsi_strings_check(void)
 {
    STATIC_ASSERT(Elements(tgsi_semantic_names) == TGSI_SEMANTIC_COUNT);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
index 39d7688ab3b..ceb7c2e0f46 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
@@ -94,7 +94,7 @@ struct tgsi_transform_context
 /**
  * Helper for emitting temporary register declarations.
  */
-static INLINE void
+static inline void
 tgsi_transform_temp_decl(struct tgsi_transform_context *ctx,
                          unsigned index)
 {
@@ -108,7 +108,7 @@ tgsi_transform_temp_decl(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_input_decl(struct tgsi_transform_context *ctx,
                           unsigned index,
                           unsigned sem_name, unsigned sem_index,
@@ -130,7 +130,7 @@ tgsi_transform_input_decl(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_sampler_decl(struct tgsi_transform_context *ctx,
                             unsigned index)
 {
@@ -143,7 +143,7 @@ tgsi_transform_sampler_decl(struct tgsi_transform_context *ctx,
    ctx->emit_declaration(ctx, &decl);
 }
 
-static INLINE void
+static inline void
 tgsi_transform_sampler_view_decl(struct tgsi_transform_context *ctx,
                                  unsigned index,
                                  unsigned target,
@@ -165,7 +165,7 @@ tgsi_transform_sampler_view_decl(struct tgsi_transform_context *ctx,
    ctx->emit_declaration(ctx, &decl);
 }
 
-static INLINE void
+static inline void
 tgsi_transform_immediate_decl(struct tgsi_transform_context *ctx,
                               float x, float y, float z, float w)
 {
@@ -186,7 +186,7 @@ tgsi_transform_immediate_decl(struct tgsi_transform_context *ctx,
 /**
  * Helper for emitting 1-operand instructions.
  */
-static INLINE void
+static inline void
 tgsi_transform_op1_inst(struct tgsi_transform_context *ctx,
                         unsigned opcode,
                         unsigned dst_file,
@@ -211,7 +211,7 @@ tgsi_transform_op1_inst(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_op2_inst(struct tgsi_transform_context *ctx,
                         unsigned opcode,
                         unsigned dst_file,
@@ -240,7 +240,7 @@ tgsi_transform_op2_inst(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_op1_swz_inst(struct tgsi_transform_context *ctx,
                             unsigned opcode,
                             unsigned dst_file,
@@ -282,7 +282,7 @@ tgsi_transform_op1_swz_inst(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_op2_swz_inst(struct tgsi_transform_context *ctx,
                             unsigned opcode,
                             unsigned dst_file,
@@ -333,7 +333,7 @@ tgsi_transform_op2_swz_inst(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_op3_swz_inst(struct tgsi_transform_context *ctx,
                             unsigned opcode,
                             unsigned dst_file,
@@ -395,7 +395,7 @@ tgsi_transform_op3_swz_inst(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_kill_inst(struct tgsi_transform_context *ctx,
                          unsigned src_file,
                          unsigned src_index,
@@ -419,7 +419,7 @@ tgsi_transform_kill_inst(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_tex_2d_inst(struct tgsi_transform_context *ctx,
                            unsigned dst_file,
                            unsigned dst_index,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 201a849ef95..3d213195090 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -1830,7 +1830,7 @@ void ureg_free_tokens( const struct tgsi_token *tokens )
 }
 
 
-static INLINE unsigned
+static inline unsigned
 pipe_shader_from_tgsi_processor(unsigned processor)
 {
    switch (processor) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 1891b068774..0aae550d60a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -140,7 +140,7 @@ ureg_destroy( struct ureg_program * );
 /***********************************************************************
  * Convenience routine:
  */
-static INLINE void *
+static inline void *
 ureg_create_shader_with_so_and_destroy( struct ureg_program *p,
 			struct pipe_context *pipe,
 			const struct pipe_stream_output_info *so )
@@ -150,7 +150,7 @@ ureg_create_shader_with_so_and_destroy( struct ureg_program *p,
    return result;
 }
 
-static INLINE void *
+static inline void *
 ureg_create_shader_and_destroy( struct ureg_program *p,
                                 struct pipe_context *pipe )
 {
@@ -180,7 +180,7 @@ ureg_DECL_fs_input_cyl_centroid(struct ureg_program *,
                        unsigned array_id,
                        unsigned array_size);
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
                        unsigned semantic_name,
                        unsigned semantic_index,
@@ -195,7 +195,7 @@ ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
                                  0, 0, 1);
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_DECL_fs_input(struct ureg_program *ureg,
                    unsigned semantic_name,
                    unsigned semantic_index,
@@ -328,7 +328,7 @@ ureg_DECL_sampler_view(struct ureg_program *,
                        unsigned return_type_w );
 
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm4f( struct ureg_program *ureg,
                        float a, float b,
                        float c, float d)
@@ -341,7 +341,7 @@ ureg_imm4f( struct ureg_program *ureg,
    return ureg_DECL_immediate( ureg, v, 4 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm3f( struct ureg_program *ureg,
                        float a, float b,
                        float c)
@@ -353,7 +353,7 @@ ureg_imm3f( struct ureg_program *ureg,
    return ureg_DECL_immediate( ureg, v, 3 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm2f( struct ureg_program *ureg,
                        float a, float b)
 {
@@ -363,7 +363,7 @@ ureg_imm2f( struct ureg_program *ureg,
    return ureg_DECL_immediate( ureg, v, 2 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm1f( struct ureg_program *ureg,
                        float a)
 {
@@ -372,7 +372,7 @@ ureg_imm1f( struct ureg_program *ureg,
    return ureg_DECL_immediate( ureg, v, 1 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm4u( struct ureg_program *ureg,
             unsigned a, unsigned b,
             unsigned c, unsigned d)
@@ -385,7 +385,7 @@ ureg_imm4u( struct ureg_program *ureg,
    return ureg_DECL_immediate_uint( ureg, v, 4 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm3u( struct ureg_program *ureg,
             unsigned a, unsigned b,
             unsigned c)
@@ -397,7 +397,7 @@ ureg_imm3u( struct ureg_program *ureg,
    return ureg_DECL_immediate_uint( ureg, v, 3 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm2u( struct ureg_program *ureg,
             unsigned a, unsigned b)
 {
@@ -407,14 +407,14 @@ ureg_imm2u( struct ureg_program *ureg,
    return ureg_DECL_immediate_uint( ureg, v, 2 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm1u( struct ureg_program *ureg,
             unsigned a)
 {
    return ureg_DECL_immediate_uint( ureg, &a, 1 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm4i( struct ureg_program *ureg,
             int a, int b,
             int c, int d)
@@ -427,7 +427,7 @@ ureg_imm4i( struct ureg_program *ureg,
    return ureg_DECL_immediate_int( ureg, v, 4 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm3i( struct ureg_program *ureg,
             int a, int b,
             int c)
@@ -439,7 +439,7 @@ ureg_imm3i( struct ureg_program *ureg,
    return ureg_DECL_immediate_int( ureg, v, 3 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm2i( struct ureg_program *ureg,
             int a, int b)
 {
@@ -449,7 +449,7 @@ ureg_imm2i( struct ureg_program *ureg,
    return ureg_DECL_immediate_int( ureg, v, 2 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm1i( struct ureg_program *ureg,
             int a)
 {
@@ -459,7 +459,7 @@ ureg_imm1i( struct ureg_program *ureg,
 /* Where the destination register has a valid file, but an empty
  * writemask.
  */
-static INLINE boolean
+static inline boolean
 ureg_dst_is_empty( struct ureg_dst dst )
 {
    return dst.File != TGSI_FILE_NULL &&
@@ -573,7 +573,7 @@ ureg_fixup_insn_size(struct ureg_program *ureg,
 
 
 #define OP00( op )                                              \
-static INLINE void ureg_##op( struct ureg_program *ureg )       \
+static inline void ureg_##op( struct ureg_program *ureg )       \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
    struct ureg_emit_insn_result insn;                           \
@@ -592,7 +592,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg )       \
 }
 
 #define OP01( op )                                              \
-static INLINE void ureg_##op( struct ureg_program *ureg,        \
+static inline void ureg_##op( struct ureg_program *ureg,        \
                               struct ureg_src src )             \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
@@ -613,7 +613,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,        \
 }
 
 #define OP00_LBL( op )                                          \
-static INLINE void ureg_##op( struct ureg_program *ureg,        \
+static inline void ureg_##op( struct ureg_program *ureg,        \
                               unsigned *label_token )           \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
@@ -634,7 +634,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,        \
 }
 
 #define OP01_LBL( op )                                          \
-static INLINE void ureg_##op( struct ureg_program *ureg,        \
+static inline void ureg_##op( struct ureg_program *ureg,        \
                               struct ureg_src src,              \
                               unsigned *label_token )          \
 {                                                               \
@@ -657,7 +657,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,        \
 }
 
 #define OP10( op )                                                      \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst )                     \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
@@ -681,7 +681,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 
 
 #define OP11( op )                                                      \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src )                     \
 {                                                                       \
@@ -706,7 +706,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP12( op )                                                      \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1 )                    \
@@ -733,7 +733,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP12_TEX( op )                                                  \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               unsigned target,                          \
                               struct ureg_src src0,                     \
@@ -762,7 +762,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP12_SAMPLE( op )                                               \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1 )                    \
@@ -791,7 +791,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP13( op )                                                      \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1,                     \
@@ -820,7 +820,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP13_SAMPLE( op )                                               \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1,                     \
@@ -851,7 +851,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP14_TEX( op )                                                  \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               unsigned target,                          \
                               struct ureg_src src0,                     \
@@ -884,7 +884,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP14_SAMPLE( op )                                               \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1,                     \
@@ -918,7 +918,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 
 
 #define OP14( op )                                                      \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1,                     \
@@ -950,7 +950,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 
 
 #define OP15( op )                                                      \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1,                     \
@@ -983,7 +983,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP15_SAMPLE( op )                                               \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1,                     \
@@ -1026,7 +1026,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 /***********************************************************************
  * Inline helpers for manipulating register structs:
  */
-static INLINE struct ureg_src 
+static inline struct ureg_src 
 ureg_negate( struct ureg_src reg )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1034,7 +1034,7 @@ ureg_negate( struct ureg_src reg )
    return reg;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_abs( struct ureg_src reg )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1043,7 +1043,7 @@ ureg_abs( struct ureg_src reg )
    return reg;
 }
 
-static INLINE struct ureg_src 
+static inline struct ureg_src 
 ureg_swizzle( struct ureg_src reg, 
               int x, int y, int z, int w )
 {
@@ -1065,13 +1065,13 @@ ureg_swizzle( struct ureg_src reg,
    return reg;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_scalar( struct ureg_src reg, int x )
 {
    return ureg_swizzle(reg, x, x, x, x);
 }
 
-static INLINE struct ureg_dst 
+static inline struct ureg_dst 
 ureg_writemask( struct ureg_dst reg,
                 unsigned writemask )
 {
@@ -1080,7 +1080,7 @@ ureg_writemask( struct ureg_dst reg,
    return reg;
 }
 
-static INLINE struct ureg_dst 
+static inline struct ureg_dst 
 ureg_saturate( struct ureg_dst reg )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1088,7 +1088,7 @@ ureg_saturate( struct ureg_dst reg )
    return reg;
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_predicate(struct ureg_dst reg,
                boolean negate,
                unsigned swizzle_x,
@@ -1106,7 +1106,7 @@ ureg_predicate(struct ureg_dst reg,
    return reg;
 }
 
-static INLINE struct ureg_dst 
+static inline struct ureg_dst 
 ureg_dst_indirect( struct ureg_dst reg, struct ureg_src addr )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1118,7 +1118,7 @@ ureg_dst_indirect( struct ureg_dst reg, struct ureg_src addr )
    return reg;
 }
 
-static INLINE struct ureg_src 
+static inline struct ureg_src 
 ureg_src_indirect( struct ureg_src reg, struct ureg_src addr )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1130,7 +1130,7 @@ ureg_src_indirect( struct ureg_src reg, struct ureg_src addr )
    return reg;
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst_dimension( struct ureg_dst reg, int index )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1140,7 +1140,7 @@ ureg_dst_dimension( struct ureg_dst reg, int index )
    return reg;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src_dimension( struct ureg_src reg, int index )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1150,7 +1150,7 @@ ureg_src_dimension( struct ureg_src reg, int index )
    return reg;
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst_dimension_indirect( struct ureg_dst reg, struct ureg_src addr,
                              int index )
 {
@@ -1164,7 +1164,7 @@ ureg_dst_dimension_indirect( struct ureg_dst reg, struct ureg_src addr,
    return reg;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src_dimension_indirect( struct ureg_src reg, struct ureg_src addr,
                              int index )
 {
@@ -1178,21 +1178,21 @@ ureg_src_dimension_indirect( struct ureg_src reg, struct ureg_src addr,
    return reg;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src_array_offset(struct ureg_src reg, int offset)
 {
    reg.Index += offset;
    return reg;
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst_array_offset( struct ureg_dst reg, int offset )
 {
    reg.Index += offset;
    return reg;
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst_array_register(unsigned file,
                         unsigned index,
                         unsigned array_id)
@@ -1224,14 +1224,14 @@ ureg_dst_array_register(unsigned file,
    return dst;
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst_register(unsigned file,
                   unsigned index)
 {
    return ureg_dst_array_register(file, index, 0);
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst( struct ureg_src src )
 {
    struct ureg_dst dst;
@@ -1265,7 +1265,7 @@ ureg_dst( struct ureg_src src )
    return dst;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src_array_register(unsigned file,
                         unsigned index,
                         unsigned array_id)
@@ -1295,14 +1295,14 @@ ureg_src_array_register(unsigned file,
    return src;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src_register(unsigned file,
                   unsigned index)
 {
    return ureg_src_array_register(file, index, 0);
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src( struct ureg_dst dst )
 {
    struct ureg_src src;
@@ -1332,7 +1332,7 @@ ureg_src( struct ureg_dst dst )
 
 
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst_undef( void )
 {
    struct ureg_dst dst;
@@ -1362,7 +1362,7 @@ ureg_dst_undef( void )
    return dst;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src_undef( void )
 {
    struct ureg_src src;
@@ -1390,13 +1390,13 @@ ureg_src_undef( void )
    return src;
 }
 
-static INLINE boolean
+static inline boolean
 ureg_src_is_undef( struct ureg_src src )
 {
    return src.File == TGSI_FILE_NULL;
 }
 
-static INLINE boolean
+static inline boolean
 ureg_dst_is_undef( struct ureg_dst dst )
 {
    return dst.File == TGSI_FILE_NULL;
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
index 7fe8ff8145f..d77561aa7ce 100644
--- a/src/gallium/auxiliary/translate/translate.h
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -130,12 +130,12 @@ struct translate *translate_create( const struct translate_key *key );
 
 boolean translate_is_output_format_supported(enum pipe_format format);
 
-static INLINE int translate_keysize( const struct translate_key *key )
+static inline int translate_keysize( const struct translate_key *key )
 {
    return 2 * sizeof(int) + key->nr_elements * sizeof(struct translate_element);
 }
 
-static INLINE int translate_key_compare( const struct translate_key *a,
+static inline int translate_key_compare( const struct translate_key *a,
                                          const struct translate_key *b )
 {
    int keysize_a = translate_keysize(a);
@@ -148,7 +148,7 @@ static INLINE int translate_key_compare( const struct translate_key *a,
 }
 
 
-static INLINE void translate_key_sanitize( struct translate_key *a )
+static inline void translate_key_sanitize( struct translate_key *a )
 {
    int keysize = translate_keysize(a);
    char *ptr = (char *)a;
diff --git a/src/gallium/auxiliary/translate/translate_cache.c b/src/gallium/auxiliary/translate/translate_cache.c
index bb8bdcb58c4..2bed02a454b 100644
--- a/src/gallium/auxiliary/translate/translate_cache.c
+++ b/src/gallium/auxiliary/translate/translate_cache.c
@@ -49,7 +49,7 @@ struct translate_cache * translate_cache_create( void )
 }
 
 
-static INLINE void delete_translates(struct translate_cache *cache)
+static inline void delete_translates(struct translate_cache *cache)
 {
    struct cso_hash *hash = cache->hash;
    struct cso_hash_iter iter = cso_hash_first_node(hash);
@@ -70,14 +70,14 @@ void translate_cache_destroy(struct translate_cache *cache)
 }
 
 
-static INLINE unsigned translate_hash_key_size(struct translate_key *key)
+static inline unsigned translate_hash_key_size(struct translate_key *key)
 {
    unsigned size = sizeof(struct translate_key) -
                    sizeof(struct translate_element) * (TRANSLATE_MAX_ATTRIBS - key->nr_elements);
    return size;
 }
 
-static INLINE unsigned create_key(struct translate_key *key)
+static inline unsigned create_key(struct translate_key *key)
 {
    unsigned hash_key;
    unsigned size = translate_hash_key_size(key);
diff --git a/src/gallium/auxiliary/util/u_bitmask.c b/src/gallium/auxiliary/util/u_bitmask.c
index 23c93a3ebcb..b19be29a5a4 100644
--- a/src/gallium/auxiliary/util/u_bitmask.c
+++ b/src/gallium/auxiliary/util/u_bitmask.c
@@ -85,7 +85,7 @@ util_bitmask_create(void)
 /**
  * Resize the bitmask if necessary 
  */
-static INLINE boolean
+static inline boolean
 util_bitmask_resize(struct util_bitmask *bm,
                     unsigned minimum_index)
 {
@@ -131,7 +131,7 @@ util_bitmask_resize(struct util_bitmask *bm,
 /**
  * Lazily update the filled.
  */
-static INLINE void
+static inline void
 util_bitmask_filled_set(struct util_bitmask *bm,
                         unsigned index)
 {
@@ -144,7 +144,7 @@ util_bitmask_filled_set(struct util_bitmask *bm,
    }
 }
 
-static INLINE void
+static inline void
 util_bitmask_filled_unset(struct util_bitmask *bm,
                           unsigned index)
 {
diff --git a/src/gallium/auxiliary/util/u_blend.h b/src/gallium/auxiliary/util/u_blend.h
index 2485c34d418..4f969778972 100644
--- a/src/gallium/auxiliary/util/u_blend.h
+++ b/src/gallium/auxiliary/util/u_blend.h
@@ -9,7 +9,7 @@
  * garbage that's there. Return a blend factor that will take that into
  * account.
  */
-static INLINE int
+static inline int
 util_blend_dst_alpha_to_one(int factor)
 {
    switch (factor) {
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index e3f30557a03..0ba00a3997a 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -158,7 +158,7 @@ util_destroy_blit(struct blit_state *ctx)
 /**
  * Helper function to set the fragment shaders.
  */
-static INLINE void
+static inline void
 set_fragment_shader(struct blit_state *ctx, uint writemask,
                     enum pipe_format format,
                     enum pipe_texture_target pipe_tex)
@@ -194,7 +194,7 @@ set_fragment_shader(struct blit_state *ctx, uint writemask,
 /**
  * Helper function to set the vertex shader.
  */
-static INLINE void
+static inline void
 set_vertex_shader(struct blit_state *ctx)
 {
    /* vertex shader - still required to provide the linkage between
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index b5ef9a23966..85206eab1a7 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -938,7 +938,7 @@ static void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,
    }
 }
 
-static INLINE
+static inline
 void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx,
                                     enum pipe_texture_target target,
                                     unsigned nr_samples)
@@ -976,7 +976,7 @@ void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx,
    }
 }
 
-static INLINE
+static inline
 void *blitter_get_fs_texfetch_depthstencil(struct blitter_context_priv *ctx,
                                            enum pipe_texture_target target,
                                            unsigned nr_samples)
@@ -1014,7 +1014,7 @@ void *blitter_get_fs_texfetch_depthstencil(struct blitter_context_priv *ctx,
    }
 }
 
-static INLINE
+static inline
 void *blitter_get_fs_texfetch_stencil(struct blitter_context_priv *ctx,
                                       enum pipe_texture_target target,
                                       unsigned nr_samples)
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index 93b0e513bd0..0cd173d6284 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -143,7 +143,7 @@ void util_blitter_cache_all_shaders(struct blitter_context *blitter);
 /**
  * Return the pipe context associated with a blitter context.
  */
-static INLINE
+static inline
 struct pipe_context *util_blitter_get_pipe(struct blitter_context *blitter)
 {
    return blitter->pipe;
@@ -371,77 +371,77 @@ void util_blitter_custom_resolve_color(struct blitter_context *blitter,
  *
  * States not listed here are not affected by util_blitter. */
 
-static INLINE
+static inline
 void util_blitter_save_blend(struct blitter_context *blitter,
                              void *state)
 {
    blitter->saved_blend_state = state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_depth_stencil_alpha(struct blitter_context *blitter,
                                            void *state)
 {
    blitter->saved_dsa_state = state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_vertex_elements(struct blitter_context *blitter,
                                        void *state)
 {
    blitter->saved_velem_state = state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_stencil_ref(struct blitter_context *blitter,
                                    const struct pipe_stencil_ref *state)
 {
    blitter->saved_stencil_ref = *state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_rasterizer(struct blitter_context *blitter,
                                   void *state)
 {
    blitter->saved_rs_state = state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_fragment_shader(struct blitter_context *blitter,
                                        void *fs)
 {
    blitter->saved_fs = fs;
 }
 
-static INLINE
+static inline
 void util_blitter_save_vertex_shader(struct blitter_context *blitter,
                                      void *vs)
 {
    blitter->saved_vs = vs;
 }
 
-static INLINE
+static inline
 void util_blitter_save_geometry_shader(struct blitter_context *blitter,
                                        void *gs)
 {
    blitter->saved_gs = gs;
 }
 
-static INLINE void
+static inline void
 util_blitter_save_tessctrl_shader(struct blitter_context *blitter,
                                   void *sh)
 {
    blitter->saved_tcs = sh;
 }
 
-static INLINE void
+static inline void
 util_blitter_save_tesseval_shader(struct blitter_context *blitter,
                                   void *sh)
 {
    blitter->saved_tes = sh;
 }
 
-static INLINE
+static inline
 void util_blitter_save_framebuffer(struct blitter_context *blitter,
                                    const struct pipe_framebuffer_state *state)
 {
@@ -449,21 +449,21 @@ void util_blitter_save_framebuffer(struct blitter_context *blitter,
    util_copy_framebuffer_state(&blitter->saved_fb_state, state);
 }
 
-static INLINE
+static inline
 void util_blitter_save_viewport(struct blitter_context *blitter,
                                 struct pipe_viewport_state *state)
 {
    blitter->saved_viewport = *state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_scissor(struct blitter_context *blitter,
                                struct pipe_scissor_state *state)
 {
    blitter->saved_scissor = *state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_fragment_sampler_states(
                   struct blitter_context *blitter,
                   unsigned num_sampler_states,
@@ -476,7 +476,7 @@ void util_blitter_save_fragment_sampler_states(
           num_sampler_states * sizeof(void *));
 }
 
-static INLINE void
+static inline void
 util_blitter_save_fragment_sampler_views(struct blitter_context *blitter,
                                          unsigned num_views,
                                          struct pipe_sampler_view **views)
@@ -490,7 +490,7 @@ util_blitter_save_fragment_sampler_views(struct blitter_context *blitter,
                                   views[i]);
 }
 
-static INLINE void
+static inline void
 util_blitter_save_vertex_buffer_slot(struct blitter_context *blitter,
                                      struct pipe_vertex_buffer *vertex_buffers)
 {
@@ -500,7 +500,7 @@ util_blitter_save_vertex_buffer_slot(struct blitter_context *blitter,
           sizeof(struct pipe_vertex_buffer));
 }
 
-static INLINE void
+static inline void
 util_blitter_save_so_targets(struct blitter_context *blitter,
                              unsigned num_targets,
                              struct pipe_stream_output_target **targets)
@@ -514,7 +514,7 @@ util_blitter_save_so_targets(struct blitter_context *blitter,
                                targets[i]);
 }
 
-static INLINE void
+static inline void
 util_blitter_save_sample_mask(struct blitter_context *blitter,
                               unsigned sample_mask)
 {
@@ -522,7 +522,7 @@ util_blitter_save_sample_mask(struct blitter_context *blitter,
    blitter->saved_sample_mask = sample_mask;
 }
 
-static INLINE void
+static inline void
 util_blitter_save_render_condition(struct blitter_context *blitter,
                                    struct pipe_query *query,
                                    boolean condition,
diff --git a/src/gallium/auxiliary/util/u_box.h b/src/gallium/auxiliary/util/u_box.h
index 520a3d596cb..66cf989a830 100644
--- a/src/gallium/auxiliary/util/u_box.h
+++ b/src/gallium/auxiliary/util/u_box.h
@@ -4,7 +4,7 @@
 #include "pipe/p_state.h"
 #include "util/u_math.h"
 
-static INLINE
+static inline
 void u_box_1d( unsigned x,
 	       unsigned w,
 	       struct pipe_box *box )
@@ -17,7 +17,7 @@ void u_box_1d( unsigned x,
    box->depth = 1;
 }
 
-static INLINE
+static inline
 void u_box_2d( unsigned x,
 	       unsigned y,
 	       unsigned w,
@@ -32,7 +32,7 @@ void u_box_2d( unsigned x,
    box->depth = 1;
 }
 
-static INLINE
+static inline
 void u_box_origin_2d( unsigned w,
 		      unsigned h,
 		      struct pipe_box *box )
@@ -45,7 +45,7 @@ void u_box_origin_2d( unsigned w,
    box->depth = 1;
 }
 
-static INLINE
+static inline
 void u_box_2d_zslice( unsigned x,
 		      unsigned y,
 		      unsigned z,
@@ -61,7 +61,7 @@ void u_box_2d_zslice( unsigned x,
    box->depth = 1;
 }
 
-static INLINE
+static inline
 void u_box_3d( unsigned x,
 	       unsigned y,
 	       unsigned z,
@@ -86,7 +86,7 @@ void u_box_3d( unsigned x,
  *          3 if both width and height have been reduced.
  * Aliasing permitted.
  */
-static INLINE int
+static inline int
 u_box_clip_2d(struct pipe_box *dst,
               const struct pipe_box *box, int w, int h)
 {
@@ -129,14 +129,14 @@ u_box_clip_2d(struct pipe_box *dst,
    return res;
 }
 
-static INLINE int64_t
+static inline int64_t
 u_box_volume_3d(const struct pipe_box *box)
 {
    return (int64_t)box->width * box->height * box->depth;
 }
 
 /* Aliasing of @dst permitted. */
-static INLINE void
+static inline void
 u_box_union_2d(struct pipe_box *dst,
                const struct pipe_box *a, const struct pipe_box *b)
 {
@@ -148,7 +148,7 @@ u_box_union_2d(struct pipe_box *dst,
 }
 
 /* Aliasing of @dst permitted. */
-static INLINE void
+static inline void
 u_box_union_3d(struct pipe_box *dst,
                const struct pipe_box *a, const struct pipe_box *b)
 {
@@ -161,7 +161,7 @@ u_box_union_3d(struct pipe_box *dst,
    dst->depth = MAX2(a->z + a->depth, b->z + b->depth) - dst->z;
 }
 
-static INLINE boolean
+static inline boolean
 u_box_test_intersection_2d(const struct pipe_box *a,
                            const struct pipe_box *b)
 {
@@ -185,7 +185,7 @@ u_box_test_intersection_2d(const struct pipe_box *a,
    return TRUE;
 }
 
-static INLINE void
+static inline void
 u_box_minify_2d(struct pipe_box *dst,
                 const struct pipe_box *src, unsigned l)
 {
diff --git a/src/gallium/auxiliary/util/u_cache.c b/src/gallium/auxiliary/util/u_cache.c
index 9395c66f2f8..da0856981eb 100644
--- a/src/gallium/auxiliary/util/u_cache.c
+++ b/src/gallium/auxiliary/util/u_cache.c
@@ -155,7 +155,7 @@ util_cache_entry_get(struct util_cache *cache,
    return NULL;
 }
 
-static INLINE void
+static inline void
 util_cache_entry_destroy(struct util_cache *cache,
                          struct util_cache_entry *entry)
 {
diff --git a/src/gallium/auxiliary/util/u_clear.h b/src/gallium/auxiliary/util/u_clear.h
index af557be00bd..864d1302b4f 100644
--- a/src/gallium/auxiliary/util/u_clear.h
+++ b/src/gallium/auxiliary/util/u_clear.h
@@ -37,7 +37,7 @@
  * Clear the given buffers to the specified values.
  * No masking, no scissor (clear entire buffer).
  */
-static INLINE void
+static inline void
 util_clear(struct pipe_context *pipe,
            struct pipe_framebuffer_state *framebuffer, unsigned buffers,
            const union pipe_color_union *color, double depth, unsigned stencil)
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
index 23ab46c54bc..d1f9e978682 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -179,7 +179,7 @@ static int has_cpuid(void)
  * @sa cpuid.h included in gcc-4.3 onwards.
  * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
  */
-static INLINE void
+static inline void
 cpuid(uint32_t ax, uint32_t *p)
 {
 #if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86)
@@ -216,7 +216,7 @@ cpuid(uint32_t ax, uint32_t *p)
  * @sa cpuid.h included in gcc-4.4 onwards.
  * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
  */
-static INLINE void
+static inline void
 cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
 {
 #if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86)
@@ -250,7 +250,7 @@ cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
 }
 
 
-static INLINE uint64_t xgetbv(void)
+static inline uint64_t xgetbv(void)
 {
 #if defined(PIPE_CC_GCC)
    uint32_t eax, edx;
@@ -272,7 +272,7 @@ static INLINE uint64_t xgetbv(void)
 
 
 #if defined(PIPE_ARCH_X86)
-PIPE_ALIGN_STACK static INLINE boolean sse2_has_daz(void)
+PIPE_ALIGN_STACK static inline boolean sse2_has_daz(void)
 {
    struct {
       uint32_t pad1[7];
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index 3b2255244a7..b4286d32b32 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -58,7 +58,7 @@ extern "C" {
 void _debug_vprintf(const char *format, va_list ap);
    
 
-static INLINE void
+static inline void
 _debug_printf(const char *format, ...)
 {
    va_list ap;
@@ -78,10 +78,10 @@ _debug_printf(const char *format, ...)
  * that is guaranteed to be printed in all platforms)
  */
 #if !defined(PIPE_OS_HAIKU)
-static INLINE void
+static inline void
 debug_printf(const char *format, ...) _util_printf_format(1,2);
 
-static INLINE void
+static inline void
 debug_printf(const char *format, ...)
 {
 #ifdef DEBUG
diff --git a/src/gallium/auxiliary/util/u_debug_memory.c b/src/gallium/auxiliary/util/u_debug_memory.c
index 747837cd148..3e7ecfa79f3 100644
--- a/src/gallium/auxiliary/util/u_debug_memory.c
+++ b/src/gallium/auxiliary/util/u_debug_memory.c
@@ -92,7 +92,7 @@ pipe_static_mutex(list_mutex);
 static unsigned long last_no = 0;
 
 
-static INLINE struct debug_memory_header *
+static inline struct debug_memory_header *
 header_from_data(void *data)
 {
    if(data)
@@ -101,7 +101,7 @@ header_from_data(void *data)
       return NULL;
 }
 
-static INLINE void *
+static inline void *
 data_from_header(struct debug_memory_header *hdr)
 {
    if(hdr)
@@ -110,7 +110,7 @@ data_from_header(struct debug_memory_header *hdr)
       return NULL;
 }
 
-static INLINE struct debug_memory_footer *
+static inline struct debug_memory_footer *
 footer_from_header(struct debug_memory_header *hdr)
 {
    if(hdr)
diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.h b/src/gallium/auxiliary/util/u_debug_refcnt.h
index c02fba27ddf..1f9218fec9a 100644
--- a/src/gallium/auxiliary/util/u_debug_refcnt.h
+++ b/src/gallium/auxiliary/util/u_debug_refcnt.h
@@ -42,7 +42,7 @@ extern int debug_refcnt_state;
 
 void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change);
 
-static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+static inline void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
 {
    if (debug_refcnt_state >= 0)
       debug_reference_slowpath(p, get_desc, change);
@@ -50,7 +50,7 @@ static INLINE void debug_reference(const struct pipe_reference* p, debug_referen
 
 #else
 
-static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+static inline void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
 {
 }
 
diff --git a/src/gallium/auxiliary/util/u_debug_symbol.c b/src/gallium/auxiliary/util/u_debug_symbol.c
index 542493252ce..10efdd593e5 100644
--- a/src/gallium/auxiliary/util/u_debug_symbol.c
+++ b/src/gallium/auxiliary/util/u_debug_symbol.c
@@ -146,7 +146,7 @@ DBGHELP_DISPATCH(SymGetLineFromAddr64,
 #undef DBGHELP_DISPATCH
 
 
-static INLINE boolean
+static inline boolean
 debug_symbol_name_dbghelp(const void *addr, char* buf, unsigned size)
 {
    DWORD64 dwAddr = (DWORD64)(uintptr_t)addr;
@@ -227,7 +227,7 @@ debug_symbol_name_dbghelp(const void *addr, char* buf, unsigned size)
  *
  * To fix this, post-process the output with tools/addr2line.sh
  */
-static INLINE boolean
+static inline boolean
 debug_symbol_name_glibc(const void *addr, char* buf, unsigned size)
 {
    char** syms = backtrace_symbols((void**)&addr, 1);
diff --git a/src/gallium/auxiliary/util/u_dirty_surfaces.h b/src/gallium/auxiliary/util/u_dirty_surfaces.h
index d31f8b9170a..ccde8a8c115 100644
--- a/src/gallium/auxiliary/util/u_dirty_surfaces.h
+++ b/src/gallium/auxiliary/util/u_dirty_surfaces.h
@@ -47,13 +47,13 @@ struct util_dirty_surface
    struct list_head dirty_list;
 };
 
-static INLINE void
+static inline void
 util_dirty_surfaces_init(struct util_dirty_surfaces *ds)
 {
    LIST_INITHEAD(&ds->dirty_list);
 }
 
-static INLINE void
+static inline void
 util_dirty_surfaces_use_for_sampling(struct pipe_context *pipe, struct util_dirty_surfaces *dss, util_dirty_surface_flush_t flush)
 {
    struct list_head *p, *next;
@@ -66,7 +66,7 @@ util_dirty_surfaces_use_for_sampling(struct pipe_context *pipe, struct util_dirt
    }
 }
 
-static INLINE void
+static inline void
 util_dirty_surfaces_use_levels_for_sampling(struct pipe_context *pipe, struct util_dirty_surfaces *dss, unsigned first, unsigned last, util_dirty_surface_flush_t flush)
 {
    struct list_head *p, *next;
@@ -82,7 +82,7 @@ util_dirty_surfaces_use_levels_for_sampling(struct pipe_context *pipe, struct ut
    }
 }
 
-static INLINE void
+static inline void
 util_dirty_surfaces_use_for_sampling_with(struct pipe_context *pipe, struct util_dirty_surfaces *dss, struct pipe_sampler_view *psv, struct pipe_sampler_state *pss, util_dirty_surface_flush_t flush)
 {
    if(!LIST_IS_EMPTY(&dss->dirty_list))
@@ -90,26 +90,26 @@ util_dirty_surfaces_use_for_sampling_with(struct pipe_context *pipe, struct util
 						  MIN2((unsigned)ceilf(pss->max_lod) + psv->u.tex.first_level, psv->u.tex.last_level), flush);
 }
 
-static INLINE void
+static inline void
 util_dirty_surface_init(struct util_dirty_surface *ds)
 {
    LIST_INITHEAD(&ds->dirty_list);
 }
 
-static INLINE boolean
+static inline boolean
 util_dirty_surface_is_dirty(struct util_dirty_surface *ds)
 {
    return !LIST_IS_EMPTY(&ds->dirty_list);
 }
 
-static INLINE void
+static inline void
 util_dirty_surface_set_dirty(struct util_dirty_surfaces *dss, struct util_dirty_surface *ds)
 {
    if(LIST_IS_EMPTY(&ds->dirty_list))
       LIST_ADDTAIL(&ds->dirty_list, &dss->dirty_list);
 }
 
-static INLINE void
+static inline void
 util_dirty_surface_set_clean(struct util_dirty_surfaces *dss, struct util_dirty_surface *ds)
 {
    if(!LIST_IS_EMPTY(&ds->dirty_list))
diff --git a/src/gallium/auxiliary/util/u_draw.h b/src/gallium/auxiliary/util/u_draw.h
index 9fc3e9924e1..5c0880f6ce4 100644
--- a/src/gallium/auxiliary/util/u_draw.h
+++ b/src/gallium/auxiliary/util/u_draw.h
@@ -39,7 +39,7 @@ extern "C" {
 #endif
 
 
-static INLINE void
+static inline void
 util_draw_init_info(struct pipe_draw_info *info)
 {
    memset(info, 0, sizeof(*info));
@@ -48,7 +48,7 @@ util_draw_init_info(struct pipe_draw_info *info)
 }
 
 
-static INLINE void
+static inline void
 util_draw_arrays(struct pipe_context *pipe, uint mode, uint start, uint count)
 {
    struct pipe_draw_info info;
@@ -63,7 +63,7 @@ util_draw_arrays(struct pipe_context *pipe, uint mode, uint start, uint count)
    pipe->draw_vbo(pipe, &info);
 }
 
-static INLINE void
+static inline void
 util_draw_elements(struct pipe_context *pipe, int index_bias,
                    uint mode, uint start, uint count)
 {
@@ -79,7 +79,7 @@ util_draw_elements(struct pipe_context *pipe, int index_bias,
    pipe->draw_vbo(pipe, &info);
 }
 
-static INLINE void
+static inline void
 util_draw_arrays_instanced(struct pipe_context *pipe,
                            uint mode, uint start, uint count,
                            uint start_instance,
@@ -99,7 +99,7 @@ util_draw_arrays_instanced(struct pipe_context *pipe,
    pipe->draw_vbo(pipe, &info);
 }
 
-static INLINE void
+static inline void
 util_draw_elements_instanced(struct pipe_context *pipe,
                              int index_bias,
                              uint mode, uint start, uint count,
@@ -120,7 +120,7 @@ util_draw_elements_instanced(struct pipe_context *pipe,
    pipe->draw_vbo(pipe, &info);
 }
 
-static INLINE void
+static inline void
 util_draw_range_elements(struct pipe_context *pipe,
                          int index_bias,
                          uint min_index,
diff --git a/src/gallium/auxiliary/util/u_dual_blend.h b/src/gallium/auxiliary/util/u_dual_blend.h
index e31d43c18bd..9450800f715 100644
--- a/src/gallium/auxiliary/util/u_dual_blend.h
+++ b/src/gallium/auxiliary/util/u_dual_blend.h
@@ -3,7 +3,7 @@
 
 #include "pipe/p_state.h"
 
-static INLINE boolean util_blend_factor_is_dual_src(int factor)
+static inline boolean util_blend_factor_is_dual_src(int factor)
 {
    return (factor == PIPE_BLENDFACTOR_SRC1_COLOR) ||
           (factor == PIPE_BLENDFACTOR_SRC1_ALPHA) ||
@@ -11,7 +11,7 @@ static INLINE boolean util_blend_factor_is_dual_src(int factor)
           (factor == PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
 }
 
-static INLINE boolean util_blend_state_is_dual(const struct pipe_blend_state *blend, 
+static inline boolean util_blend_state_is_dual(const struct pipe_blend_state *blend, 
 				  int index)
 {
    if (util_blend_factor_is_dual_src(blend->rt[index].rgb_src_factor) ||
diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c
index 88027cbbc79..58ccfba2dc4 100644
--- a/src/gallium/auxiliary/util/u_dump_state.c
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -39,7 +39,7 @@
  * Dump primitives
  */
 
-static INLINE void
+static inline void
 util_stream_writef(FILE *stream, const char *format, ...)
 {
    static char buf[1024];
diff --git a/src/gallium/auxiliary/util/u_dynarray.h b/src/gallium/auxiliary/util/u_dynarray.h
index 980cadf22d1..7b7a093d824 100644
--- a/src/gallium/auxiliary/util/u_dynarray.h
+++ b/src/gallium/auxiliary/util/u_dynarray.h
@@ -43,13 +43,13 @@ struct util_dynarray
    unsigned capacity;
 };
 
-static INLINE void
+static inline void
 util_dynarray_init(struct util_dynarray *buf)
 {
    memset(buf, 0, sizeof(*buf));
 }
 
-static INLINE void
+static inline void
 util_dynarray_fini(struct util_dynarray *buf)
 {
    if(buf->data)
@@ -60,7 +60,7 @@ util_dynarray_fini(struct util_dynarray *buf)
 }
 
 /* use util_dynarray_trim to reduce the allocated storage */
-static INLINE void *
+static inline void *
 util_dynarray_resize(struct util_dynarray *buf, unsigned newsize)
 {
    char *p;
@@ -78,13 +78,13 @@ util_dynarray_resize(struct util_dynarray *buf, unsigned newsize)
    return p;
 }
 
-static INLINE void *
+static inline void *
 util_dynarray_grow(struct util_dynarray *buf, int diff)
 {
    return util_dynarray_resize(buf, buf->size + diff);
 }
 
-static INLINE void
+static inline void
 util_dynarray_trim(struct util_dynarray *buf)
 {
    if (buf->size != buf->capacity) {
diff --git a/src/gallium/auxiliary/util/u_fifo.h b/src/gallium/auxiliary/util/u_fifo.h
index 9e007de1ada..a7aad6179d9 100644
--- a/src/gallium/auxiliary/util/u_fifo.h
+++ b/src/gallium/auxiliary/util/u_fifo.h
@@ -36,7 +36,7 @@ struct util_fifo
    size_t size;
 };
 
-static INLINE struct util_fifo *
+static inline struct util_fifo *
 u_fifo_create(size_t size)
 {
    struct util_fifo *fifo;
@@ -50,7 +50,7 @@ u_fifo_create(size_t size)
    return fifo;
 }
 
-static INLINE boolean
+static inline boolean
 u_fifo_add(struct util_fifo *fifo, void *ptr)
 {
    void **array = (void**)&fifo[1];
@@ -67,7 +67,7 @@ u_fifo_add(struct util_fifo *fifo, void *ptr)
    return TRUE;
 }
 
-static INLINE boolean
+static inline boolean
 u_fifo_pop(struct util_fifo *fifo, void **ptr)
 {
    void **array = (void**)&fifo[1];
@@ -85,7 +85,7 @@ u_fifo_pop(struct util_fifo *fifo, void **ptr)
    return TRUE;
 }
 
-static INLINE void
+static inline void
 u_fifo_destroy(struct util_fifo *fifo)
 {
    FREE(fifo);
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index 621574c9673..42b39ff04fd 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -425,7 +425,7 @@ util_format_description(enum pipe_format format);
  * Format query functions.
  */
 
-static INLINE const char *
+static inline const char *
 util_format_name(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -438,7 +438,7 @@ util_format_name(enum pipe_format format)
    return desc->name;
 }
 
-static INLINE const char *
+static inline const char *
 util_format_short_name(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -454,7 +454,7 @@ util_format_short_name(enum pipe_format format)
 /**
  * Whether this format is plain, see UTIL_FORMAT_LAYOUT_PLAIN for more info.
  */
-static INLINE boolean
+static inline boolean
 util_format_is_plain(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -466,7 +466,7 @@ util_format_is_plain(enum pipe_format format)
    return desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ? TRUE : FALSE;
 }
 
-static INLINE boolean 
+static inline boolean 
 util_format_is_compressed(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -488,7 +488,7 @@ util_format_is_compressed(enum pipe_format format)
    }
 }
 
-static INLINE boolean 
+static inline boolean 
 util_format_is_s3tc(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -501,28 +501,28 @@ util_format_is_s3tc(enum pipe_format format)
    return desc->layout == UTIL_FORMAT_LAYOUT_S3TC ? TRUE : FALSE;
 }
 
-static INLINE boolean 
+static inline boolean 
 util_format_is_srgb(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
    return desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB;
 }
 
-static INLINE boolean
+static inline boolean
 util_format_has_depth(const struct util_format_description *desc)
 {
    return desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
           desc->swizzle[0] != UTIL_FORMAT_SWIZZLE_NONE;
 }
 
-static INLINE boolean
+static inline boolean
 util_format_has_stencil(const struct util_format_description *desc)
 {
    return desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
           desc->swizzle[1] != UTIL_FORMAT_SWIZZLE_NONE;
 }
 
-static INLINE boolean
+static inline boolean
 util_format_is_depth_or_stencil(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -536,7 +536,7 @@ util_format_is_depth_or_stencil(enum pipe_format format)
           util_format_has_stencil(desc);
 }
 
-static INLINE boolean
+static inline boolean
 util_format_is_depth_and_stencil(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -554,7 +554,7 @@ util_format_is_depth_and_stencil(enum pipe_format format)
 /**
  * Calculates the depth format type based upon the incoming format description.
  */
-static INLINE unsigned
+static inline unsigned
 util_get_depth_format_type(const struct util_format_description *desc)
 {
    unsigned depth_channel = desc->swizzle[0];
@@ -581,7 +581,7 @@ util_get_depth_format_mrd(const struct util_format_description *desc);
  * Return whether this is an RGBA, Z, S, or combined ZS format.
  * Useful for initializing pipe_blit_info::mask.
  */
-static INLINE unsigned
+static inline unsigned
 util_format_get_mask(enum pipe_format format)
 {
    const struct util_format_description *desc =
@@ -611,7 +611,7 @@ util_format_get_mask(enum pipe_format format)
  *
  * That is, the channels whose values are preserved.
  */
-static INLINE unsigned
+static inline unsigned
 util_format_colormask(const struct util_format_description *desc)
 {
    unsigned colormask;
@@ -643,7 +643,7 @@ util_format_colormask(const struct util_format_description *desc)
  * @param desc       a format description to check colormask with
  * @param colormask  a bit mask for channels, matches format of PIPE_MASK_RGBA
  */
-static INLINE boolean
+static inline boolean
 util_format_colormask_full(const struct util_format_description *desc, unsigned colormask)
 {
    return (~colormask & util_format_colormask(desc)) == 0;
@@ -709,7 +709,7 @@ util_format_is_supported(enum pipe_format format, unsigned bind);
  *
  *   PIPE_FORMAT_?8?8?8?8_UNORM
  */
-static INLINE boolean
+static inline boolean
 util_format_is_rgba8_variant(const struct util_format_description *desc)
 {
    unsigned chan;
@@ -737,7 +737,7 @@ util_format_is_rgba8_variant(const struct util_format_description *desc)
 /**
  * Return total bits needed for the pixel format per block.
  */
-static INLINE uint
+static inline uint
 util_format_get_blocksizebits(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -753,7 +753,7 @@ util_format_get_blocksizebits(enum pipe_format format)
 /**
  * Return bytes per block (not pixel) for the given format.
  */
-static INLINE uint
+static inline uint
 util_format_get_blocksize(enum pipe_format format)
 {
    uint bits = util_format_get_blocksizebits(format);
@@ -768,7 +768,7 @@ util_format_get_blocksize(enum pipe_format format)
    return bytes;
 }
 
-static INLINE uint
+static inline uint
 util_format_get_blockwidth(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -781,7 +781,7 @@ util_format_get_blockwidth(enum pipe_format format)
    return desc->block.width;
 }
 
-static INLINE uint
+static inline uint
 util_format_get_blockheight(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -794,7 +794,7 @@ util_format_get_blockheight(enum pipe_format format)
    return desc->block.height;
 }
 
-static INLINE unsigned
+static inline unsigned
 util_format_get_nblocksx(enum pipe_format format,
                          unsigned x)
 {
@@ -802,7 +802,7 @@ util_format_get_nblocksx(enum pipe_format format,
    return (x + blockwidth - 1) / blockwidth;
 }
 
-static INLINE unsigned
+static inline unsigned
 util_format_get_nblocksy(enum pipe_format format,
                          unsigned y)
 {
@@ -810,7 +810,7 @@ util_format_get_nblocksy(enum pipe_format format,
    return (y + blockheight - 1) / blockheight;
 }
 
-static INLINE unsigned
+static inline unsigned
 util_format_get_nblocks(enum pipe_format format,
                         unsigned width,
                         unsigned height)
@@ -818,14 +818,14 @@ util_format_get_nblocks(enum pipe_format format,
    return util_format_get_nblocksx(format, width) * util_format_get_nblocksy(format, height);
 }
 
-static INLINE size_t
+static inline size_t
 util_format_get_stride(enum pipe_format format,
                        unsigned width)
 {
    return util_format_get_nblocksx(format, width) * util_format_get_blocksize(format);
 }
 
-static INLINE size_t
+static inline size_t
 util_format_get_2d_size(enum pipe_format format,
                         size_t stride,
                         unsigned height)
@@ -833,7 +833,7 @@ util_format_get_2d_size(enum pipe_format format,
    return util_format_get_nblocksy(format, height) * stride;
 }
 
-static INLINE uint
+static inline uint
 util_format_get_component_bits(enum pipe_format format,
                                enum util_format_colorspace colorspace,
                                uint component)
@@ -880,7 +880,7 @@ util_format_get_component_bits(enum pipe_format format,
  * Given a linear RGB colorspace format, return the corresponding SRGB
  * format, or PIPE_FORMAT_NONE if none.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 util_format_srgb(enum pipe_format format)
 {
    if (util_format_is_srgb(format))
@@ -930,7 +930,7 @@ util_format_srgb(enum pipe_format format)
  * Given an sRGB format, return the corresponding linear colorspace format.
  * For non sRGB formats, return the format unchanged.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 util_format_linear(enum pipe_format format)
 {
    switch (format) {
@@ -977,7 +977,7 @@ util_format_linear(enum pipe_format format)
  * Given a depth-stencil format, return the corresponding stencil-only format.
  * For stencil-only formats, return the format unchanged.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 util_format_stencil_only(enum pipe_format format)
 {
    switch (format) {
@@ -1006,7 +1006,7 @@ util_format_stencil_only(enum pipe_format format)
  * Converts PIPE_FORMAT_*I* to PIPE_FORMAT_*R*.
  * This is identity for non-intensity formats.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 util_format_intensity_to_red(enum pipe_format format)
 {
    switch (format) {
@@ -1044,7 +1044,7 @@ util_format_intensity_to_red(enum pipe_format format)
  * Converts PIPE_FORMAT_*L* to PIPE_FORMAT_*R*.
  * This is identity for non-luminance formats.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 util_format_luminance_to_red(enum pipe_format format)
 {
    switch (format) {
@@ -1122,7 +1122,7 @@ util_format_luminance_to_red(enum pipe_format format)
  * Return the number of components stored.
  * Formats with block size != 1x1 will always have 1 component (the block).
  */
-static INLINE unsigned
+static inline unsigned
 util_format_get_nr_components(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -1133,7 +1133,7 @@ util_format_get_nr_components(enum pipe_format format)
  * Return the index of the first non-void channel
  * -1 if no non-void channels
  */
-static INLINE int
+static inline int
 util_format_get_first_non_void_channel(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
diff --git a/src/gallium/auxiliary/util/u_format_pack.py b/src/gallium/auxiliary/util/u_format_pack.py
index d5138cc0577..fb42de723c4 100644
--- a/src/gallium/auxiliary/util/u_format_pack.py
+++ b/src/gallium/auxiliary/util/u_format_pack.py
@@ -616,7 +616,7 @@ def generate_format_unpack(format, dst_channel, dst_native_type, dst_suffix):
 
     name = format.short_name()
 
-    print 'static INLINE void'
+    print 'static inline void'
     print 'util_format_%s_unpack_%s(%s *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)' % (name, dst_suffix, dst_native_type)
     print '{'
 
@@ -645,7 +645,7 @@ def generate_format_pack(format, src_channel, src_native_type, src_suffix):
 
     name = format.short_name()
 
-    print 'static INLINE void'
+    print 'static inline void'
     print 'util_format_%s_pack_%s(uint8_t *dst_row, unsigned dst_stride, const %s *src_row, unsigned src_stride, unsigned width, unsigned height)' % (name, src_suffix, src_native_type)
     print '{'
     
@@ -674,7 +674,7 @@ def generate_format_fetch(format, dst_channel, dst_native_type, dst_suffix):
 
     name = format.short_name()
 
-    print 'static INLINE void'
+    print 'static inline void'
     print 'util_format_%s_fetch_%s(%s *dst, const uint8_t *src, unsigned i, unsigned j)' % (name, dst_suffix, dst_native_type)
     print '{'
 
diff --git a/src/gallium/auxiliary/util/u_format_r11g11b10f.h b/src/gallium/auxiliary/util/u_format_r11g11b10f.h
index 57516c39c6e..218822b16e6 100644
--- a/src/gallium/auxiliary/util/u_format_r11g11b10f.h
+++ b/src/gallium/auxiliary/util/u_format_r11g11b10f.h
@@ -45,7 +45,7 @@
 
 #define F32_INFINITY         0x7f800000
 
-static INLINE unsigned f32_to_uf11(float val)
+static inline unsigned f32_to_uf11(float val)
 {
    union {
       float f;
@@ -94,7 +94,7 @@ static INLINE unsigned f32_to_uf11(float val)
    return uf11;
 }
 
-static INLINE float uf11_to_f32(uint16_t val)
+static inline float uf11_to_f32(uint16_t val)
 {
    union {
       float f;
@@ -131,7 +131,7 @@ static INLINE float uf11_to_f32(uint16_t val)
    return f32.f;
 }
 
-static INLINE unsigned f32_to_uf10(float val)
+static inline unsigned f32_to_uf10(float val)
 {
    union {
       float f;
@@ -180,7 +180,7 @@ static INLINE unsigned f32_to_uf10(float val)
    return uf10;
 }
 
-static INLINE float uf10_to_f32(uint16_t val)
+static inline float uf10_to_f32(uint16_t val)
 {
    union {
       float f;
@@ -217,14 +217,14 @@ static INLINE float uf10_to_f32(uint16_t val)
    return f32.f;
 }
 
-static INLINE unsigned float3_to_r11g11b10f(const float rgb[3])
+static inline unsigned float3_to_r11g11b10f(const float rgb[3])
 {
    return ( f32_to_uf11(rgb[0]) & 0x7ff) |
           ((f32_to_uf11(rgb[1]) & 0x7ff) << 11) |
           ((f32_to_uf10(rgb[2]) & 0x3ff) << 22);
 }
 
-static INLINE void r11g11b10f_to_float3(unsigned rgb, float retval[3])
+static inline void r11g11b10f_to_float3(unsigned rgb, float retval[3])
 {
    retval[0] = uf11_to_f32( rgb        & 0x7ff);
    retval[1] = uf11_to_f32((rgb >> 11) & 0x7ff);
diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h b/src/gallium/auxiliary/util/u_format_rgb9e5.h
index c2a3f6f3e9d..973e542c536 100644
--- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
+++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
@@ -73,7 +73,7 @@ typedef union {
    } field;
 } rgb9e5;
 
-static INLINE float rgb9e5_ClampRange(float x)
+static inline float rgb9e5_ClampRange(float x)
 {
    if (x > 0.0) {
       if (x >= MAX_RGB9E5) {
@@ -90,7 +90,7 @@ static INLINE float rgb9e5_ClampRange(float x)
 /* Ok, FloorLog2 is not correct for the denorm and zero values, but we
    are going to do a max of this value with the minimum rgb9e5 exponent
    that will hide these problem cases. */
-static INLINE int rgb9e5_FloorLog2(float x)
+static inline int rgb9e5_FloorLog2(float x)
 {
    float754 f;
 
@@ -98,7 +98,7 @@ static INLINE int rgb9e5_FloorLog2(float x)
    return (f.field.biasedexponent - 127);
 }
 
-static INLINE unsigned float3_to_rgb9e5(const float rgb[3])
+static inline unsigned float3_to_rgb9e5(const float rgb[3])
 {
    rgb9e5 retval;
    float maxrgb;
@@ -146,7 +146,7 @@ static INLINE unsigned float3_to_rgb9e5(const float rgb[3])
    return retval.raw;
 }
 
-static INLINE void rgb9e5_to_float3(unsigned rgb, float retval[3])
+static inline void rgb9e5_to_float3(unsigned rgb, float retval[3])
 {
    rgb9e5 v;
    int exponent;
diff --git a/src/gallium/auxiliary/util/u_format_s3tc.c b/src/gallium/auxiliary/util/u_format_s3tc.c
index 7e05989e6a1..cd3e165d3f0 100644
--- a/src/gallium/auxiliary/util/u_format_s3tc.c
+++ b/src/gallium/auxiliary/util/u_format_s3tc.c
@@ -235,7 +235,7 @@ util_format_dxt5_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned
  * Block decompression.
  */
 
-static INLINE void
+static inline void
 util_format_dxtn_rgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
                                         const uint8_t *src_row, unsigned src_stride,
                                         unsigned width, unsigned height,
@@ -312,7 +312,7 @@ util_format_dxt5_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
                                            16, FALSE);
 }
 
-static INLINE void
+static inline void
 util_format_dxtn_rgb_unpack_rgba_float(float *dst_row, unsigned dst_stride,
                                        const uint8_t *src_row, unsigned src_stride,
                                        unsigned width, unsigned height,
@@ -400,7 +400,7 @@ util_format_dxt5_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride,
  * Block compression.
  */
 
-static INLINE void
+static inline void
 util_format_dxtn_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
                                   const uint8_t *src, unsigned src_stride,
                                   unsigned width, unsigned height,
@@ -478,7 +478,7 @@ util_format_dxt5_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
                                      16, FALSE);
 }
 
-static INLINE void
+static inline void
 util_format_dxtn_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
                                  const float *src, unsigned src_stride,
                                  unsigned width, unsigned height,
diff --git a/src/gallium/auxiliary/util/u_format_yuv.h b/src/gallium/auxiliary/util/u_format_yuv.h
index 4ec39812e47..41524d63f3a 100644
--- a/src/gallium/auxiliary/util/u_format_yuv.h
+++ b/src/gallium/auxiliary/util/u_format_yuv.h
@@ -54,7 +54,7 @@
  * precision in the coefficients.
  */
 
-static INLINE void
+static inline void
 util_format_rgb_float_to_yuv(float r, float g, float b,
                              uint8_t *y, uint8_t *u, uint8_t *v)
 {
@@ -74,7 +74,7 @@ util_format_rgb_float_to_yuv(float r, float g, float b,
 }
 
 
-static INLINE void
+static inline void
 util_format_yuv_to_rgb_float(uint8_t y, uint8_t u, uint8_t v,
                              float *r, float *g, float *b)
 {
@@ -92,7 +92,7 @@ util_format_yuv_to_rgb_float(uint8_t y, uint8_t u, uint8_t v,
 }
 
 
-static INLINE void
+static inline void
 util_format_rgb_8unorm_to_yuv(uint8_t r, uint8_t g, uint8_t b,
                 	      uint8_t *y, uint8_t *u, uint8_t *v)
 {
@@ -102,7 +102,7 @@ util_format_rgb_8unorm_to_yuv(uint8_t r, uint8_t g, uint8_t b,
 }
 
 
-static INLINE void
+static inline void
 util_format_yuv_to_rgb_8unorm(uint8_t y, uint8_t u, uint8_t v,
                               uint8_t *r, uint8_t *g, uint8_t *b)
 {
diff --git a/src/gallium/auxiliary/util/u_format_zs.c b/src/gallium/auxiliary/util/u_format_zs.c
index f1ed32f1d5c..69f2f2971f7 100644
--- a/src/gallium/auxiliary/util/u_format_zs.c
+++ b/src/gallium/auxiliary/util/u_format_zs.c
@@ -35,28 +35,28 @@
  * z32_unorm conversion functions
  */
 
-static INLINE uint16_t
+static inline uint16_t
 z32_unorm_to_z16_unorm(uint32_t z)
 {
    /* z * 0xffff / 0xffffffff */
    return z >> 16;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 z16_unorm_to_z32_unorm(uint16_t z)
 {
    /* z * 0xffffffff / 0xffff */
    return (z << 16) | z;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 z32_unorm_to_z24_unorm(uint32_t z)
 {
    /* z * 0xffffff / 0xffffffff */
    return z >> 8;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 z24_unorm_to_z32_unorm(uint32_t z)
 {
    /* z * 0xffffffff / 0xffffff */
@@ -68,42 +68,42 @@ z24_unorm_to_z32_unorm(uint32_t z)
  * z32_float conversion functions
  */
 
-static INLINE uint16_t
+static inline uint16_t
 z32_float_to_z16_unorm(float z)
 {
    const float scale = 0xffff;
    return (uint16_t)(z * scale + 0.5f);
 }
 
-static INLINE float
+static inline float
 z16_unorm_to_z32_float(uint16_t z)
 {
    const float scale = 1.0 / 0xffff;
    return (float)(z * scale);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 z32_float_to_z24_unorm(float z)
 {
    const double scale = 0xffffff;
    return (uint32_t)(z * scale) & 0xffffff;
 }
 
-static INLINE float
+static inline float
 z24_unorm_to_z32_float(uint32_t z)
 {
    const double scale = 1.0 / 0xffffff;
    return (float)(z * scale);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 z32_float_to_z32_unorm(float z)
 {
    const double scale = 0xffffffff;
    return (uint32_t)(z * scale);
 }
 
-static INLINE float
+static inline float
 z32_unorm_to_z32_float(uint32_t z)
 {
    const double scale = 1.0 / 0xffffffff;
diff --git a/src/gallium/auxiliary/util/u_half.h b/src/gallium/auxiliary/util/u_half.h
index d340b9a7aef..d28fae3c77d 100644
--- a/src/gallium/auxiliary/util/u_half.h
+++ b/src/gallium/auxiliary/util/u_half.h
@@ -43,7 +43,7 @@ extern "C" {
  *  https://gist.github.com/2144712
  */
 
-static INLINE uint16_t
+static inline uint16_t
 util_float_to_half(float f)
 {
    uint32_t sign_mask  = 0x80000000;
@@ -96,7 +96,7 @@ util_float_to_half(float f)
    return f16;
 }
 
-static INLINE float
+static inline float
 util_half_to_float(uint16_t f16)
 {
    union fi infnan;
diff --git a/src/gallium/auxiliary/util/u_handle_table.c b/src/gallium/auxiliary/util/u_handle_table.c
index 85302f1e194..42c4e44b644 100644
--- a/src/gallium/auxiliary/util/u_handle_table.c
+++ b/src/gallium/auxiliary/util/u_handle_table.c
@@ -96,7 +96,7 @@ handle_table_set_destroy(struct handle_table *ht,
 /**
  * Resize the table if necessary 
  */
-static INLINE int
+static inline int
 handle_table_resize(struct handle_table *ht,
                     unsigned minimum_size)
 {
@@ -126,7 +126,7 @@ handle_table_resize(struct handle_table *ht,
 }
 
 
-static INLINE void
+static inline void
 handle_table_clear(struct handle_table *ht, 
                    unsigned index)
 {
diff --git a/src/gallium/auxiliary/util/u_hash_table.c b/src/gallium/auxiliary/util/u_hash_table.c
index 06c8b5c91a5..a505fbc4d83 100644
--- a/src/gallium/auxiliary/util/u_hash_table.c
+++ b/src/gallium/auxiliary/util/u_hash_table.c
@@ -68,7 +68,7 @@ struct util_hash_table_item
 };
 
 
-static INLINE struct util_hash_table_item *
+static inline struct util_hash_table_item *
 util_hash_table_item(struct cso_hash_iter iter)
 {
    return (struct util_hash_table_item *)cso_hash_iter_data(iter);
@@ -98,7 +98,7 @@ util_hash_table_create(unsigned (*hash)(void *key),
 }
 
 
-static INLINE struct cso_hash_iter
+static inline struct cso_hash_iter
 util_hash_table_find_iter(struct util_hash_table *ht,
                           void *key,
                           unsigned key_hash)
@@ -118,7 +118,7 @@ util_hash_table_find_iter(struct util_hash_table *ht,
 }
 
 
-static INLINE struct util_hash_table_item *
+static inline struct util_hash_table_item *
 util_hash_table_find_item(struct util_hash_table *ht,
                           void *key,
                           unsigned key_hash)
diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h
index 661a949a4b1..bb99a02ce49 100644
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -51,13 +51,13 @@ extern "C" {
  */
 
 
-static INLINE void
+static inline void
 pipe_reference_init(struct pipe_reference *reference, unsigned count)
 {
    p_atomic_set(&reference->count, count);
 }
 
-static INLINE boolean
+static inline boolean
 pipe_is_referenced(struct pipe_reference *reference)
 {
    return p_atomic_read(&reference->count) != 0;
@@ -69,7 +69,7 @@ pipe_is_referenced(struct pipe_reference *reference)
  * Both 'ptr' and 'reference' may be NULL.
  * \return TRUE if the object's refcount hits zero and should be destroyed.
  */
-static INLINE boolean
+static inline boolean
 pipe_reference_described(struct pipe_reference *ptr, 
                          struct pipe_reference *reference, 
                          debug_reference_descriptor get_desc)
@@ -96,14 +96,14 @@ pipe_reference_described(struct pipe_reference *ptr,
    return destroy;
 }
 
-static INLINE boolean
+static inline boolean
 pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
 {
    return pipe_reference_described(ptr, reference, 
                                    (debug_reference_descriptor)debug_describe_reference);
 }
 
-static INLINE void
+static inline void
 pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
 {
    struct pipe_surface *old_surf = *ptr;
@@ -120,7 +120,7 @@ pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
  * of using a deleted context's surface_destroy() method when freeing a surface
  * that's shared by multiple contexts.
  */
-static INLINE void
+static inline void
 pipe_surface_release(struct pipe_context *pipe, struct pipe_surface **ptr)
 {
    if (pipe_reference_described(&(*ptr)->reference, NULL,
@@ -130,7 +130,7 @@ pipe_surface_release(struct pipe_context *pipe, struct pipe_surface **ptr)
 }
 
 
-static INLINE void
+static inline void
 pipe_resource_reference(struct pipe_resource **ptr, struct pipe_resource *tex)
 {
    struct pipe_resource *old_tex = *ptr;
@@ -141,7 +141,7 @@ pipe_resource_reference(struct pipe_resource **ptr, struct pipe_resource *tex)
    *ptr = tex;
 }
 
-static INLINE void
+static inline void
 pipe_sampler_view_reference(struct pipe_sampler_view **ptr, struct pipe_sampler_view *view)
 {
    struct pipe_sampler_view *old_view = *ptr;
@@ -158,7 +158,7 @@ pipe_sampler_view_reference(struct pipe_sampler_view **ptr, struct pipe_sampler_
  * work-around for fixing a dangling context pointer problem when textures
  * are shared by multiple contexts.  XXX fix this someday.
  */
-static INLINE void
+static inline void
 pipe_sampler_view_release(struct pipe_context *ctx,
                           struct pipe_sampler_view **ptr)
 {
@@ -173,7 +173,7 @@ pipe_sampler_view_release(struct pipe_context *ctx,
    *ptr = NULL;
 }
 
-static INLINE void
+static inline void
 pipe_image_view_reference(struct pipe_image_view **ptr, struct pipe_image_view *view)
 {
    struct pipe_image_view *old_view = *ptr;
@@ -184,7 +184,7 @@ pipe_image_view_reference(struct pipe_image_view **ptr, struct pipe_image_view *
    *ptr = view;
 }
 
-static INLINE void
+static inline void
 pipe_so_target_reference(struct pipe_stream_output_target **ptr,
                          struct pipe_stream_output_target *target)
 {
@@ -196,7 +196,7 @@ pipe_so_target_reference(struct pipe_stream_output_target **ptr,
    *ptr = target;
 }
 
-static INLINE void
+static inline void
 pipe_surface_reset(struct pipe_context *ctx, struct pipe_surface* ps,
                    struct pipe_resource *pt, unsigned level, unsigned layer)
 {
@@ -209,7 +209,7 @@ pipe_surface_reset(struct pipe_context *ctx, struct pipe_surface* ps,
    ps->context = ctx;
 }
 
-static INLINE void
+static inline void
 pipe_surface_init(struct pipe_context *ctx, struct pipe_surface* ps,
                   struct pipe_resource *pt, unsigned level, unsigned layer)
 {
@@ -219,7 +219,7 @@ pipe_surface_init(struct pipe_context *ctx, struct pipe_surface* ps,
 }
 
 /* Return true if the surfaces are equal. */
-static INLINE boolean
+static inline boolean
 pipe_surface_equal(struct pipe_surface *s1, struct pipe_surface *s2)
 {
    return s1->texture == s2->texture &&
@@ -243,7 +243,7 @@ pipe_surface_equal(struct pipe_surface *s1, struct pipe_surface *s2)
  * \param bind  bitmask of PIPE_BIND_x flags
  * \param usage  bitmask of PIPE_USAGE_x flags
  */
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 pipe_buffer_create( struct pipe_screen *screen,
 		    unsigned bind,
 		    unsigned usage,
@@ -271,7 +271,7 @@ pipe_buffer_create( struct pipe_screen *screen,
  * \param access  bitmask of PIPE_TRANSFER_x flags
  * \param transfer  returns a transfer object
  */
-static INLINE void *
+static inline void *
 pipe_buffer_map_range(struct pipe_context *pipe,
 		      struct pipe_resource *buffer,
 		      unsigned offset,
@@ -302,7 +302,7 @@ pipe_buffer_map_range(struct pipe_context *pipe,
  * \param access  bitmask of PIPE_TRANSFER_x flags
  * \param transfer  returns a transfer object
  */
-static INLINE void *
+static inline void *
 pipe_buffer_map(struct pipe_context *pipe,
                 struct pipe_resource *buffer,
                 unsigned access,
@@ -312,14 +312,14 @@ pipe_buffer_map(struct pipe_context *pipe,
 }
 
 
-static INLINE void
+static inline void
 pipe_buffer_unmap(struct pipe_context *pipe,
                   struct pipe_transfer *transfer)
 {
    pipe->transfer_unmap(pipe, transfer);
 }
 
-static INLINE void
+static inline void
 pipe_buffer_flush_mapped_range(struct pipe_context *pipe,
                                struct pipe_transfer *transfer,
                                unsigned offset,
@@ -343,7 +343,7 @@ pipe_buffer_flush_mapped_range(struct pipe_context *pipe,
    pipe->transfer_flush_region(pipe, transfer, &box);
 }
 
-static INLINE void
+static inline void
 pipe_buffer_write(struct pipe_context *pipe,
                   struct pipe_resource *buf,
                   unsigned offset,
@@ -377,7 +377,7 @@ pipe_buffer_write(struct pipe_context *pipe,
  * We can avoid GPU/CPU synchronization when writing range that has never
  * been written before.
  */
-static INLINE void
+static inline void
 pipe_buffer_write_nooverlap(struct pipe_context *pipe,
                             struct pipe_resource *buf,
                             unsigned offset, unsigned size,
@@ -403,7 +403,7 @@ pipe_buffer_write_nooverlap(struct pipe_context *pipe,
  * \param bind  bitmask of PIPE_BIND_x flags
  * \param usage  bitmask of PIPE_USAGE_x flags
  */
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 pipe_buffer_create_with_data(struct pipe_context *pipe,
                              unsigned bind,
                              unsigned usage,
@@ -416,7 +416,7 @@ pipe_buffer_create_with_data(struct pipe_context *pipe,
    return res;
 }
 
-static INLINE void
+static inline void
 pipe_buffer_read(struct pipe_context *pipe,
                  struct pipe_resource *buf,
                  unsigned offset,
@@ -443,7 +443,7 @@ pipe_buffer_read(struct pipe_context *pipe,
  * Map a resource for reading/writing.
  * \param access  bitmask of PIPE_TRANSFER_x flags
  */
-static INLINE void *
+static inline void *
 pipe_transfer_map(struct pipe_context *context,
                   struct pipe_resource *resource,
                   unsigned level, unsigned layer,
@@ -466,7 +466,7 @@ pipe_transfer_map(struct pipe_context *context,
  * Map a 3D (texture) resource for reading/writing.
  * \param access  bitmask of PIPE_TRANSFER_x flags
  */
-static INLINE void *
+static inline void *
 pipe_transfer_map_3d(struct pipe_context *context,
                      struct pipe_resource *resource,
                      unsigned level,
@@ -484,14 +484,14 @@ pipe_transfer_map_3d(struct pipe_context *context,
                                 &box, transfer);
 }
 
-static INLINE void
+static inline void
 pipe_transfer_unmap( struct pipe_context *context,
                      struct pipe_transfer *transfer )
 {
    context->transfer_unmap( context, transfer );
 }
 
-static INLINE void
+static inline void
 pipe_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
                          struct pipe_resource *buf)
 {
@@ -512,7 +512,7 @@ pipe_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
  * Get the polygon offset enable/disable flag for the given polygon fill mode.
  * \param fill_mode  one of PIPE_POLYGON_MODE_POINT/LINE/FILL
  */
-static INLINE boolean
+static inline boolean
 util_get_offset(const struct pipe_rasterizer_state *templ,
                 unsigned fill_mode)
 {
@@ -529,7 +529,7 @@ util_get_offset(const struct pipe_rasterizer_state *templ,
    }
 }
 
-static INLINE float
+static inline float
 util_get_min_point_size(const struct pipe_rasterizer_state *state)
 {
    /* The point size should be clamped to this value at the rasterizer stage.
@@ -539,7 +539,7 @@ util_get_min_point_size(const struct pipe_rasterizer_state *state)
           !state->multisample ? 1.0f : 0.0f;
 }
 
-static INLINE void
+static inline void
 util_query_clear_result(union pipe_query_result *result, unsigned type)
 {
    switch (type) {
@@ -570,7 +570,7 @@ util_query_clear_result(union pipe_query_result *result, unsigned type)
 }
 
 /** Convert PIPE_TEXTURE_x to TGSI_TEXTURE_x */
-static INLINE unsigned
+static inline unsigned
 util_pipe_tex_to_tgsi_tex(enum pipe_texture_target pipe_tex_target,
                           unsigned nr_samples)
 {
@@ -615,7 +615,7 @@ util_pipe_tex_to_tgsi_tex(enum pipe_texture_target pipe_tex_target,
 }
 
 
-static INLINE void
+static inline void
 util_copy_constant_buffer(struct pipe_constant_buffer *dst,
                           const struct pipe_constant_buffer *src)
 {
@@ -633,7 +633,7 @@ util_copy_constant_buffer(struct pipe_constant_buffer *dst,
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 util_max_layer(const struct pipe_resource *r, unsigned level)
 {
    switch (r->target) {
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
index ae14eda3cec..daa2991ced6 100644
--- a/src/gallium/auxiliary/util/u_keymap.c
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -71,7 +71,7 @@ default_delete_func(const struct keymap *map,
 }
 
 
-static INLINE struct keymap_item *
+static inline struct keymap_item *
 hash_table_item(struct cso_hash_iter iter)
 {
    return (struct keymap_item *) cso_hash_iter_data(iter);
@@ -143,7 +143,7 @@ util_delete_keymap(struct keymap *map, void *user)
 }
 
 
-static INLINE struct cso_hash_iter
+static inline struct cso_hash_iter
 hash_table_find_iter(const struct keymap *map, const void *key,
                      unsigned key_hash)
 {
@@ -162,7 +162,7 @@ hash_table_find_iter(const struct keymap *map, const void *key,
 }
 
 
-static INLINE struct keymap_item *
+static inline struct keymap_item *
 hash_table_find_item(const struct keymap *map, const void *key,
                      unsigned key_hash)
 {
diff --git a/src/gallium/auxiliary/util/u_linear.h b/src/gallium/auxiliary/util/u_linear.h
index 81ffc9fb27d..87e52a344d4 100644
--- a/src/gallium/auxiliary/util/u_linear.h
+++ b/src/gallium/auxiliary/util/u_linear.h
@@ -89,7 +89,7 @@ void pipe_linear_fill_info(struct pipe_tile_info *t,
 			   unsigned tile_width, unsigned tile_height,
 			   unsigned tiles_x, unsigned tiles_y);
 
-static INLINE boolean pipe_linear_check_tile(const struct pipe_tile_info *t)
+static inline boolean pipe_linear_check_tile(const struct pipe_tile_info *t)
 {
    if (t->tile.size != t->block.size * t->cols * t->rows)
       return FALSE;
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 3b4040f0ee2..f5c409dbdf2 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -92,7 +92,7 @@ union di {
 /**
  * Extract the IEEE float32 exponent.
  */
-static INLINE signed
+static inline signed
 util_get_float32_exponent(float x)
 {
    union fi f;
@@ -112,7 +112,7 @@ util_get_float32_exponent(float x)
  * Compute exp2(ipart) with i << ipart
  * Compute exp2(fpart) with lookup table.
  */
-static INLINE float
+static inline float
 util_fast_exp2(float x)
 {
    int32_t ipart;
@@ -143,7 +143,7 @@ util_fast_exp2(float x)
 /**
  * Fast approximation to exp(x).
  */
-static INLINE float
+static inline float
 util_fast_exp(float x)
 {
    const float k = 1.44269f; /* = log2(e) */
@@ -160,7 +160,7 @@ extern float log2_table[LOG2_TABLE_SIZE];
 /**
  * Fast approximation to log2(x).
  */
-static INLINE float
+static inline float
 util_fast_log2(float x)
 {
    union fi num;
@@ -176,7 +176,7 @@ util_fast_log2(float x)
 /**
  * Fast approximation to x^y.
  */
-static INLINE float
+static inline float
 util_fast_pow(float x, float y)
 {
    return util_fast_exp2(util_fast_log2(x) * y);
@@ -184,7 +184,7 @@ util_fast_pow(float x, float y)
 
 /* Note that this counts zero as a power of two.
  */
-static INLINE boolean
+static inline boolean
 util_is_power_of_two( unsigned v )
 {
    return (v & (v-1)) == 0;
@@ -194,7 +194,7 @@ util_is_power_of_two( unsigned v )
 /**
  * Floor(x), returned as int.
  */
-static INLINE int
+static inline int
 util_ifloor(float f)
 {
    int ai, bi;
@@ -211,7 +211,7 @@ util_ifloor(float f)
 /**
  * Round float to nearest int.
  */
-static INLINE int
+static inline int
 util_iround(float f)
 {
 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) 
@@ -237,7 +237,7 @@ util_iround(float f)
 /**
  * Approximate floating point comparison
  */
-static INLINE boolean
+static inline boolean
 util_is_approx(float a, float b, float tol)
 {
    return fabs(b - a) <= tol;
@@ -256,7 +256,7 @@ util_is_approx(float a, float b, float tol)
 /**
  * Single-float
  */
-static INLINE boolean
+static inline boolean
 util_is_inf_or_nan(float x)
 {
    union fi tmp;
@@ -265,7 +265,7 @@ util_is_inf_or_nan(float x)
 }
 
 
-static INLINE boolean
+static inline boolean
 util_is_nan(float x)
 {
    union fi tmp;
@@ -274,7 +274,7 @@ util_is_nan(float x)
 }
 
 
-static INLINE int
+static inline int
 util_inf_sign(float x)
 {
    union fi tmp;
@@ -290,7 +290,7 @@ util_inf_sign(float x)
 /**
  * Double-float
  */
-static INLINE boolean
+static inline boolean
 util_is_double_inf_or_nan(double x)
 {
    union di tmp;
@@ -299,7 +299,7 @@ util_is_double_inf_or_nan(double x)
 }
 
 
-static INLINE boolean
+static inline boolean
 util_is_double_nan(double x)
 {
    union di tmp;
@@ -308,7 +308,7 @@ util_is_double_nan(double x)
 }
 
 
-static INLINE int
+static inline int
 util_double_inf_sign(double x)
 {
    union di tmp;
@@ -324,21 +324,21 @@ util_double_inf_sign(double x)
 /**
  * Half-float
  */
-static INLINE boolean
+static inline boolean
 util_is_half_inf_or_nan(int16_t x)
 {
    return (x & 0x7c00) == 0x7c00;
 }
 
 
-static INLINE boolean
+static inline boolean
 util_is_half_nan(int16_t x)
 {
    return (x & 0x7fff) > 0x7c00;
 }
 
 
-static INLINE int
+static inline int
 util_half_inf_sign(int16_t x)
 {
    if ((x & 0x7fff) != 0x7c00) {
@@ -359,7 +359,7 @@ util_half_inf_sign(int16_t x)
 #if defined(_MSC_VER) && (_M_IX86 || _M_AMD64 || _M_IA64)
 unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask);
 #pragma intrinsic(_BitScanForward)
-static INLINE
+static inline
 unsigned long ffs( unsigned long u )
 {
    unsigned long i;
@@ -369,7 +369,7 @@ unsigned long ffs( unsigned long u )
       return 0;
 }
 #elif defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
-static INLINE
+static inline
 unsigned ffs( unsigned u )
 {
    unsigned i;
@@ -409,7 +409,7 @@ unsigned ffs( unsigned u )
  * Find last bit set in a word.  The least significant bit is 1.
  * Return 0 if no bits are set.
  */
-static INLINE unsigned
+static inline unsigned
 util_last_bit(unsigned u)
 {
 #if defined(HAVE___BUILTIN_CLZ)
@@ -428,7 +428,7 @@ util_last_bit(unsigned u)
  * Find last bit set in a word.  The least significant bit is 1.
  * Return 0 if no bits are set.
  */
-static INLINE unsigned
+static inline unsigned
 util_last_bit64(uint64_t u)
 {
 #if defined(HAVE___BUILTIN_CLZLL)
@@ -448,7 +448,7 @@ util_last_bit64(uint64_t u)
  * significant bit is 1.
  * Return 0 if no bits are set.
  */
-static INLINE unsigned
+static inline unsigned
 util_last_bit_signed(int i)
 {
    if (i >= 0)
@@ -465,7 +465,7 @@ util_last_bit_signed(int i)
  * }
  *
  */
-static INLINE int
+static inline int
 u_bit_scan(unsigned *mask)
 {
    int i = ffs(*mask) - 1;
@@ -474,7 +474,7 @@ u_bit_scan(unsigned *mask)
 }
 
 #ifndef _MSC_VER
-static INLINE int
+static inline int
 u_bit_scan64(uint64_t *mask)
 {
    int i = ffsll(*mask) - 1;
@@ -486,7 +486,7 @@ u_bit_scan64(uint64_t *mask)
 /**
  * Return float bits.
  */
-static INLINE unsigned
+static inline unsigned
 fui( float f )
 {
    union fi fi;
@@ -494,7 +494,7 @@ fui( float f )
    return fi.ui;
 }
 
-static INLINE float
+static inline float
 uif(uint32_t ui)
 {
    union fi fi;
@@ -507,7 +507,7 @@ uif(uint32_t ui)
  * Convert ubyte to float in [0, 1].
  * XXX a 256-entry lookup table would be slightly faster.
  */
-static INLINE float
+static inline float
 ubyte_to_float(ubyte ub)
 {
    return (float) ub * (1.0f / 255.0f);
@@ -517,7 +517,7 @@ ubyte_to_float(ubyte ub)
 /**
  * Convert float in [0,1] to ubyte in [0,255] with clamping.
  */
-static INLINE ubyte
+static inline ubyte
 float_to_ubyte(float f)
 {
    union fi tmp;
@@ -535,13 +535,13 @@ float_to_ubyte(float f)
    }
 }
 
-static INLINE float
+static inline float
 byte_to_float_tex(int8_t b)
 {
    return (b == -128) ? -1.0F : b * 1.0F / 127.0F;
 }
 
-static INLINE int8_t
+static inline int8_t
 float_to_byte_tex(float f)
 {
    return (int8_t) (127.0F * f);
@@ -550,7 +550,7 @@ float_to_byte_tex(float f)
 /**
  * Calc log base 2
  */
-static INLINE unsigned
+static inline unsigned
 util_logbase2(unsigned n)
 {
 #if defined(HAVE___BUILTIN_CLZ)
@@ -570,7 +570,7 @@ util_logbase2(unsigned n)
 /**
  * Returns the smallest power of two >= x
  */
-static INLINE unsigned
+static inline unsigned
 util_next_power_of_two(unsigned x)
 {
 #if defined(HAVE___BUILTIN_CLZ)
@@ -602,7 +602,7 @@ util_next_power_of_two(unsigned x)
 /**
  * Return number of bits set in n.
  */
-static INLINE unsigned
+static inline unsigned
 util_bitcount(unsigned n)
 {
 #if defined(HAVE___BUILTIN_POPCOUNT)
@@ -623,7 +623,7 @@ util_bitcount(unsigned n)
 }
 
 
-static INLINE unsigned
+static inline unsigned
 util_bitcount64(uint64_t n)
 {
 #ifdef HAVE___BUILTIN_POPCOUNTLL
@@ -639,7 +639,7 @@ util_bitcount64(uint64_t n)
  * Algorithm taken from:
  * http://stackoverflow.com/questions/9144800/c-reverse-bits-in-unsigned-integer
  */
-static INLINE unsigned
+static inline unsigned
 util_bitreverse(unsigned n)
 {
     n = ((n >> 1) & 0x55555555u) | ((n & 0x55555555u) << 1);
@@ -671,7 +671,7 @@ util_bitreverse(unsigned n)
 /**
  * Reverse byte order of a 32 bit word.
  */
-static INLINE uint32_t
+static inline uint32_t
 util_bswap32(uint32_t n)
 {
 #if defined(HAVE___BUILTIN_BSWAP32)
@@ -687,7 +687,7 @@ util_bswap32(uint32_t n)
 /**
  * Reverse byte order of a 64bit word.
  */
-static INLINE uint64_t
+static inline uint64_t
 util_bswap64(uint64_t n)
 {
 #if defined(HAVE___BUILTIN_BSWAP64)
@@ -702,14 +702,14 @@ util_bswap64(uint64_t n)
 /**
  * Reverse byte order of a 16 bit word.
  */
-static INLINE uint16_t
+static inline uint16_t
 util_bswap16(uint16_t n)
 {
    return (n >> 8) |
           (n << 8);
 }
 
-static INLINE void*
+static inline void*
 util_memcpy_cpu_to_le32(void * restrict dest, const void * restrict src, size_t n)
 {
 #ifdef PIPE_ARCH_BIG_ENDIAN
@@ -746,7 +746,7 @@ util_memcpy_cpu_to_le32(void * restrict dest, const void * restrict src, size_t
 /**
  * Align a value, only works pot alignemnts.
  */
-static INLINE int
+static inline int
 align(int value, int alignment)
 {
    return (value + alignment - 1) & ~(alignment - 1);
@@ -755,7 +755,7 @@ align(int value, int alignment)
 /**
  * Works like align but on npot alignments.
  */
-static INLINE size_t
+static inline size_t
 util_align_npot(size_t value, size_t alignment)
 {
    if (value % alignment)
@@ -763,7 +763,7 @@ util_align_npot(size_t value, size_t alignment)
    return value;
 }
 
-static INLINE unsigned
+static inline unsigned
 u_minify(unsigned value, unsigned levels)
 {
     return MAX2(1, value >> levels);
@@ -796,13 +796,13 @@ do {                                     \
 #endif
 
 
-static INLINE uint32_t
+static inline uint32_t
 util_unsigned_fixed(float value, unsigned frac_bits)
 {
    return value < 0 ? 0 : (uint32_t)(value * (1<<frac_bits));
 }
 
-static INLINE int32_t
+static inline int32_t
 util_signed_fixed(float value, unsigned frac_bits)
 {
    return (int32_t)(value * (1<<frac_bits));
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
index 9ff6c7da919..7fe0fe6f053 100644
--- a/src/gallium/auxiliary/util/u_memory.h
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -67,7 +67,7 @@ extern "C" {
 /**
  * Duplicate a block of memory.
  */
-static INLINE void *
+static inline void *
 mem_dup(const void *src, uint size)
 {
    void *dup = MALLOC(size);
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index 82f83702d1e..2069b56f464 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -224,7 +224,7 @@ u_mmFindBlock(struct mem_block *heap, int start)
 }
 
 
-static INLINE int
+static inline int
 Join2Blocks(struct mem_block *p)
 {
    /* XXX there should be some assertions here */
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index e0c9018f8ef..b882502b7ba 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -60,7 +60,7 @@ union util_color {
 /**
  * Pack ubyte R,G,B,A into dest pixel.
  */
-static INLINE void
+static inline void
 util_pack_color_ub(ubyte r, ubyte g, ubyte b, ubyte a,
                    enum pipe_format format, union util_color *uc)
 {
@@ -161,7 +161,7 @@ util_pack_color_ub(ubyte r, ubyte g, ubyte b, ubyte a,
 /**
  * Unpack RGBA from a packed pixel, returning values as ubytes in [0,255].
  */
-static INLINE void
+static inline void
 util_unpack_color_ub(enum pipe_format format, union util_color *uc,
                      ubyte *r, ubyte *g, ubyte *b, ubyte *a)
 {
@@ -333,7 +333,7 @@ util_unpack_color_ub(enum pipe_format format, union util_color *uc,
  * This will not work (and might not really be useful with float input)
  * for pure integer formats (which lack the pack_rgba_float function).
  */
-static INLINE void
+static inline void
 util_pack_color(const float rgba[4], enum pipe_format format, union util_color *uc)
 {
    ubyte r = 0;
@@ -437,7 +437,7 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color *
 /* Integer versions of util_pack_z and util_pack_z_stencil - useful for
  * constructing clear masks.
  */
-static INLINE uint32_t
+static inline uint32_t
 util_pack_mask_z(enum pipe_format format, uint32_t z)
 {
    switch (format) {
@@ -462,7 +462,7 @@ util_pack_mask_z(enum pipe_format format, uint32_t z)
 }
 
 
-static INLINE uint64_t
+static inline uint64_t
 util_pack64_mask_z(enum pipe_format format, uint32_t z)
 {
    switch (format) {
@@ -474,7 +474,7 @@ util_pack64_mask_z(enum pipe_format format, uint32_t z)
 }
 
 
-static INLINE uint32_t
+static inline uint32_t
 util_pack_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s)
 {
    uint32_t packed = util_pack_mask_z(format, z);
@@ -497,7 +497,7 @@ util_pack_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s)
 }
 
 
-static INLINE uint64_t
+static inline uint64_t
 util_pack64_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s)
 {
    uint64_t packed;
@@ -516,7 +516,7 @@ util_pack64_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s)
 /**
  * Note: it's assumed that z is in [0,1]
  */
-static INLINE uint32_t
+static inline uint32_t
 util_pack_z(enum pipe_format format, double z)
 {
    union fi fui;
@@ -558,7 +558,7 @@ util_pack_z(enum pipe_format format, double z)
 }
 
 
-static INLINE uint64_t
+static inline uint64_t
 util_pack64_z(enum pipe_format format, double z)
 {
    union fi fui;
@@ -580,7 +580,7 @@ util_pack64_z(enum pipe_format format, double z)
  * Pack Z and/or stencil values into a 32-bit value described by format.
  * Note: it's assumed that z is in [0,1] and s in [0,255]
  */
-static INLINE uint32_t
+static inline uint32_t
 util_pack_z_stencil(enum pipe_format format, double z, uint8_t s)
 {
    uint32_t packed = util_pack_z(format, z);
@@ -603,7 +603,7 @@ util_pack_z_stencil(enum pipe_format format, double z, uint8_t s)
 }
 
 
-static INLINE uint64_t
+static inline uint64_t
 util_pack64_z_stencil(enum pipe_format format, double z, uint8_t s)
 {
    uint64_t packed;
@@ -624,7 +624,7 @@ util_pack64_z_stencil(enum pipe_format format, double z, uint8_t s)
 /**
  * Pack 4 ubytes into a 4-byte word
  */
-static INLINE unsigned
+static inline unsigned
 pack_ub4(ubyte b0, ubyte b1, ubyte b2, ubyte b3)
 {
    return ((((unsigned int)b0) << 0) |
@@ -637,7 +637,7 @@ pack_ub4(ubyte b0, ubyte b1, ubyte b2, ubyte b3)
 /**
  * Pack/convert 4 floats into one 4-byte word.
  */
-static INLINE unsigned
+static inline unsigned
 pack_ui32_float4(float a, float b, float c, float d)
 {
    return pack_ub4( float_to_ubyte(a),
diff --git a/src/gallium/auxiliary/util/u_pointer.h b/src/gallium/auxiliary/util/u_pointer.h
index 30c23b79831..4f7a27ca61d 100644
--- a/src/gallium/auxiliary/util/u_pointer.h
+++ b/src/gallium/auxiliary/util/u_pointer.h
@@ -34,7 +34,7 @@
 extern "C" {
 #endif
 
-static INLINE intptr_t
+static inline intptr_t
 pointer_to_intptr( const void *p )
 {
    union {
@@ -45,7 +45,7 @@ pointer_to_intptr( const void *p )
    return pi.i;
 }
 
-static INLINE void *
+static inline void *
 intptr_to_pointer( intptr_t i )
 {
    union {
@@ -56,7 +56,7 @@ intptr_to_pointer( intptr_t i )
    return pi.p;
 }
 
-static INLINE uintptr_t
+static inline uintptr_t
 pointer_to_uintptr( const void *ptr )
 {
    union {
@@ -67,7 +67,7 @@ pointer_to_uintptr( const void *ptr )
    return pu.u;
 }
 
-static INLINE void *
+static inline void *
 uintptr_to_pointer( uintptr_t u )
 {
    union {
@@ -81,7 +81,7 @@ uintptr_to_pointer( uintptr_t u )
 /**
  * Return a pointer aligned to next multiple of N bytes.
  */
-static INLINE void *
+static inline void *
 align_pointer( const void *unaligned, uintptr_t alignment )
 {
    uintptr_t aligned = (pointer_to_uintptr( unaligned ) + alignment - 1) & ~(alignment - 1);
@@ -92,7 +92,7 @@ align_pointer( const void *unaligned, uintptr_t alignment )
 /**
  * Return a pointer aligned to next multiple of 16 bytes.
  */
-static INLINE void *
+static inline void *
 align16( void *unaligned )
 {
    return align_pointer( unaligned, 16 );
@@ -100,7 +100,7 @@ align16( void *unaligned )
 
 typedef void (*func_pointer)(void);
 
-static INLINE func_pointer
+static inline func_pointer
 pointer_to_func( void *p )
 {
    union {
@@ -111,7 +111,7 @@ pointer_to_func( void *p )
    return pf.f;
 }
 
-static INLINE void *
+static inline void *
 func_to_pointer( func_pointer f )
 {
    union {
diff --git a/src/gallium/auxiliary/util/u_prim.h b/src/gallium/auxiliary/util/u_prim.h
index b2dd44df230..366801545ed 100644
--- a/src/gallium/auxiliary/util/u_prim.h
+++ b/src/gallium/auxiliary/util/u_prim.h
@@ -46,7 +46,7 @@ struct u_prim_vertex_count {
  * Decompose a primitive that is a loop, a strip, or a fan.  Return the
  * original primitive if it is already decomposed.
  */
-static INLINE unsigned
+static inline unsigned
 u_decomposed_prim(unsigned prim)
 {
    switch (prim) {
@@ -71,7 +71,7 @@ u_decomposed_prim(unsigned prim)
  * Reduce a primitive to one of PIPE_PRIM_POINTS, PIPE_PRIM_LINES, and
  * PIPE_PRIM_TRIANGLES.
  */
-static INLINE unsigned
+static inline unsigned
 u_reduced_prim(unsigned prim)
 {
    switch (prim) {
@@ -91,7 +91,7 @@ u_reduced_prim(unsigned prim)
 /**
  * Re-assemble a primitive to remove its adjacency.
  */
-static INLINE unsigned
+static inline unsigned
 u_assembled_prim(unsigned prim)
 {
    switch (prim) {
@@ -113,7 +113,7 @@ u_assembled_prim(unsigned prim)
  * source file, it will increase the size of the binary slightly more than
  * expected because of the use of a table.
  */
-static INLINE const struct u_prim_vertex_count *
+static inline const struct u_prim_vertex_count *
 u_prim_vertex_count(unsigned prim)
 {
    static const struct u_prim_vertex_count prim_table[PIPE_PRIM_MAX] = {
@@ -140,7 +140,7 @@ u_prim_vertex_count(unsigned prim)
  * Given a vertex count, return the number of primitives.
  * For polygons, return the number of triangles.
  */
-static INLINE unsigned
+static inline unsigned
 u_prims_for_vertices(unsigned prim, unsigned num)
 {
    const struct u_prim_vertex_count *info = u_prim_vertex_count(prim);
@@ -151,7 +151,7 @@ u_prims_for_vertices(unsigned prim, unsigned num)
    return 1 + ((num - info->min) / info->incr);
 }
 
-static INLINE boolean u_validate_pipe_prim( unsigned pipe_prim, unsigned nr )
+static inline boolean u_validate_pipe_prim( unsigned pipe_prim, unsigned nr )
 {
    const struct u_prim_vertex_count *count = u_prim_vertex_count(pipe_prim);
 
@@ -159,7 +159,7 @@ static INLINE boolean u_validate_pipe_prim( unsigned pipe_prim, unsigned nr )
 }
 
 
-static INLINE boolean u_trim_pipe_prim( unsigned pipe_prim, unsigned *nr )
+static inline boolean u_trim_pipe_prim( unsigned pipe_prim, unsigned *nr )
 {
    const struct u_prim_vertex_count *count = u_prim_vertex_count(pipe_prim);
 
@@ -174,7 +174,7 @@ static INLINE boolean u_trim_pipe_prim( unsigned pipe_prim, unsigned *nr )
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 u_vertices_per_prim(int primitive)
 {
    switch(primitive) {
@@ -216,7 +216,7 @@ u_vertices_per_prim(int primitive)
  * statistics depend on knowing the exact number of decomposed
  * primitives for a set of vertices.
  */
-static INLINE unsigned
+static inline unsigned
 u_decomposed_prims_for_vertices(int primitive, int vertices)
 {
    switch (primitive) {
@@ -263,7 +263,7 @@ u_decomposed_prims_for_vertices(int primitive, int vertices)
  * count.  Each quad is treated as two triangles.  Polygons are treated as
  * triangle fans.
  */
-static INLINE unsigned
+static inline unsigned
 u_reduced_prims_for_vertices(int primitive, int vertices)
 {
    switch (primitive) {
diff --git a/src/gallium/auxiliary/util/u_range.h b/src/gallium/auxiliary/util/u_range.h
index efe25ef5e42..a1da5e5a6f0 100644
--- a/src/gallium/auxiliary/util/u_range.h
+++ b/src/gallium/auxiliary/util/u_range.h
@@ -47,7 +47,7 @@ struct util_range {
 };
 
 
-static INLINE void
+static inline void
 util_range_set_empty(struct util_range *range)
 {
    range->start = ~0;
@@ -55,7 +55,7 @@ util_range_set_empty(struct util_range *range)
 }
 
 /* This is like a union of two sets. */
-static INLINE void
+static inline void
 util_range_add(struct util_range *range, unsigned start, unsigned end)
 {
    if (start < range->start || end > range->end) {
@@ -66,7 +66,7 @@ util_range_add(struct util_range *range, unsigned start, unsigned end)
    }
 }
 
-static INLINE boolean
+static inline boolean
 util_ranges_intersect(struct util_range *range, unsigned start, unsigned end)
 {
    return MAX2(start, range->start) < MIN2(end, range->end);
@@ -75,14 +75,14 @@ util_ranges_intersect(struct util_range *range, unsigned start, unsigned end)
 
 /* Init/deinit */
 
-static INLINE void
+static inline void
 util_range_init(struct util_range *range)
 {
    pipe_mutex_init(range->write_mutex);
    util_range_set_empty(range);
 }
 
-static INLINE void
+static inline void
 util_range_destroy(struct util_range *range)
 {
    pipe_mutex_destroy(range->write_mutex);
diff --git a/src/gallium/auxiliary/util/u_rect.h b/src/gallium/auxiliary/util/u_rect.h
index cf29dff0d02..b26f671f313 100644
--- a/src/gallium/auxiliary/util/u_rect.h
+++ b/src/gallium/auxiliary/util/u_rect.h
@@ -43,7 +43,7 @@ struct u_rect {
 
 /* Do two rectangles intersect?
  */
-static INLINE boolean
+static inline boolean
 u_rect_test_intersection(const struct u_rect *a,
                          const struct u_rect *b)
 {
@@ -55,7 +55,7 @@ u_rect_test_intersection(const struct u_rect *a,
 
 /* Find the intersection of two rectangles known to intersect.
  */
-static INLINE void
+static inline void
 u_rect_find_intersection(const struct u_rect *a,
                          struct u_rect *b)
 {
@@ -68,13 +68,13 @@ u_rect_find_intersection(const struct u_rect *a,
 }
 
 
-static INLINE int
+static inline int
 u_rect_area(const struct u_rect *r)
 {
    return (r->x1 - r->x0) * (r->y1 - r->y0);
 }
 
-static INLINE void
+static inline void
 u_rect_possible_intersection(const struct u_rect *a,
                              struct u_rect *b)
 {
@@ -88,7 +88,7 @@ u_rect_possible_intersection(const struct u_rect *a,
 
 /* Set @d to a rectangle that covers both @a and @b.
  */
-static INLINE void
+static inline void
 u_rect_union(struct u_rect *d, const struct u_rect *a, const struct u_rect *b)
 {
    d->x0 = MIN2(a->x0, b->x0);
diff --git a/src/gallium/auxiliary/util/u_resource.h b/src/gallium/auxiliary/util/u_resource.h
index a5e091fd66e..6736476f4da 100644
--- a/src/gallium/auxiliary/util/u_resource.h
+++ b/src/gallium/auxiliary/util/u_resource.h
@@ -36,7 +36,7 @@ util_resource_size(const struct pipe_resource *res);
  *
  * Note that this function returns true for single-layered array textures.
  */
-static INLINE boolean
+static inline boolean
 util_resource_is_array_texture(const struct pipe_resource *res)
 {
    switch (res->target) {
diff --git a/src/gallium/auxiliary/util/u_ringbuffer.c b/src/gallium/auxiliary/util/u_ringbuffer.c
index 648b105b137..5816b781660 100644
--- a/src/gallium/auxiliary/util/u_ringbuffer.c
+++ b/src/gallium/auxiliary/util/u_ringbuffer.c
@@ -56,7 +56,7 @@ void util_ringbuffer_destroy( struct util_ringbuffer *ring )
 /**
  * Return number of free entries in the ring
  */
-static INLINE unsigned util_ringbuffer_space( const struct util_ringbuffer *ring )
+static inline unsigned util_ringbuffer_space( const struct util_ringbuffer *ring )
 {
    return (ring->tail - (ring->head + 1)) & ring->mask;
 }
@@ -64,7 +64,7 @@ static INLINE unsigned util_ringbuffer_space( const struct util_ringbuffer *ring
 /**
  * Is the ring buffer empty?
  */
-static INLINE boolean util_ringbuffer_empty( const struct util_ringbuffer *ring )
+static inline boolean util_ringbuffer_empty( const struct util_ringbuffer *ring )
 {
    return util_ringbuffer_space(ring) == ring->mask;
 }
diff --git a/src/gallium/auxiliary/util/u_split_prim.h b/src/gallium/auxiliary/util/u_split_prim.h
index 7f80fc12700..5afb7d9a920 100644
--- a/src/gallium/auxiliary/util/u_split_prim.h
+++ b/src/gallium/auxiliary/util/u_split_prim.h
@@ -23,7 +23,7 @@ struct util_split_prim {
    uint edgeflag_off:1;
 };
 
-static INLINE void
+static inline void
 util_split_prim_init(struct util_split_prim *s,
                   unsigned mode, unsigned start, unsigned count)
 {
@@ -41,7 +41,7 @@ util_split_prim_init(struct util_split_prim *s,
    s->repeat_first = 0;
 }
 
-static INLINE boolean
+static inline boolean
 util_split_prim_next(struct util_split_prim *s, unsigned max_verts)
 {
    int repeat = 0;
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index d4f51912a2d..7f8e5a1a3cf 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -51,7 +51,7 @@ union m128i {
    uint ui[4];
 };
 
-static INLINE void u_print_epi8(const char *name, __m128i r)
+static inline void u_print_epi8(const char *name, __m128i r)
 {
    union { __m128i m; ubyte ub[16]; } u;
    u.m = r;
@@ -80,7 +80,7 @@ static INLINE void u_print_epi8(const char *name, __m128i r)
                 u.ub[12], u.ub[13], u.ub[14], u.ub[15]);
 }
 
-static INLINE void u_print_epi16(const char *name, __m128i r)
+static inline void u_print_epi16(const char *name, __m128i r)
 {
    union { __m128i m; ushort us[8]; } u;
    u.m = r;
@@ -99,7 +99,7 @@ static INLINE void u_print_epi16(const char *name, __m128i r)
                 u.us[4],  u.us[5],  u.us[6],  u.us[7]);
 }
 
-static INLINE void u_print_epi32(const char *name, __m128i r)
+static inline void u_print_epi32(const char *name, __m128i r)
 {
    union { __m128i m; uint ui[4]; } u;
    u.m = r;
@@ -113,7 +113,7 @@ static INLINE void u_print_epi32(const char *name, __m128i r)
                 u.ui[0],  u.ui[1],  u.ui[2],  u.ui[3]);
 }
 
-static INLINE void u_print_ps(const char *name, __m128 r)
+static inline void u_print_ps(const char *name, __m128 r)
 {
    union { __m128 m; float f[4]; } u;
    u.m = r;
@@ -179,7 +179,7 @@ _mm_shuffle_epi8(__m128i a, __m128i mask)
  * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
  * dependency at this point.
  */
-static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
+static inline __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
 {
    __m128i a4   = _mm_srli_epi64(a, 32);  /* shift by one dword */
    __m128i b4   = _mm_srli_epi64(b, 32);  /* shift by one dword */
@@ -204,7 +204,7 @@ static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
 }
 
 
-static INLINE void
+static inline void
 transpose4_epi32(const __m128i * restrict a,
                  const __m128i * restrict b,
                  const __m128i * restrict c,
diff --git a/src/gallium/auxiliary/util/u_string.h b/src/gallium/auxiliary/util/u_string.h
index dc89c4400bc..da3e2675d8e 100644
--- a/src/gallium/auxiliary/util/u_string.h
+++ b/src/gallium/auxiliary/util/u_string.h
@@ -54,7 +54,7 @@ extern "C" {
 
 #else
 
-static INLINE char *
+static inline char *
 util_strchrnul(const char *s, char c)
 {
    for (; *s && *s != c; ++s);
@@ -69,13 +69,13 @@ util_strchrnul(const char *s, char c)
 int util_vsnprintf(char *, size_t, const char *, va_list);
 int util_snprintf(char *str, size_t size, const char *format, ...);
 
-static INLINE void
+static inline void
 util_vsprintf(char *str, const char *format, va_list ap)
 {
    util_vsnprintf(str, (size_t)-1, format, ap);
 }
 
-static INLINE void
+static inline void
 util_sprintf(char *str, const char *format, ...)
 {
    va_list ap;
@@ -84,7 +84,7 @@ util_sprintf(char *str, const char *format, ...)
    va_end(ap);
 }
 
-static INLINE char *
+static inline char *
 util_strchr(const char *s, char c)
 {
    char *p = util_strchrnul(s, c);
@@ -92,7 +92,7 @@ util_strchr(const char *s, char c)
    return *p ? p : NULL;
 }
 
-static INLINE char*
+static inline char*
 util_strncat(char *dst, const char *src, size_t n)
 {
    char *p = dst + strlen(dst);
@@ -106,7 +106,7 @@ util_strncat(char *dst, const char *src, size_t n)
    return dst;
 }
 
-static INLINE int
+static inline int
 util_strcmp(const char *s1, const char *s2)
 {
    unsigned char u1, u2;
@@ -122,7 +122,7 @@ util_strcmp(const char *s1, const char *s2)
    return 0;
 }
 
-static INLINE int
+static inline int
 util_strncmp(const char *s1, const char *s2, size_t n)
 {
    unsigned char u1, u2;
@@ -138,7 +138,7 @@ util_strncmp(const char *s1, const char *s2, size_t n)
    return 0;
 }
 
-static INLINE char *
+static inline char *
 util_strstr(const char *haystack, const char *needle)
 {
    const char *p = haystack;
@@ -152,7 +152,7 @@ util_strstr(const char *haystack, const char *needle)
    return NULL;
 }
 
-static INLINE void *
+static inline void *
 util_memmove(void *dest, const void *src, size_t n)
 {
    char *p = (char *)dest;
@@ -199,7 +199,7 @@ struct util_strbuf
 };
 
 
-static INLINE void
+static inline void
 util_strbuf_init(struct util_strbuf *sbuf, char *str, size_t size) 
 {
    sbuf->str = str;
@@ -209,7 +209,7 @@ util_strbuf_init(struct util_strbuf *sbuf, char *str, size_t size)
 }
 
 
-static INLINE void
+static inline void
 util_strbuf_printf(struct util_strbuf *sbuf, const char *format, ...)
 {
    if(sbuf->left > 1) {
diff --git a/src/gallium/auxiliary/util/u_surfaces.h b/src/gallium/auxiliary/util/u_surfaces.h
index 1605215cb88..b84694c540b 100644
--- a/src/gallium/auxiliary/util/u_surfaces.h
+++ b/src/gallium/auxiliary/util/u_surfaces.h
@@ -50,7 +50,7 @@ util_surfaces_do_get(struct util_surfaces *us, unsigned surface_struct_size,
                      struct pipe_surface **res);
 
 /* fast inline path for the very common case */
-static INLINE boolean
+static inline boolean
 util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size,
                   struct pipe_context *ctx, struct pipe_resource *pt,
                   unsigned level, unsigned layer,
@@ -70,7 +70,7 @@ util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size,
    return util_surfaces_do_get(us, surface_struct_size, ctx, pt, level, layer, res);
 }
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 util_surfaces_peek(struct util_surfaces *us, struct pipe_resource *pt, unsigned level, unsigned layer)
 {
    if(!us->u.pv)
@@ -84,7 +84,7 @@ util_surfaces_peek(struct util_surfaces *us, struct pipe_resource *pt, unsigned
 
 void util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps);
 
-static INLINE void
+static inline void
 util_surfaces_detach(struct util_surfaces *us, struct pipe_surface *ps)
 {
    if(likely(ps->texture->target == PIPE_TEXTURE_2D || ps->texture->target == PIPE_TEXTURE_RECT))
diff --git a/src/gallium/auxiliary/util/u_tile.h b/src/gallium/auxiliary/util/u_tile.h
index a33d7f7722b..dc1f568a8e5 100644
--- a/src/gallium/auxiliary/util/u_tile.h
+++ b/src/gallium/auxiliary/util/u_tile.h
@@ -42,7 +42,7 @@ struct pipe_transfer;
  *
  * \return TRUE if tile is totally clipped, FALSE otherwise
  */
-static INLINE boolean
+static inline boolean
 u_clip_tile(uint x, uint y, uint *w, uint *h, const struct pipe_box *box)
 {
    if ((int) x >= box->width)
diff --git a/src/gallium/auxiliary/util/u_time.h b/src/gallium/auxiliary/util/u_time.h
index 2bee1e00014..a5017d6bce2 100644
--- a/src/gallium/auxiliary/util/u_time.h
+++ b/src/gallium/auxiliary/util/u_time.h
@@ -60,7 +60,7 @@ struct util_time
    
 
 PIPE_DEPRECATED
-static INLINE void
+static inline void
 util_time_get(struct util_time *t)
 {
    t->counter = os_time_get();
@@ -71,7 +71,7 @@ util_time_get(struct util_time *t)
  * Return t2 = t1 + usecs
  */
 PIPE_DEPRECATED
-static INLINE void
+static inline void
 util_time_add(const struct util_time *t1,
               int64_t usecs,
               struct util_time *t2)
@@ -84,7 +84,7 @@ util_time_add(const struct util_time *t1,
  * Return difference between times, in microseconds
  */
 PIPE_DEPRECATED
-static INLINE int64_t
+static inline int64_t
 util_time_diff(const struct util_time *t1, 
                const struct util_time *t2)
 {
@@ -98,7 +98,7 @@ util_time_diff(const struct util_time *t1,
  * Not publicly available because it does not take in account wrap-arounds.
  * Use util_time_timeout instead.
  */
-static INLINE int
+static inline int
 _util_time_compare(const struct util_time *t1,
                    const struct util_time *t2)
 {
@@ -115,7 +115,7 @@ _util_time_compare(const struct util_time *t1,
  * Returns non-zero when the timeout expires.
  */
 PIPE_DEPRECATED
-static INLINE boolean
+static inline boolean
 util_time_timeout(const struct util_time *start, 
                   const struct util_time *end,
                   const struct util_time *curr)
@@ -128,7 +128,7 @@ util_time_timeout(const struct util_time *start,
  * Return current time in microseconds
  */
 PIPE_DEPRECATED
-static INLINE int64_t
+static inline int64_t
 util_time_micros(void)
 {
    return os_time_get();
@@ -136,7 +136,7 @@ util_time_micros(void)
 
 
 PIPE_DEPRECATED
-static INLINE void
+static inline void
 util_time_sleep(int64_t usecs)
 {
    os_time_sleep(usecs);
diff --git a/src/gallium/auxiliary/util/u_transfer.c b/src/gallium/auxiliary/util/u_transfer.c
index 71da35d6d39..4cb524d5cb1 100644
--- a/src/gallium/auxiliary/util/u_transfer.c
+++ b/src/gallium/auxiliary/util/u_transfer.c
@@ -90,7 +90,7 @@ void u_default_transfer_unmap( struct pipe_context *pipe,
 }
 
 
-static INLINE struct u_resource *
+static inline struct u_resource *
 u_resource( struct pipe_resource *res )
 {
    return (struct u_resource *)res;
diff --git a/src/gallium/auxiliary/util/u_video.h b/src/gallium/auxiliary/util/u_video.h
index b4743d13fbf..99d0f6e775a 100644
--- a/src/gallium/auxiliary/util/u_video.h
+++ b/src/gallium/auxiliary/util/u_video.h
@@ -40,7 +40,7 @@ extern "C" {
 #include "util/u_debug.h"
 #include "util/u_math.h"
 
-static INLINE enum pipe_video_format
+static inline enum pipe_video_format
 u_reduce_video_profile(enum pipe_video_profile profile)
 {
    switch (profile)
@@ -73,7 +73,7 @@ u_reduce_video_profile(enum pipe_video_profile profile)
    }
 }
 
-static INLINE void
+static inline void
 u_copy_nv12_to_yv12(void *const *destination_data,
                     uint32_t const *destination_pitches,
                     int src_plane, int src_field,
@@ -99,7 +99,7 @@ u_copy_nv12_to_yv12(void *const *destination_data,
    }
 }
 
-static INLINE void
+static inline void
 u_copy_yv12_to_nv12(void *const *destination_data,
                     uint32_t const *destination_pitches,
                     int src_plane, int src_field,
@@ -122,7 +122,7 @@ u_copy_yv12_to_nv12(void *const *destination_data,
    }
 }
 
-static INLINE void
+static inline void
 u_copy_swap422_packed(void *const *destination_data,
                        uint32_t const *destination_pitches,
                        int src_plane, int src_field,
@@ -147,7 +147,7 @@ u_copy_swap422_packed(void *const *destination_data,
    }
 }
 
-static INLINE uint32_t
+static inline uint32_t
 u_get_h264_level(uint32_t width, uint32_t height, uint32_t *max_reference)
 {
    uint32_t max_dpb_mbs;
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
index 69839e61386..afe53063b48 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -538,7 +538,7 @@ cleanup_buffers(struct vl_compositor *c)
    pipe_resource_reference(&c->vertex_buf.buffer, NULL);
 }
 
-static INLINE struct u_rect
+static inline struct u_rect
 default_rect(struct vl_compositor_layer *layer)
 {
    struct pipe_resource *res = layer->sampler_views[0]->texture;
@@ -546,21 +546,21 @@ default_rect(struct vl_compositor_layer *layer)
    return rect;
 }
 
-static INLINE struct vertex2f
+static inline struct vertex2f
 calc_topleft(struct vertex2f size, struct u_rect rect)
 {
    struct vertex2f res = { rect.x0 / size.x, rect.y0 / size.y };
    return res;
 }
 
-static INLINE struct vertex2f
+static inline struct vertex2f
 calc_bottomright(struct vertex2f size, struct u_rect rect)
 {
    struct vertex2f res = { rect.x1 / size.x, rect.y1 / size.y };
    return res;
 }
 
-static INLINE void
+static inline void
 calc_src_and_dst(struct vl_compositor_layer *layer, unsigned width, unsigned height,
                  struct u_rect src, struct u_rect dst)
 {
@@ -658,7 +658,7 @@ gen_rect_verts(struct vertex2f *vb, struct vl_compositor_layer *layer)
    vb[19].y = layer->colors[3].w;
 }
 
-static INLINE struct u_rect
+static inline struct u_rect
 calc_drawn_area(struct vl_compositor_state *s, struct vl_compositor_layer *layer)
 {
    struct vertex2f tl, br;
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c b/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
index abb3780f61e..539a9913e02 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
@@ -533,7 +533,7 @@ static struct dct_coeff tbl_B14_DC[1 << 17];
 static struct dct_coeff tbl_B14_AC[1 << 17];
 static struct dct_coeff tbl_B15[1 << 17];
 
-static INLINE void
+static inline void
 init_dct_coeff_table(struct dct_coeff *dst, const struct dct_coeff_compressed *src,
                      unsigned size, bool is_DC)
 {
@@ -594,7 +594,7 @@ init_dct_coeff_table(struct dct_coeff *dst, const struct dct_coeff_compressed *s
    }
 }
 
-static INLINE void
+static inline void
 init_tables()
 {
    vl_vlc_init_table(tbl_B1, Elements(tbl_B1), macroblock_address_increment, Elements(macroblock_address_increment));
@@ -611,19 +611,19 @@ init_tables()
    init_dct_coeff_table(tbl_B15, dct_coeff_tbl_one, Elements(dct_coeff_tbl_one), false);
 }
 
-static INLINE int
+static inline int
 DIV2DOWN(int todiv)
 {
    return (todiv&~1)/2;
 }
 
-static INLINE int
+static inline int
 DIV2UP(int todiv)
 {
    return (todiv+1)/2;
 }
 
-static INLINE void
+static inline void
 motion_vector(struct vl_mpg12_bs *bs, int r, int s, int dmv, short delta[2], short dmvector[2])
 {
    int t;
@@ -647,7 +647,7 @@ motion_vector(struct vl_mpg12_bs *bs, int r, int s, int dmv, short delta[2], sho
    }
 }
 
-static INLINE int
+static inline int
 wrap(short f, int shift)
 {
    if (f < (-16 << shift))
@@ -658,7 +658,7 @@ wrap(short f, int shift)
       return f;
 }
 
-static INLINE void
+static inline void
 motion_vector_frame(struct vl_mpg12_bs *bs, int s, struct pipe_mpeg12_macroblock *mb)
 {
    int dmv = mb->macroblock_modes.bits.frame_motion_type == PIPE_MPEG12_MO_TYPE_DUAL_PRIME;
@@ -682,7 +682,7 @@ motion_vector_frame(struct vl_mpg12_bs *bs, int s, struct pipe_mpeg12_macroblock
    }
 }
 
-static INLINE void
+static inline void
 motion_vector_field(struct vl_mpg12_bs *bs, int s, struct pipe_mpeg12_macroblock *mb)
 {
    int dmv = mb->macroblock_modes.bits.field_motion_type == PIPE_MPEG12_MO_TYPE_DUAL_PRIME;
@@ -701,12 +701,12 @@ motion_vector_field(struct vl_mpg12_bs *bs, int s, struct pipe_mpeg12_macroblock
    }
 }
 
-static INLINE void
+static inline void
 reset_predictor(struct vl_mpg12_bs *bs) {
    bs->pred_dc[0] = bs->pred_dc[1] = bs->pred_dc[2] = 0;
 }
 
-static INLINE void
+static inline void
 decode_dct(struct vl_mpg12_bs *bs, struct pipe_mpeg12_macroblock *mb, int scale)
 {
    static const unsigned blk2cc[] = { 0, 0, 0, 0, 1, 2 };
@@ -805,7 +805,7 @@ entry:
       vl_vlc_eatbits(&bs->vlc, 1);
 }
 
-static INLINE void
+static inline void
 decode_slice(struct vl_mpg12_bs *bs, struct pipe_video_buffer *target)
 {
    struct pipe_mpeg12_macroblock mb;
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
index a3ad6e6623a..b7009837293 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
@@ -317,7 +317,7 @@ cleanup_mc_buffer(struct vl_mpeg12_buffer *buf)
       vl_mc_cleanup_buffer(&buf->mc[i]);
 }
 
-static INLINE void
+static inline void
 MacroBlockTypeToPipeWeights(const struct pipe_mpeg12_macroblock *mb, unsigned weights[2])
 {
    assert(mb);
@@ -352,7 +352,7 @@ MacroBlockTypeToPipeWeights(const struct pipe_mpeg12_macroblock *mb, unsigned we
    }
 }
 
-static INLINE struct vl_motionvector
+static inline struct vl_motionvector
 MotionVectorToPipe(const struct pipe_mpeg12_macroblock *mb, unsigned vector,
                    unsigned field_select_mask, unsigned weight)
 {
@@ -403,7 +403,7 @@ MotionVectorToPipe(const struct pipe_mpeg12_macroblock *mb, unsigned vector,
    return mv;
 }
 
-static INLINE void
+static inline void
 UploadYcbcrBlocks(struct vl_mpeg12_decoder *dec,
                   struct vl_mpeg12_buffer *buf,
                   const struct pipe_mpeg12_macroblock *mb)
diff --git a/src/gallium/auxiliary/vl/vl_rbsp.h b/src/gallium/auxiliary/vl/vl_rbsp.h
index 2e3da8e1d28..7867238c49e 100644
--- a/src/gallium/auxiliary/vl/vl_rbsp.h
+++ b/src/gallium/auxiliary/vl/vl_rbsp.h
@@ -48,7 +48,7 @@ struct vl_rbsp {
 /**
  * Initialize the RBSP object
  */
-static INLINE void vl_rbsp_init(struct vl_rbsp *rbsp, struct vl_vlc *nal, unsigned num_bits)
+static inline void vl_rbsp_init(struct vl_rbsp *rbsp, struct vl_vlc *nal, unsigned num_bits)
 {
    unsigned bits_left = vl_vlc_bits_left(nal);
 
@@ -71,7 +71,7 @@ static INLINE void vl_rbsp_init(struct vl_rbsp *rbsp, struct vl_vlc *nal, unsign
 /**
  * Make at least 16 more bits available
  */
-static INLINE void vl_rbsp_fillbits(struct vl_rbsp *rbsp)
+static inline void vl_rbsp_fillbits(struct vl_rbsp *rbsp)
 {
    unsigned valid = vl_vlc_valid_bits(&rbsp->nal);
    unsigned i, bits;
@@ -108,7 +108,7 @@ static INLINE void vl_rbsp_fillbits(struct vl_rbsp *rbsp)
 /**
  * Return an unsigned integer from the first n bits
  */
-static INLINE unsigned vl_rbsp_u(struct vl_rbsp *rbsp, unsigned n)
+static inline unsigned vl_rbsp_u(struct vl_rbsp *rbsp, unsigned n)
 {
    if (n == 0)
       return 0;
@@ -120,7 +120,7 @@ static INLINE unsigned vl_rbsp_u(struct vl_rbsp *rbsp, unsigned n)
 /**
  * Return an unsigned exponential Golomb encoded integer
  */
-static INLINE unsigned vl_rbsp_ue(struct vl_rbsp *rbsp)
+static inline unsigned vl_rbsp_ue(struct vl_rbsp *rbsp)
 {
    unsigned bits = 0;
 
@@ -134,7 +134,7 @@ static INLINE unsigned vl_rbsp_ue(struct vl_rbsp *rbsp)
 /**
  * Return an signed exponential Golomb encoded integer
  */
-static INLINE signed vl_rbsp_se(struct vl_rbsp *rbsp)
+static inline signed vl_rbsp_se(struct vl_rbsp *rbsp)
 {
    signed codeNum = vl_rbsp_ue(rbsp);
    if (codeNum & 1)
@@ -146,7 +146,7 @@ static INLINE signed vl_rbsp_se(struct vl_rbsp *rbsp)
 /**
  * Are more data available in the RBSP ?
  */
-static INLINE bool vl_rbsp_more_data(struct vl_rbsp *rbsp)
+static inline bool vl_rbsp_more_data(struct vl_rbsp *rbsp)
 {
    unsigned bits, value;
 
diff --git a/src/gallium/auxiliary/vl/vl_vlc.h b/src/gallium/auxiliary/vl/vl_vlc.h
index 2f905956dbf..7821b8be0a1 100644
--- a/src/gallium/auxiliary/vl/vl_vlc.h
+++ b/src/gallium/auxiliary/vl/vl_vlc.h
@@ -65,7 +65,7 @@ struct vl_vlc_compressed
 /**
  * initalize and decompress a lookup table
  */
-static INLINE void
+static inline void
 vl_vlc_init_table(struct vl_vlc_entry *dst, unsigned dst_size, const struct vl_vlc_compressed *src, unsigned src_size)
 {
    unsigned i, bits = util_logbase2(dst_size);
@@ -87,7 +87,7 @@ vl_vlc_init_table(struct vl_vlc_entry *dst, unsigned dst_size, const struct vl_v
 /**
  * switch over to next input buffer
  */
-static INLINE void
+static inline void
 vl_vlc_next_input(struct vl_vlc *vlc)
 {
    unsigned len = vlc->sizes[0];
@@ -112,7 +112,7 @@ vl_vlc_next_input(struct vl_vlc *vlc)
 /**
  * align the data pointer to the next dword
  */
-static INLINE void
+static inline void
 vl_vlc_align_data_ptr(struct vl_vlc *vlc)
 {
    /* align the data pointer */
@@ -126,7 +126,7 @@ vl_vlc_align_data_ptr(struct vl_vlc *vlc)
 /**
  * fill the bit buffer, so that at least 32 bits are valid
  */
-static INLINE void
+static inline void
 vl_vlc_fillbits(struct vl_vlc *vlc)
 {
    assert(vlc);
@@ -175,7 +175,7 @@ vl_vlc_fillbits(struct vl_vlc *vlc)
 /**
  * initialize vlc structure and start reading from first input buffer
  */
-static INLINE void
+static inline void
 vl_vlc_init(struct vl_vlc *vlc, unsigned num_inputs,
             const void *const *inputs, const unsigned *sizes)
 {
@@ -203,7 +203,7 @@ vl_vlc_init(struct vl_vlc *vlc, unsigned num_inputs,
 /**
  * number of bits still valid in bit buffer
  */
-static INLINE unsigned
+static inline unsigned
 vl_vlc_valid_bits(struct vl_vlc *vlc)
 {
    return 32 - vlc->invalid_bits;
@@ -212,7 +212,7 @@ vl_vlc_valid_bits(struct vl_vlc *vlc)
 /**
  * number of bits left over all inbut buffers
  */
-static INLINE unsigned
+static inline unsigned
 vl_vlc_bits_left(struct vl_vlc *vlc)
 {
    signed bytes_left = vlc->end - vlc->data;
@@ -223,7 +223,7 @@ vl_vlc_bits_left(struct vl_vlc *vlc)
 /**
  * get num_bits from bit buffer without removing them
  */
-static INLINE unsigned
+static inline unsigned
 vl_vlc_peekbits(struct vl_vlc *vlc, unsigned num_bits)
 {
    assert(vl_vlc_valid_bits(vlc) >= num_bits || vlc->data >= vlc->end);
@@ -233,7 +233,7 @@ vl_vlc_peekbits(struct vl_vlc *vlc, unsigned num_bits)
 /**
  * remove num_bits from bit buffer
  */
-static INLINE void
+static inline void
 vl_vlc_eatbits(struct vl_vlc *vlc, unsigned num_bits)
 {
    assert(vl_vlc_valid_bits(vlc) >= num_bits);
@@ -245,7 +245,7 @@ vl_vlc_eatbits(struct vl_vlc *vlc, unsigned num_bits)
 /**
  * get num_bits from bit buffer with removing them
  */
-static INLINE unsigned
+static inline unsigned
 vl_vlc_get_uimsbf(struct vl_vlc *vlc, unsigned num_bits)
 {
    unsigned value;
@@ -261,7 +261,7 @@ vl_vlc_get_uimsbf(struct vl_vlc *vlc, unsigned num_bits)
 /**
  * treat num_bits as signed value and remove them from bit buffer
  */
-static INLINE signed
+static inline signed
 vl_vlc_get_simsbf(struct vl_vlc *vlc, unsigned num_bits)
 {
    signed value;
@@ -277,7 +277,7 @@ vl_vlc_get_simsbf(struct vl_vlc *vlc, unsigned num_bits)
 /**
  * lookup a value and length in a decompressed table
  */
-static INLINE int8_t
+static inline int8_t
 vl_vlc_get_vlclbf(struct vl_vlc *vlc, const struct vl_vlc_entry *tbl, unsigned num_bits)
 {
    tbl += vl_vlc_peekbits(vlc, num_bits);
@@ -288,7 +288,7 @@ vl_vlc_get_vlclbf(struct vl_vlc *vlc, const struct vl_vlc_entry *tbl, unsigned n
 /**
  * fast forward search for a specific byte value
  */
-static INLINE boolean
+static inline boolean
 vl_vlc_search_byte(struct vl_vlc *vlc, unsigned num_bits, uint8_t value)
 {
    /* make sure we are on a byte boundary */
@@ -345,7 +345,7 @@ vl_vlc_search_byte(struct vl_vlc *vlc, unsigned num_bits, uint8_t value)
 /**
  * remove num_bits bits starting at pos from the bitbuffer
  */
-static INLINE void
+static inline void
 vl_vlc_removebits(struct vl_vlc *vlc, unsigned pos, unsigned num_bits)
 {
    uint64_t lo = (vlc->buffer & (~0UL >> (pos + num_bits))) << num_bits;
@@ -357,7 +357,7 @@ vl_vlc_removebits(struct vl_vlc *vlc, unsigned pos, unsigned num_bits)
 /**
  * limit the number of bits left for fetching
  */
-static INLINE void
+static inline void
 vl_vlc_limit(struct vl_vlc *vlc, unsigned bits_left)
 {
    assert(bits_left <= vl_vlc_bits_left(vlc));
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_blend.h b/src/gallium/drivers/freedreno/a2xx/fd2_blend.h
index 7cafcd3747e..3c8d8f7c09f 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_blend.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_blend.h
@@ -39,7 +39,7 @@ struct fd2_blend_stateobj {
 	uint32_t rb_colormask;
 };
 
-static INLINE struct fd2_blend_stateobj *
+static inline struct fd2_blend_stateobj *
 fd2_blend_stateobj(struct pipe_blend_state *blend)
 {
 	return (struct fd2_blend_stateobj *)blend;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.h b/src/gallium/drivers/freedreno/a2xx/fd2_context.h
index de845f07a85..74147107930 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_context.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.h
@@ -40,7 +40,7 @@ struct fd2_context {
 	struct pipe_resource *solid_vertexbuf;
 };
 
-static INLINE struct fd2_context *
+static inline struct fd2_context *
 fd2_context(struct fd_context *ctx)
 {
 	return (struct fd2_context *)ctx;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h b/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h
index adc0653132b..9e53cd3be75 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h
@@ -43,7 +43,7 @@ struct fd2_rasterizer_stateobj {
 	uint32_t pa_su_sc_mode_cntl;
 };
 
-static INLINE struct fd2_rasterizer_stateobj *
+static inline struct fd2_rasterizer_stateobj *
 fd2_rasterizer_stateobj(struct pipe_rasterizer_state *rast)
 {
 	return (struct fd2_rasterizer_stateobj *)rast;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_texture.h b/src/gallium/drivers/freedreno/a2xx/fd2_texture.h
index 4fffa08b3c3..5c9236851bd 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_texture.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_texture.h
@@ -42,7 +42,7 @@ struct fd2_sampler_stateobj {
 	uint32_t tex0, tex3, tex4, tex5;
 };
 
-static INLINE struct fd2_sampler_stateobj *
+static inline struct fd2_sampler_stateobj *
 fd2_sampler_stateobj(struct pipe_sampler_state *samp)
 {
 	return (struct fd2_sampler_stateobj *)samp;
@@ -54,7 +54,7 @@ struct fd2_pipe_sampler_view {
 	uint32_t tex0, tex2, tex3;
 };
 
-static INLINE struct fd2_pipe_sampler_view *
+static inline struct fd2_pipe_sampler_view *
 fd2_pipe_sampler_view(struct pipe_sampler_view *pview)
 {
 	return (struct fd2_pipe_sampler_view *)pview;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h b/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h
index dda1e552174..15609ad0267 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h
@@ -44,7 +44,7 @@ struct fd2_zsa_stateobj {
 	uint32_t rb_stencilrefmask_bf;
 };
 
-static INLINE struct fd2_zsa_stateobj *
+static inline struct fd2_zsa_stateobj *
 fd2_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa)
 {
 	return (struct fd2_zsa_stateobj *)zsa;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_blend.h b/src/gallium/drivers/freedreno/a3xx/fd3_blend.h
index 4f6eeb74481..0267001b0b9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_blend.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_blend.h
@@ -45,7 +45,7 @@ struct fd3_blend_stateobj {
 	} rb_mrt[4];
 };
 
-static INLINE struct fd3_blend_stateobj *
+static inline struct fd3_blend_stateobj *
 fd3_blend_stateobj(struct pipe_blend_state *blend)
 {
 	return (struct fd3_blend_stateobj *)blend;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.h b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
index 77e4605e550..6e20b2ff9bc 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
@@ -112,7 +112,7 @@ struct fd3_context {
 	struct ir3_shader_key last_key;
 };
 
-static INLINE struct fd3_context *
+static inline struct fd3_context *
 fd3_context(struct fd_context *ctx)
 {
 	return (struct fd3_context *)ctx;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.h b/src/gallium/drivers/freedreno/a3xx/fd3_format.h
index 6afc3015901..678343aaa11 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_format.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.h
@@ -41,7 +41,7 @@ enum a3xx_color_swap fd3_pipe2swap(enum pipe_format format);
 uint32_t fd3_tex_swiz(enum pipe_format format, unsigned swizzle_r,
 		unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a);
 
-static INLINE bool
+static inline bool
 fd3_half_precision(const struct pipe_surface *surface)
 {
 	enum pipe_format format;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h
index 7e9c1f51f59..765d9719524 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h
@@ -44,7 +44,7 @@ struct fd3_rasterizer_stateobj {
 	uint32_t pc_prim_vtx_cntl;
 };
 
-static INLINE struct fd3_rasterizer_stateobj *
+static inline struct fd3_rasterizer_stateobj *
 fd3_rasterizer_stateobj(struct pipe_rasterizer_state *rast)
 {
 	return (struct fd3_rasterizer_stateobj *)rast;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.h b/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
index c38fd847f27..d5afb03cd7a 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
@@ -43,7 +43,7 @@ struct fd3_sampler_stateobj {
 	bool saturate_s, saturate_t, saturate_r;
 };
 
-static INLINE struct fd3_sampler_stateobj *
+static inline struct fd3_sampler_stateobj *
 fd3_sampler_stateobj(struct pipe_sampler_state *samp)
 {
 	return (struct fd3_sampler_stateobj *)samp;
@@ -54,7 +54,7 @@ struct fd3_pipe_sampler_view {
 	uint32_t texconst0, texconst1, texconst2, texconst3;
 };
 
-static INLINE struct fd3_pipe_sampler_view *
+static inline struct fd3_pipe_sampler_view *
 fd3_pipe_sampler_view(struct pipe_sampler_view *pview)
 {
 	return (struct fd3_pipe_sampler_view *)pview;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h
index 352c3dd5432..d4dc5954da5 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h
@@ -45,7 +45,7 @@ struct fd3_zsa_stateobj {
 	uint32_t rb_stencilrefmask_bf;
 };
 
-static INLINE struct fd3_zsa_stateobj *
+static inline struct fd3_zsa_stateobj *
 fd3_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa)
 {
 	return (struct fd3_zsa_stateobj *)zsa;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
index 33641da5e2c..821b3c82832 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
@@ -42,7 +42,7 @@ struct fd4_blend_stateobj {
 	uint32_t rb_fs_output;
 };
 
-static INLINE struct fd4_blend_stateobj *
+static inline struct fd4_blend_stateobj *
 fd4_blend_stateobj(struct pipe_blend_state *blend)
 {
 	return (struct fd4_blend_stateobj *)blend;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
index 53e1bf6a2e6..0b749916841 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
@@ -90,7 +90,7 @@ struct fd4_context {
 	struct ir3_shader_key last_key;
 };
 
-static INLINE struct fd4_context *
+static inline struct fd4_context *
 fd4_context(struct fd_context *ctx)
 {
 	return (struct fd4_context *)ctx;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h
index 06c728f2f1f..64e81a9983b 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h
@@ -44,7 +44,7 @@ struct fd4_rasterizer_stateobj {
 	uint32_t pc_prim_vtx_cntl;
 };
 
-static INLINE struct fd4_rasterizer_stateobj *
+static inline struct fd4_rasterizer_stateobj *
 fd4_rasterizer_stateobj(struct pipe_rasterizer_state *rast)
 {
 	return (struct fd4_rasterizer_stateobj *)rast;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
index 579ed87f14b..84ee7ecb50c 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
@@ -42,7 +42,7 @@ struct fd4_sampler_stateobj {
 	uint32_t texsamp0, texsamp1;
 };
 
-static INLINE struct fd4_sampler_stateobj *
+static inline struct fd4_sampler_stateobj *
 fd4_sampler_stateobj(struct pipe_sampler_state *samp)
 {
 	return (struct fd4_sampler_stateobj *)samp;
@@ -53,7 +53,7 @@ struct fd4_pipe_sampler_view {
 	uint32_t texconst0, texconst1, texconst2, texconst3, textconst4;
 };
 
-static INLINE struct fd4_pipe_sampler_view *
+static inline struct fd4_pipe_sampler_view *
 fd4_pipe_sampler_view(struct pipe_sampler_view *pview)
 {
 	return (struct fd4_pipe_sampler_view *)pview;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h
index 033317cf620..6a92a9b6785 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h
@@ -47,7 +47,7 @@ struct fd4_zsa_stateobj {
 	uint32_t rb_stencilrefmask_bf;
 };
 
-static INLINE struct fd4_zsa_stateobj *
+static inline struct fd4_zsa_stateobj *
 fd4_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa)
 {
 	return (struct fd4_zsa_stateobj *)zsa;
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index e420f1e5bd9..c2d98345349 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -356,13 +356,13 @@ struct fd_context {
 			const union pipe_color_union *color, double depth, unsigned stencil);
 };
 
-static INLINE struct fd_context *
+static inline struct fd_context *
 fd_context(struct pipe_context *pctx)
 {
 	return (struct fd_context *)pctx;
 }
 
-static INLINE struct pipe_scissor_state *
+static inline struct pipe_scissor_state *
 fd_context_get_scissor(struct fd_context *ctx)
 {
 	if (ctx->rasterizer && ctx->rasterizer->scissor)
@@ -370,13 +370,13 @@ fd_context_get_scissor(struct fd_context *ctx)
 	return &ctx->disabled_scissor;
 }
 
-static INLINE bool
+static inline bool
 fd_supported_prim(struct fd_context *ctx, unsigned prim)
 {
 	return (1 << prim) & ctx->primtype_mask;
 }
 
-static INLINE void
+static inline void
 fd_reset_wfi(struct fd_context *ctx)
 {
 	ctx->needs_wfi = true;
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h
index 0634923fcb2..9833b30e17f 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.h
+++ b/src/gallium/drivers/freedreno/freedreno_resource.h
@@ -78,7 +78,7 @@ struct fd_resource {
 	struct list_head list;
 };
 
-static INLINE struct fd_resource *
+static inline struct fd_resource *
 fd_resource(struct pipe_resource *ptex)
 {
 	return (struct fd_resource *)ptex;
@@ -89,13 +89,13 @@ struct fd_transfer {
 	void *staging;
 };
 
-static INLINE struct fd_transfer *
+static inline struct fd_transfer *
 fd_transfer(struct pipe_transfer *ptrans)
 {
 	return (struct fd_transfer *)ptrans;
 }
 
-static INLINE struct fd_resource_slice *
+static inline struct fd_resource_slice *
 fd_resource_slice(struct fd_resource *rsc, unsigned level)
 {
 	assert(level <= rsc->base.b.last_level);
@@ -103,7 +103,7 @@ fd_resource_slice(struct fd_resource *rsc, unsigned level)
 }
 
 /* get offset for specified mipmap level and texture/array layer */
-static INLINE uint32_t
+static inline uint32_t
 fd_resource_offset(struct fd_resource *rsc, unsigned level, unsigned layer)
 {
 	struct fd_resource_slice *slice = fd_resource_slice(rsc, level);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index dbc2808262a..4b32d2d7d97 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -56,7 +56,7 @@ struct fd_screen {
 	int64_t cpu_gpu_time_delta;
 };
 
-static INLINE struct fd_screen *
+static inline struct fd_screen *
 fd_screen(struct pipe_screen *pscreen)
 {
 	return (struct fd_screen *)pscreen;
diff --git a/src/gallium/drivers/freedreno/freedreno_surface.h b/src/gallium/drivers/freedreno/freedreno_surface.h
index 3293f33dd84..2de37cee2dd 100644
--- a/src/gallium/drivers/freedreno/freedreno_surface.h
+++ b/src/gallium/drivers/freedreno/freedreno_surface.h
@@ -40,7 +40,7 @@ struct fd_surface {
 	uint16_t depth;
 };
 
-static INLINE struct fd_surface *
+static inline struct fd_surface *
 fd_surface(struct pipe_surface *psurf)
 {
 	return (struct fd_surface *)psurf;
diff --git a/src/gallium/drivers/i915/i915_batchbuffer.h b/src/gallium/drivers/i915/i915_batchbuffer.h
index dcf63543219..6466fa594f9 100644
--- a/src/gallium/drivers/i915/i915_batchbuffer.h
+++ b/src/gallium/drivers/i915/i915_batchbuffer.h
@@ -33,20 +33,20 @@
 
 struct i915_context;
 
-static INLINE size_t
+static inline size_t
 i915_winsys_batchbuffer_space(struct i915_winsys_batchbuffer *batch)
 {
    return batch->size - (batch->ptr - batch->map);
 }
 
-static INLINE boolean
+static inline boolean
 i915_winsys_batchbuffer_check(struct i915_winsys_batchbuffer *batch,
                               size_t dwords)
 {
    return dwords * 4 <= i915_winsys_batchbuffer_space(batch);
 }
 
-static INLINE void
+static inline void
 i915_winsys_batchbuffer_dword_unchecked(struct i915_winsys_batchbuffer *batch,
                                         unsigned dword)
 {
@@ -54,7 +54,7 @@ i915_winsys_batchbuffer_dword_unchecked(struct i915_winsys_batchbuffer *batch,
    batch->ptr += 4;
 }
 
-static INLINE void
+static inline void
 i915_winsys_batchbuffer_float(struct i915_winsys_batchbuffer *batch,
                               float f)
 {
@@ -64,7 +64,7 @@ i915_winsys_batchbuffer_float(struct i915_winsys_batchbuffer *batch,
    i915_winsys_batchbuffer_dword_unchecked(batch, uif.ui);
 }
 
-static INLINE void
+static inline void
 i915_winsys_batchbuffer_dword(struct i915_winsys_batchbuffer *batch,
                               unsigned dword)
 {
@@ -72,7 +72,7 @@ i915_winsys_batchbuffer_dword(struct i915_winsys_batchbuffer *batch,
    i915_winsys_batchbuffer_dword_unchecked(batch, dword);
 }
 
-static INLINE void
+static inline void
 i915_winsys_batchbuffer_write(struct i915_winsys_batchbuffer *batch,
                               void *data,
                               size_t size)
@@ -83,7 +83,7 @@ i915_winsys_batchbuffer_write(struct i915_winsys_batchbuffer *batch,
    batch->ptr += size;
 }
 
-static INLINE boolean
+static inline boolean
 i915_winsys_validate_buffers(struct i915_winsys_batchbuffer *batch,
                              struct i915_winsys_buffer **buffers,
                              int num_of_buffers)
@@ -91,7 +91,7 @@ i915_winsys_validate_buffers(struct i915_winsys_batchbuffer *batch,
    return batch->iws->validate_buffers(batch, buffers, num_of_buffers);
 }
 
-static INLINE int
+static inline int
 i915_winsys_batchbuffer_reloc(struct i915_winsys_batchbuffer *batch,
                               struct i915_winsys_buffer *buffer,
                               enum i915_winsys_buffer_usage usage,
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index 40abf3c577f..c8c7d64f5cb 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -339,7 +339,7 @@ struct i915_context {
 #define I915_DST_VARS                   4
 #define I915_DST_RECT                   8
 
-static INLINE
+static inline
 void i915_set_flush_dirty(struct i915_context *i915, unsigned flush)
 {
    i915->hardware_dirty |= I915_HW_FLUSH;
@@ -408,7 +408,7 @@ struct pipe_context *i915_create_context(struct pipe_screen *screen,
  * Inline conversion functions.  These are better-typed than the
  * macros used previously:
  */
-static INLINE struct i915_context *
+static inline struct i915_context *
 i915_context( struct pipe_context *pipe )
 {
    return (struct i915_context *)pipe;
diff --git a/src/gallium/drivers/i915/i915_debug.h b/src/gallium/drivers/i915/i915_debug.h
index 079882c811f..0f12a592ae8 100644
--- a/src/gallium/drivers/i915/i915_debug.h
+++ b/src/gallium/drivers/i915/i915_debug.h
@@ -48,13 +48,13 @@ struct i915_winsys_batchbuffer;
 extern unsigned i915_debug;
 
 #ifdef DEBUG
-static INLINE boolean
+static inline boolean
 I915_DBG_ON(unsigned flags)
 {
    return i915_debug & flags;
 }
 
-static INLINE void
+static inline void
 I915_DBG(unsigned flags, const char *fmt, ...)
 {
    if (I915_DBG_ON(flags)) {
@@ -67,7 +67,7 @@ I915_DBG(unsigned flags, const char *fmt, ...)
 }
 #else
 #define I915_DBG_ON(flags) (0)
-static INLINE void I915_DBG(unsigned flags, const char *fmt, ...) {}
+static inline void I915_DBG(unsigned flags, const char *fmt, ...) {}
 #endif
 
 void i915_debug_init(struct i915_screen *i915);
diff --git a/src/gallium/drivers/i915/i915_fpc.h b/src/gallium/drivers/i915/i915_fpc.h
index a4dbcb4d271..adc42542fea 100644
--- a/src/gallium/drivers/i915/i915_fpc.h
+++ b/src/gallium/drivers/i915/i915_fpc.h
@@ -136,7 +136,7 @@ struct i915_fp_compile {
 
 /* One neat thing about the UREG representation:  
  */
-static INLINE int
+static inline int
 swizzle(int reg, uint x, uint y, uint z, uint w)
 {
    assert(x <= SRC_ONE);
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index 38a33888166..456be9d92ca 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -111,7 +111,7 @@ static const float cos_constants[4] = { 1.0,
 /**
  * component-wise negation of ureg
  */
-static INLINE int
+static inline int
 negate(int reg, int x, int y, int z, int w)
 {
    /* Another neat thing about the UREG representation */
diff --git a/src/gallium/drivers/i915/i915_prim_emit.c b/src/gallium/drivers/i915/i915_prim_emit.c
index 248e21e02da..ea84efd1d17 100644
--- a/src/gallium/drivers/i915/i915_prim_emit.c
+++ b/src/gallium/drivers/i915/i915_prim_emit.c
@@ -53,7 +53,7 @@ struct setup_stage {
 /**
  * Basically a cast wrapper.
  */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
+static inline struct setup_stage *setup_stage( struct draw_stage *stage )
 {
    return (struct setup_stage *)stage;
 }
@@ -65,7 +65,7 @@ static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
  * have a couple of slots at the beginning (1-dword header, 4-dword
  * clip pos) that we ignore here.
  */
-static INLINE void
+static inline void
 emit_hw_vertex( struct i915_context *i915,
                 const struct vertex_header *vertex)
 {
@@ -124,7 +124,7 @@ emit_hw_vertex( struct i915_context *i915,
 
 
 
-static INLINE void 
+static inline void 
 emit_prim( struct draw_stage *stage, 
 	   struct prim_header *prim,
 	   unsigned hwprim,
diff --git a/src/gallium/drivers/i915/i915_prim_vbuf.c b/src/gallium/drivers/i915/i915_prim_vbuf.c
index d134dbb1620..8f61f151e0c 100644
--- a/src/gallium/drivers/i915/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915/i915_prim_vbuf.c
@@ -96,7 +96,7 @@ struct i915_vbuf_render {
 /**
  * Basically a cast wrapper.
  */
-static INLINE struct i915_vbuf_render *
+static inline struct i915_vbuf_render *
 i915_vbuf_render(struct vbuf_render *render)
 {
    assert(render);
diff --git a/src/gallium/drivers/i915/i915_resource.h b/src/gallium/drivers/i915/i915_resource.h
index ef99cfb5d3c..77fe8b70f79 100644
--- a/src/gallium/drivers/i915/i915_resource.h
+++ b/src/gallium/drivers/i915/i915_resource.h
@@ -94,14 +94,14 @@ void i915_init_resource_functions(struct i915_context *i915);
 extern struct u_resource_vtbl i915_buffer_vtbl;
 extern struct u_resource_vtbl i915_texture_vtbl;
 
-static INLINE struct i915_texture *i915_texture(struct pipe_resource *resource)
+static inline struct i915_texture *i915_texture(struct pipe_resource *resource)
 {
    struct i915_texture *tex = (struct i915_texture *)resource;
    assert(tex->b.vtbl == &i915_texture_vtbl);
    return tex;
 }
 
-static INLINE struct i915_buffer *i915_buffer(struct pipe_resource *resource)
+static inline struct i915_buffer *i915_buffer(struct pipe_resource *resource)
 {
    struct i915_buffer *tex = (struct i915_buffer *)resource;
    assert(tex->b.vtbl == &i915_buffer_vtbl);
diff --git a/src/gallium/drivers/i915/i915_resource_texture.c b/src/gallium/drivers/i915/i915_resource_texture.c
index 8ef73d6f2c2..9a3279ccb75 100644
--- a/src/gallium/drivers/i915/i915_resource_texture.c
+++ b/src/gallium/drivers/i915/i915_resource_texture.c
@@ -89,25 +89,25 @@ static const int bottom_offsets[6] = {
    [PIPE_TEX_FACE_NEG_Z] = 16 + 5 * 8,
 };
 
-static INLINE unsigned
+static inline unsigned
 align_nblocksx(enum pipe_format format, unsigned width, unsigned align_to)
 {
    return align(util_format_get_nblocksx(format, width), align_to);
 }
 
-static INLINE unsigned
+static inline unsigned
 align_nblocksy(enum pipe_format format, unsigned width, unsigned align_to)
 {
    return align(util_format_get_nblocksy(format, width), align_to);
 }
 
-static INLINE unsigned
+static inline unsigned
 get_pot_stride(enum pipe_format format, unsigned width)
 {
    return util_next_power_of_two(util_format_get_stride(format, width));
 }
 
-static INLINE const char*
+static inline const char*
 get_tiling_string(enum i915_winsys_buffer_tile tile)
 {
    switch(tile) {
diff --git a/src/gallium/drivers/i915/i915_screen.h b/src/gallium/drivers/i915/i915_screen.h
index 99d3ffd3af9..3be941a1561 100644
--- a/src/gallium/drivers/i915/i915_screen.h
+++ b/src/gallium/drivers/i915/i915_screen.h
@@ -59,7 +59,7 @@ struct i915_screen
  */
 
 
-static INLINE struct i915_screen *
+static inline struct i915_screen *
 i915_screen(struct pipe_screen *pscreen)
 {
    return (struct i915_screen *) pscreen;
diff --git a/src/gallium/drivers/i915/i915_state_dynamic.c b/src/gallium/drivers/i915/i915_state_dynamic.c
index 4050cd4ac44..1c29e8ae671 100644
--- a/src/gallium/drivers/i915/i915_state_dynamic.c
+++ b/src/gallium/drivers/i915/i915_state_dynamic.c
@@ -46,7 +46,7 @@
  * (active) state every time a 4kb boundary is crossed.
  */
 
-static INLINE void set_dynamic(struct i915_context *i915,
+static inline void set_dynamic(struct i915_context *i915,
                                unsigned offset,
                                const unsigned state)
 {
@@ -60,7 +60,7 @@ static INLINE void set_dynamic(struct i915_context *i915,
 
 
 
-static INLINE void set_dynamic_array(struct i915_context *i915,
+static inline void set_dynamic_array(struct i915_context *i915,
                                      unsigned offset,
                                      const unsigned *src,
                                      unsigned dwords)
diff --git a/src/gallium/drivers/i915/i915_state_immediate.c b/src/gallium/drivers/i915/i915_state_immediate.c
index d244a349fce..c4a6cae1beb 100644
--- a/src/gallium/drivers/i915/i915_state_immediate.c
+++ b/src/gallium/drivers/i915/i915_state_immediate.c
@@ -39,7 +39,7 @@
 /* Convinience function to check immediate state.
  */
 
-static INLINE void set_immediate(struct i915_context *i915,
+static inline void set_immediate(struct i915_context *i915,
                                  unsigned offset,
                                  const unsigned state)
 {
diff --git a/src/gallium/drivers/i915/i915_state_inlines.h b/src/gallium/drivers/i915/i915_state_inlines.h
index d4c5ab69555..015ea32933b 100644
--- a/src/gallium/drivers/i915/i915_state_inlines.h
+++ b/src/gallium/drivers/i915/i915_state_inlines.h
@@ -34,7 +34,7 @@
 #include "i915_reg.h"
 
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_compare_func(unsigned func)
 {
    switch (func) {
@@ -59,7 +59,7 @@ i915_translate_compare_func(unsigned func)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_shadow_compare_func(unsigned func)
 {
    switch (func) {
@@ -84,7 +84,7 @@ i915_translate_shadow_compare_func(unsigned func)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_stencil_op(unsigned op)
 {
    switch (op) {
@@ -109,7 +109,7 @@ i915_translate_stencil_op(unsigned op)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_blend_factor(unsigned factor)
 {
    switch (factor) {
@@ -148,7 +148,7 @@ i915_translate_blend_factor(unsigned factor)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_blend_func(unsigned mode)
 {
    switch (mode) {
@@ -168,7 +168,7 @@ i915_translate_blend_func(unsigned mode)
 }
 
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_logic_op(unsigned opcode)
 {
    switch (opcode) {
@@ -211,7 +211,7 @@ i915_translate_logic_op(unsigned opcode)
 
 
 
-static INLINE boolean i915_validate_vertices( unsigned hw_prim, unsigned nr )
+static inline boolean i915_validate_vertices( unsigned hw_prim, unsigned nr )
 {
    boolean ok;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.c b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
index 1de43f77ee0..1feb415c9e5 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
@@ -78,7 +78,7 @@ lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
 /**
  * Whether the blending factors are complementary of each other.
  */
-static INLINE boolean
+static inline boolean
 lp_build_blend_factor_complementary(unsigned src_factor, unsigned dst_factor)
 {
    return dst_factor == (src_factor ^ 0x10);
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 0d47c0d517c..c273b25f096 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -169,7 +169,7 @@ llvmpipe_user_buffer_create(struct pipe_screen *screen,
                             unsigned bind_flags);
 
 
-static INLINE struct llvmpipe_context *
+static inline struct llvmpipe_context *
 llvmpipe_context( struct pipe_context *pipe )
 {
    return (struct llvmpipe_context *)pipe;
diff --git a/src/gallium/drivers/llvmpipe/lp_debug.h b/src/gallium/drivers/llvmpipe/lp_debug.h
index e0f7d8e1bc3..1038c5fe151 100644
--- a/src/gallium/drivers/llvmpipe/lp_debug.h
+++ b/src/gallium/drivers/llvmpipe/lp_debug.h
@@ -71,7 +71,7 @@ extern int LP_DEBUG;
 
 void st_debug_init( void );
 
-static INLINE void
+static inline void
 LP_DBG( unsigned flag, const char *fmt, ... )
 {
     if (LP_DEBUG & flag)
diff --git a/src/gallium/drivers/llvmpipe/lp_fence.h b/src/gallium/drivers/llvmpipe/lp_fence.h
index 3c591187801..d7f0c153ec8 100644
--- a/src/gallium/drivers/llvmpipe/lp_fence.h
+++ b/src/gallium/drivers/llvmpipe/lp_fence.h
@@ -72,7 +72,7 @@ llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen);
 void
 lp_fence_destroy(struct lp_fence *fence);
 
-static INLINE void
+static inline void
 lp_fence_reference(struct lp_fence **ptr,
                    struct lp_fence *f)
 {
@@ -85,7 +85,7 @@ lp_fence_reference(struct lp_fence **ptr,
    *ptr = f;
 }
 
-static INLINE boolean
+static inline boolean
 lp_fence_issued(const struct lp_fence *fence)
 {
    return fence->issued;
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index c209f47f0f5..c19f9318006 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -184,7 +184,7 @@ union lp_rast_cmd_arg {
 
 /* Cast wrappers.  Hopefully these compile to noops!
  */
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile )
 {
    union lp_rast_cmd_arg arg;
@@ -192,7 +192,7 @@ lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile )
    return arg;
 }
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_triangle( const struct lp_rast_triangle *triangle,
                       unsigned plane_mask)
 {
@@ -208,7 +208,7 @@ lp_rast_arg_triangle( const struct lp_rast_triangle *triangle,
  * All planes are enabled, so instead of the plane mask we pass the upper
  * left coordinates of the a block that fully encloses the triangle.
  */
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_triangle_contained( const struct lp_rast_triangle *triangle,
                                 unsigned x, unsigned y)
 {
@@ -218,7 +218,7 @@ lp_rast_arg_triangle_contained( const struct lp_rast_triangle *triangle,
    return arg;
 }
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_state( const struct lp_rast_state *state )
 {
    union lp_rast_cmd_arg arg;
@@ -226,7 +226,7 @@ lp_rast_arg_state( const struct lp_rast_state *state )
    return arg;
 }
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_fence( struct lp_fence *fence )
 {
    union lp_rast_cmd_arg arg;
@@ -235,7 +235,7 @@ lp_rast_arg_fence( struct lp_fence *fence )
 }
 
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_clearzs( uint64_t value, uint64_t mask )
 {
    union lp_rast_cmd_arg arg;
@@ -245,7 +245,7 @@ lp_rast_arg_clearzs( uint64_t value, uint64_t mask )
 }
 
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_query( struct llvmpipe_query *pq )
 {
    union lp_rast_cmd_arg arg;
@@ -253,7 +253,7 @@ lp_rast_arg_query( struct llvmpipe_query *pq )
    return arg;
 }
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_null( void )
 {
    union lp_rast_cmd_arg arg;
@@ -312,7 +312,7 @@ lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
 #include <emmintrin.h>
 #include "util/u_sse.h"
 
-static INLINE __m128i
+static inline __m128i
 lp_plane_to_m128i(const struct lp_rast_plane *plane)
 {
    return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index e6ebbcd526d..9aa7e874657 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -145,7 +145,7 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
  * Get the pointer to a 4x4 color block (within a 64x64 tile).
  * \param x, y location of 4x4 block in window coords
  */
-static INLINE uint8_t *
+static inline uint8_t *
 lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task,
                                 unsigned buf, unsigned x, unsigned y,
                                 unsigned layer)
@@ -186,7 +186,7 @@ lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task,
  * Get the pointer to a 4x4 depth block (within a 64x64 tile).
  * \param x, y location of 4x4 block in window coords
  */
-static INLINE uint8_t *
+static inline uint8_t *
 lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task,
                                 unsigned x, unsigned y, unsigned layer)
 {
@@ -222,7 +222,7 @@ lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task,
  * triangle in/out tests.
  * \param x, y location of 4x4 block in window coords
  */
-static INLINE void
+static inline void
 lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
                          const struct lp_rast_shader_inputs *inputs,
                          unsigned x, unsigned y )
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 41f6fbfa059..c9b9221d87c 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -63,7 +63,7 @@ block_full_16(struct lp_rasterizer_task *task,
 	 block_full_4(task, tri, x + ix, y + iy);
 }
 
-static INLINE unsigned
+static inline unsigned
 build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
 {
    unsigned mask = 0;
@@ -94,7 +94,7 @@ build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
 }
 
 
-static INLINE void
+static inline void
 build_masks(int64_t c,
             int64_t cdiff,
             int64_t dcdx,
@@ -167,7 +167,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 #include "util/u_sse.h"
 
 
-static INLINE void
+static inline void
 build_masks_32(int c, 
                int cdiff,
                int dcdx,
@@ -213,7 +213,7 @@ build_masks_32(int c,
 }
 
 
-static INLINE unsigned
+static inline unsigned
 build_mask_linear_32(int c, int dcdx, int dcdy)
 {
    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
@@ -239,7 +239,7 @@ build_mask_linear_32(int c, int dcdx, int dcdy)
    return _mm_movemask_epi8(result);
 }
 
-static INLINE unsigned
+static inline unsigned
 sign_bits4(const __m128i *cstep, int cdiff)
 {
 
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
index a226ff0c485..b1464bb54c4 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -207,7 +207,7 @@ boolean lp_scene_is_resource_referenced(const struct lp_scene *scene,
  * Allocate space for a command/data in the bin's data buffer.
  * Grow the block list if needed.
  */
-static INLINE void *
+static inline void *
 lp_scene_alloc( struct lp_scene *scene, unsigned size)
 {
    struct data_block_list *list = &scene->data;
@@ -240,7 +240,7 @@ lp_scene_alloc( struct lp_scene *scene, unsigned size)
 /**
  * As above, but with specific alignment.
  */
-static INLINE void *
+static inline void *
 lp_scene_alloc_aligned( struct lp_scene *scene, unsigned size,
 			unsigned alignment )
 {
@@ -272,7 +272,7 @@ lp_scene_alloc_aligned( struct lp_scene *scene, unsigned size,
 
 /* Put back data if we decide not to use it, eg. culled triangles.
  */
-static INLINE void
+static inline void
 lp_scene_putback_data( struct lp_scene *scene, unsigned size)
 {
    struct data_block_list *list = &scene->data;
@@ -282,7 +282,7 @@ lp_scene_putback_data( struct lp_scene *scene, unsigned size)
 
 
 /** Return pointer to a particular tile's bin. */
-static INLINE struct cmd_bin *
+static inline struct cmd_bin *
 lp_scene_get_bin(struct lp_scene *scene, unsigned x, unsigned y)
 {
    return &scene->tile[x][y];
@@ -296,7 +296,7 @@ lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y);
 
 /* Add a command to bin[x][y].
  */
-static INLINE boolean
+static inline boolean
 lp_scene_bin_command( struct lp_scene *scene,
                       unsigned x, unsigned y,
                       unsigned cmd,
@@ -328,7 +328,7 @@ lp_scene_bin_command( struct lp_scene *scene,
 }
 
 
-static INLINE boolean
+static inline boolean
 lp_scene_bin_cmd_with_state( struct lp_scene *scene,
                              unsigned x, unsigned y,
                              const struct lp_rast_state *state,
@@ -354,7 +354,7 @@ lp_scene_bin_cmd_with_state( struct lp_scene *scene,
 
 /* Add a command to all active bins.
  */
-static INLINE boolean
+static inline boolean
 lp_scene_bin_everywhere( struct lp_scene *scene,
 			 unsigned cmd,
 			 const union lp_rast_cmd_arg arg )
@@ -371,7 +371,7 @@ lp_scene_bin_everywhere( struct lp_scene *scene,
 }
 
 
-static INLINE unsigned
+static inline unsigned
 lp_scene_get_num_bins( const struct lp_scene *scene )
 {
    return scene->tiles_x * scene->tiles_y;
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.h b/src/gallium/drivers/llvmpipe/lp_screen.h
index 8b8ea1afac9..00bf20c8c5f 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.h
+++ b/src/gallium/drivers/llvmpipe/lp_screen.h
@@ -62,7 +62,7 @@ struct llvmpipe_screen
 
 
 
-static INLINE struct llvmpipe_screen *
+static inline struct llvmpipe_screen *
 llvmpipe_screen( struct pipe_screen *pipe )
 {
    return (struct llvmpipe_screen *)pipe;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index c944ad26756..a42df2dc9e0 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -159,7 +159,7 @@ void
 lp_setup_end_query(struct lp_setup_context *setup,
                    struct llvmpipe_query *pq);
 
-static INLINE unsigned
+static inline unsigned
 lp_clamp_viewport_idx(int idx)
 {
    return (PIPE_MAX_VIEWPORTS > idx && idx >= 0) ? idx : 0;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 6c05b90e64a..a190254d9df 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -233,7 +233,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
 
 
 
-static INLINE int subpixel_snap( float a )
+static inline int subpixel_snap( float a )
 {
    return util_iround(FIXED_ONE * a);
 }
@@ -262,14 +262,14 @@ print_line(struct lp_setup_context *setup,
 }
 
 
-static INLINE boolean sign(float x){
+static inline boolean sign(float x){
    return x >= 0;  
 }  
 
 
 /* Used on positive floats only:
  */
-static INLINE float fracf(float f)
+static inline float fracf(float f)
 {
    return f - floorf(f);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index f065676a7fb..75544b52493 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -296,7 +296,7 @@ setup_point_coefficients( struct lp_setup_context *setup,
 }
 
 
-static INLINE int
+static inline int
 subpixel_snap(float a)
 {
    return util_iround(FIXED_ONE * a);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index a2f55ed3a1e..98a9d4bc28b 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -48,13 +48,13 @@
 #include <emmintrin.h>
 #endif
 
-static INLINE int
+static inline int
 subpixel_snap(float a)
 {
    return util_iround(FIXED_ONE * a);
 }
 
-static INLINE float
+static inline float
 fixed_to_float(int a)
 {
    return a * (1.0f / FIXED_ONE);
@@ -579,7 +579,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
  *
  * Undefined if no bit set exists, so code should check against 0 first.
  */
-static INLINE uint32_t 
+static inline uint32_t 
 floor_pot(uint32_t n)
 {
 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
@@ -841,7 +841,7 @@ static void retry_triangle_ccw( struct lp_setup_context *setup,
 /**
  * Calculate fixed position data for a triangle
  */
-static INLINE void
+static inline void
 calc_fixed_position( struct lp_setup_context *setup,
                      struct fixed_position* position,
                      const float (*v0)[4],
@@ -873,7 +873,7 @@ calc_fixed_position( struct lp_setup_context *setup,
  * Rotate a triangle, flipping its clockwise direction,
  * Swaps values for xy[0] and xy[1]
  */
-static INLINE void
+static inline void
 rotate_fixed_position_01( struct fixed_position* position )
 {
    int x, y;
@@ -898,7 +898,7 @@ rotate_fixed_position_01( struct fixed_position* position )
  * Rotate a triangle, flipping its clockwise direction,
  * Swaps values for xy[1] and xy[2]
  */
-static INLINE void
+static inline void
 rotate_fixed_position_12( struct fixed_position* position )
 {
    int x, y;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
index 89992007849..534c5f48a64 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
@@ -122,7 +122,7 @@ lp_setup_set_primitive(struct vbuf_render *vbr, unsigned prim)
 
 typedef const float (*const_float4_ptr)[4];
 
-static INLINE const_float4_ptr get_vert( const void *vertex_buffer,
+static inline const_float4_ptr get_vert( const void *vertex_buffer,
                                          int index,
                                          int stride )
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index b5ce8683f1a..fd6c49aacd8 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -840,7 +840,7 @@ store_unswizzled_block(struct gallivm_state *gallivm,
  *
  * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
  */
-static INLINE boolean
+static inline boolean
 is_arithmetic_format(const struct util_format_description *format_desc)
 {
    boolean arith = false;
@@ -860,7 +860,7 @@ is_arithmetic_format(const struct util_format_description *format_desc)
  * to floats for blending, and furthermore has "natural" packed AoS -> unpacked
  * SoA conversion.
  */
-static INLINE boolean
+static inline boolean
 format_expands_to_float_soa(const struct util_format_description *format_desc)
 {
    if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
@@ -876,7 +876,7 @@ format_expands_to_float_soa(const struct util_format_description *format_desc)
  *
  * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
  */
-static INLINE void
+static inline void
 lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
                              struct lp_type* type)
 {
@@ -924,7 +924,7 @@ lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
  *
  * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
  */
-static INLINE void
+static inline void
 lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
                                struct lp_type* type)
 {
@@ -996,7 +996,7 @@ lp_blend_type_from_format_desc(const struct util_format_description *format_desc
  *
  * but we try to avoid division and multiplication through shifts.
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 scale_bits(struct gallivm_state *gallivm,
            int src_bits,
            int dst_bits,
@@ -1108,7 +1108,7 @@ scale_bits(struct gallivm_state *gallivm,
 /**
  * If RT is a smallfloat (needing denorms) format
  */
-static INLINE int
+static inline int
 have_smallfloat_format(struct lp_type dst_type,
                        enum pipe_format format)
 {
@@ -2880,7 +2880,7 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
 /**
  * Return the blend factor equivalent to a destination alpha of one.
  */
-static INLINE unsigned
+static inline unsigned
 force_dst_alpha_one(unsigned factor, boolean clamped_zero)
 {
    switch(factor) {
diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h
index 4b6c8a7a6a5..e1b51c9c9a6 100644
--- a/src/gallium/drivers/llvmpipe/lp_test.h
+++ b/src/gallium/drivers/llvmpipe/lp_test.h
@@ -77,7 +77,7 @@ unsigned __int64 __rdtsc();
 
 #elif defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
 
-static INLINE uint64_t
+static inline uint64_t
 rdtsc(void)
 {
    uint32_t hi, lo;
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h
index 9fbd3a21648..3d315bb9a73 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.h
+++ b/src/gallium/drivers/llvmpipe/lp_texture.h
@@ -106,21 +106,21 @@ struct llvmpipe_transfer
 
 
 /** cast wrappers */
-static INLINE struct llvmpipe_resource *
+static inline struct llvmpipe_resource *
 llvmpipe_resource(struct pipe_resource *pt)
 {
    return (struct llvmpipe_resource *) pt;
 }
 
 
-static INLINE const struct llvmpipe_resource *
+static inline const struct llvmpipe_resource *
 llvmpipe_resource_const(const struct pipe_resource *pt)
 {
    return (const struct llvmpipe_resource *) pt;
 }
 
 
-static INLINE struct llvmpipe_transfer *
+static inline struct llvmpipe_transfer *
 llvmpipe_transfer(struct pipe_transfer *pt)
 {
    return (struct llvmpipe_transfer *) pt;
@@ -131,7 +131,7 @@ void llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen);
 void llvmpipe_init_context_resource_funcs(struct pipe_context *pipe);
 
 
-static INLINE boolean
+static inline boolean
 llvmpipe_resource_is_texture(const struct pipe_resource *resource)
 {
    switch (resource->target) {
@@ -153,7 +153,7 @@ llvmpipe_resource_is_texture(const struct pipe_resource *resource)
 }
 
 
-static INLINE boolean
+static inline boolean
 llvmpipe_resource_is_1d(const struct pipe_resource *resource)
 {
    switch (resource->target) {
@@ -175,7 +175,7 @@ llvmpipe_resource_is_1d(const struct pipe_resource *resource)
 }
 
 
-static INLINE unsigned
+static inline unsigned
 llvmpipe_layer_stride(struct pipe_resource *resource,
                       unsigned level)
 {
@@ -185,7 +185,7 @@ llvmpipe_layer_stride(struct pipe_resource *resource,
 }
 
 
-static INLINE unsigned
+static inline unsigned
 llvmpipe_resource_stride(struct pipe_resource *resource,
                          unsigned level)
 {
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index ad05fdc3e76..67e181e803a 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -22,13 +22,13 @@ struct nouveau_transfer {
    uint32_t offset;
 };
 
-static INLINE struct nouveau_transfer *
+static inline struct nouveau_transfer *
 nouveau_transfer(struct pipe_transfer *transfer)
 {
    return (struct nouveau_transfer *)transfer;
 }
 
-static INLINE bool
+static inline bool
 nouveau_buffer_malloc(struct nv04_resource *buf)
 {
    if (!buf->data)
@@ -36,7 +36,7 @@ nouveau_buffer_malloc(struct nv04_resource *buf)
    return !!buf->data;
 }
 
-static INLINE bool
+static inline bool
 nouveau_buffer_allocate(struct nouveau_screen *screen,
                         struct nv04_resource *buf, unsigned domain)
 {
@@ -69,7 +69,7 @@ nouveau_buffer_allocate(struct nouveau_screen *screen,
    return true;
 }
 
-static INLINE void
+static inline void
 release_allocation(struct nouveau_mm_allocation **mm,
                    struct nouveau_fence *fence)
 {
@@ -77,7 +77,7 @@ release_allocation(struct nouveau_mm_allocation **mm,
    (*mm) = NULL;
 }
 
-INLINE void
+inline void
 nouveau_buffer_release_gpu_storage(struct nv04_resource *buf)
 {
    nouveau_bo_ref(NULL, &buf->bo);
@@ -93,7 +93,7 @@ nouveau_buffer_release_gpu_storage(struct nv04_resource *buf)
    buf->domain = 0;
 }
 
-static INLINE bool
+static inline bool
 nouveau_buffer_reallocate(struct nouveau_screen *screen,
                           struct nv04_resource *buf, unsigned domain)
 {
@@ -219,7 +219,7 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx,
 /* Does a CPU wait for the buffer's backing data to become reliably accessible
  * for write/read by waiting on the buffer's relevant fences.
  */
-static INLINE bool
+static inline bool
 nouveau_buffer_sync(struct nv04_resource *buf, unsigned rw)
 {
    if (rw == PIPE_TRANSFER_READ) {
@@ -244,7 +244,7 @@ nouveau_buffer_sync(struct nv04_resource *buf, unsigned rw)
    return true;
 }
 
-static INLINE bool
+static inline bool
 nouveau_buffer_busy(struct nv04_resource *buf, unsigned rw)
 {
    if (rw == PIPE_TRANSFER_READ)
@@ -253,7 +253,7 @@ nouveau_buffer_busy(struct nv04_resource *buf, unsigned rw)
       return (buf->fence && !nouveau_fence_signalled(buf->fence));
 }
 
-static INLINE void
+static inline void
 nouveau_buffer_transfer_init(struct nouveau_transfer *tx,
                              struct pipe_resource *resource,
                              const struct pipe_box *box,
@@ -275,7 +275,7 @@ nouveau_buffer_transfer_init(struct nouveau_transfer *tx,
    tx->map = NULL;
 }
 
-static INLINE void
+static inline void
 nouveau_buffer_transfer_del(struct nouveau_context *nv,
                             struct nouveau_transfer *tx)
 {
@@ -330,7 +330,7 @@ nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf)
  * resource. This can be useful if we would otherwise have to wait for a read
  * operation to complete on this data.
  */
-static INLINE bool
+static inline bool
 nouveau_buffer_should_discard(struct nv04_resource *buf, unsigned usage)
 {
    if (!(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE))
@@ -725,7 +725,7 @@ nouveau_user_buffer_create(struct pipe_screen *pscreen, void *ptr,
    return &buffer->base;
 }
 
-static INLINE bool
+static inline bool
 nouveau_buffer_data_fetch(struct nouveau_context *nv, struct nv04_resource *buf,
                           struct nouveau_bo *bo, unsigned offset, unsigned size)
 {
@@ -833,7 +833,7 @@ nouveau_user_buffer_upload(struct nouveau_context *nv,
 
 /* Scratch data allocation. */
 
-static INLINE int
+static inline int
 nouveau_scratch_bo_alloc(struct nouveau_context *nv, struct nouveau_bo **pbo,
                          unsigned size)
 {
@@ -870,7 +870,7 @@ nouveau_scratch_runout_release(struct nouveau_context *nv)
 /* Allocate an extra bo if we can't fit everything we need simultaneously.
  * (Could happen for very large user arrays.)
  */
-static INLINE bool
+static inline bool
 nouveau_scratch_runout(struct nouveau_context *nv, unsigned size)
 {
    int ret;
@@ -904,7 +904,7 @@ nouveau_scratch_runout(struct nouveau_context *nv, unsigned size)
 /* Continue to next scratch buffer, if available (no wrapping, large enough).
  * Allocate it if it has not yet been created.
  */
-static INLINE bool
+static inline bool
 nouveau_scratch_next(struct nouveau_context *nv, unsigned size)
 {
    struct nouveau_bo *bo;
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.h b/src/gallium/drivers/nouveau/nouveau_buffer.h
index 4d05d3d2397..7e6a6cc804b 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.h
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.h
@@ -66,20 +66,20 @@ void *
 nouveau_resource_map_offset(struct nouveau_context *, struct nv04_resource *,
                             uint32_t offset, uint32_t flags);
 
-static INLINE void
+static inline void
 nouveau_resource_unmap(struct nv04_resource *res)
 {
    /* no-op */
 }
 
-static INLINE struct nv04_resource *
+static inline struct nv04_resource *
 nv04_resource(struct pipe_resource *resource)
 {
    return (struct nv04_resource *)resource;
 }
 
 /* is resource mapped into the GPU's address space (i.e. VRAM or GART) ? */
-static INLINE bool
+static inline bool
 nouveau_resource_mapped_by_gpu(struct pipe_resource *resource)
 {
    return nv04_resource(resource)->domain != 0;
diff --git a/src/gallium/drivers/nouveau/nouveau_context.h b/src/gallium/drivers/nouveau/nouveau_context.h
index d99e485bbc6..24deb7ee4c0 100644
--- a/src/gallium/drivers/nouveau/nouveau_context.h
+++ b/src/gallium/drivers/nouveau/nouveau_context.h
@@ -53,7 +53,7 @@ struct nouveau_context {
    } stats;
 };
 
-static INLINE struct nouveau_context *
+static inline struct nouveau_context *
 nouveau_context(struct pipe_context *pipe)
 {
    return (struct nouveau_context *)pipe;
@@ -69,7 +69,7 @@ nouveau_scratch_runout_release(struct nouveau_context *);
  * because we don't want to un-bo_ref each allocation every time. This is less
  * work, and we need the wrap index anyway for extreme situations.
  */
-static INLINE void
+static inline void
 nouveau_scratch_done(struct nouveau_context *nv)
 {
    nv->scratch.wrap = nv->scratch.id;
@@ -84,7 +84,7 @@ void *
 nouveau_scratch_get(struct nouveau_context *, unsigned size, uint64_t *gpu_addr,
                     struct nouveau_bo **);
 
-static INLINE void
+static inline void
 nouveau_context_destroy(struct nouveau_context *ctx)
 {
    int i;
@@ -96,7 +96,7 @@ nouveau_context_destroy(struct nouveau_context *ctx)
    FREE(ctx);
 }
 
-static INLINE  void
+static inline  void
 nouveau_context_update_frame_stats(struct nouveau_context *nv)
 {
    nv->stats.buf_cache_frame <<= 1;
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.h b/src/gallium/drivers/nouveau/nouveau_fence.h
index 904df0be7fa..a1587051b0f 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.h
+++ b/src/gallium/drivers/nouveau/nouveau_fence.h
@@ -37,7 +37,7 @@ void nouveau_fence_next(struct nouveau_screen *);
 bool nouveau_fence_wait(struct nouveau_fence *);
 bool nouveau_fence_signalled(struct nouveau_fence *);
 
-static INLINE void
+static inline void
 nouveau_fence_ref(struct nouveau_fence *fence, struct nouveau_fence **ref)
 {
    if (fence)
@@ -51,7 +51,7 @@ nouveau_fence_ref(struct nouveau_fence *fence, struct nouveau_fence **ref)
    *ref = fence;
 }
 
-static INLINE struct nouveau_fence *
+static inline struct nouveau_fence *
 nouveau_fence(struct pipe_fence_handle *fence)
 {
    return (struct nouveau_fence *)fence;
diff --git a/src/gallium/drivers/nouveau/nouveau_gldefs.h b/src/gallium/drivers/nouveau/nouveau_gldefs.h
index ff97aaa9af0..1538c7b6e57 100644
--- a/src/gallium/drivers/nouveau/nouveau_gldefs.h
+++ b/src/gallium/drivers/nouveau/nouveau_gldefs.h
@@ -1,7 +1,7 @@
 #ifndef __NOUVEAU_GLDEFS_H__
 #define __NOUVEAU_GLDEFS_H__
 
-static INLINE unsigned
+static inline unsigned
 nvgl_blend_func(unsigned factor)
 {
 	switch (factor) {
@@ -40,7 +40,7 @@ nvgl_blend_func(unsigned factor)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_blend_eqn(unsigned func)
 {
 	switch (func) {
@@ -59,7 +59,7 @@ nvgl_blend_eqn(unsigned func)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_logicop_func(unsigned func)
 {
 	switch (func) {
@@ -100,7 +100,7 @@ nvgl_logicop_func(unsigned func)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_comparison_op(unsigned op)
 {
 	switch (op) {
@@ -125,7 +125,7 @@ nvgl_comparison_op(unsigned op)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_polygon_mode(unsigned mode)
 {
 	switch (mode) {
@@ -140,7 +140,7 @@ nvgl_polygon_mode(unsigned mode)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_stencil_op(unsigned op)
 {
 	switch (op) {
@@ -165,7 +165,7 @@ nvgl_stencil_op(unsigned op)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_primitive(unsigned prim) {
 	switch (prim) {
 	case PIPE_PRIM_POINTS:
diff --git a/src/gallium/drivers/nouveau/nouveau_mm.c b/src/gallium/drivers/nouveau/nouveau_mm.c
index 9c454c56db0..43b3d99f48a 100644
--- a/src/gallium/drivers/nouveau/nouveau_mm.c
+++ b/src/gallium/drivers/nouveau/nouveau_mm.c
@@ -70,7 +70,7 @@ mm_slab_alloc(struct mm_slab *slab)
    return -1;
 }
 
-static INLINE void
+static inline void
 mm_slab_free(struct mm_slab *slab, int i)
 {
    assert(i < slab->count);
@@ -79,7 +79,7 @@ mm_slab_free(struct mm_slab *slab, int i)
    assert(slab->free <= slab->count);
 }
 
-static INLINE int
+static inline int
 mm_get_order(uint32_t size)
 {
    int s = __builtin_clz(size) ^ 31;
@@ -104,7 +104,7 @@ mm_bucket_by_size(struct nouveau_mman *cache, unsigned size)
 }
 
 /* size of bo allocation for slab with chunks of (1 << chunk_order) bytes */
-static INLINE uint32_t
+static inline uint32_t
 mm_default_slab_size(unsigned chunk_order)
 {
    static const int8_t slab_order[MM_MAX_ORDER - MM_MIN_ORDER + 1] =
@@ -263,7 +263,7 @@ nouveau_mm_create(struct nouveau_device *dev, uint32_t domain,
    return cache;
 }
 
-static INLINE void
+static inline void
 nouveau_mm_free_slabs(struct list_head *head)
 {
    struct mm_slab *slab, *next;
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index 05f56ed5842..4fdde9fbf3d 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -112,7 +112,7 @@ struct nouveau_screen {
 # define NOUVEAU_DRV_STAT_IFD(x)
 #endif
 
-static INLINE struct nouveau_screen *
+static inline struct nouveau_screen *
 nouveau_screen(struct pipe_screen *pscreen)
 {
 	return (struct nouveau_screen *)pscreen;
diff --git a/src/gallium/drivers/nouveau/nouveau_statebuf.h b/src/gallium/drivers/nouveau/nouveau_statebuf.h
index 4f8bd7bdf16..f38014091ba 100644
--- a/src/gallium/drivers/nouveau/nouveau_statebuf.h
+++ b/src/gallium/drivers/nouveau/nouveau_statebuf.h
@@ -20,7 +20,7 @@ struct nouveau_statebuf_builder
 #define sb_data(sb, v) *(sb).p++ = (v)
 #endif
 
-static INLINE uint32_t sb_header(unsigned subc, unsigned mthd, unsigned size)
+static inline uint32_t sb_header(unsigned subc, unsigned mthd, unsigned size)
 {
 	return (size << 18) | (subc << 13) | mthd;
 }
diff --git a/src/gallium/drivers/nouveau/nouveau_video.c b/src/gallium/drivers/nouveau/nouveau_video.c
index 30f1d9dcb25..e414a534418 100644
--- a/src/gallium/drivers/nouveau/nouveau_video.c
+++ b/src/gallium/drivers/nouveau/nouveau_video.c
@@ -100,7 +100,7 @@ nouveau_vpe_fini(struct nouveau_decoder *dec) {
    dec->current = dec->future = dec->past = 8;
 }
 
-static INLINE void
+static inline void
 nouveau_vpe_mb_dct_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12_macroblock *mb)
 {
    int cbb;
@@ -125,7 +125,7 @@ nouveau_vpe_mb_dct_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12_
    }
 }
 
-static INLINE void
+static inline void
 nouveau_vpe_mb_data_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12_macroblock *mb)
 {
    int cbb;
@@ -143,7 +143,7 @@ nouveau_vpe_mb_data_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12
    }
 }
 
-static INLINE void
+static inline void
 nouveau_vpe_mb_dct_header(struct nouveau_decoder *dec,
                           const struct pipe_mpeg12_macroblock *mb,
                           bool luma)
@@ -187,7 +187,7 @@ nouveau_vpe_mb_dct_header(struct nouveau_decoder *dec,
                      x | (y << NV17_MPEG_CMD_MB_COORDS_Y__SHIFT));
 }
 
-static INLINE unsigned int
+static inline unsigned int
 nouveau_vpe_mb_mv_flags(bool luma, int mv_h, int mv_v, bool forward, bool first, bool vert)
 {
    unsigned mc_header = 0;
@@ -228,7 +228,7 @@ static int div_up(int val, int mult) {
    return val / mult;
 }
 
-static INLINE void
+static inline void
 nouveau_vpe_mb_mv(struct nouveau_decoder *dec, unsigned mc_header,
                    bool luma, bool frame, bool forward, bool vert,
                    int x, int y, const short motions[2],
diff --git a/src/gallium/drivers/nouveau/nouveau_video.h b/src/gallium/drivers/nouveau/nouveau_video.h
index 08d48b371fd..fd1bd527deb 100644
--- a/src/gallium/drivers/nouveau/nouveau_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_video.h
@@ -45,7 +45,7 @@ struct nouveau_decoder {
 #define NV31_VIDEO_BIND_CMD     NV31_MPEG_IMAGE_Y_OFFSET__LEN
 #define NV31_VIDEO_BIND_COUNT  (NV31_MPEG_IMAGE_Y_OFFSET__LEN + 1)
 
-static INLINE void
+static inline void
 nouveau_vpe_write(struct nouveau_decoder *dec, unsigned data) {
    dec->cmds[dec->ofs++] = data;
 }
@@ -54,33 +54,33 @@ nouveau_vpe_write(struct nouveau_decoder *dec, unsigned data) {
 #define NV31_MPEG(mthd) SUBC_MPEG(NV31_MPEG_##mthd)
 #define NV84_MPEG(mthd) SUBC_MPEG(NV84_MPEG_##mthd)
 
-static INLINE uint32_t
+static inline uint32_t
 NV04_FIFO_PKHDR(int subc, int mthd, unsigned size)
 {
    return 0x00000000 | (size << 18) | (subc << 13) | mthd;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NV04_FIFO_PKHDR_NI(int subc, int mthd, unsigned size)
 {
    return 0x40000000 | (size << 18) | (subc << 13) | mthd;
 }
 
-static INLINE void
+static inline void
 BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
    PUSH_SPACE(push, size + 1);
    PUSH_DATA (push, NV04_FIFO_PKHDR(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
    PUSH_SPACE(push, size + 1);
    PUSH_DATA (push, NV04_FIFO_PKHDR_NI(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd,
            struct nouveau_bo *bo, uint32_t offset,
 	   struct nouveau_bufctx *ctx, int bin, uint32_t rw)
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.h b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
index 279a1ce18ef..33e3bef3df3 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
@@ -135,22 +135,22 @@ struct comm {
 	uint32_t parse_endpos[0x10]; // 1c0
 };
 
-static INLINE uint32_t nouveau_vp3_video_align(uint32_t h)
+static inline uint32_t nouveau_vp3_video_align(uint32_t h)
 {
    return ((h+0x3f)&~0x3f);
 };
 
-static INLINE uint32_t mb(uint32_t coord)
+static inline uint32_t mb(uint32_t coord)
 {
    return (coord + 0xf)>>4;
 }
 
-static INLINE uint32_t mb_half(uint32_t coord)
+static inline uint32_t mb_half(uint32_t coord)
 {
    return (coord + 0x1f)>>5;
 }
 
-static INLINE uint64_t
+static inline uint64_t
 nouveau_vp3_video_addr(struct nouveau_vp3_decoder *dec, struct nouveau_vp3_video_buffer *target)
 {
    uint64_t ret;
@@ -161,7 +161,7 @@ nouveau_vp3_video_addr(struct nouveau_vp3_decoder *dec, struct nouveau_vp3_video
    return dec->ref_bo->offset + ret;
 }
 
-static INLINE void
+static inline void
 nouveau_vp3_ycbcr_offsets(struct nouveau_vp3_decoder *dec, uint32_t *y2,
                           uint32_t *cbcr, uint32_t *cbcr2)
 {
@@ -182,7 +182,7 @@ nouveau_vp3_ycbcr_offsets(struct nouveau_vp3_decoder *dec, uint32_t *y2,
    }
 }
 
-static INLINE void
+static inline void
 nouveau_vp3_inter_sizes(struct nouveau_vp3_decoder *dec, uint32_t slice_count,
                         uint32_t *slice_size, uint32_t *bucket_size,
                         uint32_t *ring_size)
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
index a5bb489980b..389a229eb78 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -15,13 +15,13 @@
 #define NOUVEAU_MIN_BUFFER_MAP_ALIGN      64
 #define NOUVEAU_MIN_BUFFER_MAP_ALIGN_MASK (NOUVEAU_MIN_BUFFER_MAP_ALIGN - 1)
 
-static INLINE uint32_t
+static inline uint32_t
 PUSH_AVAIL(struct nouveau_pushbuf *push)
 {
    return push->end - push->cur;
 }
 
-static INLINE bool
+static inline bool
 PUSH_SPACE(struct nouveau_pushbuf *push, uint32_t size)
 {
    if (PUSH_AVAIL(push) < size)
@@ -29,20 +29,20 @@ PUSH_SPACE(struct nouveau_pushbuf *push, uint32_t size)
    return true;
 }
 
-static INLINE void
+static inline void
 PUSH_DATA(struct nouveau_pushbuf *push, uint32_t data)
 {
    *push->cur++ = data;
 }
 
-static INLINE void
+static inline void
 PUSH_DATAp(struct nouveau_pushbuf *push, const void *data, uint32_t size)
 {
    memcpy(push->cur, data, size * 4);
    push->cur += size;
 }
 
-static INLINE void
+static inline void
 PUSH_DATAf(struct nouveau_pushbuf *push, float f)
 {
    union { float f; uint32_t i; } u;
@@ -50,7 +50,7 @@ PUSH_DATAf(struct nouveau_pushbuf *push, float f)
    PUSH_DATA(push, u.i);
 }
 
-static INLINE void
+static inline void
 PUSH_KICK(struct nouveau_pushbuf *push)
 {
    nouveau_pushbuf_kick(push, push->channel);
@@ -60,7 +60,7 @@ PUSH_KICK(struct nouveau_pushbuf *push)
 #define NOUVEAU_RESOURCE_FLAG_LINEAR   (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
 #define NOUVEAU_RESOURCE_FLAG_DRV_PRIV (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
 
-static INLINE uint32_t
+static inline uint32_t
 nouveau_screen_transfer_flags(unsigned pipe)
 {
 	uint32_t flags = 0;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_clear.c b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
index c92e3b4e7ea..118cac77277 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_clear.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
@@ -32,7 +32,7 @@
 #include "nv30/nv30_context.h"
 #include "nv30/nv30_format.h"
 
-static INLINE uint32_t
+static inline uint32_t
 pack_rgba(enum pipe_format format, const float *rgba)
 {
    union util_color uc;
@@ -40,7 +40,7 @@ pack_rgba(enum pipe_format format, const float *rgba)
    return uc.ui[0];
 }
 
-static INLINE uint32_t
+static inline uint32_t
 pack_zeta(enum pipe_format format, double depth, unsigned stencil)
 {
    uint32_t zuint = (uint32_t)(depth * 4294967295.0);
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.h b/src/gallium/drivers/nouveau/nv30/nv30_context.h
index 92e9284d52c..d5c18bb62dc 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.h
@@ -125,7 +125,7 @@ struct nv30_context {
    bool render_cond_cond;
 };
 
-static INLINE struct nv30_context *
+static inline struct nv30_context *
 nv30_context(struct pipe_context *pipe)
 {
    return (struct nv30_context *)pipe;
@@ -214,7 +214,7 @@ nv30_state_release(struct nv30_context *nv30);
 #define NV30_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NV30_3D_VERTEX_BEGIN_END_##n
 
-static INLINE unsigned
+static inline unsigned
 nv30_prim_gl(unsigned prim)
 {
    switch (prim) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
index adfa9f11d66..098d6e499fa 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -52,7 +52,7 @@ struct nv30_render {
    uint32_t prim;
 };
 
-static INLINE struct nv30_render *
+static inline struct nv30_render *
 nv30_render(struct vbuf_render *render)
 {
    return (struct nv30_render *)render;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_format.h b/src/gallium/drivers/nouveau/nv30/nv30_format.h
index 8bf4a37299f..fa1e922fb65 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_format.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_format.h
@@ -27,28 +27,28 @@ struct nv30_texfmt {
 };
 
 extern const struct nv30_format_info nv30_format_info_table[];
-static INLINE const struct nv30_format_info *
+static inline const struct nv30_format_info *
 nv30_format_info(struct pipe_screen *pscreen, enum pipe_format format)
 {
    return &nv30_format_info_table[format];
 }
 
 extern const struct nv30_format nv30_format_table[];
-static INLINE const struct nv30_format *
+static inline const struct nv30_format *
 nv30_format(struct pipe_screen *pscreen, enum pipe_format format)
 {
    return &nv30_format_table[format];
 }
 
 extern const struct nv30_vtxfmt nv30_vtxfmt_table[];
-static INLINE const struct nv30_vtxfmt *
+static inline const struct nv30_vtxfmt *
 nv30_vtxfmt(struct pipe_screen *pscreen, enum pipe_format format)
 {
    return &nv30_vtxfmt_table[format];
 }
 
 extern const struct nv30_texfmt nv30_texfmt_table[];
-static INLINE const struct nv30_texfmt *
+static inline const struct nv30_texfmt *
 nv30_texfmt(struct pipe_screen *pscreen, enum pipe_format format)
 {
    return &nv30_texfmt_table[format];
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
index ea654fb2617..c75b4b95fd8 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
@@ -33,7 +33,7 @@
 #include "nv30/nv30_resource.h"
 #include "nv30/nv30_transfer.h"
 
-static INLINE unsigned
+static inline unsigned
 layer_offset(struct pipe_resource *pt, unsigned level, unsigned layer)
 {
    struct nv30_miptree *mt = nv30_miptree(pt);
@@ -78,13 +78,13 @@ struct nv30_transfer {
    unsigned nblocksy;
 };
 
-static INLINE struct nv30_transfer *
+static inline struct nv30_transfer *
 nv30_transfer(struct pipe_transfer *ptx)
 {
    return (struct nv30_transfer *)ptx;
 }
 
-static INLINE void
+static inline void
 define_rect(struct pipe_resource *pt, unsigned level, unsigned z,
             unsigned x, unsigned y, unsigned w, unsigned h,
             struct nv30_rect *rect)
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_push.c b/src/gallium/drivers/nouveau/nv30/nv30_push.c
index 011f1e437dc..67ab0508c17 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_push.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_push.c
@@ -52,7 +52,7 @@ struct push_context {
    uint32_t restart_index;
 };
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
 {
    unsigned i;
@@ -62,7 +62,7 @@ prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
 {
    unsigned i;
@@ -72,7 +72,7 @@ prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i32(uint32_t *elts, unsigned push, uint32_t index)
 {
    unsigned i;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_query.c b/src/gallium/drivers/nouveau/nv30/nv30_query.c
index dfe2059e6ea..3980be9579a 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_query.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_query.c
@@ -98,7 +98,7 @@ struct nv30_query {
    uint64_t result;
 };
 
-static INLINE struct nv30_query *
+static inline struct nv30_query *
 nv30_query(struct pipe_query *pipe)
 {
    return (struct nv30_query *)pipe;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.h b/src/gallium/drivers/nouveau/nv30/nv30_resource.h
index 7ea7b42faa7..8dac7795c9d 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_resource.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.h
@@ -15,7 +15,7 @@ struct nv30_surface {
    uint16_t depth;
 };
 
-static INLINE struct nv30_surface *
+static inline struct nv30_surface *
 nv30_surface(struct pipe_surface *ps)
 {
    return (struct nv30_surface *)ps;
@@ -38,7 +38,7 @@ struct nv30_miptree {
    unsigned ms_y:1;
 };
 
-static INLINE struct nv30_miptree *
+static inline struct nv30_miptree *
 nv30_miptree(struct pipe_resource *pt)
 {
    return (struct nv30_miptree *)pt;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.h b/src/gallium/drivers/nouveau/nv30/nv30_screen.h
index 3f2e47fec99..7b17b88097c 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.h
@@ -40,7 +40,7 @@ struct nv30_screen {
    struct nouveau_heap *vp_data_heap;
 };
 
-static INLINE struct nv30_screen *
+static inline struct nv30_screen *
 nv30_screen(struct pipe_screen *pscreen)
 {
    return (struct nv30_screen *)pscreen;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_texture.c b/src/gallium/drivers/nouveau/nv30/nv30_texture.c
index c3567217442..bfe21cceaa2 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_texture.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_texture.c
@@ -37,7 +37,7 @@
 #define NV40_WRAP(n) \
    case PIPE_TEX_WRAP_##n: ret = NV40_3D_TEX_WRAP_S_##n; break
 
-static INLINE unsigned
+static inline unsigned
 wrap_mode(unsigned pipe)
 {
    unsigned ret = NV30_3D_TEX_WRAP_S_REPEAT;
@@ -58,7 +58,7 @@ wrap_mode(unsigned pipe)
    return ret >> NV30_3D_TEX_WRAP_S__SHIFT;
 }
 
-static INLINE unsigned
+static inline unsigned
 filter_mode(const struct pipe_sampler_state *cso)
 {
    unsigned filter;
@@ -104,7 +104,7 @@ filter_mode(const struct pipe_sampler_state *cso)
    return filter;
 }
 
-static INLINE unsigned
+static inline unsigned
 compare_mode(const struct pipe_sampler_state *cso)
 {
    if (cso->compare_mode != PIPE_TEX_COMPARE_R_TO_TEXTURE)
@@ -201,7 +201,7 @@ nv30_bind_sampler_states(struct pipe_context *pipe,
    }
 }
 
-static INLINE uint32_t
+static inline uint32_t
 swizzle(const struct nv30_texfmt *fmt, unsigned cmp, unsigned swz)
 {
    uint32_t data = fmt->swz[swz].src << 8;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
index d76b73ab4a4..214da6568c3 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
@@ -41,7 +41,7 @@
  * of different ways.
  */
 
-static INLINE bool
+static inline bool
 nv30_transfer_scaled(struct nv30_rect *src, struct nv30_rect *dst)
 {
    if (src->x1 - src->x0 != dst->x1 - dst->x0)
@@ -51,7 +51,7 @@ nv30_transfer_scaled(struct nv30_rect *src, struct nv30_rect *dst)
    return false;
 }
 
-static INLINE bool
+static inline bool
 nv30_transfer_blit(XFER_ARGS)
 {
    if (nv30->screen->eng3d->oclass < NV40_3D_CLASS)
@@ -67,7 +67,7 @@ nv30_transfer_blit(XFER_ARGS)
    return true;
 }
 
-static INLINE struct nouveau_heap *
+static inline struct nouveau_heap *
 nv30_transfer_rect_vertprog(struct nv30_context *nv30)
 {
    struct nouveau_heap *heap = nv30->screen->vp_exec_heap;
@@ -108,7 +108,7 @@ nv30_transfer_rect_vertprog(struct nv30_context *nv30)
 }
 
 
-static INLINE struct nv04_resource *
+static inline struct nv04_resource *
 nv30_transfer_rect_fragprog(struct nv30_context *nv30)
 {
    struct nv04_resource *fp = nv04_resource(nv30->blit_fp);
@@ -554,7 +554,7 @@ linear_ptr(struct nv30_rect *rect, char *base, int x, int y, int z)
    return base + (y * rect->pitch) + (x * rect->cpp);
 }
 
-static INLINE unsigned
+static inline unsigned
 swizzle2d(unsigned v, unsigned s)
 {
    v = (v | (v << 8)) & 0x00ff00ff;
@@ -614,7 +614,7 @@ swizzle3d_ptr(struct nv30_rect *rect, char *base, int x, int y, int z)
 
 typedef char *(*get_ptr_t)(struct nv30_rect *, char *, int, int, int);
 
-static INLINE get_ptr_t
+static inline get_ptr_t
 get_ptr(struct nv30_rect *rect)
 {
    if (rect->pitch)
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
index b0b1b54046d..8494549e9b1 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
@@ -79,7 +79,7 @@ nv30_emit_vtxattr(struct nv30_context *nv30, struct pipe_vertex_buffer *vb,
    }
 }
 
-static INLINE void
+static inline void
 nv30_vbuf_range(struct nv30_context *nv30, int vbi,
                 uint32_t *base, uint32_t *size)
 {
@@ -163,7 +163,7 @@ nv30_update_user_vbufs(struct nv30_context *nv30)
    nv30->base.vbo_dirty = true;
 }
 
-static INLINE void
+static inline void
 nv30_release_user_vbufs(struct nv30_context *nv30)
 {
    uint32_t vbo_user = nv30->vbo_user;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_winsys.h b/src/gallium/drivers/nouveau/nv30/nv30_winsys.h
index 5cee5df60ce..2324b517c44 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_winsys.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_winsys.h
@@ -19,34 +19,34 @@
 #define NV40_3D_PRIM_RESTART_ENABLE 0x1dac
 #define NV40_3D_PRIM_RESTART_INDEX  0x1db0
 
-static INLINE void
+static inline void
 PUSH_RELOC(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t offset,
       uint32_t flags, uint32_t vor, uint32_t tor)
 {
    nouveau_pushbuf_reloc(push, bo, offset, flags, vor, tor);
 }
 
-static INLINE struct nouveau_bufctx *
+static inline struct nouveau_bufctx *
 bufctx(struct nouveau_pushbuf *push)
 {
    struct nouveau_bufctx **pctx = push->user_priv;
    return *pctx;
 }
 
-static INLINE void
+static inline void
 PUSH_RESET(struct nouveau_pushbuf *push, int bin)
 {
    nouveau_bufctx_reset(bufctx(push), bin);
 }
 
-static INLINE void
+static inline void
 PUSH_REFN(struct nouveau_pushbuf *push, int bin,
      struct nouveau_bo *bo, uint32_t access)
 {
    nouveau_bufctx_refn(bufctx(push), bin, bo, access);
 }
 
-static INLINE void
+static inline void
 PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       struct nouveau_bo *bo, uint32_t offset, uint32_t access)
 {
@@ -55,7 +55,7 @@ PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
    PUSH_DATA(push, bo->offset + offset);
 }
 
-static INLINE void
+static inline void
 PUSH_MTHDo(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       struct nouveau_bo *bo, uint32_t access, uint32_t vor, uint32_t tor)
 {
@@ -67,7 +67,7 @@ PUSH_MTHDo(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       PUSH_DATA(push, tor);
 }
 
-static INLINE void
+static inline void
 PUSH_MTHDs(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       struct nouveau_bo *bo, uint32_t data, uint32_t access,
       uint32_t vor, uint32_t tor)
@@ -80,7 +80,7 @@ PUSH_MTHDs(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       PUSH_DATA(push, data | tor);
 }
 
-static INLINE struct nouveau_bufref *
+static inline struct nouveau_bufref *
 PUSH_MTHD(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
      struct nouveau_bo *bo, uint32_t data, uint32_t access,
      uint32_t vor, uint32_t tor)
@@ -99,7 +99,7 @@ PUSH_MTHD(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
    return bref;
 }
 
-static INLINE void
+static inline void
 PUSH_RESRC(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
            struct nv04_resource *r, uint32_t data, uint32_t access,
            uint32_t vor, uint32_t tor)
@@ -108,14 +108,14 @@ PUSH_RESRC(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
              r->domain | access, vor, tor)->priv = r;
 }
 
-static INLINE void
+static inline void
 BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, int size)
 {
    PUSH_SPACE(push, size + 1);
    PUSH_DATA (push, 0x00000000 | (size << 18) | (subc << 13) | mthd);
 }
 
-static INLINE void
+static inline void
 BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, int size)
 {
    PUSH_SPACE(push, size + 1);
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
index 9a28e742f3a..e68d23e5587 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
@@ -44,7 +44,7 @@ struct nvfx_fpc {
    struct util_dynarray label_relocs;
 };
 
-static INLINE struct nvfx_reg
+static inline struct nvfx_reg
 temp(struct nvfx_fpc *fpc)
 {
    int idx = __builtin_ctzll(~fpc->r_temps);
@@ -60,7 +60,7 @@ temp(struct nvfx_fpc *fpc)
    return nvfx_reg(NVFXSR_TEMP, idx);
 }
 
-static INLINE void
+static inline void
 release_temps(struct nvfx_fpc *fpc)
 {
    fpc->r_temps &= ~fpc->r_temps_discard;
@@ -373,7 +373,7 @@ nv40_fp_brk(struct nvfx_fpc *fpc)
    hw[3] = 0;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 {
    struct nvfx_src src;
@@ -415,7 +415,7 @@ tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
    return src;
 }
 
-static INLINE struct nvfx_reg
+static inline struct nvfx_reg
 tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
    switch (fdst->Register.File) {
    case TGSI_FILE_OUTPUT:
@@ -430,7 +430,7 @@ tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
    }
 }
 
-static INLINE int
+static inline int
 tgsi_mask(uint tgsi)
 {
    int mask = 0;
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_shader.h b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
index b0538911960..e66d8af7620 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
@@ -448,7 +448,7 @@ struct nvfx_insn
 	struct nvfx_src src[3];
 };
 
-static INLINE struct nvfx_insn
+static inline struct nvfx_insn
 nvfx_insn(bool sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2)
 {
 	struct nvfx_insn insn = {
@@ -468,7 +468,7 @@ nvfx_insn(bool sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, s
 	return insn;
 }
 
-static INLINE struct nvfx_reg
+static inline struct nvfx_reg
 nvfx_reg(int type, int index)
 {
 	struct nvfx_reg temp = {
@@ -478,7 +478,7 @@ nvfx_reg(int type, int index)
 	return temp;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 nvfx_src(struct nvfx_reg reg)
 {
 	struct nvfx_src temp = {
@@ -491,7 +491,7 @@ nvfx_src(struct nvfx_reg reg)
 	return temp;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 nvfx_src_swz(struct nvfx_src src, int x, int y, int z, int w)
 {
 	struct nvfx_src dst = src;
@@ -503,14 +503,14 @@ nvfx_src_swz(struct nvfx_src src, int x, int y, int z, int w)
 	return dst;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 nvfx_src_neg(struct nvfx_src src)
 {
 	src.negate = !src.negate;
 	return src;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 nvfx_src_abs(struct nvfx_src src)
 {
 	src.abs = 1;
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
index dfa12b0ec08..5757eb1fb16 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
@@ -416,7 +416,7 @@ tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
    return src;
 }
 
-static INLINE struct nvfx_reg
+static inline struct nvfx_reg
 tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
    struct nvfx_reg dst;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_blit.h b/src/gallium/drivers/nouveau/nv50/nv50_blit.h
index de490af4557..0ccec568d3a 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_blit.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_blit.h
@@ -37,7 +37,7 @@ nv50_resource_resolve(struct pipe_context *, const struct pipe_resolve_info *);
 #define NV50_BLIT_TEXTURE_2D_ARRAY  5
 #define NV50_BLIT_MAX_TEXTURE_TYPES 6
 
-static INLINE unsigned
+static inline unsigned
 nv50_blit_texture_type(enum pipe_texture_target target)
 {
    switch (target) {
@@ -52,7 +52,7 @@ nv50_blit_texture_type(enum pipe_texture_target target)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 nv50_blit_get_tgsi_texture_target(enum pipe_texture_target target)
 {
    switch (target) {
@@ -67,7 +67,7 @@ nv50_blit_get_tgsi_texture_target(enum pipe_texture_target target)
    }
 }
 
-static INLINE enum pipe_texture_target
+static inline enum pipe_texture_target
 nv50_blit_reinterpret_pipe_texture_target(enum pipe_texture_target target)
 {
    switch (target) {
@@ -81,7 +81,7 @@ nv50_blit_reinterpret_pipe_texture_target(enum pipe_texture_target target)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 nv50_blit_get_filter(const struct pipe_blit_info *info)
 {
    if (info->dst.resource->nr_samples < info->src.resource->nr_samples)
@@ -102,7 +102,7 @@ nv50_blit_get_filter(const struct pipe_blit_info *info)
 /* Since shaders cannot export stencil, we cannot copy stencil values when
  * rendering to ZETA, so we attach the ZS surface to a colour render target.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 nv50_blit_zeta_to_colour_format(enum pipe_format format)
 {
    switch (format) {
@@ -127,7 +127,7 @@ nv50_blit_zeta_to_colour_format(enum pipe_format format)
 }
 
 
-static INLINE uint16_t
+static inline uint16_t
 nv50_blit_derive_color_mask(const struct pipe_blit_info *info)
 {
    const unsigned mask = info->mask;
@@ -162,7 +162,7 @@ nv50_blit_derive_color_mask(const struct pipe_blit_info *info)
    return color_mask;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 nv50_blit_eng2d_get_mask(const struct pipe_blit_info *info)
 {
    uint32_t mask = 0;
@@ -192,7 +192,7 @@ nv50_blit_eng2d_get_mask(const struct pipe_blit_info *info)
 #endif
 
 /* return true for formats that can be converted among each other by NVC0_2D */
-static INLINE bool
+static inline bool
 nv50_2d_dst_format_faithful(enum pipe_format format)
 {
    const uint64_t mask =
@@ -201,7 +201,7 @@ nv50_2d_dst_format_faithful(enum pipe_format format)
    uint8_t id = nv50_format_table[format].rt;
    return (id >= 0xc0) && (mask & (1ULL << (id - 0xc0)));
 }
-static INLINE bool
+static inline bool
 nv50_2d_src_format_faithful(enum pipe_format format)
 {
    const uint64_t mask =
@@ -211,7 +211,7 @@ nv50_2d_src_format_faithful(enum pipe_format format)
    return (id >= 0xc0) && (mask & (1ULL << (id - 0xc0)));
 }
 
-static INLINE bool
+static inline bool
 nv50_2d_format_supported(enum pipe_format format)
 {
    uint8_t id = nv50_format_table[format].rt;
@@ -219,7 +219,7 @@ nv50_2d_format_supported(enum pipe_format format)
       (NV50_ENG2D_SUPPORTED_FORMATS & (1ULL << (id - 0xc0)));
 }
 
-static INLINE bool
+static inline bool
 nv50_2d_dst_format_ops_supported(enum pipe_format format)
 {
    uint8_t id = nv50_format_table[format].rt;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 5949b4debb5..ce12e714774 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -164,14 +164,14 @@ struct nv50_context {
    struct nv50_blitctx *blit;
 };
 
-static INLINE struct nv50_context *
+static inline struct nv50_context *
 nv50_context(struct pipe_context *pipe)
 {
    return (struct nv50_context *)pipe;
 }
 
 /* return index used in nv50_context arrays for a specific shader type */
-static INLINE unsigned
+static inline unsigned
 nv50_context_shader_stage(unsigned pipe)
 {
    switch (pipe) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
index 98de9664e12..92d49e49ff2 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
@@ -204,7 +204,7 @@ const struct u_resource_vtbl nv50_miptree_vtbl =
    u_default_transfer_inline_write  /* transfer_inline_write */
 };
 
-static INLINE bool
+static inline bool
 nv50_miptree_init_ms_mode(struct nv50_miptree *mt)
 {
    switch (mt->base.base.nr_samples) {
@@ -438,7 +438,7 @@ nv50_miptree_from_handle(struct pipe_screen *pscreen,
 
 
 /* Offset of zslice @z from start of level @l. */
-INLINE unsigned
+inline unsigned
 nv50_mt_zslice_offset(const struct nv50_miptree *mt, unsigned l, unsigned z)
 {
    const struct pipe_resource *pt = &mt->base.base;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index 5d3fb608371..02dc3677259 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -25,7 +25,7 @@
 
 #include "codegen/nv50_ir_driver.h"
 
-static INLINE unsigned
+static inline unsigned
 bitcount4(const uint32_t val)
 {
    static const uint8_t cnt[16]
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_push.c b/src/gallium/drivers/nouveau/nv50/nv50_push.c
index 2d5ac605284..f31eaa0e314 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_push.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_push.c
@@ -29,7 +29,7 @@ struct push_context {
    uint32_t instance_id;
 };
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
 {
    unsigned i;
@@ -39,7 +39,7 @@ prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
 {
    unsigned i;
@@ -49,7 +49,7 @@ prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i32(uint32_t *elts, unsigned push, uint32_t index)
 {
    unsigned i;
@@ -179,7 +179,7 @@ emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
 #define NV50_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
-static INLINE unsigned
+static inline unsigned
 nv50_prim_gl(unsigned prim)
 {
    switch (prim) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index f4c0639f233..f4adbf8c653 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -56,7 +56,7 @@ struct nv50_query {
 
 #define NV50_QUERY_ALLOC_SPACE 256
 
-static INLINE struct nv50_query *
+static inline struct nv50_query *
 nv50_query(struct pipe_query *pipe)
 {
    return (struct nv50_query *)pipe;
@@ -281,7 +281,7 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nouveau_fence_ref(nv50->screen->base.fence.current, &q->fence);
 }
 
-static INLINE void
+static inline void
 nv50_query_update(struct nv50_query *q)
 {
    if (q->is64bit) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
index 19b1a391c4f..a46e622c597 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
@@ -56,7 +56,7 @@ struct nv50_miptree {
    uint8_t ms_mode;
 };
 
-static INLINE struct nv50_miptree *
+static inline struct nv50_miptree *
 nv50_miptree(struct pipe_resource *pt)
 {
    return (struct nv50_miptree *)pt;
@@ -98,13 +98,13 @@ struct nv50_surface {
    uint16_t depth;
 };
 
-static INLINE struct nv50_surface *
+static inline struct nv50_surface *
 nv50_surface(struct pipe_surface *ps)
 {
    return (struct nv50_surface *)ps;
 }
 
-static INLINE enum pipe_format
+static inline enum pipe_format
 nv50_zs_to_s_format(enum pipe_format format)
 {
    switch (format) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
index d9199e4478a..ce51f0fc254 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
@@ -97,7 +97,7 @@ struct nv50_screen {
    struct nouveau_object *m2mf;
 };
 
-static INLINE struct nv50_screen *
+static inline struct nv50_screen *
 nv50_screen(struct pipe_screen *screen)
 {
    return (struct nv50_screen *)screen;
@@ -109,7 +109,7 @@ void nv50_blitter_destroy(struct nv50_screen *);
 int nv50_screen_tic_alloc(struct nv50_screen *, void *);
 int nv50_screen_tsc_alloc(struct nv50_screen *, void *);
 
-static INLINE void
+static inline void
 nv50_resource_fence(struct nv04_resource *res, uint32_t flags)
 {
    struct nv50_screen *screen = nv50_screen(res->base.screen);
@@ -121,7 +121,7 @@ nv50_resource_fence(struct nv04_resource *res, uint32_t flags)
    }
 }
 
-static INLINE void
+static inline void
 nv50_resource_validate(struct nv04_resource *res, uint32_t flags)
 {
    if (likely(res->bo)) {
@@ -144,21 +144,21 @@ struct nv50_format {
 
 extern const struct nv50_format nv50_format_table[];
 
-static INLINE void
+static inline void
 nv50_screen_tic_unlock(struct nv50_screen *screen, struct nv50_tic_entry *tic)
 {
    if (tic->id >= 0)
       screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32));
 }
 
-static INLINE void
+static inline void
 nv50_screen_tsc_unlock(struct nv50_screen *screen, struct nv50_tsc_entry *tsc)
 {
    if (tsc->id >= 0)
       screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32));
 }
 
-static INLINE void
+static inline void
 nv50_screen_tic_free(struct nv50_screen *screen, struct nv50_tic_entry *tic)
 {
    if (tic->id >= 0) {
@@ -167,7 +167,7 @@ nv50_screen_tic_free(struct nv50_screen *screen, struct nv50_tic_entry *tic)
    }
 }
 
-static INLINE void
+static inline void
 nv50_screen_tsc_free(struct nv50_screen *screen, struct nv50_tsc_entry *tsc)
 {
    if (tsc->id >= 0) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 9369093614d..b033ce5c6dc 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -127,7 +127,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *prog)
    return nv50_program_upload_code(nv50, prog);
 }
 
-static INLINE void
+static inline void
 nv50_program_update_context_state(struct nv50_context *nv50,
                                   struct nv50_program *prog, int stage)
 {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 2bc8f4c9ae0..186d126305a 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -62,7 +62,7 @@
  *     in advance to maintain elegant separate shader objects.)
  */
 
-static INLINE uint32_t
+static inline uint32_t
 nv50_colormask(unsigned mask)
 {
    uint32_t ret = 0;
@@ -82,7 +82,7 @@ nv50_colormask(unsigned mask)
 #define NV50_BLEND_FACTOR_CASE(a, b) \
    case PIPE_BLENDFACTOR_##a: return NV50_BLEND_FACTOR_##b
 
-static INLINE uint32_t
+static inline uint32_t
 nv50_blend_fac(unsigned factor)
 {
    switch (factor) {
@@ -439,7 +439,7 @@ nv50_zsa_state_delete(struct pipe_context *pipe, void *hwcso)
 #define NV50_TSC_WRAP_CASE(n) \
     case PIPE_TEX_WRAP_##n: return NV50_TSC_WRAP_##n
 
-static INLINE unsigned
+static inline unsigned
 nv50_tsc_wrap_mode(unsigned wrap)
 {
    switch (wrap) {
@@ -572,7 +572,7 @@ nv50_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
    FREE(hwcso);
 }
 
-static INLINE void
+static inline void
 nv50_stage_sampler_states_bind(struct nv50_context *nv50, int s,
                                unsigned nr, void **hwcso)
 {
@@ -650,7 +650,7 @@ nv50_sampler_view_destroy(struct pipe_context *pipe,
    FREE(nv50_tic_entry(view));
 }
 
-static INLINE void
+static inline void
 nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
                              unsigned nr,
                              struct pipe_sampler_view **views)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index eeec0fb6562..985603df5fa 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -2,7 +2,7 @@
 #include "nv50/nv50_context.h"
 #include "nv50/nv50_defs.xml.h"
 
-static INLINE void
+static inline void
 nv50_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i)
 {
    BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(i)), 4);
@@ -275,7 +275,7 @@ nv50_validate_viewport(struct nv50_context *nv50)
    nv50->viewports_dirty = 0;
 }
 
-static INLINE void
+static inline void
 nv50_check_program_ucps(struct nv50_context *nv50,
                         struct nv50_program *vp, uint8_t mask)
 {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
index 6d1d846ca3d..853df5497ec 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
@@ -69,7 +69,7 @@ struct nv50_so_target {
    bool clean;
 };
 
-static INLINE struct nv50_so_target *
+static inline struct nv50_so_target *
 nv50_so_target(struct pipe_stream_output_target *ptarg)
 {
    return (struct nv50_so_target *)ptarg;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h
index 99548cbdb42..e0793bb6ec4 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h
@@ -9,7 +9,7 @@ struct nv50_tsc_entry {
    uint32_t tsc[8];
 };
 
-static INLINE struct nv50_tsc_entry *
+static inline struct nv50_tsc_entry *
 nv50_tsc_entry(void *hwcso)
 {
    return (struct nv50_tsc_entry *)hwcso;
@@ -21,7 +21,7 @@ struct nv50_tic_entry {
    uint32_t tic[8];
 };
 
-static INLINE struct nv50_tic_entry *
+static inline struct nv50_tic_entry *
 nv50_tic_entry(struct pipe_sampler_view *view)
 {
    return (struct nv50_tic_entry *)view;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index 90106e7ac03..b1ae01692cb 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -49,7 +49,7 @@
 #define NOUVEAU_DRIVER 0x50
 #include "nv50/nv50_blit.h"
 
-static INLINE uint8_t
+static inline uint8_t
 nv50_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
 {
    uint8_t id = nv50_format_table[format].rt;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
index f6396fba9d1..fc6374d1b1b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
@@ -31,7 +31,7 @@
    (NV50_TIC_0_MAPA__MASK | NV50_TIC_0_MAPB__MASK |   \
     NV50_TIC_0_MAPG__MASK | NV50_TIC_0_MAPR__MASK)
 
-static INLINE uint32_t
+static inline uint32_t
 nv50_tic_swizzle(uint32_t tc, unsigned swz, bool tex_int)
 {
    switch (swz) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 95c79ef864b..6324726acec 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -188,7 +188,7 @@ nv50_emit_vtxattr(struct nv50_context *nv50, struct pipe_vertex_buffer *vb,
    }
 }
 
-static INLINE void
+static inline void
 nv50_user_vbuf_range(struct nv50_context *nv50, unsigned vbi,
                      uint32_t *base, uint32_t *size)
 {
@@ -278,7 +278,7 @@ nv50_update_user_vbufs(struct nv50_context *nv50)
    nv50->base.vbo_dirty = true;
 }
 
-static INLINE void
+static inline void
 nv50_release_user_vbufs(struct nv50_context *nv50)
 {
    if (nv50->vbo_user) {
@@ -423,7 +423,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
 #define NV50_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
-static INLINE unsigned
+static inline unsigned
 nv50_prim_gl(unsigned prim)
 {
    switch (prim) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
index e8578c8be6f..76f1b41ea70 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
@@ -16,14 +16,14 @@
 #endif
 
 
-static INLINE void
+static inline void
 nv50_add_bufctx_resident_bo(struct nouveau_bufctx *bufctx, int bin,
                             unsigned flags, struct nouveau_bo *bo)
 {
    nouveau_bufctx_refn(bufctx, bin, bo, flags)->priv = NULL;
 }
 
-static INLINE void
+static inline void
 nv50_add_bufctx_resident(struct nouveau_bufctx *bufctx, int bin,
                          struct nv04_resource *res, unsigned flags)
 {
@@ -39,7 +39,7 @@ nv50_add_bufctx_resident(struct nouveau_bufctx *bufctx, int bin,
 #define BCTX_REFN(bctx, bin, res, acc) \
    nv50_add_bufctx_resident(bctx, NV50_BIND_##bin, res, NOUVEAU_BO_##acc)
 
-static INLINE void
+static inline void
 PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 {
    struct nouveau_pushbuf_refn ref = { bo, flags };
@@ -61,39 +61,39 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 #define NV50_COMPUTE(n) SUBC_COMPUTE(NV50_COMPUTE_##n)
 
 
-static INLINE uint32_t
+static inline uint32_t
 NV50_FIFO_PKHDR(int subc, int mthd, unsigned size)
 {
    return 0x00000000 | (size << 18) | (subc << 13) | mthd;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NV50_FIFO_PKHDR_NI(int subc, int mthd, unsigned size)
 {
    return 0x40000000 | (size << 18) | (subc << 13) | mthd;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NV50_FIFO_PKHDR_L(int subc, int mthd)
 {
    return 0x00030000 | (subc << 13) | mthd;
 }
 
 
-static INLINE uint32_t
+static inline uint32_t
 nouveau_bo_memtype(const struct nouveau_bo *bo)
 {
    return bo->config.nv50.memtype;
 }
 
 
-static INLINE void
+static inline void
 PUSH_DATAh(struct nouveau_pushbuf *push, uint64_t data)
 {
    *push->cur++ = (uint32_t)(data >> 32);
 }
 
-static INLINE void
+static inline void
 BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING
@@ -102,7 +102,7 @@ BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
    PUSH_DATA (push, NV50_FIFO_PKHDR(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING
@@ -112,7 +112,7 @@ BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 }
 
 /* long, non-incremental, nv50-only */
-static INLINE void
+static inline void
 BEGIN_NL50(struct nouveau_pushbuf *push, int subc, int mthd, uint32_t size)
 {
 #ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING
diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video.h b/src/gallium/drivers/nouveau/nv50/nv84_video.h
index 2edba389dbf..09773c12974 100644
--- a/src/gallium/drivers/nouveau/nv50/nv84_video.h
+++ b/src/gallium/drivers/nouveau/nv50/nv84_video.h
@@ -102,12 +102,12 @@ struct nv84_decoder {
    uint8_t mpeg12_non_intra_matrix[64];
 };
 
-static INLINE uint32_t mb(uint32_t coord)
+static inline uint32_t mb(uint32_t coord)
 {
    return (coord + 0xf)>>4;
 }
 
-static INLINE uint32_t mb_half(uint32_t coord)
+static inline uint32_t mb_half(uint32_t coord)
 {
    return (coord + 0x1f)>>5;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c b/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c
index f3480b2e00e..8b121477a37 100644
--- a/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c
@@ -221,7 +221,7 @@ nv84_decoder_vp_h264(struct nv84_decoder *dec,
    PUSH_KICK (push);
 }
 
-static INLINE int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) {
+static inline int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) {
    int16_t ret = val * quant / 16;
    if (mpeg1 && ret) {
       if (ret > 0)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 7665991d1fd..bfa6504770c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -184,13 +184,13 @@ struct nvc0_context {
    struct util_dynarray global_residents;
 };
 
-static INLINE struct nvc0_context *
+static inline struct nvc0_context *
 nvc0_context(struct pipe_context *pipe)
 {
    return (struct nvc0_context *)pipe;
 }
 
-static INLINE unsigned
+static inline unsigned
 nvc0_shader_stage(unsigned pipe)
 {
    switch (pipe) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
index 75859eea63d..15991c3d2bd 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
@@ -133,7 +133,7 @@ nvc0_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed)
    return tile_flags;
 }
 
-static INLINE bool
+static inline bool
 nvc0_miptree_init_ms_mode(struct nv50_miptree *mt)
 {
    switch (mt->base.base.nr_samples) {
@@ -325,7 +325,7 @@ nvc0_miptree_create(struct pipe_screen *pscreen,
 }
 
 /* Offset of zslice @z from start of level @l. */
-INLINE unsigned
+inline unsigned
 nvc0_mt_zslice_offset(const struct nv50_miptree *mt, unsigned l, unsigned z)
 {
    const struct pipe_resource *pt = &mt->base.base;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index ccf3ecc3c5f..2410b832335 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -193,7 +193,7 @@ nvc0_program_assign_varying_slots(struct nv50_ir_prog_info *info)
    return ret;
 }
 
-static INLINE void
+static inline void
 nvc0_vtgp_hdr_update_oread(struct nvc0_program *vp, uint8_t slot)
 {
    uint8_t min = (vp->hdr[4] >> 12) & 0xff;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index 2652a4acde3..f7b85a8e931 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -62,7 +62,7 @@ static void nvc0_mp_pm_query_end(struct nvc0_context *, struct nvc0_query *);
 static boolean nvc0_mp_pm_query_result(struct nvc0_context *,
                                        struct nvc0_query *, void *, boolean);
 
-static INLINE struct nvc0_query *
+static inline struct nvc0_query *
 nvc0_query(struct pipe_query *pipe)
 {
    return (struct nvc0_query *)pipe;
@@ -422,7 +422,7 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nouveau_fence_ref(nvc0->screen->base.fence.current, &q->fence);
 }
 
-static INLINE void
+static inline void
 nvc0_query_update(struct nouveau_client *cli, struct nvc0_query *q)
 {
    if (q->is64bit) {
@@ -1249,7 +1249,7 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
    }
 }
 
-static INLINE bool
+static inline bool
 nvc0_mp_pm_query_read_data(uint32_t count[32][4],
                            struct nvc0_context *nvc0, bool wait,
                            struct nvc0_query *q,
@@ -1274,7 +1274,7 @@ nvc0_mp_pm_query_read_data(uint32_t count[32][4],
    return true;
 }
 
-static INLINE bool
+static inline bool
 nve4_mp_pm_query_read_data(uint32_t count[32][4],
                            struct nvc0_context *nvc0, bool wait,
                            struct nvc0_query *q,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 81b641b1bf3..8ecfe66d92f 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -105,7 +105,7 @@ struct nvc0_screen {
    struct nouveau_object *nvsw;
 };
 
-static INLINE struct nvc0_screen *
+static inline struct nvc0_screen *
 nvc0_screen(struct pipe_screen *screen)
 {
    return (struct nvc0_screen *)screen;
@@ -290,7 +290,7 @@ int nvc0_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *);
 bool nvc0_screen_resize_tls_area(struct nvc0_screen *, uint32_t lpos,
                                  uint32_t lneg, uint32_t cstack);
 
-static INLINE void
+static inline void
 nvc0_resource_fence(struct nv04_resource *res, uint32_t flags)
 {
    struct nvc0_screen *screen = nvc0_screen(res->base.screen);
@@ -302,7 +302,7 @@ nvc0_resource_fence(struct nv04_resource *res, uint32_t flags)
    }
 }
 
-static INLINE void
+static inline void
 nvc0_resource_validate(struct nv04_resource *res, uint32_t flags)
 {
    if (likely(res->bo)) {
@@ -325,21 +325,21 @@ struct nvc0_format {
 
 extern const struct nvc0_format nvc0_format_table[];
 
-static INLINE void
+static inline void
 nvc0_screen_tic_unlock(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
 {
    if (tic->id >= 0)
       screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32));
 }
 
-static INLINE void
+static inline void
 nvc0_screen_tsc_unlock(struct nvc0_screen *screen, struct nv50_tsc_entry *tsc)
 {
    if (tsc->id >= 0)
       screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32));
 }
 
-static INLINE void
+static inline void
 nvc0_screen_tic_free(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
 {
    if (tic->id >= 0) {
@@ -348,7 +348,7 @@ nvc0_screen_tic_free(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
    }
 }
 
-static INLINE void
+static inline void
 nvc0_screen_tsc_free(struct nvc0_screen *screen, struct nv50_tsc_entry *tsc)
 {
    if (tsc->id >= 0) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 450e0d52a5b..d7eeb56d2ca 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -27,7 +27,7 @@
 
 #include "nvc0/nvc0_context.h"
 
-static INLINE void
+static inline void
 nvc0_program_update_context_state(struct nvc0_context *nvc0,
                                   struct nvc0_program *prog, int stage)
 {
@@ -63,7 +63,7 @@ nvc0_program_update_context_state(struct nvc0_context *nvc0,
    }
 }
 
-static INLINE bool
+static inline bool
 nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)
 {
    if (prog->mem)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 1d54151ab8d..8b3da47c76c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -35,7 +35,7 @@
 
 #include "nouveau_gldefs.h"
 
-static INLINE uint32_t
+static inline uint32_t
 nvc0_colormask(unsigned mask)
 {
     uint32_t ret = 0;
@@ -55,7 +55,7 @@ nvc0_colormask(unsigned mask)
 #define NVC0_BLEND_FACTOR_CASE(a, b) \
    case PIPE_BLENDFACTOR_##a: return NV50_BLEND_FACTOR_##b
 
-static INLINE uint32_t
+static inline uint32_t
 nvc0_blend_fac(unsigned factor)
 {
    switch (factor) {
@@ -428,7 +428,7 @@ nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
    FREE(hwcso);
 }
 
-static INLINE void
+static inline void
 nvc0_stage_sampler_states_bind(struct nvc0_context *nvc0, int s,
                                unsigned nr, void **hwcso)
 {
@@ -537,7 +537,7 @@ nvc0_sampler_view_destroy(struct pipe_context *pipe,
    FREE(nv50_tic_entry(view));
 }
 
-static INLINE void
+static inline void
 nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s,
                              unsigned nr,
                              struct pipe_sampler_view **views)
@@ -1136,7 +1136,7 @@ nvc0_set_shader_images(struct pipe_context *pipe, unsigned shader,
 #endif
 }
 
-static INLINE void
+static inline void
 nvc0_set_global_handle(uint32_t *phandle, struct pipe_resource *res)
 {
    struct nv04_resource *buf = nv04_resource(res);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index 242831461de..82004e7ca31 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -55,7 +55,7 @@ nvc0_validate_zcull(struct nvc0_context *nvc0)
 }
 #endif
 
-static INLINE void
+static inline void
 nvc0_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i)
 {
    BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 6);
@@ -309,7 +309,7 @@ nvc0_validate_viewport(struct nvc0_context *nvc0)
    nvc0->viewports_dirty = 0;
 }
 
-static INLINE void
+static inline void
 nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
@@ -324,7 +324,7 @@ nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s)
    PUSH_DATAp(push, &nvc0->clip.ucp[0][0], PIPE_MAX_CLIP_PLANES * 4);
 }
 
-static INLINE void
+static inline void
 nvc0_check_program_ucps(struct nvc0_context *nvc0,
                         struct nvc0_program *vp, uint8_t mask)
 {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
index 4bc4780ae17..9b69bf091d0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
@@ -68,7 +68,7 @@ struct nvc0_so_target {
    bool clean;
 };
 
-static INLINE struct nvc0_so_target *
+static inline struct nvc0_so_target *
 nvc0_so_target(struct pipe_stream_output_target *ptarg)
 {
    return (struct nvc0_so_target *)ptarg;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index 2376e45c48e..51a6f93f891 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -47,7 +47,7 @@
 #define NOUVEAU_DRIVER 0xc0
 #include "nv50/nv50_blit.h"
 
-static INLINE uint8_t
+static inline uint8_t
 nvc0_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
 {
    uint8_t id = nvc0_format_table[format].rt;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index 7c041cbfe34..dc790f4d4c2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -34,7 +34,7 @@
    (NV50_TIC_0_MAPA__MASK | NV50_TIC_0_MAPB__MASK |   \
     NV50_TIC_0_MAPG__MASK | NV50_TIC_0_MAPR__MASK)
 
-static INLINE uint32_t
+static inline uint32_t
 nv50_tic_swizzle(uint32_t tc, unsigned swz, bool tex_int)
 {
    switch (swz) {
@@ -645,13 +645,13 @@ nve4_set_surface_info(struct nouveau_pushbuf *push,
    }
 }
 
-static INLINE void
+static inline void
 nvc0_update_surface_bindings(struct nvc0_context *nvc0)
 {
    /* TODO */
 }
 
-static INLINE void
+static inline void
 nve4_update_surface_bindings(struct nvc0_context *nvc0)
 {
    /* TODO */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
index d21c7cc64fb..7cc5b4b1f48 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
@@ -329,7 +329,7 @@ nve4_m2mf_copy_linear(struct nouveau_context *nv,
 }
 
 
-static INLINE bool
+static inline bool
 nvc0_mt_transfer_can_map_directly(struct nv50_miptree *mt)
 {
    if (mt->base.domain == NOUVEAU_BO_VRAM)
@@ -339,7 +339,7 @@ nvc0_mt_transfer_can_map_directly(struct nv50_miptree *mt)
    return !nouveau_bo_memtype(mt->base.bo);
 }
 
-static INLINE bool
+static inline bool
 nvc0_mt_sync(struct nvc0_context *nvc0, struct nv50_miptree *mt, unsigned usage)
 {
    if (!mt->base.mm) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 8458ff07b27..d8c19f111c1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -196,7 +196,7 @@ nvc0_set_constant_vertex_attrib(struct nvc0_context *nvc0, const unsigned a)
    push->cur += 5;
 }
 
-static INLINE void
+static inline void
 nvc0_user_vbuf_range(struct nvc0_context *nvc0, int vbi,
                      uint32_t *base, uint32_t *size)
 {
@@ -214,7 +214,7 @@ nvc0_user_vbuf_range(struct nvc0_context *nvc0, int vbi,
    }
 }
 
-static INLINE void
+static inline void
 nvc0_release_user_vbufs(struct nvc0_context *nvc0)
 {
    if (nvc0->vbo_user) {
@@ -529,7 +529,7 @@ nvc0_idxbuf_validate(struct nvc0_context *nvc0)
 #define NVC0_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
-static INLINE unsigned
+static inline unsigned
 nvc0_prim_gl(unsigned prim)
 {
    switch (prim) {
@@ -835,7 +835,7 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
                         buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
 }
 
-static INLINE void
+static inline void
 nvc0_update_prim_restart(struct nvc0_context *nvc0, bool en, uint32_t index)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
index ff9a66e63c2..685ada9c61b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
@@ -55,7 +55,7 @@ nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx)
    ctx->edgeflag.stride = 0;
 }
 
-static INLINE void
+static inline void
 nvc0_vertex_configure_translate(struct nvc0_context *nvc0, int32_t index_bias)
 {
    struct translate *translate = nvc0->vertex->translate;
@@ -78,7 +78,7 @@ nvc0_vertex_configure_translate(struct nvc0_context *nvc0, int32_t index_bias)
    }
 }
 
-static INLINE void
+static inline void
 nvc0_push_map_idxbuf(struct push_context *ctx, struct nvc0_context *nvc0)
 {
    if (nvc0->idxbuf.buffer) {
@@ -90,7 +90,7 @@ nvc0_push_map_idxbuf(struct push_context *ctx, struct nvc0_context *nvc0)
    }
 }
 
-static INLINE void
+static inline void
 nvc0_push_map_edgeflag(struct push_context *ctx, struct nvc0_context *nvc0,
                        int32_t index_bias)
 {
@@ -112,7 +112,7 @@ nvc0_push_map_edgeflag(struct push_context *ctx, struct nvc0_context *nvc0,
       ctx->edgeflag.data += (intptr_t)index_bias * vb->stride;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i08(const uint8_t *elts, unsigned push, uint8_t index)
 {
    unsigned i;
@@ -120,7 +120,7 @@ prim_restart_search_i08(const uint8_t *elts, unsigned push, uint8_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i16(const uint16_t *elts, unsigned push, uint16_t index)
 {
    unsigned i;
@@ -128,7 +128,7 @@ prim_restart_search_i16(const uint16_t *elts, unsigned push, uint16_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index)
 {
    unsigned i;
@@ -136,21 +136,21 @@ prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index)
    return i;
 }
 
-static INLINE bool
+static inline bool
 ef_value(const struct push_context *ctx, uint32_t index)
 {
    float *pf = (float *)&ctx->edgeflag.data[index * ctx->edgeflag.stride];
    return *pf ? true : false;
 }
 
-static INLINE bool
+static inline bool
 ef_toggle(struct push_context *ctx)
 {
    ctx->edgeflag.value = !ctx->edgeflag.value;
    return ctx->edgeflag.value;
 }
 
-static INLINE unsigned
+static inline unsigned
 ef_toggle_search_i08(struct push_context *ctx, const uint8_t *elts, unsigned n)
 {
    unsigned i;
@@ -158,7 +158,7 @@ ef_toggle_search_i08(struct push_context *ctx, const uint8_t *elts, unsigned n)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 ef_toggle_search_i16(struct push_context *ctx, const uint16_t *elts, unsigned n)
 {
    unsigned i;
@@ -166,7 +166,7 @@ ef_toggle_search_i16(struct push_context *ctx, const uint16_t *elts, unsigned n)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 ef_toggle_search_i32(struct push_context *ctx, const uint32_t *elts, unsigned n)
 {
    unsigned i;
@@ -174,7 +174,7 @@ ef_toggle_search_i32(struct push_context *ctx, const uint32_t *elts, unsigned n)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 ef_toggle_search_seq(struct push_context *ctx, unsigned start, unsigned n)
 {
    unsigned i;
@@ -182,7 +182,7 @@ ef_toggle_search_seq(struct push_context *ctx, unsigned start, unsigned n)
    return i;
 }
 
-static INLINE void *
+static inline void *
 nvc0_push_setup_vertex_array(struct nvc0_context *nvc0, const unsigned count)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
@@ -409,7 +409,7 @@ disp_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
 #define NVC0_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
-static INLINE unsigned
+static inline unsigned
 nvc0_prim_gl(unsigned prim)
 {
    switch (prim) {
@@ -560,7 +560,7 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    NOUVEAU_DRV_STAT(&nvc0->screen->base, draw_calls_fallback_count, 1);
 }
 
-static INLINE void
+static inline void
 copy_indices_u8(uint32_t *dst, const uint8_t *elts, uint32_t bias, unsigned n)
 {
    unsigned i;
@@ -568,7 +568,7 @@ copy_indices_u8(uint32_t *dst, const uint8_t *elts, uint32_t bias, unsigned n)
       dst[i] = elts[i] + bias;
 }
 
-static INLINE void
+static inline void
 copy_indices_u16(uint32_t *dst, const uint16_t *elts, uint32_t bias, unsigned n)
 {
    unsigned i;
@@ -576,7 +576,7 @@ copy_indices_u16(uint32_t *dst, const uint16_t *elts, uint32_t bias, unsigned n)
       dst[i] = elts[i] + bias;
 }
 
-static INLINE void
+static inline void
 copy_indices_u32(uint32_t *dst, const uint32_t *elts, uint32_t bias, unsigned n)
 {
    unsigned i;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
index 725e889683f..4ea8ca3cfa2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
@@ -15,14 +15,14 @@
 #endif
 
 
-static INLINE void
+static inline void
 nv50_add_bufctx_resident_bo(struct nouveau_bufctx *bufctx, int bin,
                             unsigned flags, struct nouveau_bo *bo)
 {
    nouveau_bufctx_refn(bufctx, bin, bo, flags)->priv = NULL;
 }
 
-static INLINE void
+static inline void
 nvc0_add_resident(struct nouveau_bufctx *bufctx, int bin,
                   struct nv04_resource *res, unsigned flags)
 {
@@ -38,7 +38,7 @@ nvc0_add_resident(struct nouveau_bufctx *bufctx, int bin,
 #define BCTX_REFN(bctx, bin, res, acc) \
    nvc0_add_resident(bctx, NVC0_BIND_##bin, res, NOUVEAU_BO_##acc)
 
-static INLINE void
+static inline void
 PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 {
    struct nouveau_pushbuf_refn ref = { bo, flags };
@@ -69,46 +69,46 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 
 #define NVC0_3D_SERIALIZE NV50_GRAPH_SERIALIZE
 
-static INLINE uint32_t
+static inline uint32_t
 NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size)
 {
    return 0x20000000 | (size << 16) | (subc << 13) | (mthd >> 2);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NVC0_FIFO_PKHDR_NI(int subc, int mthd, unsigned size)
 {
    return 0x60000000 | (size << 16) | (subc << 13) | (mthd >> 2);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NVC0_FIFO_PKHDR_IL(int subc, int mthd, uint16_t data)
 {
    assert(data < 0x2000);
    return 0x80000000 | (data << 16) | (subc << 13) | (mthd >> 2);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NVC0_FIFO_PKHDR_1I(int subc, int mthd, unsigned size)
 {
    return 0xa0000000 | (size << 16) | (subc << 13) | (mthd >> 2);
 }
 
 
-static INLINE uint8_t
+static inline uint8_t
 nouveau_bo_memtype(const struct nouveau_bo *bo)
 {
    return bo->config.nvc0.memtype;
 }
 
 
-static INLINE void
+static inline void
 PUSH_DATAh(struct nouveau_pushbuf *push, uint64_t data)
 {
    *push->cur++ = (uint32_t)(data >> 32);
 }
 
-static INLINE void
+static inline void
 BEGIN_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
@@ -117,7 +117,7 @@ BEGIN_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
    PUSH_DATA (push, NVC0_FIFO_PKHDR_SQ(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 BEGIN_NIC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
@@ -126,7 +126,7 @@ BEGIN_NIC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
    PUSH_DATA (push, NVC0_FIFO_PKHDR_NI(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 BEGIN_1IC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
@@ -135,7 +135,7 @@ BEGIN_1IC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
    PUSH_DATA (push, NVC0_FIFO_PKHDR_1I(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 IMMED_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, uint16_t data)
 {
 #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index 2933d9b24db..d3e5676873e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -364,7 +364,7 @@ nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
 }
 
-static INLINE uint8_t
+static inline uint8_t
 nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
 {
    if (shared_size > (32 << 10))
@@ -413,7 +413,7 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
    nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE);
 }
 
-static INLINE struct nve4_cp_launch_desc *
+static inline struct nve4_cp_launch_desc *
 nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
                                struct nouveau_bo **pbo, uint64_t *pgpuaddr)
 {
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
index 4d7af54d860..7364a68a579 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
@@ -68,7 +68,7 @@ struct nve4_cp_launch_desc
    u32 unk48[16];
 };
 
-static INLINE void
+static inline void
 nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc)
 {
    memset(desc, 0, sizeof(*desc));
@@ -78,7 +78,7 @@ nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc)
    desc->unk47_20 = 0x300;
 }
 
-static INLINE void
+static inline void
 nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
                            unsigned index,
                            struct nouveau_bo *bo,
@@ -96,7 +96,7 @@ nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
    desc->cb_mask |= 1 << index;
 }
 
-static INLINE void
+static inline void
 nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc,
                                unsigned index,
                                const struct nvc0_constbuf *cb)
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 3873c9a31c1..5a58500b074 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -647,32 +647,32 @@ struct r300_context {
     for (atom = r300->first_dirty; atom != r300->last_dirty; atom++)
 
 /* Convenience cast wrappers. */
-static INLINE struct r300_query* r300_query(struct pipe_query* q)
+static inline struct r300_query* r300_query(struct pipe_query* q)
 {
     return (struct r300_query*)q;
 }
 
-static INLINE struct r300_surface* r300_surface(struct pipe_surface* surf)
+static inline struct r300_surface* r300_surface(struct pipe_surface* surf)
 {
     return (struct r300_surface*)surf;
 }
 
-static INLINE struct r300_resource* r300_resource(struct pipe_resource* tex)
+static inline struct r300_resource* r300_resource(struct pipe_resource* tex)
 {
     return (struct r300_resource*)tex;
 }
 
-static INLINE struct r300_context* r300_context(struct pipe_context* context)
+static inline struct r300_context* r300_context(struct pipe_context* context)
 {
     return (struct r300_context*)context;
 }
 
-static INLINE struct r300_fragment_shader *r300_fs(struct r300_context *r300)
+static inline struct r300_fragment_shader *r300_fs(struct r300_context *r300)
 {
     return (struct r300_fragment_shader*)r300->fs.state;
 }
 
-static INLINE void r300_mark_atom_dirty(struct r300_context *r300,
+static inline void r300_mark_atom_dirty(struct r300_context *r300,
                                         struct r300_atom *atom)
 {
     atom->dirty = TRUE;
@@ -688,7 +688,7 @@ static INLINE void r300_mark_atom_dirty(struct r300_context *r300,
     }
 }
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 r300_get_nonnull_cb(struct pipe_framebuffer_state *fb, unsigned i)
 {
     if (fb->cbufs[i])
@@ -777,12 +777,12 @@ void r300_update_derived_state(struct r300_context* r300);
 void r500_dump_rs_block(struct r300_rs_block *rs);
 
 
-static INLINE boolean CTX_DBG_ON(struct r300_context * ctx, unsigned flags)
+static inline boolean CTX_DBG_ON(struct r300_context * ctx, unsigned flags)
 {
     return SCREEN_DBG_ON(ctx->screen, flags);
 }
 
-static INLINE void CTX_DBG(struct r300_context * ctx, unsigned flags,
+static inline void CTX_DBG(struct r300_context * ctx, unsigned flags,
                        const char * fmt, ...)
 {
     if (CTX_DBG_ON(ctx, flags)) {
diff --git a/src/gallium/drivers/r300/r300_fs.h b/src/gallium/drivers/r300/r300_fs.h
index 39eb73da65d..b39624dad5f 100644
--- a/src/gallium/drivers/r300/r300_fs.h
+++ b/src/gallium/drivers/r300/r300_fs.h
@@ -77,14 +77,14 @@ void r300_shader_read_fs_inputs(struct tgsi_shader_info* info,
 /* Return TRUE if the shader was switched and should be re-emitted. */
 boolean r300_pick_fragment_shader(struct r300_context* r300);
 
-static INLINE boolean r300_fragment_shader_writes_depth(struct r300_fragment_shader *fs)
+static inline boolean r300_fragment_shader_writes_depth(struct r300_fragment_shader *fs)
 {
     if (!fs)
         return FALSE;
     return (fs->shader->code.writes_depth) ? TRUE : FALSE;
 }
 
-static INLINE boolean r300_fragment_shader_writes_all(struct r300_fragment_shader *fs)
+static inline boolean r300_fragment_shader_writes_all(struct r300_fragment_shader *fs)
 {
     if (!fs)
         return FALSE;
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 4c951d14f10..9dffa4935a3 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -871,7 +871,7 @@ struct r300_render {
     uint8_t *vbo_ptr;
 };
 
-static INLINE struct r300_render*
+static inline struct r300_render*
 r300_render(struct vbuf_render* render)
 {
     return (struct r300_render*)render;
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 6292c2c4a03..f1df79ccb21 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -428,7 +428,7 @@ static int r300_get_video_param(struct pipe_screen *screen,
  * Whether the format matches:
  *   PIPE_FORMAT_?10?10?10?2_UNORM
  */
-static INLINE boolean
+static inline boolean
 util_format_is_rgba1010102_variant(const struct util_format_description *desc)
 {
    static const unsigned size[4] = {10, 10, 10, 2};
diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h
index 7bba39bf12b..e15c3c7de0c 100644
--- a/src/gallium/drivers/r300/r300_screen.h
+++ b/src/gallium/drivers/r300/r300_screen.h
@@ -51,11 +51,11 @@ struct r300_screen {
 
 
 /* Convenience cast wrappers. */
-static INLINE struct r300_screen* r300_screen(struct pipe_screen* screen) {
+static inline struct r300_screen* r300_screen(struct pipe_screen* screen) {
     return (struct r300_screen*)screen;
 }
 
-static INLINE struct radeon_winsys *
+static inline struct radeon_winsys *
 radeon_winsys(struct pipe_screen *screen) {
     return r300_screen(screen)->rws;
 }
@@ -102,12 +102,12 @@ radeon_winsys(struct pipe_screen *screen) {
 #define DBG_P_STAT      (1 << 25)
 /*@}*/
 
-static INLINE boolean SCREEN_DBG_ON(struct r300_screen * screen, unsigned flags)
+static inline boolean SCREEN_DBG_ON(struct r300_screen * screen, unsigned flags)
 {
     return (screen->debug & flags) ? TRUE : FALSE;
 }
 
-static INLINE void SCREEN_DBG(struct r300_screen * screen, unsigned flags,
+static inline void SCREEN_DBG(struct r300_screen * screen, unsigned flags,
                               const char * fmt, ...)
 {
     if (SCREEN_DBG_ON(screen, flags)) {
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.h b/src/gallium/drivers/r300/r300_screen_buffer.h
index b4c8520039b..14b849c8c93 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.h
+++ b/src/gallium/drivers/r300/r300_screen_buffer.h
@@ -46,7 +46,7 @@ struct pipe_resource *r300_buffer_create(struct pipe_screen *screen,
 
 /* Inline functions. */
 
-static INLINE struct r300_buffer *r300_buffer(struct pipe_resource *buffer)
+static inline struct r300_buffer *r300_buffer(struct pipe_resource *buffer)
 {
     return (struct r300_buffer *)buffer;
 }
diff --git a/src/gallium/drivers/r300/r300_shader_semantics.h b/src/gallium/drivers/r300/r300_shader_semantics.h
index b756048c6c7..93bbc9d4a96 100644
--- a/src/gallium/drivers/r300/r300_shader_semantics.h
+++ b/src/gallium/drivers/r300/r300_shader_semantics.h
@@ -46,7 +46,7 @@ struct r300_shader_semantics {
     int num_generic;
 };
 
-static INLINE void r300_shader_semantics_reset(
+static inline void r300_shader_semantics_reset(
     struct r300_shader_semantics* info)
 {
     int i;
diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h
index feec494c4dc..fbd91cda9fe 100644
--- a/src/gallium/drivers/r300/r300_state_inlines.h
+++ b/src/gallium/drivers/r300/r300_state_inlines.h
@@ -32,13 +32,13 @@
 
 /* Some maths. These should probably find their way to u_math, if needed. */
 
-static INLINE int pack_float_16_6x(float f) {
+static inline int pack_float_16_6x(float f) {
     return ((int)(f * 6.0) & 0xffff);
 }
 
 /* Blend state. */
 
-static INLINE uint32_t r300_translate_blend_function(int blend_func,
+static inline uint32_t r300_translate_blend_function(int blend_func,
                                                      boolean clamp)
 {
     switch (blend_func) {
@@ -60,7 +60,7 @@ static INLINE uint32_t r300_translate_blend_function(int blend_func,
     return 0;
 }
 
-static INLINE uint32_t r300_translate_blend_factor(int blend_fact)
+static inline uint32_t r300_translate_blend_factor(int blend_fact)
 {
     switch (blend_fact) {
         case PIPE_BLENDFACTOR_ONE:
@@ -113,7 +113,7 @@ static INLINE uint32_t r300_translate_blend_factor(int blend_fact)
 
 /* DSA state. */
 
-static INLINE uint32_t r300_translate_depth_stencil_function(int zs_func)
+static inline uint32_t r300_translate_depth_stencil_function(int zs_func)
 {
     switch (zs_func) {
         case PIPE_FUNC_NEVER:
@@ -141,7 +141,7 @@ static INLINE uint32_t r300_translate_depth_stencil_function(int zs_func)
     return 0;
 }
 
-static INLINE uint32_t r300_translate_stencil_op(int s_op)
+static inline uint32_t r300_translate_stencil_op(int s_op)
 {
     switch (s_op) {
         case PIPE_STENCIL_OP_KEEP:
@@ -168,7 +168,7 @@ static INLINE uint32_t r300_translate_stencil_op(int s_op)
     return 0;
 }
 
-static INLINE uint32_t r300_translate_alpha_function(int alpha_func)
+static inline uint32_t r300_translate_alpha_function(int alpha_func)
 {
     switch (alpha_func) {
         case PIPE_FUNC_NEVER:
@@ -195,7 +195,7 @@ static INLINE uint32_t r300_translate_alpha_function(int alpha_func)
     return 0;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 r300_translate_polygon_mode_front(unsigned mode) {
     switch (mode)
     {
@@ -213,7 +213,7 @@ r300_translate_polygon_mode_front(unsigned mode) {
     }
 }
 
-static INLINE uint32_t
+static inline uint32_t
 r300_translate_polygon_mode_back(unsigned mode) {
     switch (mode)
     {
@@ -233,7 +233,7 @@ r300_translate_polygon_mode_back(unsigned mode) {
 
 /* Texture sampler state. */
 
-static INLINE uint32_t r300_translate_wrap(int wrap)
+static inline uint32_t r300_translate_wrap(int wrap)
 {
     switch (wrap) {
         case PIPE_TEX_WRAP_REPEAT:
@@ -259,7 +259,7 @@ static INLINE uint32_t r300_translate_wrap(int wrap)
     }
 }
 
-static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip,
+static inline uint32_t r300_translate_tex_filters(int min, int mag, int mip,
                                                   boolean is_anisotropic)
 {
     uint32_t retval = 0;
@@ -308,7 +308,7 @@ static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip,
     return retval;
 }
 
-static INLINE uint32_t r300_anisotropy(unsigned max_aniso)
+static inline uint32_t r300_anisotropy(unsigned max_aniso)
 {
     if (max_aniso >= 16) {
         return R300_TX_MAX_ANISO_16_TO_1;
@@ -323,7 +323,7 @@ static INLINE uint32_t r300_anisotropy(unsigned max_aniso)
     }
 }
 
-static INLINE uint32_t r500_anisotropy(unsigned max_aniso)
+static inline uint32_t r500_anisotropy(unsigned max_aniso)
 {
     if (!max_aniso) {
         return 0;
@@ -336,7 +336,7 @@ static INLINE uint32_t r500_anisotropy(unsigned max_aniso)
 }
 
 /* Translate pipe_formats into PSC vertex types. */
-static INLINE uint16_t
+static inline uint16_t
 r300_translate_vertex_data_type(enum pipe_format format) {
     uint32_t result = 0;
     const struct util_format_description *desc;
@@ -410,7 +410,7 @@ r300_translate_vertex_data_type(enum pipe_format format) {
     return result;
 }
 
-static INLINE uint16_t
+static inline uint16_t
 r300_translate_vertex_data_swizzle(enum pipe_format format) {
     const struct util_format_description *desc;
     unsigned i, swizzle = 0;
diff --git a/src/gallium/drivers/r300/r300_transfer.c b/src/gallium/drivers/r300/r300_transfer.c
index b87164ba836..bc0835d1875 100644
--- a/src/gallium/drivers/r300/r300_transfer.c
+++ b/src/gallium/drivers/r300/r300_transfer.c
@@ -41,7 +41,7 @@ struct r300_transfer {
 };
 
 /* Convenience cast wrapper. */
-static INLINE struct r300_transfer*
+static inline struct r300_transfer*
 r300_transfer(struct pipe_transfer* transfer)
 {
     return (struct r300_transfer*)transfer;
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 4ddbc0beba5..7065af94e7c 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -32,7 +32,7 @@
 #include "evergreen_compute.h"
 #include "util/u_math.h"
 
-static INLINE unsigned evergreen_array_mode(unsigned mode)
+static inline unsigned evergreen_array_mode(unsigned mode)
 {
 	switch (mode) {
 	case RADEON_SURF_MODE_LINEAR_ALIGNED:	return V_028C70_ARRAY_LINEAR_ALIGNED;
diff --git a/src/gallium/drivers/r600/r600_formats.h b/src/gallium/drivers/r600/r600_formats.h
index fa374d92e6f..9533aaa1378 100644
--- a/src/gallium/drivers/r600/r600_formats.h
+++ b/src/gallium/drivers/r600/r600_formats.h
@@ -64,7 +64,7 @@
 #define     ENDIAN_8IN32                    2
 #define     ENDIAN_8IN64                    3
 
-static INLINE unsigned r600_endian_swap(unsigned size)
+static inline unsigned r600_endian_swap(unsigned size)
 {
 	if (R600_BIG_ENDIAN) {
 		switch (size) {
@@ -82,7 +82,7 @@ static INLINE unsigned r600_endian_swap(unsigned size)
 	}
 }
 
-static INLINE bool r600_is_vertex_format_supported(enum pipe_format format)
+static inline bool r600_is_vertex_format_supported(enum pipe_format format)
 {
 	const struct util_format_description *desc = util_format_description(format);
 	unsigned i;
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 4ea270d3839..991fda894ac 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -490,7 +490,7 @@ struct r600_context {
 	struct r600_isa		*isa;
 };
 
-static INLINE void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
+static inline void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
 					    struct r600_command_buffer *cb)
 {
 	assert(cs->cdw + cb->num_dw <= RADEON_MAX_CMDBUF_DWORDS);
@@ -500,7 +500,7 @@ static INLINE void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
 
 void r600_trace_emit(struct r600_context *rctx);
 
-static INLINE void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom)
+static inline void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom)
 {
 	atom->emit(&rctx->b, atom);
 	atom->dirty = false;
@@ -509,13 +509,13 @@ static INLINE void r600_emit_atom(struct r600_context *rctx, struct r600_atom *a
 	}
 }
 
-static INLINE void r600_set_cso_state(struct r600_cso_state *state, void *cso)
+static inline void r600_set_cso_state(struct r600_cso_state *state, void *cso)
 {
 	state->cso = cso;
 	state->atom.dirty = cso != NULL;
 }
 
-static INLINE void r600_set_cso_state_with_cb(struct r600_cso_state *state, void *cso,
+static inline void r600_set_cso_state_with_cb(struct r600_cso_state *state, void *cso,
 					      struct r600_command_buffer *cb)
 {
 	state->cb = cb;
@@ -719,19 +719,19 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe,
 /*Evergreen Compute packet3*/
 #define PKT3C(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate) | RADEON_CP_PACKET3_COMPUTE_MODE)
 
-static INLINE void r600_store_value(struct r600_command_buffer *cb, unsigned value)
+static inline void r600_store_value(struct r600_command_buffer *cb, unsigned value)
 {
 	cb->buf[cb->num_dw++] = value;
 }
 
-static INLINE void r600_store_array(struct r600_command_buffer *cb, unsigned num, unsigned *ptr)
+static inline void r600_store_array(struct r600_command_buffer *cb, unsigned num, unsigned *ptr)
 {
 	assert(cb->num_dw+num <= cb->max_num_dw);
 	memcpy(&cb->buf[cb->num_dw], ptr, num * sizeof(ptr[0]));
 	cb->num_dw += num;
 }
 
-static INLINE void r600_store_config_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void r600_store_config_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg < R600_CONTEXT_REG_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -743,7 +743,7 @@ static INLINE void r600_store_config_reg_seq(struct r600_command_buffer *cb, uns
  * Needs cb->pkt_flags set to  RADEON_CP_PACKET3_COMPUTE_MODE for compute
  * shaders.
  */
-static INLINE void r600_store_context_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void r600_store_context_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CONTEXT_REG_OFFSET && reg < R600_CTL_CONST_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -755,7 +755,7 @@ static INLINE void r600_store_context_reg_seq(struct r600_command_buffer *cb, un
  * Needs cb->pkt_flags set to  RADEON_CP_PACKET3_COMPUTE_MODE for compute
  * shaders.
  */
-static INLINE void r600_store_ctl_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void r600_store_ctl_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CTL_CONST_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -763,7 +763,7 @@ static INLINE void r600_store_ctl_const_seq(struct r600_command_buffer *cb, unsi
 	cb->buf[cb->num_dw++] = (reg - R600_CTL_CONST_OFFSET) >> 2;
 }
 
-static INLINE void r600_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void r600_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_LOOP_CONST_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -775,7 +775,7 @@ static INLINE void r600_store_loop_const_seq(struct r600_command_buffer *cb, uns
  * Needs cb->pkt_flags set to  RADEON_CP_PACKET3_COMPUTE_MODE for compute
  * shaders.
  */
-static INLINE void eg_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void eg_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg >= EG_LOOP_CONST_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -783,31 +783,31 @@ static INLINE void eg_store_loop_const_seq(struct r600_command_buffer *cb, unsig
 	cb->buf[cb->num_dw++] = (reg - EG_LOOP_CONST_OFFSET) >> 2;
 }
 
-static INLINE void r600_store_config_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void r600_store_config_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	r600_store_config_reg_seq(cb, reg, 1);
 	r600_store_value(cb, value);
 }
 
-static INLINE void r600_store_context_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void r600_store_context_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	r600_store_context_reg_seq(cb, reg, 1);
 	r600_store_value(cb, value);
 }
 
-static INLINE void r600_store_ctl_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void r600_store_ctl_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	r600_store_ctl_const_seq(cb, reg, 1);
 	r600_store_value(cb, value);
 }
 
-static INLINE void r600_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void r600_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	r600_store_loop_const_seq(cb, reg, 1);
 	r600_store_value(cb, value);
 }
 
-static INLINE void eg_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void eg_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	eg_store_loop_const_seq(cb, reg, 1);
 	r600_store_value(cb, value);
@@ -816,14 +816,14 @@ static INLINE void eg_store_loop_const(struct r600_command_buffer *cb, unsigned
 void r600_init_command_buffer(struct r600_command_buffer *cb, unsigned num_dw);
 void r600_release_command_buffer(struct r600_command_buffer *cb);
 
-static INLINE void r600_write_compute_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void r600_write_compute_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	r600_write_context_reg_seq(cs, reg, num);
 	/* Set the compute bit on the packet header */
 	cs->buf[cs->cdw - 2] |= RADEON_CP_PACKET3_COMPUTE_MODE;
 }
 
-static INLINE void r600_write_ctl_const_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void r600_write_ctl_const_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CTL_CONST_OFFSET);
 	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
@@ -831,13 +831,13 @@ static INLINE void r600_write_ctl_const_seq(struct radeon_winsys_cs *cs, unsigne
 	cs->buf[cs->cdw++] = (reg - R600_CTL_CONST_OFFSET) >> 2;
 }
 
-static INLINE void r600_write_compute_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void r600_write_compute_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	r600_write_compute_context_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static INLINE void r600_write_context_reg_flag(struct radeon_winsys_cs *cs, unsigned reg, unsigned value, unsigned flag)
+static inline void r600_write_context_reg_flag(struct radeon_winsys_cs *cs, unsigned reg, unsigned value, unsigned flag)
 {
 	if (flag & RADEON_CP_PACKET3_COMPUTE_MODE) {
 		r600_write_compute_context_reg(cs, reg, value);
@@ -846,7 +846,7 @@ static INLINE void r600_write_context_reg_flag(struct radeon_winsys_cs *cs, unsi
 	}
 }
 
-static INLINE void r600_write_ctl_const(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void r600_write_ctl_const(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	r600_write_ctl_const_seq(cs, reg, 1);
 	radeon_emit(cs, value);
@@ -855,21 +855,21 @@ static INLINE void r600_write_ctl_const(struct radeon_winsys_cs *cs, unsigned re
 /*
  * common helpers
  */
-static INLINE uint32_t S_FIXED(float value, uint32_t frac_bits)
+static inline uint32_t S_FIXED(float value, uint32_t frac_bits)
 {
 	return value * (1 << frac_bits);
 }
 #define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y))
 
 /* 12.4 fixed-point */
-static INLINE unsigned r600_pack_float_12p4(float x)
+static inline unsigned r600_pack_float_12p4(float x)
 {
 	return x <= 0    ? 0 :
 	       x >= 4096 ? 0xffff : x * 16;
 }
 
 /* Return if the depth format can be read without the DB->CB copy on r6xx-r7xx. */
-static INLINE bool r600_can_read_depth(struct r600_texture *rtex)
+static inline bool r600_can_read_depth(struct r600_texture *rtex)
 {
 	return rtex->resource.b.b.nr_samples <= 1 &&
 	       (rtex->resource.b.b.format == PIPE_FORMAT_Z16_UNORM ||
@@ -880,7 +880,7 @@ static INLINE bool r600_can_read_depth(struct r600_texture *rtex)
 #define     V_028A6C_OUTPRIM_TYPE_LINESTRIP            1
 #define     V_028A6C_OUTPRIM_TYPE_TRISTRIP             2
 
-static INLINE unsigned r600_conv_prim_to_gs_out(unsigned mode)
+static inline unsigned r600_conv_prim_to_gs_out(unsigned mode)
 {
 	static const int prim_conv[] = {
 		V_028A6C_OUTPRIM_TYPE_POINTLIST,
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 960dfcedfef..c28a1e14f0f 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -2800,7 +2800,7 @@ void r600_update_db_shader_control(struct r600_context * rctx)
 	}
 }
 
-static INLINE unsigned r600_array_mode(unsigned mode)
+static inline unsigned r600_array_mode(unsigned mode)
 {
 	switch (mode) {
 	case RADEON_SURF_MODE_LINEAR_ALIGNED:	return V_0280A0_ARRAY_LINEAR_ALIGNED;
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 13dc9ee8c10..26184e0b517 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -694,7 +694,7 @@ void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom)
 }
 
 /* Compute the key for the hw shader variant */
-static INLINE struct r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
+static inline struct r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
 		struct r600_pipe_shader_selector * sel)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
diff --git a/src/gallium/drivers/radeon/r600_cs.h b/src/gallium/drivers/radeon/r600_cs.h
index b51eebbc68e..d6d4c88fbf6 100644
--- a/src/gallium/drivers/radeon/r600_cs.h
+++ b/src/gallium/drivers/radeon/r600_cs.h
@@ -33,7 +33,7 @@
 #include "r600_pipe_common.h"
 #include "r600d_common.h"
 
-static INLINE unsigned r600_context_bo_reloc(struct r600_common_context *rctx,
+static inline unsigned r600_context_bo_reloc(struct r600_common_context *rctx,
 					     struct r600_ring *ring,
 					     struct r600_resource *rbo,
 					     enum radeon_bo_usage usage,
@@ -59,7 +59,7 @@ static INLINE unsigned r600_context_bo_reloc(struct r600_common_context *rctx,
 				      rbo->domains, priority) * 4;
 }
 
-static INLINE void r600_emit_reloc(struct r600_common_context *rctx,
+static inline void r600_emit_reloc(struct r600_common_context *rctx,
 				   struct r600_ring *ring, struct r600_resource *rbo,
 				   enum radeon_bo_usage usage,
 				   enum radeon_bo_priority priority)
@@ -74,7 +74,7 @@ static INLINE void r600_emit_reloc(struct r600_common_context *rctx,
 	}
 }
 
-static INLINE void r600_write_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void r600_write_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg < R600_CONTEXT_REG_OFFSET);
 	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
@@ -82,13 +82,13 @@ static INLINE void r600_write_config_reg_seq(struct radeon_winsys_cs *cs, unsign
 	radeon_emit(cs, (reg - R600_CONFIG_REG_OFFSET) >> 2);
 }
 
-static INLINE void r600_write_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void r600_write_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	r600_write_config_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static INLINE void r600_write_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void r600_write_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CONTEXT_REG_OFFSET);
 	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
@@ -96,13 +96,13 @@ static INLINE void r600_write_context_reg_seq(struct radeon_winsys_cs *cs, unsig
 	radeon_emit(cs, (reg - R600_CONTEXT_REG_OFFSET) >> 2);
 }
 
-static INLINE void r600_write_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void r600_write_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	r600_write_context_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static INLINE void si_write_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void si_write_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
 	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
@@ -110,13 +110,13 @@ static INLINE void si_write_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg
 	radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
 }
 
-static INLINE void si_write_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void si_write_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	si_write_sh_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static INLINE void cik_write_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void cik_write_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
 	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
@@ -124,7 +124,7 @@ static INLINE void cik_write_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsign
 	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
 }
 
-static INLINE void cik_write_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void cik_write_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	cik_write_uconfig_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index a471426ea27..f6d0380c35f 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -550,12 +550,12 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 
 /* Inline helpers. */
 
-static INLINE struct r600_resource *r600_resource(struct pipe_resource *r)
+static inline struct r600_resource *r600_resource(struct pipe_resource *r)
 {
 	return (struct r600_resource*)r;
 }
 
-static INLINE void
+static inline void
 r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res)
 {
 	pipe_resource_reference((struct pipe_resource **)ptr,
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 48342c063c1..dabee534fe3 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -669,12 +669,12 @@ struct radeon_winsys {
 };
 
 
-static INLINE void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
+static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
 {
     cs->buf[cs->cdw++] = value;
 }
 
-static INLINE void radeon_emit_array(struct radeon_winsys_cs *cs,
+static inline void radeon_emit_array(struct radeon_winsys_cs *cs,
 				     const uint32_t *values, unsigned count)
 {
     memcpy(cs->buf+cs->cdw, values, count * 4);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 67cb0357231..266eeef61e9 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -294,7 +294,7 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
  * common helpers
  */
 
-static INLINE struct r600_resource *
+static inline struct r600_resource *
 si_resource_create_custom(struct pipe_screen *screen,
 			  unsigned usage, unsigned size)
 {
@@ -303,7 +303,7 @@ si_resource_create_custom(struct pipe_screen *screen,
 		PIPE_BIND_CUSTOM, usage, size));
 }
 
-static INLINE void
+static inline void
 si_invalidate_draw_sh_constants(struct si_context *sctx)
 {
 	sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index a842d9db798..78be4d9baf2 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -335,7 +335,7 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 }
 
 /* Compute the key for the hw shader variant */
-static INLINE void si_shader_selector_key(struct pipe_context *ctx,
+static inline void si_shader_selector_key(struct pipe_context *ctx,
 					  struct si_shader_selector *sel,
 					  union si_shader_key *key)
 {
diff --git a/src/gallium/drivers/rbug/rbug_context.h b/src/gallium/drivers/rbug/rbug_context.h
index 5e7b9d4dee4..e99f6edc523 100644
--- a/src/gallium/drivers/rbug/rbug_context.h
+++ b/src/gallium/drivers/rbug/rbug_context.h
@@ -79,7 +79,7 @@ struct rbug_context {
    struct rbug_list shaders;
 };
 
-static INLINE struct rbug_context *
+static inline struct rbug_context *
 rbug_context(struct pipe_context *pipe)
 {
    return (struct rbug_context *)pipe;
diff --git a/src/gallium/drivers/rbug/rbug_objects.h b/src/gallium/drivers/rbug/rbug_objects.h
index 3fba3334228..02973e07996 100644
--- a/src/gallium/drivers/rbug/rbug_objects.h
+++ b/src/gallium/drivers/rbug/rbug_objects.h
@@ -93,7 +93,7 @@ struct rbug_transfer
 };
 
 
-static INLINE struct rbug_resource *
+static inline struct rbug_resource *
 rbug_resource(struct pipe_resource *_resource)
 {
    if (!_resource)
@@ -102,7 +102,7 @@ rbug_resource(struct pipe_resource *_resource)
    return (struct rbug_resource *)_resource;
 }
 
-static INLINE struct rbug_sampler_view *
+static inline struct rbug_sampler_view *
 rbug_sampler_view(struct pipe_sampler_view *_sampler_view)
 {
    if (!_sampler_view)
@@ -111,7 +111,7 @@ rbug_sampler_view(struct pipe_sampler_view *_sampler_view)
    return (struct rbug_sampler_view *)_sampler_view;
 }
 
-static INLINE struct rbug_surface *
+static inline struct rbug_surface *
 rbug_surface(struct pipe_surface *_surface)
 {
    if (!_surface)
@@ -120,7 +120,7 @@ rbug_surface(struct pipe_surface *_surface)
    return (struct rbug_surface *)_surface;
 }
 
-static INLINE struct rbug_transfer *
+static inline struct rbug_transfer *
 rbug_transfer(struct pipe_transfer *_transfer)
 {
    if (!_transfer)
@@ -129,7 +129,7 @@ rbug_transfer(struct pipe_transfer *_transfer)
    return (struct rbug_transfer *)_transfer;
 }
 
-static INLINE struct rbug_shader *
+static inline struct rbug_shader *
 rbug_shader(void *_state)
 {
    if (!_state)
@@ -137,7 +137,7 @@ rbug_shader(void *_state)
    return (struct rbug_shader *)_state;
 }
 
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 rbug_resource_unwrap(struct pipe_resource *_resource)
 {
    if (!_resource)
@@ -145,7 +145,7 @@ rbug_resource_unwrap(struct pipe_resource *_resource)
    return rbug_resource(_resource)->resource;
 }
 
-static INLINE struct pipe_sampler_view *
+static inline struct pipe_sampler_view *
 rbug_sampler_view_unwrap(struct pipe_sampler_view *_sampler_view)
 {
    if (!_sampler_view)
@@ -153,7 +153,7 @@ rbug_sampler_view_unwrap(struct pipe_sampler_view *_sampler_view)
    return rbug_sampler_view(_sampler_view)->sampler_view;
 }
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 rbug_surface_unwrap(struct pipe_surface *_surface)
 {
    if (!_surface)
@@ -161,7 +161,7 @@ rbug_surface_unwrap(struct pipe_surface *_surface)
    return rbug_surface(_surface)->surface;
 }
 
-static INLINE struct pipe_transfer *
+static inline struct pipe_transfer *
 rbug_transfer_unwrap(struct pipe_transfer *_transfer)
 {
    if (!_transfer)
@@ -169,7 +169,7 @@ rbug_transfer_unwrap(struct pipe_transfer *_transfer)
    return rbug_transfer(_transfer)->transfer;
 }
 
-static INLINE void *
+static inline void *
 rbug_shader_unwrap(void *_state)
 {
    struct rbug_shader *shader;
diff --git a/src/gallium/drivers/rbug/rbug_screen.h b/src/gallium/drivers/rbug/rbug_screen.h
index a53afac05e9..fd92374beda 100644
--- a/src/gallium/drivers/rbug/rbug_screen.h
+++ b/src/gallium/drivers/rbug/rbug_screen.h
@@ -60,7 +60,7 @@ struct rbug_screen
    struct rbug_list transfers;
 };
 
-static INLINE struct rbug_screen *
+static inline struct rbug_screen *
 rbug_screen(struct pipe_screen *screen)
 {
    return (struct rbug_screen *)screen;
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 50a73369c1d..577df814b29 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -203,7 +203,7 @@ struct softpipe_context {
 };
 
 
-static INLINE struct softpipe_context *
+static inline struct softpipe_context *
 softpipe_context( struct pipe_context *pipe )
 {
    return (struct softpipe_context *)pipe;
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 369ab6ed8d4..89411777ec9 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -52,7 +52,7 @@ struct sp_exec_fragment_shader
 
 
 /** cast wrapper */
-static INLINE struct sp_exec_fragment_shader *
+static inline struct sp_exec_fragment_shader *
 sp_exec_fragment_shader(const struct sp_fragment_shader_variant *var)
 {
    return (struct sp_exec_fragment_shader *) var;
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 18eca611669..f8a3eacdb37 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -145,7 +145,7 @@ sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 }
 
 
-static INLINE cptrf4 get_vert( const void *vertex_buffer,
+static inline cptrf4 get_vert( const void *vertex_buffer,
                                int index,
                                int stride )
 {
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index a32bd7fd241..5b458450cd8 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -63,7 +63,7 @@ struct blend_quad_stage
 
 
 /** cast wrapper */
-static INLINE struct blend_quad_stage *
+static inline struct blend_quad_stage *
 blend_quad_stage(struct quad_stage *stage)
 {
    return (struct blend_quad_stage *) stage;
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 82c58d04527..395bc70f2cf 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -56,7 +56,7 @@ struct quad_shade_stage
 
 
 /** cast wrapper */
-static INLINE struct quad_shade_stage *
+static inline struct quad_shade_stage *
 quad_shade_stage(struct quad_stage *qs)
 {
    return (struct quad_shade_stage *) qs;
@@ -67,7 +67,7 @@ quad_shade_stage(struct quad_stage *qs)
  * Execute fragment shader for the four fragments in the quad.
  * \return TRUE if quad is alive, FALSE if all four pixels are killed
  */
-static INLINE boolean
+static inline boolean
 shade_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct softpipe_context *softpipe = qs->softpipe;
diff --git a/src/gallium/drivers/softpipe/sp_screen.h b/src/gallium/drivers/softpipe/sp_screen.h
index d39e9f48e80..f0e929111c2 100644
--- a/src/gallium/drivers/softpipe/sp_screen.h
+++ b/src/gallium/drivers/softpipe/sp_screen.h
@@ -49,7 +49,7 @@ struct softpipe_screen {
    boolean use_llvm;
 };
 
-static INLINE struct softpipe_screen *
+static inline struct softpipe_screen *
 softpipe_screen( struct pipe_screen *pipe )
 {
    return (struct softpipe_screen *)pipe;
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index 6704015112b..ff3cb9fe5e1 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -125,7 +125,7 @@ struct setup_context {
 /**
  * Clip setup->quad against the scissor/surface bounds.
  */
-static INLINE void
+static inline void
 quad_clip(struct setup_context *setup, struct quad_header *quad)
 {
    const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
@@ -156,7 +156,7 @@ quad_clip(struct setup_context *setup, struct quad_header *quad)
 /**
  * Emit a quad (pass to next stage) with clipping.
  */
-static INLINE void
+static inline void
 clip_emit_quad(struct setup_context *setup, struct quad_header *quad)
 {
    quad_clip( setup, quad );
@@ -178,14 +178,14 @@ clip_emit_quad(struct setup_context *setup, struct quad_header *quad)
  * Given an X or Y coordinate, return the block/quad coordinate that it
  * belongs to.
  */
-static INLINE int
+static inline int
 block(int x)
 {
    return x & ~(2-1);
 }
 
 
-static INLINE int
+static inline int
 block_x(int x)
 {
    return x & ~(16-1);
@@ -1039,7 +1039,7 @@ setup_line_coefficients(struct setup_context *setup,
 /**
  * Plot a pixel in a line segment.
  */
-static INLINE void
+static inline void
 plot(struct setup_context *setup, int x, int y)
 {
    const int iy = y & 1;
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 1010b63de2c..565fca632c6 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -58,7 +58,7 @@
  * of improperly weighted linear-filtered textures.
  * The tests/texwrap.c demo is a good test.
  */
-static INLINE float
+static inline float
 frac(float f)
 {
    return f - floorf(f);
@@ -69,7 +69,7 @@ frac(float f)
 /**
  * Linear interpolation macro
  */
-static INLINE float
+static inline float
 lerp(float a, float v0, float v1)
 {
    return v0 + a * (v1 - v0);
@@ -84,7 +84,7 @@ lerp(float a, float v0, float v1)
  * optimization!  If we find that's not true on some systems, convert
  * to a macro.
  */
-static INLINE float
+static inline float
 lerp_2d(float a, float b,
         float v00, float v10, float v01, float v11)
 {
@@ -97,7 +97,7 @@ lerp_2d(float a, float b,
 /**
  * As above, but 3D interpolation of 8 values.
  */
-static INLINE float
+static inline float
 lerp_3d(float a, float b, float c,
         float v000, float v100, float v010, float v110,
         float v001, float v101, float v011, float v111)
@@ -115,7 +115,7 @@ lerp_3d(float a, float b, float c,
  * value.  To avoid that problem we add a large multiple of the size
  * (rather than using a conditional).
  */
-static INLINE int
+static inline int
 repeat(int coord, unsigned size)
 {
    return (coord + size * 1024) % size;
@@ -486,7 +486,7 @@ wrap_linear_unorm_clamp_to_edge(float s, unsigned size, int offset,
 /**
  * Do coordinate to array index conversion.  For array textures.
  */
-static INLINE int
+static inline int
 coord_to_layer(float coord, unsigned first_layer, unsigned last_layer)
 {
    int c = util_ifloor(coord + 0.5F);
@@ -587,7 +587,7 @@ compute_lambda_vert(const struct sp_sampler_view *sview,
 
 
 
-static INLINE const float *
+static inline const float *
 get_texel_2d_no_border(const struct sp_sampler_view *sp_sview,
                        union tex_tile_address addr, int x, int y)
 {
@@ -603,7 +603,7 @@ get_texel_2d_no_border(const struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE const float *
+static inline const float *
 get_texel_2d(const struct sp_sampler_view *sp_sview,
              const struct sp_sampler *sp_samp,
              union tex_tile_address addr, int x, int y)
@@ -695,7 +695,7 @@ static const unsigned face_array[PIPE_TEX_FACE_MAX][4] = {
      PIPE_TEX_FACE_POS_Y, PIPE_TEX_FACE_NEG_Y }
 };
 
-static INLINE unsigned
+static inline unsigned
 get_next_face(unsigned face, int idx)
 {
    return face_array[face][idx];
@@ -705,7 +705,7 @@ get_next_face(unsigned face, int idx)
  * return a new xcoord based on old face, old coords, cube size
  * and fall_off_index (0 for x-, 1 for x+, 2 for y-, 3 for y+)
  */
-static INLINE int
+static inline int
 get_next_xcoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
 {
    if ((face == 0 && fall_off_index != 1) ||
@@ -743,7 +743,7 @@ get_next_xcoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
  * return a new ycoord based on old face, old coords, cube size
  * and fall_off_index (0 for x-, 1 for x+, 2 for y-, 3 for y+)
  */
-static INLINE int
+static inline int
 get_next_ycoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
 {
    if ((fall_off_index <= 1) && (face <= 1 || face >= 4)) {
@@ -771,7 +771,7 @@ get_next_ycoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
 
 /* Gather a quad of adjacent texels within a tile:
  */
-static INLINE void
+static inline void
 get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_view *sp_sview,
                                         union tex_tile_address addr,
                                         unsigned x, unsigned y,
@@ -795,7 +795,7 @@ get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_view *sp_sview,
 
 /* Gather a quad of potentially non-adjacent texels:
  */
-static INLINE void
+static inline void
 get_texel_quad_2d_no_border(const struct sp_sampler_view *sp_sview,
                             union tex_tile_address addr,
                             int x0, int y0,
@@ -810,7 +810,7 @@ get_texel_quad_2d_no_border(const struct sp_sampler_view *sp_sview,
 
 /* Can involve a lot of unnecessary checks for border color:
  */
-static INLINE void
+static inline void
 get_texel_quad_2d(const struct sp_sampler_view *sp_sview,
                   const struct sp_sampler *sp_samp,
                   union tex_tile_address addr,
@@ -828,7 +828,7 @@ get_texel_quad_2d(const struct sp_sampler_view *sp_sview,
 
 /* 3d variants:
  */
-static INLINE const float *
+static inline const float *
 get_texel_3d_no_border(const struct sp_sampler_view *sp_sview,
                        union tex_tile_address addr, int x, int y, int z)
 {
@@ -846,7 +846,7 @@ get_texel_3d_no_border(const struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE const float *
+static inline const float *
 get_texel_3d(const struct sp_sampler_view *sp_sview,
              const struct sp_sampler *sp_samp,
              union tex_tile_address addr, int x, int y, int z)
@@ -866,7 +866,7 @@ get_texel_3d(const struct sp_sampler_view *sp_sview,
 
 
 /* Get texel pointer for 1D array texture */
-static INLINE const float *
+static inline const float *
 get_texel_1d_array(const struct sp_sampler_view *sp_sview,
                    const struct sp_sampler *sp_samp,
                    union tex_tile_address addr, int x, int y)
@@ -884,7 +884,7 @@ get_texel_1d_array(const struct sp_sampler_view *sp_sview,
 
 
 /* Get texel pointer for 2D array texture */
-static INLINE const float *
+static inline const float *
 get_texel_2d_array(const struct sp_sampler_view *sp_sview,
                    const struct sp_sampler *sp_samp,
                    union tex_tile_address addr, int x, int y, int layer)
@@ -905,7 +905,7 @@ get_texel_2d_array(const struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE const float *
+static inline const float *
 get_texel_cube_seamless(const struct sp_sampler_view *sp_sview,
                         union tex_tile_address addr, int x, int y,
                         float *corner, int layer, unsigned face)
@@ -960,7 +960,7 @@ get_texel_cube_seamless(const struct sp_sampler_view *sp_sview,
 
 
 /* Get texel pointer for cube array texture */
-static INLINE const float *
+static inline const float *
 get_texel_cube_array(const struct sp_sampler_view *sp_sview,
                      const struct sp_sampler *sp_samp,
                      union tex_tile_address addr, int x, int y, int layer)
@@ -986,7 +986,7 @@ get_texel_cube_array(const struct sp_sampler_view *sp_sview,
  * If level = 2, then we'll return 64 (the width at level=2).
  * Return 1 if level > base_pot.
  */
-static INLINE unsigned
+static inline unsigned
 pot_level_size(unsigned base_pot, unsigned level)
 {
    return (base_pot >= level) ? (1 << (base_pot - level)) : 1;
@@ -1016,7 +1016,7 @@ print_sample_4(const char *function, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZ
 
 /* Some image-filter fastpaths:
  */
-static INLINE void
+static inline void
 img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
                                 const struct img_filter_args *args,
@@ -1070,7 +1070,7 @@ img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE void
+static inline void
 img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview,
                                  struct sp_sampler *sp_samp,
                                  const struct img_filter_args *args,
@@ -1104,7 +1104,7 @@ img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE void
+static inline void
 img_filter_2d_nearest_clamp_POT(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
                                 const struct img_filter_args *args,
@@ -1819,7 +1819,7 @@ img_filter_3d_linear(struct sp_sampler_view *sp_sview,
  * \param lod_in per-fragment lod_bias or explicit_lod.
  * \param lod returns the per-fragment lod.
  */
-static INLINE void
+static inline void
 compute_lod(const struct pipe_sampler_state *sampler,
             enum tgsi_sampler_control control,
             const float biased_lambda,
@@ -1859,7 +1859,7 @@ compute_lod(const struct pipe_sampler_state *sampler,
  * \param lod_in per-fragment lod_bias or explicit_lod.
  * \param lod results per-fragment lod.
  */
-static INLINE void
+static inline void
 compute_lambda_lod(struct sp_sampler_view *sp_sview,
                    struct sp_sampler *sp_samp,
                    const float s[TGSI_QUAD_SIZE],
@@ -1906,7 +1906,7 @@ compute_lambda_lod(struct sp_sampler_view *sp_sview,
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 get_gather_component(const float lod_in[TGSI_QUAD_SIZE])
 {
    /* gather component is stored in lod_in slot as unsigned */
@@ -2789,7 +2789,7 @@ get_linear_wrap(unsigned mode)
 /**
  * Is swizzling needed for the given state key?
  */
-static INLINE bool
+static inline bool
 any_swizzle(const struct pipe_sampler_view *view)
 {
    return (view->swizzle_r != PIPE_SWIZZLE_RED ||
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
index 4a421a8f882..21f38b2f859 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
@@ -185,7 +185,7 @@ sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
  * This is basically a direct-map cache.
  * XXX There's probably lots of ways in which we can improve this.
  */
-static INLINE uint
+static inline uint
 tex_cache_pos( union tex_tile_address addr )
 {
    uint entry = (addr.bits.x + 
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
index 2233effc439..b7ad222d715 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
@@ -127,7 +127,7 @@ extern const struct softpipe_tex_cached_tile *
 sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
                         union tex_tile_address addr );
 
-static INLINE union tex_tile_address
+static inline union tex_tile_address
 tex_tile_address( unsigned x,
                   unsigned y,
                   unsigned z,
@@ -147,7 +147,7 @@ tex_tile_address( unsigned x,
 
 /* Quickly retrieve tile if it matches last lookup.
  */
-static INLINE const struct softpipe_tex_cached_tile *
+static inline const struct softpipe_tex_cached_tile *
 sp_get_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
                        union tex_tile_address addr )
 {
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
index 1701bf574d9..fbf741a9c72 100644
--- a/src/gallium/drivers/softpipe/sp_texture.h
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -81,13 +81,13 @@ struct softpipe_transfer
 
 
 /** cast wrappers */
-static INLINE struct softpipe_resource *
+static inline struct softpipe_resource *
 softpipe_resource(struct pipe_resource *pt)
 {
    return (struct softpipe_resource *) pt;
 }
 
-static INLINE struct softpipe_transfer *
+static inline struct softpipe_transfer *
 softpipe_transfer(struct pipe_transfer *pt)
 {
    return (struct softpipe_transfer *) pt;
@@ -99,7 +99,7 @@ softpipe_transfer(struct pipe_transfer *pt)
  * This is a short-cut instead of using map()/unmap(), which should
  * probably be fixed.
  */
-static INLINE void *
+static inline void *
 softpipe_resource_data(struct pipe_resource *pt)
 {
    if (!pt)
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index b763f526e61..9cc8ac12525 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -52,7 +52,7 @@ sp_alloc_tile(struct softpipe_tile_cache *tc);
    (((x) + (y) * 5 + (l) * 10) % NUM_ENTRIES)
 
 
-static INLINE int addr_to_clear_pos(union tile_address addr)
+static inline int addr_to_clear_pos(union tile_address addr)
 {
    int pos;
    pos = addr.bits.layer * (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE);
@@ -63,7 +63,7 @@ static INLINE int addr_to_clear_pos(union tile_address addr)
 /**
  * Is the tile at (x,y) in cleared state?
  */
-static INLINE uint
+static inline uint
 is_clear_flag_set(const uint *bitvec, union tile_address addr, unsigned max)
 {
    int pos, bit;
@@ -77,7 +77,7 @@ is_clear_flag_set(const uint *bitvec, union tile_address addr, unsigned max)
 /**
  * Mark the tile at (x,y) as not cleared.
  */
-static INLINE void
+static inline void
 clear_clear_flag(uint *bitvec, union tile_address addr, unsigned max)
 {
    int pos;
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
index 167e1ffcada..2c0bafad651 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -128,7 +128,7 @@ sp_find_cached_tile(struct softpipe_tile_cache *tc,
                     union tile_address addr );
 
 
-static INLINE union tile_address
+static inline union tile_address
 tile_address( unsigned x,
               unsigned y, unsigned layer )
 {
@@ -143,7 +143,7 @@ tile_address( unsigned x,
 
 /* Quickly retrieve tile if it matches last lookup.
  */
-static INLINE struct softpipe_cached_tile *
+static inline struct softpipe_cached_tile *
 sp_get_cached_tile(struct softpipe_tile_cache *tc, 
                    int x, int y, int layer )
 {
diff --git a/src/gallium/drivers/svga/include/svga3d_shaderdefs.h b/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
index 355edfdb702..5e00906ce36 100644
--- a/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
+++ b/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
@@ -507,7 +507,7 @@ static const uint32 SVGA3D_OUTPUT_REG_DEPTH_NUM_PS20 = 1;
  *----------------------------------------------------------------------
  */
 
-static INLINE SVGA3dShaderRegType
+static inline SVGA3dShaderRegType
 SVGA3dShaderGetRegType(uint32 token)
 {
    SVGA3dShaderSrcToken src;
diff --git a/src/gallium/drivers/svga/include/svga_overlay.h b/src/gallium/drivers/svga/include/svga_overlay.h
index 0f242dd402c..ccbf7912e6d 100644
--- a/src/gallium/drivers/svga/include/svga_overlay.h
+++ b/src/gallium/drivers/svga/include/svga_overlay.h
@@ -133,7 +133,7 @@ struct {
  *----------------------------------------------------------------------
  */
 
-static INLINE Bool
+static inline Bool
 VMwareVideoGetAttributes(const SVGAOverlayFormat format,    // IN
                          uint32 *width,                     // IN / OUT
                          uint32 *height,                    // IN / OUT
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index 474b75c3c86..b271832171d 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -57,7 +57,7 @@
  *----------------------------------------------------------------------
  */
 
-static INLINE void
+static inline void
 surface_to_surfaceid(struct svga_winsys_context *swc, // IN
                      struct pipe_surface *surface,    // IN
                      SVGA3dSurfaceImageId *id,        // OUT
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 630f5f77d66..71f038df8c1 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -485,20 +485,20 @@ svga_context_create(struct pipe_screen *screen,
  * Inline conversion functions.  These are better-typed than the
  * macros used previously:
  */
-static INLINE struct svga_context *
+static inline struct svga_context *
 svga_context( struct pipe_context *pipe )
 {
    return (struct svga_context *)pipe;
 }
 
 
-static INLINE boolean
+static inline boolean
 svga_have_gb_objects(const struct svga_context *svga)
 {
    return svga_screen(svga->pipe.screen)->sws->have_gb_objects;
 }
 
-static INLINE boolean
+static inline boolean
 svga_have_gb_dma(const struct svga_context *svga)
 {
    return svga_screen(svga->pipe.screen)->sws->have_gb_dma;
diff --git a/src/gallium/drivers/svga/svga_debug.h b/src/gallium/drivers/svga/svga_debug.h
index 3a3fcd8fae2..82c9b602d5d 100644
--- a/src/gallium/drivers/svga/svga_debug.h
+++ b/src/gallium/drivers/svga/svga_debug.h
@@ -53,7 +53,7 @@ extern int SVGA_DEBUG;
 #define DBSTR(x) ""
 #endif
 
-static INLINE void
+static inline void
 SVGA_DBG( unsigned flag, const char *fmt, ... )
 {
 #ifdef DEBUG 
diff --git a/src/gallium/drivers/svga/svga_draw_private.h b/src/gallium/drivers/svga/svga_draw_private.h
index 1b054038e9f..9ab87e8259a 100644
--- a/src/gallium/drivers/svga/svga_draw_private.h
+++ b/src/gallium/drivers/svga/svga_draw_private.h
@@ -57,7 +57,7 @@ static const unsigned svga_hw_prims =
  * PIPE_PRIM_QUADS, PIPE_PRIM_QUAD_STRIP or PIPE_PRIM_POLYGON.  We convert
  * those to other types of primitives with index/translation code.
  */
-static INLINE unsigned
+static inline unsigned
 svga_translate_prim(unsigned mode, unsigned vcount,unsigned *prim_count)
 {
    switch (mode) {
diff --git a/src/gallium/drivers/svga/svga_pipe_blend.c b/src/gallium/drivers/svga/svga_pipe_blend.c
index 594eec7166e..2890516c0cf 100644
--- a/src/gallium/drivers/svga/svga_pipe_blend.c
+++ b/src/gallium/drivers/svga/svga_pipe_blend.c
@@ -33,7 +33,7 @@
 #include "svga_hw_reg.h"
 
 
-static INLINE unsigned
+static inline unsigned
 svga_translate_blend_factor(unsigned factor)
 {
    switch (factor) {
@@ -58,7 +58,7 @@ svga_translate_blend_factor(unsigned factor)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 svga_translate_blend_func(unsigned mode)
 {
    switch (mode) {
diff --git a/src/gallium/drivers/svga/svga_pipe_depthstencil.c b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
index cb07dbe09a3..8db21fd7476 100644
--- a/src/gallium/drivers/svga/svga_pipe_depthstencil.c
+++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
@@ -32,7 +32,7 @@
 #include "svga_hw_reg.h"
 
 
-static INLINE unsigned
+static inline unsigned
 svga_translate_compare_func(unsigned func)
 {
    switch (func) {
@@ -50,7 +50,7 @@ svga_translate_compare_func(unsigned func)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 svga_translate_stencil_op(unsigned op)
 {
    switch (op) {
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index a97a9c46cf8..208a2cd14bf 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -59,7 +59,7 @@ struct svga_query {
 
 
 /** cast wrapper */
-static INLINE struct svga_query *
+static inline struct svga_query *
 svga_query( struct pipe_query *q )
 {
    return (struct svga_query *)q;
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
index 8a87bb467aa..effd490dd22 100644
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -35,7 +35,7 @@
 
 #include "svga_debug.h"
 
-static INLINE unsigned
+static inline unsigned
 translate_wrap_mode(unsigned wrap)
 {
    switch (wrap) {
@@ -68,7 +68,7 @@ translate_wrap_mode(unsigned wrap)
    }
 }
 
-static INLINE unsigned translate_img_filter( unsigned filter )
+static inline unsigned translate_img_filter( unsigned filter )
 {
    switch (filter) {
    case PIPE_TEX_FILTER_NEAREST: return SVGA3D_TEX_FILTER_NEAREST;
@@ -79,7 +79,7 @@ static INLINE unsigned translate_img_filter( unsigned filter )
    }
 }
 
-static INLINE unsigned translate_mip_filter( unsigned filter )
+static inline unsigned translate_mip_filter( unsigned filter )
 {
    switch (filter) {
    case PIPE_TEX_MIPFILTER_NONE:    return SVGA3D_TEX_FILTER_NONE;
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c
index d2c7762e7ff..13f85cddbd5 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer.c
@@ -45,7 +45,7 @@
  * Vertex and index buffers need hardware backing.  Constant buffers
  * do not.  No other types of buffers currently supported.
  */
-static INLINE boolean
+static inline boolean
 svga_buffer_needs_hw_storage(unsigned usage)
 {
    return usage & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER);
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.h b/src/gallium/drivers/svga/svga_resource_buffer.h
index 83b3d342aec..e838beb6661 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.h
+++ b/src/gallium/drivers/svga/svga_resource_buffer.h
@@ -190,7 +190,7 @@ struct svga_buffer
 };
 
 
-static INLINE struct svga_buffer *
+static inline struct svga_buffer *
 svga_buffer(struct pipe_resource *buffer)
 {
    if (buffer) {
@@ -205,7 +205,7 @@ svga_buffer(struct pipe_resource *buffer)
  * Returns TRUE for user buffers.  We may
  * decide to use an alternate upload path for these buffers.
  */
-static INLINE boolean 
+static inline boolean 
 svga_buffer_is_user_buffer( struct pipe_resource *buffer )
 {
    if (buffer) {
@@ -219,7 +219,7 @@ svga_buffer_is_user_buffer( struct pipe_resource *buffer )
  * Returns a pointer to a struct svga_winsys_screen given a
  * struct svga_buffer.
  */
-static INLINE struct svga_winsys_screen *
+static inline struct svga_winsys_screen *
 svga_buffer_winsys_screen(struct svga_buffer *sbuf)
 {
    return svga_screen(sbuf->b.b.screen)->sws;
@@ -230,7 +230,7 @@ svga_buffer_winsys_screen(struct svga_buffer *sbuf)
  * Returns whether a buffer has hardware storage that is
  * visible to the GPU.
  */
-static INLINE boolean
+static inline boolean
 svga_buffer_has_hw_storage(struct svga_buffer *sbuf)
 {
    if (svga_buffer_winsys_screen(sbuf)->have_gb_objects)
@@ -242,7 +242,7 @@ svga_buffer_has_hw_storage(struct svga_buffer *sbuf)
 /**
  * Map the hardware storage of a buffer.
  */
-static INLINE void *
+static inline void *
 svga_buffer_hw_storage_map(struct svga_context *svga,
                            struct svga_buffer *sbuf,
                            unsigned flags, boolean *retry)
@@ -259,7 +259,7 @@ svga_buffer_hw_storage_map(struct svga_context *svga,
 /**
  * Unmap the hardware storage of a buffer.
  */
-static INLINE void
+static inline void
 svga_buffer_hw_storage_unmap(struct svga_context *svga,
                              struct svga_buffer *sbuf)
 {
diff --git a/src/gallium/drivers/svga/svga_resource_texture.h b/src/gallium/drivers/svga/svga_resource_texture.h
index 1ff42fabab9..19dadfb8828 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.h
+++ b/src/gallium/drivers/svga/svga_resource_texture.h
@@ -106,7 +106,7 @@ struct svga_transfer
 };
 
 
-static INLINE struct svga_texture *svga_texture( struct pipe_resource *resource )
+static inline struct svga_texture *svga_texture( struct pipe_resource *resource )
 {
    struct svga_texture *tex = (struct svga_texture *)resource;
    assert(tex == NULL || tex->b.vtbl == &svga_texture_vtbl);
@@ -114,7 +114,7 @@ static INLINE struct svga_texture *svga_texture( struct pipe_resource *resource
 }
 
 
-static INLINE struct svga_transfer *
+static inline struct svga_transfer *
 svga_transfer(struct pipe_transfer *transfer)
 {
    assert(transfer);
@@ -127,7 +127,7 @@ svga_transfer(struct pipe_transfer *transfer)
  * This is used to track updates to textures when we draw into
  * them via a surface.
  */
-static INLINE void
+static inline void
 svga_age_texture_view(struct svga_texture *tex, unsigned level)
 {
    assert(level < Elements(tex->view_age));
@@ -138,7 +138,7 @@ svga_age_texture_view(struct svga_texture *tex, unsigned level)
 /**
  * Mark the given texture face/level as being defined.
  */
-static INLINE void
+static inline void
 svga_define_texture_level(struct svga_texture *tex,
                           unsigned face,unsigned level)
 {
@@ -148,7 +148,7 @@ svga_define_texture_level(struct svga_texture *tex,
 }
 
 
-static INLINE bool
+static inline bool
 svga_is_texture_level_defined(const struct svga_texture *tex,
                               unsigned face, unsigned level)
 {
@@ -177,7 +177,7 @@ check_face_level(const struct svga_texture *tex,
 }
 
 
-static INLINE void
+static inline void
 svga_set_texture_rendered_to(struct svga_texture *tex,
                              unsigned face, unsigned level)
 {
@@ -186,7 +186,7 @@ svga_set_texture_rendered_to(struct svga_texture *tex,
 }
 
 
-static INLINE void
+static inline void
 svga_clear_texture_rendered_to(struct svga_texture *tex,
                                unsigned face, unsigned level)
 {
@@ -195,7 +195,7 @@ svga_clear_texture_rendered_to(struct svga_texture *tex,
 }
 
 
-static INLINE boolean
+static inline boolean
 svga_was_texture_rendered_to(const struct svga_texture *tex,
                              unsigned face, unsigned level)
 {
diff --git a/src/gallium/drivers/svga/svga_sampler_view.h b/src/gallium/drivers/svga/svga_sampler_view.h
index 2087c1be85e..7f14323f84f 100644
--- a/src/gallium/drivers/svga/svga_sampler_view.h
+++ b/src/gallium/drivers/svga/svga_sampler_view.h
@@ -86,7 +86,7 @@ svga_destroy_sampler_view_priv(struct svga_sampler_view *v);
 void
 svga_debug_describe_sampler_view(char *buf, const struct svga_sampler_view *sv);
 
-static INLINE void
+static inline void
 svga_sampler_view_reference(struct svga_sampler_view **ptr, struct svga_sampler_view *v)
 {
    struct svga_sampler_view *old = *ptr;
diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h
index b85191c4b26..ea1e743dfe5 100644
--- a/src/gallium/drivers/svga/svga_screen.h
+++ b/src/gallium/drivers/svga/svga_screen.h
@@ -82,7 +82,7 @@ struct svga_screen
 
 #ifndef DEBUG
 /** cast wrapper */
-static INLINE struct svga_screen *
+static inline struct svga_screen *
 svga_screen(struct pipe_screen *pscreen)
 {
    return (struct svga_screen *) pscreen;
diff --git a/src/gallium/drivers/svga/svga_screen_cache.c b/src/gallium/drivers/svga/svga_screen_cache.c
index f63f7836187..3c765394a88 100644
--- a/src/gallium/drivers/svga/svga_screen_cache.c
+++ b/src/gallium/drivers/svga/svga_screen_cache.c
@@ -76,7 +76,7 @@ surface_size(const struct svga_host_surface_cache_key *key)
 /**
  * Compute the bucket for this key.
  */
-static INLINE unsigned
+static inline unsigned
 svga_screen_cache_bucket(const struct svga_host_surface_cache_key *key)
 {
    return util_hash_crc32(key, sizeof *key) % SVGA_HOST_SURFACE_CACHE_BUCKETS;
diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h
index fd500ae4401..5102159b96a 100644
--- a/src/gallium/drivers/svga/svga_shader.h
+++ b/src/gallium/drivers/svga/svga_shader.h
@@ -44,7 +44,7 @@ svga_destroy_shader_variant(struct svga_context *svga,
 /**
  * Check if a shader's bytecode exceeds the device limits.
  */
-static INLINE boolean
+static inline boolean
 svga_shader_too_large(const struct svga_context *svga,
                       const struct svga_shader_variant *variant)
 {
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index 566a79407e5..8cdce742b3b 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -41,7 +41,7 @@
 
 
 
-static INLINE int
+static inline int
 compare_fs_keys(const struct svga_fs_compile_key *a,
                 const struct svga_fs_compile_key *b)
 {
diff --git a/src/gallium/drivers/svga/svga_state_rss.c b/src/gallium/drivers/svga/svga_state_rss.c
index fb56b3d36ba..ebb98373e2b 100644
--- a/src/gallium/drivers/svga/svga_state_rss.c
+++ b/src/gallium/drivers/svga/svga_state_rss.c
@@ -61,7 +61,7 @@ do {                                                            \
 } while (0)
 
 
-static INLINE void
+static inline void
 svga_queue_rs( struct rs_queue *q,
                unsigned rss,
                unsigned value )
diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c
index 0ab571c0588..41334bd7cb9 100644
--- a/src/gallium/drivers/svga/svga_state_tss.c
+++ b/src/gallium/drivers/svga/svga_state_tss.c
@@ -274,7 +274,7 @@ do {                                                                    \
 } while (0)
 
 
-static INLINE void 
+static inline void 
 svga_queue_tss( struct ts_queue *q,
                 unsigned unit,
                 unsigned tss,
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index 545c9d7420f..c2a0f1ee6b1 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -41,7 +41,7 @@
 #include "svga_hw_reg.h"
 
 
-static INLINE int
+static inline int
 compare_vs_keys(const struct svga_vs_compile_key *a,
                 const struct svga_vs_compile_key *b)
 {
diff --git a/src/gallium/drivers/svga/svga_surface.h b/src/gallium/drivers/svga/svga_surface.h
index 7b8f6f018d2..2fa72a1c8f0 100644
--- a/src/gallium/drivers/svga/svga_surface.h
+++ b/src/gallium/drivers/svga/svga_surface.h
@@ -84,7 +84,7 @@ svga_texture_copy_handle(struct svga_context *svga,
                          unsigned width, unsigned height, unsigned depth);
 
 
-static INLINE struct svga_surface *
+static inline struct svga_surface *
 svga_surface(struct pipe_surface *surface)
 {
    assert(surface);
@@ -92,7 +92,7 @@ svga_surface(struct pipe_surface *surface)
 }
 
 
-static INLINE const struct svga_surface *
+static inline const struct svga_surface *
 svga_surface_const(const struct pipe_surface *surface)
 {
    assert(surface);
diff --git a/src/gallium/drivers/svga/svga_swtnl_private.h b/src/gallium/drivers/svga/svga_swtnl_private.h
index 608950d7af6..e2106e1e8e6 100644
--- a/src/gallium/drivers/svga/svga_swtnl_private.h
+++ b/src/gallium/drivers/svga/svga_swtnl_private.h
@@ -76,7 +76,7 @@ struct svga_vbuf_render {
 /**
  * Basically a cast wrapper.
  */
-static INLINE struct svga_vbuf_render *
+static inline struct svga_vbuf_render *
 svga_vbuf_render( struct vbuf_render *render )
 {
    assert(render);
diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
index 9aafd851264..2e2ff5e4673 100644
--- a/src/gallium/drivers/svga/svga_tgsi.c
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -84,7 +84,7 @@ svga_shader_expand(struct svga_shader_emitter *emit)
 }
 
 
-static INLINE boolean
+static inline boolean
 reserve(struct svga_shader_emitter *emit, unsigned nr_dwords)
 {
    if (emit->ptr - emit->buf + nr_dwords * sizeof(unsigned) >= emit->size) {
diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h
index e7a2a134ca5..5c47a4ad39f 100644
--- a/src/gallium/drivers/svga/svga_tgsi.h
+++ b/src/gallium/drivers/svga/svga_tgsi.h
@@ -124,7 +124,7 @@ struct svga_shader_variant
  * The real use of this information is matching vertex elements to
  * fragment shader inputs in the case where vertex shader is disabled.
  */
-static INLINE void svga_generate_vdecl_semantics( unsigned idx,
+static inline void svga_generate_vdecl_semantics( unsigned idx,
                                                   unsigned *usage,
                                                   unsigned *usage_index )
 {
@@ -140,12 +140,12 @@ static INLINE void svga_generate_vdecl_semantics( unsigned idx,
 
 
 
-static INLINE unsigned svga_vs_key_size( const struct svga_vs_compile_key *key )
+static inline unsigned svga_vs_key_size( const struct svga_vs_compile_key *key )
 {
    return sizeof *key;
 }
 
-static INLINE unsigned svga_fs_key_size( const struct svga_fs_compile_key *key )
+static inline unsigned svga_fs_key_size( const struct svga_fs_compile_key *key )
 {
    return (const char *)&key->tex[key->num_textures] - (const char *)key;
 }
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
index 1894296e6d7..1a1dac23507 100644
--- a/src/gallium/drivers/svga/svga_tgsi_emit.h
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -167,7 +167,7 @@ svga_translate_decl_sm30(struct svga_shader_emitter *emit,
 
 
 /** Emit the given SVGA3dShaderInstToken opcode */
-static INLINE boolean
+static inline boolean
 emit_instruction(struct svga_shader_emitter *emit,
                  SVGA3dShaderInstToken opcode)
 {
@@ -176,7 +176,7 @@ emit_instruction(struct svga_shader_emitter *emit,
 
 
 /** Generate a SVGA3dShaderInstToken for the given SVGA3D shader opcode */
-static INLINE SVGA3dShaderInstToken
+static inline SVGA3dShaderInstToken
 inst_token(unsigned opcode)
 {
    SVGA3dShaderInstToken inst;
@@ -192,7 +192,7 @@ inst_token(unsigned opcode)
  * Generate a SVGA3dShaderInstToken for the given SVGA3D shader opcode
  * with the predication flag set.
  */
-static INLINE SVGA3dShaderInstToken
+static inline SVGA3dShaderInstToken
 inst_token_predicated(unsigned opcode)
 {
    SVGA3dShaderInstToken inst;
@@ -209,7 +209,7 @@ inst_token_predicated(unsigned opcode)
  * Generate a SVGA3dShaderInstToken for a SETP instruction (set predicate)
  * using the given comparison operator (one of SVGA3DOPCOMP_xx).
  */
-static INLINE SVGA3dShaderInstToken
+static inline SVGA3dShaderInstToken
 inst_token_setp(unsigned operator)
 {
    SVGA3dShaderInstToken inst;
@@ -227,7 +227,7 @@ inst_token_setp(unsigned operator)
  * Note that this function is used to create tokens for output registers,
  * temp registers AND constants (see emit_def_const()).
  */
-static INLINE SVGA3dShaderDestToken
+static inline SVGA3dShaderDestToken
 dst_register(unsigned file, int number)
 {
    SVGA3dShaderDestToken dest;
@@ -255,7 +255,7 @@ dst_register(unsigned file, int number)
  * Apply a writemask to the given SVGA3dShaderDestToken, returning a
  * new SVGA3dShaderDestToken.
  */
-static INLINE SVGA3dShaderDestToken
+static inline SVGA3dShaderDestToken
 writemask(SVGA3dShaderDestToken dest, unsigned mask)
 {
    assert(dest.mask & mask);
@@ -265,7 +265,7 @@ writemask(SVGA3dShaderDestToken dest, unsigned mask)
 
 
 /** Create a SVGA3dShaderSrcToken given a register file and number */
-static INLINE SVGA3dShaderSrcToken
+static inline SVGA3dShaderSrcToken
 src_token(unsigned file, int number)
 {
    SVGA3dShaderSrcToken src;
@@ -289,7 +289,7 @@ src_token(unsigned file, int number)
 
 
 /** Create a src_register given a register file and register number */
-static INLINE struct src_register
+static inline struct src_register
 src_register(unsigned file, int number)
 {
    struct src_register src;
@@ -301,7 +301,7 @@ src_register(unsigned file, int number)
 }
 
 /** Translate src_register into SVGA3dShaderDestToken */
-static INLINE SVGA3dShaderDestToken
+static inline SVGA3dShaderDestToken
 dst(struct src_register src)
 {
    return dst_register(SVGA3dShaderGetRegType(src.base.value), src.base.num);
@@ -309,7 +309,7 @@ dst(struct src_register src)
 
 
 /** Translate SVGA3dShaderDestToken to a src_register */
-static INLINE struct src_register
+static inline struct src_register
 src(SVGA3dShaderDestToken dst)
 {
    return src_register(SVGA3dShaderGetRegType(dst.value), dst.num);
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader.h b/src/gallium/drivers/svga/svgadump/svga_shader.h
index 5db64bf135b..0a2e3d5f345 100644
--- a/src/gallium/drivers/svga/svgadump/svga_shader.h
+++ b/src/gallium/drivers/svga/svgadump/svga_shader.h
@@ -56,7 +56,7 @@ struct sh_reg
    unsigned is_reg:1;
 };
 
-static INLINE unsigned
+static inline unsigned
 sh_reg_type( struct sh_reg reg )
 {
    return reg.type_lo | (reg.type_hi << 3);
@@ -138,7 +138,7 @@ struct sh_dstreg
    unsigned is_reg:1;
 };
 
-static INLINE unsigned
+static inline unsigned
 sh_dstreg_type( struct sh_dstreg reg )
 {
    return reg.type_lo | (reg.type_hi << 3);
@@ -169,7 +169,7 @@ struct sh_srcreg
    unsigned is_reg:1;
 };
 
-static INLINE unsigned
+static inline unsigned
 sh_srcreg_type( struct sh_srcreg reg )
 {
    return reg.type_lo | (reg.type_hi << 3);
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 0013c963e7a..7f6d0645112 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -49,13 +49,13 @@ struct trace_query
 };
 
 
-static INLINE struct trace_query *
+static inline struct trace_query *
 trace_query(struct pipe_query *query) {
    return (struct trace_query *)query;
 }
 
 
-static INLINE struct pipe_query *
+static inline struct pipe_query *
 trace_query_unwrap(struct pipe_query *query)
 {
    if (query) {
@@ -66,7 +66,7 @@ trace_query_unwrap(struct pipe_query *query)
 }
 
 
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 trace_resource_unwrap(struct trace_context *tr_ctx,
                      struct pipe_resource *resource)
 {
@@ -82,7 +82,7 @@ trace_resource_unwrap(struct trace_context *tr_ctx,
 }
 
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 trace_surface_unwrap(struct trace_context *tr_ctx,
                      struct pipe_surface *surface)
 {
@@ -105,7 +105,7 @@ trace_surface_unwrap(struct trace_context *tr_ctx,
 }
 
 
-static INLINE void
+static inline void
 trace_context_draw_vbo(struct pipe_context *_pipe,
                        const struct pipe_draw_info *info)
 {
@@ -125,7 +125,7 @@ trace_context_draw_vbo(struct pipe_context *_pipe,
 }
 
 
-static INLINE struct pipe_query *
+static inline struct pipe_query *
 trace_context_create_query(struct pipe_context *_pipe,
                            unsigned query_type,
                            unsigned index)
@@ -163,7 +163,7 @@ trace_context_create_query(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_destroy_query(struct pipe_context *_pipe,
                             struct pipe_query *_query)
 {
@@ -185,7 +185,7 @@ trace_context_destroy_query(struct pipe_context *_pipe,
 }
 
 
-static INLINE boolean
+static inline boolean
 trace_context_begin_query(struct pipe_context *_pipe,
                           struct pipe_query *query)
 {
@@ -207,7 +207,7 @@ trace_context_begin_query(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_end_query(struct pipe_context *_pipe,
                         struct pipe_query *query)
 {
@@ -227,7 +227,7 @@ trace_context_end_query(struct pipe_context *_pipe,
 }
 
 
-static INLINE boolean
+static inline boolean
 trace_context_get_query_result(struct pipe_context *_pipe,
                                struct pipe_query *_query,
                                boolean wait,
@@ -262,7 +262,7 @@ trace_context_get_query_result(struct pipe_context *_pipe,
 }
 
 
-static INLINE void *
+static inline void *
 trace_context_create_blend_state(struct pipe_context *_pipe,
                                  const struct pipe_blend_state *state)
 {
@@ -285,7 +285,7 @@ trace_context_create_blend_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_blend_state(struct pipe_context *_pipe,
                                void *state)
 {
@@ -303,7 +303,7 @@ trace_context_bind_blend_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_blend_state(struct pipe_context *_pipe,
                                  void *state)
 {
@@ -321,7 +321,7 @@ trace_context_delete_blend_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void *
+static inline void *
 trace_context_create_sampler_state(struct pipe_context *_pipe,
                                    const struct pipe_sampler_state *state)
 {
@@ -344,7 +344,7 @@ trace_context_create_sampler_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_sampler_states(struct pipe_context *_pipe,
                                   unsigned shader,
                                   unsigned start,
@@ -371,7 +371,7 @@ trace_context_bind_sampler_states(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_sampler_state(struct pipe_context *_pipe,
                                    void *state)
 {
@@ -389,7 +389,7 @@ trace_context_delete_sampler_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void *
+static inline void *
 trace_context_create_rasterizer_state(struct pipe_context *_pipe,
                                       const struct pipe_rasterizer_state *state)
 {
@@ -412,7 +412,7 @@ trace_context_create_rasterizer_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_rasterizer_state(struct pipe_context *_pipe,
                                     void *state)
 {
@@ -430,7 +430,7 @@ trace_context_bind_rasterizer_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_rasterizer_state(struct pipe_context *_pipe,
                                       void *state)
 {
@@ -448,7 +448,7 @@ trace_context_delete_rasterizer_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void *
+static inline void *
 trace_context_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
                                                const struct pipe_depth_stencil_alpha_state *state)
 {
@@ -471,7 +471,7 @@ trace_context_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_depth_stencil_alpha_state(struct pipe_context *_pipe,
                                              void *state)
 {
@@ -489,7 +489,7 @@ trace_context_bind_depth_stencil_alpha_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
                                                void *state)
 {
@@ -508,7 +508,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
 
 
 #define TRACE_SHADER_STATE(shader_type) \
-   static INLINE void * \
+   static inline void * \
    trace_context_create_##shader_type##_state(struct pipe_context *_pipe, \
                                  const struct pipe_shader_state *state) \
    { \
@@ -524,7 +524,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
       return result; \
    } \
     \
-   static INLINE void \
+   static inline void \
    trace_context_bind_##shader_type##_state(struct pipe_context *_pipe, \
                                void *state) \
    { \
@@ -537,7 +537,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
       trace_dump_call_end(); \
    } \
     \
-   static INLINE void \
+   static inline void \
    trace_context_delete_##shader_type##_state(struct pipe_context *_pipe, \
                                  void *state) \
    { \
@@ -559,7 +559,7 @@ TRACE_SHADER_STATE(tes)
 #undef TRACE_SHADER_STATE
 
 
-static INLINE void *
+static inline void *
 trace_context_create_vertex_elements_state(struct pipe_context *_pipe,
                                            unsigned num_elements,
                                            const struct  pipe_vertex_element *elements)
@@ -587,7 +587,7 @@ trace_context_create_vertex_elements_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_vertex_elements_state(struct pipe_context *_pipe,
                                          void *state)
 {
@@ -605,7 +605,7 @@ trace_context_bind_vertex_elements_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_vertex_elements_state(struct pipe_context *_pipe,
                                            void *state)
 {
@@ -623,7 +623,7 @@ trace_context_delete_vertex_elements_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_blend_color(struct pipe_context *_pipe,
                               const struct pipe_blend_color *state)
 {
@@ -641,7 +641,7 @@ trace_context_set_blend_color(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_stencil_ref(struct pipe_context *_pipe,
                               const struct pipe_stencil_ref *state)
 {
@@ -659,7 +659,7 @@ trace_context_set_stencil_ref(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_clip_state(struct pipe_context *_pipe,
                              const struct pipe_clip_state *state)
 {
@@ -676,7 +676,7 @@ trace_context_set_clip_state(struct pipe_context *_pipe,
    trace_dump_call_end();
 }
 
-static INLINE void
+static inline void
 trace_context_set_sample_mask(struct pipe_context *_pipe,
                               unsigned sample_mask)
 {
@@ -693,7 +693,7 @@ trace_context_set_sample_mask(struct pipe_context *_pipe,
    trace_dump_call_end();
 }
 
-static INLINE void
+static inline void
 trace_context_set_constant_buffer(struct pipe_context *_pipe,
                                   uint shader, uint index,
                                   struct pipe_constant_buffer *constant_buffer)
@@ -721,7 +721,7 @@ trace_context_set_constant_buffer(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_framebuffer_state(struct pipe_context *_pipe,
                                     const struct pipe_framebuffer_state *state)
 {
@@ -751,7 +751,7 @@ trace_context_set_framebuffer_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_polygon_stipple(struct pipe_context *_pipe,
                                   const struct pipe_poly_stipple *state)
 {
@@ -769,7 +769,7 @@ trace_context_set_polygon_stipple(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_scissor_states(struct pipe_context *_pipe,
                                  unsigned start_slot,
                                  unsigned num_scissors,
@@ -791,7 +791,7 @@ trace_context_set_scissor_states(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_viewport_states(struct pipe_context *_pipe,
                                   unsigned start_slot,
                                   unsigned num_viewports,
@@ -938,7 +938,7 @@ trace_context_surface_destroy(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_sampler_views(struct pipe_context *_pipe,
                                 unsigned shader,
                                 unsigned start,
@@ -974,7 +974,7 @@ trace_context_set_sampler_views(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_vertex_buffers(struct pipe_context *_pipe,
                                  unsigned start_slot, unsigned num_buffers,
                                  const struct pipe_vertex_buffer *buffers)
@@ -1008,7 +1008,7 @@ trace_context_set_vertex_buffers(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_index_buffer(struct pipe_context *_pipe,
                                const struct pipe_index_buffer *ib)
 {
@@ -1033,7 +1033,7 @@ trace_context_set_index_buffer(struct pipe_context *_pipe,
 }
 
 
-static INLINE struct pipe_stream_output_target *
+static inline struct pipe_stream_output_target *
 trace_context_create_stream_output_target(struct pipe_context *_pipe,
                                           struct pipe_resource *res,
                                           unsigned buffer_offset,
@@ -1063,7 +1063,7 @@ trace_context_create_stream_output_target(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_stream_output_target_destroy(
    struct pipe_context *_pipe,
    struct pipe_stream_output_target *target)
@@ -1082,7 +1082,7 @@ trace_context_stream_output_target_destroy(
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_stream_output_targets(struct pipe_context *_pipe,
                                         unsigned num_targets,
                                         struct pipe_stream_output_target **tgs,
@@ -1104,7 +1104,7 @@ trace_context_set_stream_output_targets(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_resource_copy_region(struct pipe_context *_pipe,
                                    struct pipe_resource *dst,
                                    unsigned dst_level,
@@ -1139,7 +1139,7 @@ trace_context_resource_copy_region(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_blit(struct pipe_context *_pipe,
                    const struct pipe_blit_info *_info)
 {
@@ -1181,7 +1181,7 @@ trace_context_flush_resource(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_clear(struct pipe_context *_pipe,
                     unsigned buffers,
                     const union pipe_color_union *color,
@@ -1210,7 +1210,7 @@ trace_context_clear(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_clear_render_target(struct pipe_context *_pipe,
                                   struct pipe_surface *dst,
                                   const union pipe_color_union *color,
@@ -1237,7 +1237,7 @@ trace_context_clear_render_target(struct pipe_context *_pipe,
    trace_dump_call_end();
 }
 
-static INLINE void
+static inline void
 trace_context_clear_depth_stencil(struct pipe_context *_pipe,
                                   struct pipe_surface *dst,
                                   unsigned clear_flags,
@@ -1269,7 +1269,7 @@ trace_context_clear_depth_stencil(struct pipe_context *_pipe,
    trace_dump_call_end();
 }
 
-static INLINE void
+static inline void
 trace_context_flush(struct pipe_context *_pipe,
                     struct pipe_fence_handle **fence,
                     unsigned flags)
@@ -1291,7 +1291,7 @@ trace_context_flush(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_destroy(struct pipe_context *_pipe)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
diff --git a/src/gallium/drivers/trace/tr_context.h b/src/gallium/drivers/trace/tr_context.h
index 1e5ad88d034..ad57d9d5243 100644
--- a/src/gallium/drivers/trace/tr_context.h
+++ b/src/gallium/drivers/trace/tr_context.h
@@ -54,7 +54,7 @@ void
 trace_context_check(const struct pipe_context *pipe);
 
 
-static INLINE struct trace_context *
+static inline struct trace_context *
 trace_context(struct pipe_context *pipe)
 {
    assert(pipe);
diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c
index 753b92d8b54..601e2cbbec5 100644
--- a/src/gallium/drivers/trace/tr_dump.c
+++ b/src/gallium/drivers/trace/tr_dump.c
@@ -64,7 +64,7 @@ static long unsigned call_no = 0;
 static boolean dumping = FALSE;
 
 
-static INLINE void
+static inline void
 trace_dump_write(const char *buf, size_t size)
 {
    if (stream) {
@@ -73,14 +73,14 @@ trace_dump_write(const char *buf, size_t size)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_writes(const char *s)
 {
    trace_dump_write(s, strlen(s));
 }
 
 
-static INLINE void
+static inline void
 trace_dump_writef(const char *format, ...)
 {
    static char buf[1024];
@@ -93,7 +93,7 @@ trace_dump_writef(const char *format, ...)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_escape(const char *str)
 {
    const unsigned char *p = (const unsigned char *)str;
@@ -117,7 +117,7 @@ trace_dump_escape(const char *str)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_indent(unsigned level)
 {
    unsigned i;
@@ -126,14 +126,14 @@ trace_dump_indent(unsigned level)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_newline(void)
 {
    trace_dump_writes("\n");
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag(const char *name)
 {
    trace_dump_writes("<");
@@ -142,7 +142,7 @@ trace_dump_tag(const char *name)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag_begin(const char *name)
 {
    trace_dump_writes("<");
@@ -150,7 +150,7 @@ trace_dump_tag_begin(const char *name)
    trace_dump_writes(">");
 }
 
-static INLINE void
+static inline void
 trace_dump_tag_begin1(const char *name,
                       const char *attr1, const char *value1)
 {
@@ -164,7 +164,7 @@ trace_dump_tag_begin1(const char *name,
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag_begin2(const char *name,
                       const char *attr1, const char *value1,
                       const char *attr2, const char *value2)
@@ -183,7 +183,7 @@ trace_dump_tag_begin2(const char *name,
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag_begin3(const char *name,
                       const char *attr1, const char *value1,
                       const char *attr2, const char *value2,
@@ -207,7 +207,7 @@ trace_dump_tag_begin3(const char *name,
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag_end(const char *name)
 {
    trace_dump_writes("</");
diff --git a/src/gallium/drivers/trace/tr_dump_defines.h b/src/gallium/drivers/trace/tr_dump_defines.h
index 0c83c2b68f1..b38d63eac59 100644
--- a/src/gallium/drivers/trace/tr_dump_defines.h
+++ b/src/gallium/drivers/trace/tr_dump_defines.h
@@ -34,7 +34,7 @@
 #include "tr_dump.h"
 
 
-static INLINE void
+static inline void
 trace_dump_format(enum pipe_format format)
 {
    if (!trace_dumping_enabled_locked())
@@ -44,7 +44,7 @@ trace_dump_format(enum pipe_format format)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_query_type(unsigned value)
 {
    if (!trace_dumping_enabled_locked())
diff --git a/src/gallium/drivers/trace/tr_texture.h b/src/gallium/drivers/trace/tr_texture.h
index 5e45c3c2f8f..e48b7b39e24 100644
--- a/src/gallium/drivers/trace/tr_texture.h
+++ b/src/gallium/drivers/trace/tr_texture.h
@@ -85,7 +85,7 @@ struct trace_transfer
 };
 
 
-static INLINE struct trace_resource *
+static inline struct trace_resource *
 trace_resource(struct pipe_resource *texture)
 {
    if(!texture)
@@ -95,7 +95,7 @@ trace_resource(struct pipe_resource *texture)
 }
 
 
-static INLINE struct trace_surface *
+static inline struct trace_surface *
 trace_surface(struct pipe_surface *surface)
 {
    if(!surface)
@@ -105,7 +105,7 @@ trace_surface(struct pipe_surface *surface)
 }
 
 
-static INLINE struct trace_sampler_view *
+static inline struct trace_sampler_view *
 trace_sampler_view(struct pipe_sampler_view *sampler_view)
 {
    if (!sampler_view)
@@ -114,7 +114,7 @@ trace_sampler_view(struct pipe_sampler_view *sampler_view)
 }
 
 
-static INLINE struct trace_transfer *
+static inline struct trace_transfer *
 trace_transfer(struct pipe_transfer *transfer)
 {
    if(!transfer)
diff --git a/src/gallium/drivers/vc4/vc4_resource.h b/src/gallium/drivers/vc4/vc4_resource.h
index ab8f5d3cd55..87571b75e8b 100644
--- a/src/gallium/drivers/vc4/vc4_resource.h
+++ b/src/gallium/drivers/vc4/vc4_resource.h
@@ -82,19 +82,19 @@ struct vc4_resource {
         struct pipe_resource *shadow_parent;
 };
 
-static INLINE struct vc4_resource *
+static inline struct vc4_resource *
 vc4_resource(struct pipe_resource *prsc)
 {
         return (struct vc4_resource *)prsc;
 }
 
-static INLINE struct vc4_surface *
+static inline struct vc4_surface *
 vc4_surface(struct pipe_surface *psurf)
 {
         return (struct vc4_surface *)psurf;
 }
 
-static INLINE struct vc4_transfer *
+static inline struct vc4_transfer *
 vc4_transfer(struct pipe_transfer *ptrans)
 {
         return (struct vc4_transfer *)ptrans;
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 0e953695b52..7eed57018b7 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -94,11 +94,6 @@ typedef unsigned char boolean;
 #endif
 #endif
 
-/* XXX: Use standard `inline` keyword instead */
-#ifndef INLINE
-#  define INLINE inline
-#endif
-
 /* Forced function inlining */
 #ifndef ALWAYS_INLINE
 #  ifdef __GNUC__
@@ -106,7 +101,7 @@ typedef unsigned char boolean;
 #  elif defined(_MSC_VER)
 #    define ALWAYS_INLINE __forceinline
 #  else
-#    define ALWAYS_INLINE INLINE
+#    define ALWAYS_INLINE inline
 #  endif
 #endif
 
diff --git a/src/gallium/include/state_tracker/st_api.h b/src/gallium/include/state_tracker/st_api.h
index ecf1c07fb98..356863d8531 100644
--- a/src/gallium/include/state_tracker/st_api.h
+++ b/src/gallium/include/state_tracker/st_api.h
@@ -533,7 +533,7 @@ struct st_api
 /**
  * Return true if the visual has the specified buffers.
  */
-static INLINE boolean
+static inline boolean
 st_visual_have_buffers(const struct st_visual *visual, unsigned mask)
 {
    return ((visual->buffer_mask & mask) == mask);
diff --git a/src/gallium/state_trackers/dri/dri2_buffer.h b/src/gallium/state_trackers/dri/dri2_buffer.h
index e8e474ddb76..0cee4e906e6 100644
--- a/src/gallium/state_trackers/dri/dri2_buffer.h
+++ b/src/gallium/state_trackers/dri/dri2_buffer.h
@@ -11,7 +11,7 @@ struct dri2_buffer
    struct pipe_resource *resource;
 };
 
-static INLINE struct dri2_buffer *
+static inline struct dri2_buffer *
 dri2_buffer(__DRIbuffer * driBufferPriv)
 {
    return (struct dri2_buffer *) driBufferPriv;
diff --git a/src/gallium/state_trackers/dri/dri_context.h b/src/gallium/state_trackers/dri/dri_context.h
index 56dfa2ccc70..96f06442fa0 100644
--- a/src/gallium/state_trackers/dri/dri_context.h
+++ b/src/gallium/state_trackers/dri/dri_context.h
@@ -59,7 +59,7 @@ struct dri_context
    struct hud_context *hud;
 };
 
-static INLINE struct dri_context *
+static inline struct dri_context *
 dri_context(__DRIcontext * driContextPriv)
 {
    if (!driContextPriv)
diff --git a/src/gallium/state_trackers/dri/dri_drawable.h b/src/gallium/state_trackers/dri/dri_drawable.h
index c5142181e89..1f9842ea541 100644
--- a/src/gallium/state_trackers/dri/dri_drawable.h
+++ b/src/gallium/state_trackers/dri/dri_drawable.h
@@ -87,7 +87,7 @@ struct dri_drawable
                              struct pipe_resource *res);
 };
 
-static INLINE struct dri_drawable *
+static inline struct dri_drawable *
 dri_drawable(__DRIdrawable * driDrawPriv)
 {
    return (struct dri_drawable *) (driDrawPriv)
diff --git a/src/gallium/state_trackers/dri/dri_screen.h b/src/gallium/state_trackers/dri/dri_screen.h
index 173f4038cdb..6d46fea9684 100644
--- a/src/gallium/state_trackers/dri/dri_screen.h
+++ b/src/gallium/state_trackers/dri/dri_screen.h
@@ -97,7 +97,7 @@ struct dri_screen
 };
 
 /** cast wrapper */
-static INLINE struct dri_screen *
+static inline struct dri_screen *
 dri_screen(__DRIscreen * sPriv)
 {
    return (struct dri_screen *)sPriv->driverPrivate;
@@ -124,7 +124,7 @@ struct __DRIimageRec {
 
 #ifndef __NOT_HAVE_DRM_H
 
-static INLINE boolean
+static inline boolean
 dri_with_format(__DRIscreen * sPriv)
 {
    const __DRIdri2LoaderExtension *loader = sPriv->dri2.loader;
@@ -136,7 +136,7 @@ dri_with_format(__DRIscreen * sPriv)
 
 #else
 
-static INLINE boolean
+static inline boolean
 dri_with_format(__DRIscreen * sPriv)
 {
    return TRUE;
diff --git a/src/gallium/state_trackers/dri/drisw.c b/src/gallium/state_trackers/dri/drisw.c
index 4a2c1bbc2ee..4ec6992643a 100644
--- a/src/gallium/state_trackers/dri/drisw.c
+++ b/src/gallium/state_trackers/dri/drisw.c
@@ -50,7 +50,7 @@
 DEBUG_GET_ONCE_BOOL_OPTION(swrast_no_present, "SWRAST_NO_PRESENT", FALSE);
 static boolean swrast_no_present = FALSE;
 
-static INLINE void
+static inline void
 get_drawable_info(__DRIdrawable *dPriv, int *x, int *y, int *w, int *h)
 {
    __DRIscreen *sPriv = dPriv->driScreenPriv;
@@ -61,7 +61,7 @@ get_drawable_info(__DRIdrawable *dPriv, int *x, int *y, int *w, int *h)
                            dPriv->loaderPrivate);
 }
 
-static INLINE void
+static inline void
 put_image(__DRIdrawable *dPriv, void *data, unsigned width, unsigned height)
 {
    __DRIscreen *sPriv = dPriv->driScreenPriv;
@@ -72,7 +72,7 @@ put_image(__DRIdrawable *dPriv, void *data, unsigned width, unsigned height)
                     data, dPriv->loaderPrivate);
 }
 
-static INLINE void
+static inline void
 put_image2(__DRIdrawable *dPriv, void *data, int x, int y,
            unsigned width, unsigned height, unsigned stride)
 {
@@ -84,7 +84,7 @@ put_image2(__DRIdrawable *dPriv, void *data, int x, int y,
                      data, dPriv->loaderPrivate);
 }
 
-static INLINE void
+static inline void
 get_image(__DRIdrawable *dPriv, int x, int y, int width, int height, void *data)
 {
    __DRIscreen *sPriv = dPriv->driScreenPriv;
@@ -123,7 +123,7 @@ drisw_put_image2(struct dri_drawable *drawable,
    put_image2(dPriv, data, x, y, width, height, stride);
 }
 
-static INLINE void
+static inline void
 drisw_present_texture(__DRIdrawable *dPriv,
                       struct pipe_resource *ptex, struct pipe_box *sub_box)
 {
@@ -136,7 +136,7 @@ drisw_present_texture(__DRIdrawable *dPriv,
    screen->base.screen->flush_frontbuffer(screen->base.screen, ptex, 0, 0, drawable, sub_box);
 }
 
-static INLINE void
+static inline void
 drisw_invalidate_drawable(__DRIdrawable *dPriv)
 {
    struct dri_drawable *drawable = dri_drawable(dPriv);
@@ -146,7 +146,7 @@ drisw_invalidate_drawable(__DRIdrawable *dPriv)
    p_atomic_inc(&drawable->base.stamp);
 }
 
-static INLINE void
+static inline void
 drisw_copy_to_front(__DRIdrawable * dPriv,
                     struct pipe_resource *ptex)
 {
diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.h b/src/gallium/state_trackers/glx/xlib/xm_api.h
index 6d0bc3f4d81..ffdffc0940f 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_api.h
+++ b/src/gallium/state_trackers/glx/xlib/xm_api.h
@@ -378,13 +378,13 @@ xmesa_check_buffer_size(XMesaBuffer b);
 extern void
 xmesa_destroy_buffers_on_display(Display *dpy);
 
-static INLINE GLuint
+static inline GLuint
 xmesa_buffer_width(XMesaBuffer b)
 {
    return b->width;
 }
 
-static INLINE GLuint
+static inline GLuint
 xmesa_buffer_height(XMesaBuffer b)
 {
    return b->height;
diff --git a/src/gallium/state_trackers/glx/xlib/xm_st.c b/src/gallium/state_trackers/glx/xlib/xm_st.c
index 7f73a3a44fe..9d0f2d25025 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_st.c
+++ b/src/gallium/state_trackers/glx/xlib/xm_st.c
@@ -46,7 +46,7 @@ struct xmesa_st_framebuffer {
 };
 
 
-static INLINE struct xmesa_st_framebuffer *
+static inline struct xmesa_st_framebuffer *
 xmesa_st_framebuffer(struct st_framebuffer_iface *stfbi)
 {
    return (struct xmesa_st_framebuffer *) stfbi->st_manager_private;
diff --git a/src/gallium/state_trackers/hgl/hgl.c b/src/gallium/state_trackers/hgl/hgl.c
index 1e804c07e6b..0e122fe86ae 100644
--- a/src/gallium/state_trackers/hgl/hgl.c
+++ b/src/gallium/state_trackers/hgl/hgl.c
@@ -32,7 +32,7 @@
 
 
 // Perform a safe void to hgl_context cast
-static INLINE struct hgl_context*
+static inline struct hgl_context*
 hgl_st_context(struct st_context_iface *stctxi)
 {
 	struct hgl_context* context;
@@ -44,7 +44,7 @@ hgl_st_context(struct st_context_iface *stctxi)
 
 
 // Perform a safe void to hgl_buffer cast
-static INLINE struct hgl_buffer*
+static inline struct hgl_buffer*
 hgl_st_framebuffer(struct st_framebuffer_iface *stfbi)
 {
 	struct hgl_buffer* buffer;
diff --git a/src/gallium/state_trackers/nine/adapter9.c b/src/gallium/state_trackers/nine/adapter9.c
index 9d6d6590e00..c5ffcb15a18 100644
--- a/src/gallium/state_trackers/nine/adapter9.c
+++ b/src/gallium/state_trackers/nine/adapter9.c
@@ -163,7 +163,7 @@ NineAdapter9_GetAdapterIdentifier( struct NineAdapter9 *This,
     return D3D_OK;
 }
 
-static INLINE boolean
+static inline boolean
 backbuffer_format( D3DFORMAT dfmt,
                    D3DFORMAT bfmt,
                    boolean win )
@@ -220,7 +220,7 @@ NineAdapter9_CheckDeviceType( struct NineAdapter9 *This,
     return D3D_OK;
 }
 
-static INLINE boolean
+static inline boolean
 display_format( D3DFORMAT fmt,
                 boolean win )
 {
diff --git a/src/gallium/state_trackers/nine/adapter9.h b/src/gallium/state_trackers/nine/adapter9.h
index df85b2dcc28..2129ec8edc0 100644
--- a/src/gallium/state_trackers/nine/adapter9.h
+++ b/src/gallium/state_trackers/nine/adapter9.h
@@ -49,7 +49,7 @@ struct NineAdapter9
     
     struct d3dadapter9_context *ctx;
 };
-static INLINE struct NineAdapter9 *
+static inline struct NineAdapter9 *
 NineAdapter9( void *data )
 {
     return (struct NineAdapter9 *)data;
diff --git a/src/gallium/state_trackers/nine/authenticatedchannel9.h b/src/gallium/state_trackers/nine/authenticatedchannel9.h
index 7d374f67fca..63cb2269db4 100644
--- a/src/gallium/state_trackers/nine/authenticatedchannel9.h
+++ b/src/gallium/state_trackers/nine/authenticatedchannel9.h
@@ -29,7 +29,7 @@ struct NineAuthenticatedChannel9
 {
     struct NineUnknown base;
 };
-static INLINE struct NineAuthenticatedChannel9 *
+static inline struct NineAuthenticatedChannel9 *
 NineAuthenticatedChannel9( void *data )
 {
     return (struct NineAuthenticatedChannel9 *)data;
diff --git a/src/gallium/state_trackers/nine/basetexture9.h b/src/gallium/state_trackers/nine/basetexture9.h
index c803280decd..9d6fb0c002a 100644
--- a/src/gallium/state_trackers/nine/basetexture9.h
+++ b/src/gallium/state_trackers/nine/basetexture9.h
@@ -53,7 +53,7 @@ struct NineBaseTexture9
         DWORD lod_resident;
     } managed;
 };
-static INLINE struct NineBaseTexture9 *
+static inline struct NineBaseTexture9 *
 NineBaseTexture9( void *data )
 {
     return (struct NineBaseTexture9 *)data;
@@ -107,7 +107,7 @@ HRESULT
 NineBaseTexture9_UpdateSamplerView( struct NineBaseTexture9 *This,
                                     const int sRGB );
 
-static INLINE void
+static inline void
 NineBaseTexture9_Validate( struct NineBaseTexture9 *This )
 {
     DBG_FLAG(DBG_BASETEXTURE, "This=%p dirty=%i dirty_mip=%i lod=%u/%u\n",
@@ -119,7 +119,7 @@ NineBaseTexture9_Validate( struct NineBaseTexture9 *This )
         NineBaseTexture9_GenerateMipSubLevels(This);
 }
 
-static INLINE struct pipe_sampler_view *
+static inline struct pipe_sampler_view *
 NineBaseTexture9_GetSamplerView( struct NineBaseTexture9 *This, const int sRGB )
 {
     if (!This->view[sRGB])
@@ -131,7 +131,7 @@ NineBaseTexture9_GetSamplerView( struct NineBaseTexture9 *This, const int sRGB )
 void
 NineBaseTexture9_Dump( struct NineBaseTexture9 *This );
 #else
-static INLINE void
+static inline void
 NineBaseTexture9_Dump( struct NineBaseTexture9 *This ) { }
 #endif
 
diff --git a/src/gallium/state_trackers/nine/cryptosession9.h b/src/gallium/state_trackers/nine/cryptosession9.h
index 660d246bfa0..d1eab72eb37 100644
--- a/src/gallium/state_trackers/nine/cryptosession9.h
+++ b/src/gallium/state_trackers/nine/cryptosession9.h
@@ -29,7 +29,7 @@ struct NineCryptoSession9
 {
     struct NineUnknown base;
 };
-static INLINE struct NineCryptoSession9 *
+static inline struct NineCryptoSession9 *
 NineCryptoSession9( void *data )
 {
     return (struct NineCryptoSession9 *)data;
diff --git a/src/gallium/state_trackers/nine/cubetexture9.h b/src/gallium/state_trackers/nine/cubetexture9.h
index ee7e275e4d8..999715c0a74 100644
--- a/src/gallium/state_trackers/nine/cubetexture9.h
+++ b/src/gallium/state_trackers/nine/cubetexture9.h
@@ -33,7 +33,7 @@ struct NineCubeTexture9
     struct pipe_box dirty_rect[6]; /* covers all mip levels */
     uint8_t *managed_buffer;
 };
-static INLINE struct NineCubeTexture9 *
+static inline struct NineCubeTexture9 *
 NineCubeTexture9( void *data )
 {
     return (struct NineCubeTexture9 *)data;
diff --git a/src/gallium/state_trackers/nine/device9.c b/src/gallium/state_trackers/nine/device9.c
index 466b9376ce5..55948cbb67f 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -510,7 +510,7 @@ NineDevice9_GetCaps( struct NineDevice9 *This )
     return &This->caps;
 }
 
-static INLINE void
+static inline void
 NineDevice9_PauseRecording( struct NineDevice9 *This )
 {
     if (This->record) {
@@ -519,7 +519,7 @@ NineDevice9_PauseRecording( struct NineDevice9 *This )
     }
 }
 
-static INLINE void
+static inline void
 NineDevice9_ResumeRecording( struct NineDevice9 *This )
 {
     if (This->record) {
@@ -2697,7 +2697,7 @@ NineDevice9_GetNPatchMode( struct NineDevice9 *This )
     STUB(0);
 }
 
-static INLINE void
+static inline void
 init_draw_info(struct pipe_draw_info *info,
                struct NineDevice9 *dev, D3DPRIMITIVETYPE type, UINT count)
 {
diff --git a/src/gallium/state_trackers/nine/device9.h b/src/gallium/state_trackers/nine/device9.h
index c66a273bf2e..74607451c5f 100644
--- a/src/gallium/state_trackers/nine/device9.h
+++ b/src/gallium/state_trackers/nine/device9.h
@@ -132,7 +132,7 @@ struct NineDevice9
      * is not bound to anything by the vertex declaration */
     struct pipe_resource *dummy_vbo;
 };
-static INLINE struct NineDevice9 *
+static inline struct NineDevice9 *
 NineDevice9( void *data )
 {
     return (struct NineDevice9 *)data;
diff --git a/src/gallium/state_trackers/nine/device9ex.h b/src/gallium/state_trackers/nine/device9ex.h
index a31c720553a..8375622d8a1 100644
--- a/src/gallium/state_trackers/nine/device9ex.h
+++ b/src/gallium/state_trackers/nine/device9ex.h
@@ -29,7 +29,7 @@ struct NineDevice9Ex
 {
     struct NineDevice9 base;
 };
-static INLINE struct NineDevice9Ex *
+static inline struct NineDevice9Ex *
 NineDevice9Ex( void *data )
 {
     return (struct NineDevice9Ex *)data;
diff --git a/src/gallium/state_trackers/nine/device9video.h b/src/gallium/state_trackers/nine/device9video.h
index ca041e55fbc..fc2faeb624a 100644
--- a/src/gallium/state_trackers/nine/device9video.h
+++ b/src/gallium/state_trackers/nine/device9video.h
@@ -29,7 +29,7 @@ struct NineDevice9Video
 {
     struct NineUnknown base;
 };
-static INLINE struct NineDevice9Video *
+static inline struct NineDevice9Video *
 NineDevice9Video( void *data )
 {
     return (struct NineDevice9Video *)data;
diff --git a/src/gallium/state_trackers/nine/indexbuffer9.h b/src/gallium/state_trackers/nine/indexbuffer9.h
index 0982a93fbb1..f10578f47ba 100644
--- a/src/gallium/state_trackers/nine/indexbuffer9.h
+++ b/src/gallium/state_trackers/nine/indexbuffer9.h
@@ -45,7 +45,7 @@ struct NineIndexBuffer9
 
     D3DINDEXBUFFER_DESC desc;
 };
-static INLINE struct NineIndexBuffer9 *
+static inline struct NineIndexBuffer9 *
 NineIndexBuffer9( void *data )
 {
     return (struct NineIndexBuffer9 *)data;
diff --git a/src/gallium/state_trackers/nine/iunknown.h b/src/gallium/state_trackers/nine/iunknown.h
index 4c83ddd8e4e..628d984553e 100644
--- a/src/gallium/state_trackers/nine/iunknown.h
+++ b/src/gallium/state_trackers/nine/iunknown.h
@@ -52,7 +52,7 @@ struct NineUnknown
 
     void (*dtor)(void *data); /* top-level dtor */
 };
-static INLINE struct NineUnknown *
+static inline struct NineUnknown *
 NineUnknown( void *data )
 {
     return (struct NineUnknown *)data;
@@ -94,14 +94,14 @@ NineUnknown_GetDevice( struct NineUnknown *This,
 
 /*** Nine private methods ***/
 
-static INLINE void
+static inline void
 NineUnknown_Destroy( struct NineUnknown *This )
 {
     assert(!(This->refs | This->bind));
     This->dtor(This);
 }
 
-static INLINE UINT
+static inline UINT
 NineUnknown_Bind( struct NineUnknown *This )
 {
     UINT b = ++This->bind;
@@ -113,7 +113,7 @@ NineUnknown_Bind( struct NineUnknown *This )
     return b;
 }
 
-static INLINE UINT
+static inline UINT
 NineUnknown_Unbind( struct NineUnknown *This )
 {
     UINT b = --This->bind;
@@ -129,7 +129,7 @@ NineUnknown_Unbind( struct NineUnknown *This )
     return b;
 }
 
-static INLINE void
+static inline void
 NineUnknown_ConvertRefToBind( struct NineUnknown *This )
 {
     NineUnknown_Bind(This);
@@ -137,7 +137,7 @@ NineUnknown_ConvertRefToBind( struct NineUnknown *This )
 }
 
 /* Detach from container. */
-static INLINE void
+static inline void
 NineUnknown_Detach( struct NineUnknown *This )
 {
     assert(This->container && !This->forward);
diff --git a/src/gallium/state_trackers/nine/nine_dump.h b/src/gallium/state_trackers/nine/nine_dump.h
index d0d4a9eb3aa..a0ffe7bf6ab 100644
--- a/src/gallium/state_trackers/nine/nine_dump.h
+++ b/src/gallium/state_trackers/nine/nine_dump.h
@@ -31,19 +31,19 @@ nine_dump_D3DTSS_value(unsigned, D3DTEXTURESTAGESTATETYPE, DWORD);
 
 #else /* !DEBUG */
 
-static INLINE void
+static inline void
 nine_dump_D3DADAPTER_IDENTIFIER9(unsigned ch, const D3DADAPTER_IDENTIFIER9 *id)
 { }
-static INLINE void
+static inline void
 nine_dump_D3DCAPS9(unsigned ch, const D3DCAPS9 *caps)
 { }
-static INLINE void
+static inline void
 nine_dump_D3DLIGHT9(unsigned ch, const D3DLIGHT9 *light)
 { }
-static INLINE void
+static inline void
 nine_dump_D3DMATERIAL9(unsigned ch, const D3DMATERIAL9 *mat)
 { }
-static INLINE void
+static inline void
 nine_dump_D3DTSS_value(unsigned ch, D3DTEXTURESTAGESTATETYPE tss, DWORD value)
 { }
 
diff --git a/src/gallium/state_trackers/nine/nine_ff.c b/src/gallium/state_trackers/nine/nine_ff.c
index c2213e6bf11..8a53f0d9038 100644
--- a/src/gallium/state_trackers/nine/nine_ff.c
+++ b/src/gallium/state_trackers/nine/nine_ff.c
@@ -295,7 +295,7 @@ struct vs_build_ctx
     struct ureg_src mtlE;
 };
 
-static INLINE unsigned
+static inline unsigned
 get_texcoord_sn(struct pipe_screen *screen)
 {
     if (screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD))
@@ -303,7 +303,7 @@ get_texcoord_sn(struct pipe_screen *screen)
     return TGSI_SEMANTIC_GENERIC;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 build_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
 {
     const unsigned i = vs->num_inputs++;
@@ -313,7 +313,7 @@ build_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
 }
 
 /* NOTE: dst may alias src */
-static INLINE void
+static inline void
 ureg_normalize3(struct ureg_program *ureg,
                 struct ureg_dst dst, struct ureg_src src,
                 struct ureg_dst tmp)
@@ -1033,7 +1033,7 @@ static uint8_t ps_d3dtop_args_mask(D3DTEXTUREOP top)
     }
 }
 
-static INLINE boolean
+static inline boolean
 is_MOV_no_op(struct ureg_dst dst, struct ureg_src src)
 {
     return !dst.WriteMask ||
@@ -1973,7 +1973,7 @@ nine_D3DMATRIX_print(const D3DMATRIX *M)
 }
 */
 
-static INLINE float
+static inline float
 nine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
 {
     return A->m[r][0] * B->m[0][c] +
@@ -1982,7 +1982,7 @@ nine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
            A->m[r][3] * B->m[3][c];
 }
 
-static INLINE float
+static inline float
 nine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
 {
     return v->x * M->m[0][c] +
@@ -1991,7 +1991,7 @@ nine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
            1.0f * M->m[3][c];
 }
 
-static INLINE float
+static inline float
 nine_DP3_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
 {
     return v->x * M->m[0][c] +
diff --git a/src/gallium/state_trackers/nine/nine_helpers.c b/src/gallium/state_trackers/nine/nine_helpers.c
index ed179f9aedc..98c2ae30eba 100644
--- a/src/gallium/state_trackers/nine/nine_helpers.c
+++ b/src/gallium/state_trackers/nine/nine_helpers.c
@@ -49,7 +49,7 @@ nine_range_pool_more(struct nine_range_pool *pool)
     return pool->free;
 }
 
-static INLINE struct nine_range *
+static inline struct nine_range *
 nine_range_pool_get(struct nine_range_pool *pool, int16_t bgn, int16_t end)
 {
     struct nine_range *r = pool->free;
@@ -62,7 +62,7 @@ nine_range_pool_get(struct nine_range_pool *pool, int16_t bgn, int16_t end)
     return r;
 }
 
-static INLINE void
+static inline void
 nine_ranges_coalesce(struct nine_range *r, struct nine_range_pool *pool)
 {
     struct nine_range *n;
diff --git a/src/gallium/state_trackers/nine/nine_helpers.h b/src/gallium/state_trackers/nine/nine_helpers.h
index 6751a822ec2..b382c5b72b3 100644
--- a/src/gallium/state_trackers/nine/nine_helpers.h
+++ b/src/gallium/state_trackers/nine/nine_helpers.h
@@ -123,7 +123,7 @@ static inline void _nine_bind(void **dst, void *obj)
     } \
     return D3D_OK
 
-static INLINE float asfloat(DWORD value)
+static inline float asfloat(DWORD value)
 {
     union {
         float f;
@@ -149,14 +149,14 @@ struct nine_range_pool
     unsigned num_slabs_max;
 };
 
-static INLINE void
+static inline void
 nine_range_pool_put(struct nine_range_pool *pool, struct nine_range *r)
 {
     r->next = pool->free;
     pool->free = r;
 }
 
-static INLINE void
+static inline void
 nine_range_pool_put_chain(struct nine_range_pool *pool,
                           struct nine_range *head,
                           struct nine_range *tail)
diff --git a/src/gallium/state_trackers/nine/nine_pipe.c b/src/gallium/state_trackers/nine/nine_pipe.c
index 0da0b20263d..4cf37b9f59c 100644
--- a/src/gallium/state_trackers/nine/nine_pipe.c
+++ b/src/gallium/state_trackers/nine/nine_pipe.c
@@ -118,7 +118,7 @@ nine_convert_rasterizer_state(struct cso_context *ctx, const DWORD *rs)
     cso_set_rasterizer(ctx, &rast);
 }
 
-static INLINE void
+static inline void
 nine_convert_blend_state_fixup(struct pipe_blend_state *blend, const DWORD *rs)
 {
     if (unlikely(rs[D3DRS_SRCBLEND] == D3DBLEND_BOTHSRCALPHA ||
diff --git a/src/gallium/state_trackers/nine/nine_pipe.h b/src/gallium/state_trackers/nine/nine_pipe.h
index 91da5630122..43a7737cdf9 100644
--- a/src/gallium/state_trackers/nine/nine_pipe.h
+++ b/src/gallium/state_trackers/nine/nine_pipe.h
@@ -43,7 +43,7 @@ void nine_convert_sampler_state(struct cso_context *, int idx, const DWORD *);
 
 void nine_pipe_context_clear(struct NineDevice9 *);
 
-static INLINE unsigned d3dlock_buffer_to_pipe_transfer_usage(DWORD Flags)
+static inline unsigned d3dlock_buffer_to_pipe_transfer_usage(DWORD Flags)
 {
     unsigned usage;
 
@@ -70,7 +70,7 @@ static INLINE unsigned d3dlock_buffer_to_pipe_transfer_usage(DWORD Flags)
     return usage;
 }
 
-static INLINE void
+static inline void
 rect_to_pipe_box(struct pipe_box *dst, const RECT *src)
 {
     dst->x = src->left;
@@ -81,7 +81,7 @@ rect_to_pipe_box(struct pipe_box *dst, const RECT *src)
     dst->depth = 1;
 }
 
-static INLINE boolean
+static inline boolean
 rect_to_pipe_box_clamp(struct pipe_box *dst, const RECT *src)
 {
     rect_to_pipe_box(dst, src);
@@ -95,7 +95,7 @@ rect_to_pipe_box_clamp(struct pipe_box *dst, const RECT *src)
     return FALSE;
 }
 
-static INLINE boolean
+static inline boolean
 rect_to_pipe_box_flip(struct pipe_box *dst, const RECT *src)
 {
     rect_to_pipe_box(dst, src);
@@ -107,7 +107,7 @@ rect_to_pipe_box_flip(struct pipe_box *dst, const RECT *src)
     return TRUE;
 }
 
-static INLINE void
+static inline void
 rect_to_pipe_box_xy_only(struct pipe_box *dst, const RECT *src)
 {
     user_warn(src->left > src->right || src->top > src->bottom);
@@ -118,7 +118,7 @@ rect_to_pipe_box_xy_only(struct pipe_box *dst, const RECT *src)
     dst->height = src->bottom - src->top;
 }
 
-static INLINE boolean
+static inline boolean
 rect_to_pipe_box_xy_only_clamp(struct pipe_box *dst, const RECT *src)
 {
     rect_to_pipe_box_xy_only(dst, src);
@@ -132,7 +132,7 @@ rect_to_pipe_box_xy_only_clamp(struct pipe_box *dst, const RECT *src)
     return FALSE;
 }
 
-static INLINE void
+static inline void
 rect_to_g3d_u_rect(struct u_rect *dst, const RECT *src)
 {
     user_warn(src->left > src->right || src->top > src->bottom);
@@ -143,7 +143,7 @@ rect_to_g3d_u_rect(struct u_rect *dst, const RECT *src)
     dst->y1 = src->bottom;
 }
 
-static INLINE void
+static inline void
 d3dbox_to_pipe_box(struct pipe_box *dst, const D3DBOX *src)
 {
     user_warn(src->Left > src->Right);
@@ -158,13 +158,13 @@ d3dbox_to_pipe_box(struct pipe_box *dst, const D3DBOX *src)
     dst->depth = src->Back - src->Front;
 }
 
-static INLINE D3DFORMAT
+static inline D3DFORMAT
 pipe_to_d3d9_format(enum pipe_format format)
 {
     return nine_pipe_to_d3d9_format_map[format];
 }
 
-static INLINE boolean
+static inline boolean
 depth_stencil_format( D3DFORMAT fmt )
 {
     static D3DFORMAT allowed[] = {
@@ -190,7 +190,7 @@ depth_stencil_format( D3DFORMAT fmt )
     return FALSE;
 }
 
-static INLINE unsigned
+static inline unsigned
 d3d9_get_pipe_depth_format_bindings(D3DFORMAT format)
 {
     switch (format) {
@@ -215,7 +215,7 @@ d3d9_get_pipe_depth_format_bindings(D3DFORMAT format)
     }
 }
 
-static INLINE enum pipe_format
+static inline enum pipe_format
 d3d9_to_pipe_format_internal(D3DFORMAT format)
 {
     if (format <= D3DFMT_A2B10G10R10_XR_BIAS)
@@ -257,7 +257,7 @@ d3d9_to_pipe_format_internal(D3DFORMAT format)
     screen->is_format_supported(screen, pipe_format, target, \
                                 sample_count, bindings)
 
-static INLINE enum pipe_format
+static inline enum pipe_format
 d3d9_to_pipe_format_checked(struct pipe_screen *screen,
                             D3DFORMAT format,
                             enum pipe_texture_target target,
@@ -298,7 +298,7 @@ d3d9_to_pipe_format_checked(struct pipe_screen *screen,
     return PIPE_FORMAT_NONE;
 }
 
-static INLINE const char *
+static inline const char *
 d3dformat_to_string(D3DFORMAT fmt)
 {
     switch (fmt) {
@@ -381,7 +381,7 @@ d3dformat_to_string(D3DFORMAT fmt)
     return "Unknown";
 }
 
-static INLINE unsigned
+static inline unsigned
 nine_fvf_stride( DWORD fvf )
 {
     unsigned texcount, i, size = 0;
@@ -428,7 +428,7 @@ nine_fvf_stride( DWORD fvf )
     return size;
 }
 
-static INLINE void
+static inline void
 d3dcolor_to_rgba(float *rgba, D3DCOLOR color)
 {
     rgba[0] = (float)((color >> 16) & 0xFF) / 0xFF;
@@ -437,13 +437,13 @@ d3dcolor_to_rgba(float *rgba, D3DCOLOR color)
     rgba[3] = (float)((color >> 24) & 0xFF) / 0xFF;
 }
 
-static INLINE void
+static inline void
 d3dcolor_to_pipe_color_union(union pipe_color_union *rgba, D3DCOLOR color)
 {
     d3dcolor_to_rgba(&rgba->f[0], color);
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dprimitivetype_to_pipe_prim(D3DPRIMITIVETYPE prim)
 {
     switch (prim) {
@@ -459,7 +459,7 @@ d3dprimitivetype_to_pipe_prim(D3DPRIMITIVETYPE prim)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_count_to_vertex_count(D3DPRIMITIVETYPE prim, UINT count)
 {
     switch (prim) {
@@ -475,7 +475,7 @@ prim_count_to_vertex_count(D3DPRIMITIVETYPE prim, UINT count)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dcmpfunc_to_pipe_func(D3DCMPFUNC func)
 {
     switch (func) {
@@ -494,7 +494,7 @@ d3dcmpfunc_to_pipe_func(D3DCMPFUNC func)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dstencilop_to_pipe_stencil_op(D3DSTENCILOP op)
 {
     switch (op) {
@@ -511,7 +511,7 @@ d3dstencilop_to_pipe_stencil_op(D3DSTENCILOP op)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dcull_to_pipe_face(D3DCULL cull)
 {
     switch (cull) {
@@ -524,7 +524,7 @@ d3dcull_to_pipe_face(D3DCULL cull)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dfillmode_to_pipe_polygon_mode(D3DFILLMODE mode)
 {
     switch (mode) {
@@ -538,7 +538,7 @@ d3dfillmode_to_pipe_polygon_mode(D3DFILLMODE mode)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dblendop_to_pipe_blend(D3DBLENDOP op)
 {
     switch (op) {
@@ -557,7 +557,7 @@ d3dblendop_to_pipe_blend(D3DBLENDOP op)
  * Drivers may check RGB and ALPHA factors for equality so we should not
  * simply substitute the ALPHA variants.
  */
-static INLINE unsigned
+static inline unsigned
 d3dblend_alpha_to_pipe_blendfactor(D3DBLEND b)
 {
     switch (b) {
@@ -584,7 +584,7 @@ d3dblend_alpha_to_pipe_blendfactor(D3DBLEND b)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dblend_color_to_pipe_blendfactor(D3DBLEND b)
 {
     switch (b) {
@@ -611,7 +611,7 @@ d3dblend_color_to_pipe_blendfactor(D3DBLEND b)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dtextureaddress_to_pipe_tex_wrap(D3DTEXTUREADDRESS addr)
 {
     switch (addr) {
@@ -626,7 +626,7 @@ d3dtextureaddress_to_pipe_tex_wrap(D3DTEXTUREADDRESS addr)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dtexturefiltertype_to_pipe_tex_filter(D3DTEXTUREFILTERTYPE filter)
 {
     switch (filter) {
@@ -644,7 +644,7 @@ d3dtexturefiltertype_to_pipe_tex_filter(D3DTEXTUREFILTERTYPE filter)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dtexturefiltertype_to_pipe_tex_mipfilter(D3DTEXTUREFILTERTYPE filter)
 {
     switch (filter) {
@@ -662,7 +662,7 @@ d3dtexturefiltertype_to_pipe_tex_mipfilter(D3DTEXTUREFILTERTYPE filter)
     }
 }
 
-static INLINE unsigned nine_format_get_stride(enum pipe_format format,
+static inline unsigned nine_format_get_stride(enum pipe_format format,
                                               unsigned width)
 {
     unsigned stride = util_format_get_stride(format, width);
@@ -670,7 +670,7 @@ static INLINE unsigned nine_format_get_stride(enum pipe_format format,
     return align(stride, 4);
 }
 
-static INLINE unsigned nine_format_get_level_alloc_size(enum pipe_format format,
+static inline unsigned nine_format_get_level_alloc_size(enum pipe_format format,
                                                         unsigned width,
                                                         unsigned height,
                                                         unsigned level)
@@ -684,7 +684,7 @@ static INLINE unsigned nine_format_get_level_alloc_size(enum pipe_format format,
     return size;
 }
 
-static INLINE unsigned nine_format_get_size_and_offsets(enum pipe_format format,
+static inline unsigned nine_format_get_size_and_offsets(enum pipe_format format,
                                                         unsigned *offsets,
                                                         unsigned width,
                                                         unsigned height,
diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c
index 22a58825f78..754f5af6b8e 100644
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -43,7 +43,7 @@ struct shader_translator;
 
 typedef HRESULT (*translate_instruction_func)(struct shader_translator *);
 
-static INLINE const char *d3dsio_to_string(unsigned opcode);
+static inline const char *d3dsio_to_string(unsigned opcode);
 
 
 #define NINED3D_SM1_VS 0xfffe
@@ -239,7 +239,7 @@ struct sm1_dst_param
     BYTE type;
 };
 
-static INLINE void
+static inline void
 assert_replicate_swizzle(const struct ureg_src *reg)
 {
     assert(reg->SwizzleY == reg->SwizzleX &&
@@ -608,7 +608,7 @@ tx_set_lconstb(struct shader_translator *tx, INT index, BOOL b)
        ureg_imm1f(tx->ureg, b ? 1.0f : 0.0f);
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 tx_scratch(struct shader_translator *tx)
 {
     if (tx->num_scratch >= Elements(tx->regs.t)) {
@@ -620,13 +620,13 @@ tx_scratch(struct shader_translator *tx)
     return tx->regs.t[tx->num_scratch++];
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 tx_scratch_scalar(struct shader_translator *tx)
 {
     return ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_X);
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 tx_src_scalar(struct ureg_dst dst)
 {
     struct ureg_src src = ureg_src(dst);
@@ -636,7 +636,7 @@ tx_src_scalar(struct ureg_dst dst)
     return src;
 }
 
-static INLINE void
+static inline void
 tx_temp_alloc(struct shader_translator *tx, INT idx)
 {
     assert(idx >= 0);
@@ -654,7 +654,7 @@ tx_temp_alloc(struct shader_translator *tx, INT idx)
         tx->regs.r[idx] = ureg_DECL_temporary(tx->ureg);
 }
 
-static INLINE void
+static inline void
 tx_addr_alloc(struct shader_translator *tx, INT idx)
 {
     assert(idx == 0);
@@ -664,7 +664,7 @@ tx_addr_alloc(struct shader_translator *tx, INT idx)
         tx->regs.a0 = ureg_DECL_temporary(tx->ureg);
 }
 
-static INLINE void
+static inline void
 tx_pred_alloc(struct shader_translator *tx, INT idx)
 {
     assert(idx == 0);
@@ -672,7 +672,7 @@ tx_pred_alloc(struct shader_translator *tx, INT idx)
         tx->regs.p = ureg_DECL_predicate(tx->ureg);
 }
 
-static INLINE void
+static inline void
 tx_texcoord_alloc(struct shader_translator *tx, INT idx)
 {
     assert(IS_PS);
@@ -682,7 +682,7 @@ tx_texcoord_alloc(struct shader_translator *tx, INT idx)
                                              TGSI_INTERPOLATE_PERSPECTIVE);
 }
 
-static INLINE unsigned *
+static inline unsigned *
 tx_bgnloop(struct shader_translator *tx)
 {
     tx->loop_depth++;
@@ -692,7 +692,7 @@ tx_bgnloop(struct shader_translator *tx)
     return &tx->loop_labels[tx->loop_depth - 1];
 }
 
-static INLINE unsigned *
+static inline unsigned *
 tx_endloop(struct shader_translator *tx)
 {
     assert(tx->loop_depth);
@@ -741,7 +741,7 @@ tx_get_loopal(struct shader_translator *tx)
     return ureg_src_undef();
 }
 
-static INLINE unsigned *
+static inline unsigned *
 tx_cond(struct shader_translator *tx)
 {
    assert(tx->cond_depth <= NINE_MAX_COND_DEPTH);
@@ -749,14 +749,14 @@ tx_cond(struct shader_translator *tx)
    return &tx->cond_labels[tx->cond_depth - 1];
 }
 
-static INLINE unsigned *
+static inline unsigned *
 tx_elsecond(struct shader_translator *tx)
 {
    assert(tx->cond_depth);
    return &tx->cond_labels[tx->cond_depth - 1];
 }
 
-static INLINE void
+static inline void
 tx_endcond(struct shader_translator *tx)
 {
    assert(tx->cond_depth);
@@ -765,7 +765,7 @@ tx_endcond(struct shader_translator *tx)
                     ureg_get_instruction_number(tx->ureg));
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 nine_ureg_dst_register(unsigned file, int index)
 {
     return ureg_dst(ureg_src_register(file, index));
@@ -1240,7 +1240,7 @@ NineTranslateInstruction_Mkxn(struct shader_translator *tx, const unsigned k, co
 #define VNOTSUPPORTED   0, 0
 #define V(maj, min)     (((maj) << 8) | (min))
 
-static INLINE const char *
+static inline const char *
 d3dsio_to_string( unsigned opcode )
 {
     static const char *names[] = {
@@ -1657,7 +1657,7 @@ DECL_SPECIAL(IF)
     return D3D_OK;
 }
 
-static INLINE unsigned
+static inline unsigned
 sm1_insn_flags_to_tgsi_setop(BYTE flags)
 {
     switch (flags) {
@@ -1724,7 +1724,7 @@ static const char *sm1_declusage_names[] =
     [D3DDECLUSAGE_SAMPLE] = "SAMPLE"
 };
 
-static INLINE unsigned
+static inline unsigned
 sm1_to_nine_declusage(struct sm1_semantic *dcl)
 {
     return nine_d3d9_to_nine_declusage(dcl->usage, dcl->usage_idx);
@@ -1833,7 +1833,7 @@ sm1_declusage_to_tgsi(struct tgsi_declaration_semantic *sem,
 #define NINED3DSTT_2D     (D3DSTT_2D >> D3DSP_TEXTURETYPE_SHIFT)
 #define NINED3DSTT_VOLUME (D3DSTT_VOLUME >> D3DSP_TEXTURETYPE_SHIFT)
 #define NINED3DSTT_CUBE   (D3DSTT_CUBE >> D3DSP_TEXTURETYPE_SHIFT)
-static INLINE unsigned
+static inline unsigned
 d3dstt_to_tgsi_tex(BYTE sampler_type)
 {
     switch (sampler_type) {
@@ -1846,7 +1846,7 @@ d3dstt_to_tgsi_tex(BYTE sampler_type)
         return TGSI_TEXTURE_UNKNOWN;
     }
 }
-static INLINE unsigned
+static inline unsigned
 d3dstt_to_tgsi_tex_shadow(BYTE sampler_type)
 {
     switch (sampler_type) {
@@ -1859,7 +1859,7 @@ d3dstt_to_tgsi_tex_shadow(BYTE sampler_type)
         return TGSI_TEXTURE_UNKNOWN;
     }
 }
-static INLINE unsigned
+static inline unsigned
 ps1x_sampler_type(const struct nine_shader_info *info, unsigned stage)
 {
     switch ((info->sampler_ps1xtypes >> (stage * 2)) & 0x3) {
@@ -1884,7 +1884,7 @@ sm1_sampler_type_name(BYTE sampler_type)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 nine_tgsi_to_interp_mode(struct tgsi_declaration_semantic *sem)
 {
     switch (sem->Name) {
@@ -2685,7 +2685,7 @@ create_op_info_map(struct shader_translator *tx)
     }
 }
 
-static INLINE HRESULT
+static inline HRESULT
 NineTranslateInstruction_Generic(struct shader_translator *tx)
 {
     struct ureg_dst dst[1];
@@ -2703,19 +2703,19 @@ NineTranslateInstruction_Generic(struct shader_translator *tx)
     return D3D_OK;
 }
 
-static INLINE DWORD
+static inline DWORD
 TOKEN_PEEK(struct shader_translator *tx)
 {
     return *(tx->parse);
 }
 
-static INLINE DWORD
+static inline DWORD
 TOKEN_NEXT(struct shader_translator *tx)
 {
     return *(tx->parse)++;
 }
 
-static INLINE void
+static inline void
 TOKEN_JUMP(struct shader_translator *tx)
 {
     if (tx->parse_next && tx->parse != tx->parse_next) {
@@ -2724,7 +2724,7 @@ TOKEN_JUMP(struct shader_translator *tx)
     }
 }
 
-static INLINE boolean
+static inline boolean
 sm1_parse_eof(struct shader_translator *tx)
 {
     return TOKEN_PEEK(tx) == NINED3DSP_END;
@@ -3063,7 +3063,7 @@ tx_dtor(struct shader_translator *tx)
     FREE(tx);
 }
 
-static INLINE unsigned
+static inline unsigned
 tgsi_processor_from_type(unsigned shader_type)
 {
     switch (shader_type) {
diff --git a/src/gallium/state_trackers/nine/nine_shader.h b/src/gallium/state_trackers/nine/nine_shader.h
index 56c5d99b4d2..ec256c153a9 100644
--- a/src/gallium/state_trackers/nine/nine_shader.h
+++ b/src/gallium/state_trackers/nine/nine_shader.h
@@ -70,19 +70,19 @@ struct nine_shader_info
     struct nine_lconstf lconstf; /* out, NOTE: members to be free'd by user */
 };
 
-static INLINE void
+static inline void
 nine_info_mark_const_f_used(struct nine_shader_info *info, int idx)
 {
     if (info->const_float_slots < (idx + 1))
         info->const_float_slots = idx + 1;
 }
-static INLINE void
+static inline void
 nine_info_mark_const_i_used(struct nine_shader_info *info, int idx)
 {
     if (info->const_int_slots < (idx + 1))
         info->const_int_slots = idx + 1;
 }
-static INLINE void
+static inline void
 nine_info_mark_const_b_used(struct nine_shader_info *info, int idx)
 {
     if (info->const_bool_slots < (idx + 1))
@@ -100,7 +100,7 @@ struct nine_shader_variant
     uint32_t key;
 };
 
-static INLINE void *
+static inline void *
 nine_shader_variant_get(struct nine_shader_variant *list, uint32_t key)
 {
     while (list->key != key && list->next)
@@ -110,7 +110,7 @@ nine_shader_variant_get(struct nine_shader_variant *list, uint32_t key)
     return NULL;
 }
 
-static INLINE boolean
+static inline boolean
 nine_shader_variant_add(struct nine_shader_variant *list,
                         uint32_t key, void *cso)
 {
@@ -127,7 +127,7 @@ nine_shader_variant_add(struct nine_shader_variant *list,
     return TRUE;
 }
 
-static INLINE void
+static inline void
 nine_shader_variants_free(struct nine_shader_variant *list)
 {
     while (list->next) {
diff --git a/src/gallium/state_trackers/nine/nine_state.c b/src/gallium/state_trackers/nine/nine_state.c
index 435118bc93f..6c835858d18 100644
--- a/src/gallium/state_trackers/nine/nine_state.c
+++ b/src/gallium/state_trackers/nine/nine_state.c
@@ -176,7 +176,7 @@ update_viewport(struct NineDevice9 *device)
     pipe->set_viewport_states(pipe, 0, 1, &pvport);
 }
 
-static INLINE void
+static inline void
 update_scissor(struct NineDevice9 *device)
 {
     struct pipe_context *pipe = device->pipe;
@@ -184,19 +184,19 @@ update_scissor(struct NineDevice9 *device)
     pipe->set_scissor_states(pipe, 0, 1, &device->state.scissor);
 }
 
-static INLINE void
+static inline void
 update_blend(struct NineDevice9 *device)
 {
     nine_convert_blend_state(device->cso, device->state.rs);
 }
 
-static INLINE void
+static inline void
 update_dsa(struct NineDevice9 *device)
 {
     nine_convert_dsa_state(device->cso, device->state.rs);
 }
 
-static INLINE void
+static inline void
 update_rasterizer(struct NineDevice9 *device)
 {
     nine_convert_rasterizer_state(device->cso, device->state.rs);
@@ -294,7 +294,7 @@ update_vertex_elements(struct NineDevice9 *device)
     state->changed.stream_freq = 0;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 update_shader_variant_keys(struct NineDevice9 *device)
 {
     struct nine_state *state = &device->state;
@@ -332,7 +332,7 @@ update_shader_variant_keys(struct NineDevice9 *device)
     return mask;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 update_vs(struct NineDevice9 *device)
 {
     struct nine_state *state = &device->state;
@@ -359,7 +359,7 @@ update_vs(struct NineDevice9 *device)
     return changed_group;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 update_ps(struct NineDevice9 *device)
 {
     struct nine_state *state = &device->state;
@@ -656,7 +656,7 @@ update_vertex_buffers(struct NineDevice9 *device)
     state->changed.vtxbuf = 0;
 }
 
-static INLINE void
+static inline void
 update_index_buffer(struct NineDevice9 *device)
 {
     struct pipe_context *pipe = device->pipe;
@@ -677,7 +677,7 @@ validate_textures(struct NineDevice9 *device)
     }
 }
 
-static INLINE boolean
+static inline boolean
 update_sampler_derived(struct nine_state *state, unsigned s)
 {
     boolean changed = FALSE;
diff --git a/src/gallium/state_trackers/nine/nineexoverlayextension.h b/src/gallium/state_trackers/nine/nineexoverlayextension.h
index a16d690dc8c..1616ed0532c 100644
--- a/src/gallium/state_trackers/nine/nineexoverlayextension.h
+++ b/src/gallium/state_trackers/nine/nineexoverlayextension.h
@@ -29,7 +29,7 @@ struct Nine9ExOverlayExtension
 {
     struct NineUnknown base;
 };
-static INLINE struct Nine9ExOverlayExtension *
+static inline struct Nine9ExOverlayExtension *
 Nine9ExOverlayExtension( void *data )
 {
     return (struct Nine9ExOverlayExtension *)data;
diff --git a/src/gallium/state_trackers/nine/pixelshader9.h b/src/gallium/state_trackers/nine/pixelshader9.h
index 5e2219c946a..6dad1d1ee76 100644
--- a/src/gallium/state_trackers/nine/pixelshader9.h
+++ b/src/gallium/state_trackers/nine/pixelshader9.h
@@ -47,7 +47,7 @@ struct NinePixelShader9
 
     uint64_t ff_key[6];
 };
-static INLINE struct NinePixelShader9 *
+static inline struct NinePixelShader9 *
 NinePixelShader9( void *data )
 {
     return (struct NinePixelShader9 *)data;
diff --git a/src/gallium/state_trackers/nine/query9.c b/src/gallium/state_trackers/nine/query9.c
index 04f4aadabba..3afa9007f61 100644
--- a/src/gallium/state_trackers/nine/query9.c
+++ b/src/gallium/state_trackers/nine/query9.c
@@ -57,7 +57,7 @@ d3dquerytype_to_pipe_query(struct pipe_screen *screen, D3DQUERYTYPE type)
 
 #define GET_DATA_SIZE_CASE2(a, b) case D3DQUERYTYPE_##a: return sizeof(D3DDEVINFO_##b)
 #define GET_DATA_SIZE_CASET(a, b) case D3DQUERYTYPE_##a: return sizeof(b)
-static INLINE DWORD
+static inline DWORD
 nine_query_result_size(D3DQUERYTYPE type)
 {
     switch (type) {
diff --git a/src/gallium/state_trackers/nine/query9.h b/src/gallium/state_trackers/nine/query9.h
index ad1ca50f26d..9cc1e317055 100644
--- a/src/gallium/state_trackers/nine/query9.h
+++ b/src/gallium/state_trackers/nine/query9.h
@@ -41,7 +41,7 @@ struct NineQuery9
     enum nine_query_state state;
     boolean instant; /* true if D3DISSUE_BEGIN is not needed / invalid */
 };
-static INLINE struct NineQuery9 *
+static inline struct NineQuery9 *
 NineQuery9( void *data )
 {
     return (struct NineQuery9 *)data;
diff --git a/src/gallium/state_trackers/nine/resource9.h b/src/gallium/state_trackers/nine/resource9.h
index da1dd6320e0..906f90806ce 100644
--- a/src/gallium/state_trackers/nine/resource9.h
+++ b/src/gallium/state_trackers/nine/resource9.h
@@ -46,7 +46,7 @@ struct NineResource9
     /* for [GS]etPrivateData/FreePrivateData */
     struct util_hash_table *pdata;
 };
-static INLINE struct NineResource9 *
+static inline struct NineResource9 *
 NineResource9( void *data )
 {
     return (struct NineResource9 *)data;
diff --git a/src/gallium/state_trackers/nine/stateblock9.h b/src/gallium/state_trackers/nine/stateblock9.h
index bcaf634d933..a580c6a2302 100644
--- a/src/gallium/state_trackers/nine/stateblock9.h
+++ b/src/gallium/state_trackers/nine/stateblock9.h
@@ -43,7 +43,7 @@ struct NineStateBlock9
 
     enum nine_stateblock_type type;
 };
-static INLINE struct NineStateBlock9 *
+static inline struct NineStateBlock9 *
 NineStateBlock9( void *data )
 {
     return (struct NineStateBlock9 *)data;
diff --git a/src/gallium/state_trackers/nine/surface9.c b/src/gallium/state_trackers/nine/surface9.c
index e46afd91157..7533cb3a454 100644
--- a/src/gallium/state_trackers/nine/surface9.c
+++ b/src/gallium/state_trackers/nine/surface9.c
@@ -261,7 +261,7 @@ NineSurface9_GetDesc( struct NineSurface9 *This,
 }
 
 /* Add the dirty rects to the source texture */
-INLINE void
+inline void
 NineSurface9_AddDirtyRect( struct NineSurface9 *This,
                            const struct pipe_box *box )
 {
@@ -295,7 +295,7 @@ NineSurface9_AddDirtyRect( struct NineSurface9 *This,
     }
 }
 
-static INLINE uint8_t *
+static inline uint8_t *
 NineSurface9_GetSystemMemPointer(struct NineSurface9 *This, int x, int y)
 {
     unsigned x_offset = util_format_get_stride(This->base.info.format, x);
diff --git a/src/gallium/state_trackers/nine/surface9.h b/src/gallium/state_trackers/nine/surface9.h
index 2e409558609..73092ab8cf5 100644
--- a/src/gallium/state_trackers/nine/surface9.h
+++ b/src/gallium/state_trackers/nine/surface9.h
@@ -50,7 +50,7 @@ struct NineSurface9
     uint8_t *data; /* system memory backing */
     unsigned stride; /* for system memory backing */
 };
-static INLINE struct NineSurface9 *
+static inline struct NineSurface9 *
 NineSurface9( void *data )
 {
     return (struct NineSurface9 *)data;
@@ -89,7 +89,7 @@ NineSurface9_MarkContainerDirty( struct NineSurface9 *This );
 struct pipe_surface *
 NineSurface9_CreatePipeSurface( struct NineSurface9 *This, const int sRGB );
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 NineSurface9_GetSurface( struct NineSurface9 *This, int sRGB )
 {
     if (This->surface[sRGB])
@@ -97,13 +97,13 @@ NineSurface9_GetSurface( struct NineSurface9 *This, int sRGB )
     return NineSurface9_CreatePipeSurface(This, sRGB);
 }
 
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 NineSurface9_GetResource( struct NineSurface9 *This )
 {
     return This->base.resource;
 }
 
-static INLINE void
+static inline void
 NineSurface9_SetResource( struct NineSurface9 *This,
                           struct pipe_resource *resource, unsigned level )
 {
@@ -131,7 +131,7 @@ NineSurface9_CopySurface( struct NineSurface9 *This,
                           const POINT *pDestPoint,
                           const RECT *pSourceRect );
 
-static INLINE boolean
+static inline boolean
 NineSurface9_IsOffscreenPlain (struct NineSurface9 *This )
 {
     return This->base.usage == 0 && !This->texture;
@@ -141,7 +141,7 @@ NineSurface9_IsOffscreenPlain (struct NineSurface9 *This )
 void
 NineSurface9_Dump( struct NineSurface9 *This );
 #else
-static INLINE void
+static inline void
 NineSurface9_Dump( struct NineSurface9 *This ) { }
 #endif
 
diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c
index 26164f0b118..a62e6ad99d8 100644
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -631,7 +631,7 @@ static void pend_present(struct NineSwapChain9 *This,
     return;
 }
 
-static INLINE HRESULT
+static inline HRESULT
 present( struct NineSwapChain9 *This,
          const RECT *pSourceRect,
          const RECT *pDestRect,
diff --git a/src/gallium/state_trackers/nine/swapchain9.h b/src/gallium/state_trackers/nine/swapchain9.h
index 2afd6ab2954..5e48dde5004 100644
--- a/src/gallium/state_trackers/nine/swapchain9.h
+++ b/src/gallium/state_trackers/nine/swapchain9.h
@@ -76,7 +76,7 @@ struct NineSwapChain9
     BOOL enable_threadpool;
 };
 
-static INLINE struct NineSwapChain9 *
+static inline struct NineSwapChain9 *
 NineSwapChain9( void *data )
 {
     return (struct NineSwapChain9 *)data;
diff --git a/src/gallium/state_trackers/nine/swapchain9ex.h b/src/gallium/state_trackers/nine/swapchain9ex.h
index bf407836099..075f8835222 100644
--- a/src/gallium/state_trackers/nine/swapchain9ex.h
+++ b/src/gallium/state_trackers/nine/swapchain9ex.h
@@ -29,7 +29,7 @@ struct NineSwapChain9Ex
 {
     struct NineSwapChain9 base;
 };
-static INLINE struct NineSwapChain9Ex *
+static inline struct NineSwapChain9Ex *
 NineSwapChain9Ex( void *data )
 {
     return (struct NineSwapChain9Ex *)data;
diff --git a/src/gallium/state_trackers/nine/texture9.h b/src/gallium/state_trackers/nine/texture9.h
index 65db874b2a3..6f80be9ccde 100644
--- a/src/gallium/state_trackers/nine/texture9.h
+++ b/src/gallium/state_trackers/nine/texture9.h
@@ -33,7 +33,7 @@ struct NineTexture9
     struct pipe_box dirty_rect; /* covers all mip levels */
     uint8_t *managed_buffer;
 };
-static INLINE struct NineTexture9 *
+static inline struct NineTexture9 *
 NineTexture9( void *data )
 {
     return (struct NineTexture9 *)data;
diff --git a/src/gallium/state_trackers/nine/vertexbuffer9.h b/src/gallium/state_trackers/nine/vertexbuffer9.h
index 0d88b839cad..6174de4df08 100644
--- a/src/gallium/state_trackers/nine/vertexbuffer9.h
+++ b/src/gallium/state_trackers/nine/vertexbuffer9.h
@@ -40,7 +40,7 @@ struct NineVertexBuffer9
 
     D3DVERTEXBUFFER_DESC desc;
 };
-static INLINE struct NineVertexBuffer9 *
+static inline struct NineVertexBuffer9 *
 NineVertexBuffer9( void *data )
 {
     return (struct NineVertexBuffer9 *)data;
diff --git a/src/gallium/state_trackers/nine/vertexdeclaration9.c b/src/gallium/state_trackers/nine/vertexdeclaration9.c
index 9e4cb55bc67..2047b91abc4 100644
--- a/src/gallium/state_trackers/nine/vertexdeclaration9.c
+++ b/src/gallium/state_trackers/nine/vertexdeclaration9.c
@@ -34,7 +34,7 @@
 
 #define DBG_CHANNEL DBG_VERTEXDECLARATION
 
-static INLINE enum pipe_format decltype_format(BYTE type)
+static inline enum pipe_format decltype_format(BYTE type)
 {
     switch (type) {
     case D3DDECLTYPE_FLOAT1:    return PIPE_FORMAT_R32_FLOAT;
@@ -60,7 +60,7 @@ static INLINE enum pipe_format decltype_format(BYTE type)
     return PIPE_FORMAT_NONE;
 }
 
-static INLINE unsigned decltype_size(BYTE type)
+static inline unsigned decltype_size(BYTE type)
 {
     switch (type) {
     case D3DDECLTYPE_FLOAT1: return 1 * sizeof(float);
@@ -90,7 +90,7 @@ static INLINE unsigned decltype_size(BYTE type)
  * simple lookup table won't work in that case. Let's just wait
  * with making this more generic until we need it.
  */
-static INLINE boolean
+static inline boolean
 nine_d3ddeclusage_check(unsigned usage, unsigned usage_idx)
 {
     switch (usage) {
@@ -162,7 +162,7 @@ static const char *nine_declusage_names[] =
     [NINE_DECLUSAGE_FOG]             = "FOG",
     [NINE_DECLUSAGE_NONE]            = "(NONE)",
 };
-static INLINE const char *
+static inline const char *
 nine_declusage_name(unsigned ndcl)
 {
     return nine_declusage_names[ndcl % NINE_DECLUSAGE_COUNT];
diff --git a/src/gallium/state_trackers/nine/vertexdeclaration9.h b/src/gallium/state_trackers/nine/vertexdeclaration9.h
index a4d4a0445d5..655bcfbf165 100644
--- a/src/gallium/state_trackers/nine/vertexdeclaration9.h
+++ b/src/gallium/state_trackers/nine/vertexdeclaration9.h
@@ -47,7 +47,7 @@ struct NineVertexDeclaration9
     D3DVERTEXELEMENT9 *decls;
     DWORD fvf;
 };
-static INLINE struct NineVertexDeclaration9 *
+static inline struct NineVertexDeclaration9 *
 NineVertexDeclaration9( void *data )
 {
     return (struct NineVertexDeclaration9 *)data;
diff --git a/src/gallium/state_trackers/nine/vertexshader9.h b/src/gallium/state_trackers/nine/vertexshader9.h
index 3495c9f9c55..66c602c7b3c 100644
--- a/src/gallium/state_trackers/nine/vertexshader9.h
+++ b/src/gallium/state_trackers/nine/vertexshader9.h
@@ -56,7 +56,7 @@ struct NineVertexShader9
 
     uint64_t ff_key[2];
 };
-static INLINE struct NineVertexShader9 *
+static inline struct NineVertexShader9 *
 NineVertexShader9( void *data )
 {
     return (struct NineVertexShader9 *)data;
diff --git a/src/gallium/state_trackers/nine/volume9.c b/src/gallium/state_trackers/nine/volume9.c
index b34ee07dce9..4dfc5599a8e 100644
--- a/src/gallium/state_trackers/nine/volume9.c
+++ b/src/gallium/state_trackers/nine/volume9.c
@@ -152,7 +152,7 @@ NineVolume9_GetContainer( struct NineVolume9 *This,
     return NineUnknown_QueryInterface(NineUnknown(This)->container, riid, ppContainer);
 }
 
-static INLINE void
+static inline void
 NineVolume9_MarkContainerDirty( struct NineVolume9 *This )
 {
     struct NineBaseTexture9 *tex;
@@ -182,13 +182,13 @@ NineVolume9_GetDesc( struct NineVolume9 *This,
     return D3D_OK;
 }
 
-static INLINE boolean
+static inline boolean
 NineVolume9_IsDirty(struct NineVolume9 *This)
 {
     return This->dirty_box[0].width != 0;
 }
 
-INLINE void
+inline void
 NineVolume9_AddDirtyRegion( struct NineVolume9 *This,
                             const struct pipe_box *box )
 {
@@ -226,7 +226,7 @@ NineVolume9_AddDirtyRegion( struct NineVolume9 *This,
     }
 }
 
-static INLINE uint8_t *
+static inline uint8_t *
 NineVolume9_GetSystemMemPointer(struct NineVolume9 *This, int x, int y, int z)
 {
     unsigned x_offset = util_format_get_stride(This->info.format, x);
diff --git a/src/gallium/state_trackers/nine/volume9.h b/src/gallium/state_trackers/nine/volume9.h
index 802836659c2..fae24310a50 100644
--- a/src/gallium/state_trackers/nine/volume9.h
+++ b/src/gallium/state_trackers/nine/volume9.h
@@ -57,7 +57,7 @@ struct NineVolume9
     /* for [GS]etPrivateData/FreePrivateData */
     struct util_hash_table *pdata;
 };
-static INLINE struct NineVolume9 *
+static inline struct NineVolume9 *
 NineVolume9( void *data )
 {
     return (struct NineVolume9 *)data;
@@ -73,7 +73,7 @@ NineVolume9_new( struct NineDevice9 *pDevice,
 
 /*** Nine private ***/
 
-static INLINE void
+static inline void
 NineVolume9_SetResource( struct NineVolume9 *This,
                          struct pipe_resource *resource, unsigned level )
 {
@@ -85,7 +85,7 @@ void
 NineVolume9_AddDirtyRegion( struct NineVolume9 *This,
                             const struct pipe_box *box );
 
-static INLINE void
+static inline void
 NineVolume9_ClearDirtyRegion( struct NineVolume9 *This )
 {
     memset(&This->dirty_box, 0, sizeof(This->dirty_box));
diff --git a/src/gallium/state_trackers/nine/volumetexture9.h b/src/gallium/state_trackers/nine/volumetexture9.h
index 313fa1a91fb..b8f250ad72e 100644
--- a/src/gallium/state_trackers/nine/volumetexture9.h
+++ b/src/gallium/state_trackers/nine/volumetexture9.h
@@ -32,7 +32,7 @@ struct NineVolumeTexture9
     struct NineVolume9 **volumes;
     struct pipe_box dirty_box;
 };
-static INLINE struct NineVolumeTexture9 *
+static inline struct NineVolumeTexture9 *
 NineVolumeTexture9( void *data )
 {
     return (struct NineVolumeTexture9 *)data;
diff --git a/src/gallium/state_trackers/osmesa/osmesa.c b/src/gallium/state_trackers/osmesa/osmesa.c
index 96b83585429..0285cb0dac2 100644
--- a/src/gallium/state_trackers/osmesa/osmesa.c
+++ b/src/gallium/state_trackers/osmesa/osmesa.c
@@ -168,7 +168,7 @@ get_st_manager(void)
 }
 
 
-static INLINE boolean
+static inline boolean
 little_endian(void)
 {
    const unsigned ui = 1;
@@ -292,7 +292,7 @@ osmesa_init_st_visual(struct st_visual *vis,
 /**
  * Return the osmesa_buffer that corresponds to an st_framebuffer_iface.
  */
-static INLINE struct osmesa_buffer *
+static inline struct osmesa_buffer *
 stfbi_to_osbuffer(struct st_framebuffer_iface *stfbi)
 {
    return (struct osmesa_buffer *) stfbi->st_manager_private;
diff --git a/src/gallium/state_trackers/wgl/stw_device.h b/src/gallium/state_trackers/wgl/stw_device.h
index 4a930b5bef8..e35a4b94036 100644
--- a/src/gallium/state_trackers/wgl/stw_device.h
+++ b/src/gallium/state_trackers/wgl/stw_device.h
@@ -80,7 +80,7 @@ struct stw_device
 extern struct stw_device *stw_dev;
 
 
-static INLINE struct stw_context *
+static inline struct stw_context *
 stw_lookup_context_locked( DHGLRC dhglrc )
 {
    if (dhglrc == 0 || stw_dev == NULL)
diff --git a/src/gallium/state_trackers/wgl/stw_framebuffer.c b/src/gallium/state_trackers/wgl/stw_framebuffer.c
index 2b81b820495..7b34fcbb5ed 100644
--- a/src/gallium/state_trackers/wgl/stw_framebuffer.c
+++ b/src/gallium/state_trackers/wgl/stw_framebuffer.c
@@ -45,7 +45,7 @@
  * Search the framebuffer with the matching HWND while holding the
  * stw_dev::fb_mutex global lock.
  */
-static INLINE struct stw_framebuffer *
+static inline struct stw_framebuffer *
 stw_framebuffer_from_hwnd_locked(
    HWND hwnd )
 {
@@ -376,7 +376,7 @@ stw_framebuffer_cleanup(void)
 /**
  * Given an hdc, return the corresponding stw_framebuffer.
  */
-static INLINE struct stw_framebuffer *
+static inline struct stw_framebuffer *
 stw_framebuffer_from_hdc_locked(
    HDC hdc )
 {
diff --git a/src/gallium/state_trackers/wgl/stw_st.c b/src/gallium/state_trackers/wgl/stw_st.c
index 0a9116cbb73..b41171a9195 100644
--- a/src/gallium/state_trackers/wgl/stw_st.c
+++ b/src/gallium/state_trackers/wgl/stw_st.c
@@ -46,7 +46,7 @@ struct stw_st_framebuffer {
    unsigned texture_mask;
 };
 
-static INLINE struct stw_st_framebuffer *
+static inline struct stw_st_framebuffer *
 stw_st_framebuffer(struct st_framebuffer_iface *stfb)
 {
    return (struct stw_st_framebuffer *) stfb;
diff --git a/src/gallium/state_trackers/wgl/stw_tls.c b/src/gallium/state_trackers/wgl/stw_tls.c
index ca27a53433c..041066f5007 100644
--- a/src/gallium/state_trackers/wgl/stw_tls.c
+++ b/src/gallium/state_trackers/wgl/stw_tls.c
@@ -50,7 +50,7 @@ static CRITICAL_SECTION g_mutex = {
 static struct stw_tls_data *g_pendingTlsData = NULL;
 
 
-static INLINE struct stw_tls_data *
+static inline struct stw_tls_data *
 stw_tls_data_create(DWORD dwThreadId);
 
 static struct stw_tls_data *
@@ -111,7 +111,7 @@ stw_tls_init(void)
 /**
  * Install windows hook for a given thread (not necessarily the current one).
  */
-static INLINE struct stw_tls_data *
+static inline struct stw_tls_data *
 stw_tls_data_create(DWORD dwThreadId)
 {
    struct stw_tls_data *data;
diff --git a/src/gallium/state_trackers/xa/xa_composite.c b/src/gallium/state_trackers/xa/xa_composite.c
index c283a0d1892..7cfd1e136d1 100644
--- a/src/gallium/state_trackers/xa/xa_composite.c
+++ b/src/gallium/state_trackers/xa/xa_composite.c
@@ -167,7 +167,7 @@ blend_for_op(struct xa_composite_blend *blend,
 }
 
 
-static INLINE int
+static inline int
 xa_repeat_to_gallium(int mode)
 {
     switch(mode) {
@@ -185,7 +185,7 @@ xa_repeat_to_gallium(int mode)
     return PIPE_TEX_WRAP_REPEAT;
 }
 
-static INLINE boolean
+static inline boolean
 xa_filter_to_gallium(int xrender_filter, int *out_filter)
 {
 
diff --git a/src/gallium/state_trackers/xa/xa_priv.h b/src/gallium/state_trackers/xa/xa_priv.h
index f71c06c6c19..13a0e86f66d 100644
--- a/src/gallium/state_trackers/xa/xa_priv.h
+++ b/src/gallium/state_trackers/xa/xa_priv.h
@@ -123,7 +123,7 @@ struct xa_context {
     const struct xa_composite *comp;
 };
 
-static INLINE void
+static inline void
 xa_scissor_reset(struct xa_context *ctx)
 {
     ctx->scissor.maxx = 0;
@@ -133,7 +133,7 @@ xa_scissor_reset(struct xa_context *ctx)
     ctx->scissor_valid = FALSE;
 }
 
-static INLINE void
+static inline void
 xa_scissor_update(struct xa_context *ctx, unsigned minx, unsigned miny,
 		unsigned maxx, unsigned maxy)
 {
@@ -189,13 +189,13 @@ struct xa_shaders;
  * Inline utilities
  */
 
-static INLINE int
+static inline int
 xa_min(int a, int b)
 {
     return ((a <= b) ? a : b);
 }
 
-static INLINE void
+static inline void
 xa_pixel_to_float4(uint32_t pixel, float *color)
 {
     uint32_t	    r, g, b, a;
@@ -210,7 +210,7 @@ xa_pixel_to_float4(uint32_t pixel, float *color)
     color[3] = ((float)a) / 255.;
 }
 
-static INLINE void
+static inline void
 xa_pixel_to_float4_a8(uint32_t pixel, float *color)
 {
     uint32_t a;
diff --git a/src/gallium/state_trackers/xa/xa_renderer.c b/src/gallium/state_trackers/xa/xa_renderer.c
index 7b28afc907f..fda07e5b68e 100644
--- a/src/gallium/state_trackers/xa/xa_renderer.c
+++ b/src/gallium/state_trackers/xa/xa_renderer.c
@@ -45,14 +45,14 @@ void
 renderer_set_constants(struct xa_context *r,
 		       int shader_type, const float *params, int param_bytes);
 
-static INLINE boolean
+static inline boolean
 is_affine(float *matrix)
 {
     return floatIsZero(matrix[2]) && floatIsZero(matrix[5])
 	&& floatsEqual(matrix[8], 1);
 }
 
-static INLINE void
+static inline void
 map_point(float *mat, float x, float y, float *out_x, float *out_y)
 {
     if (!mat) {
@@ -71,7 +71,7 @@ map_point(float *mat, float x, float y, float *out_x, float *out_y)
     }
 }
 
-static INLINE void
+static inline void
 renderer_draw(struct xa_context *r)
 {
     int num_verts = r->buffer_size / (r->attrs_per_vertex * NUM_COMPONENTS);
@@ -97,7 +97,7 @@ renderer_draw(struct xa_context *r)
     xa_scissor_reset(r);
 }
 
-static INLINE void
+static inline void
 renderer_draw_conditional(struct xa_context *r, int next_batch)
 {
     if (r->buffer_size + next_batch >= XA_VB_SIZE ||
@@ -135,7 +135,7 @@ renderer_init_state(struct xa_context *r)
     }
 }
 
-static INLINE void
+static inline void
 add_vertex_color(struct xa_context *r, float x, float y, float color[4])
 {
     float *vertex = r->buffer + r->buffer_size;
@@ -153,7 +153,7 @@ add_vertex_color(struct xa_context *r, float x, float y, float color[4])
     r->buffer_size += 8;
 }
 
-static INLINE void
+static inline void
 add_vertex_1tex(struct xa_context *r, float x, float y, float s, float t)
 {
     float *vertex = r->buffer + r->buffer_size;
@@ -171,7 +171,7 @@ add_vertex_1tex(struct xa_context *r, float x, float y, float s, float t)
     r->buffer_size += 8;
 }
 
-static INLINE void
+static inline void
 add_vertex_2tex(struct xa_context *r,
 		float x, float y, float s0, float t0, float s1, float t1)
 {
diff --git a/src/gallium/state_trackers/xa/xa_tgsi.c b/src/gallium/state_trackers/xa/xa_tgsi.c
index c7454c9d6ac..5d8b8079c4b 100644
--- a/src/gallium/state_trackers/xa/xa_tgsi.c
+++ b/src/gallium/state_trackers/xa/xa_tgsi.c
@@ -106,7 +106,7 @@ struct xa_shaders {
     struct cso_hash *fs_hash;
 };
 
-static INLINE void
+static inline void
 src_in_mask(struct ureg_program *ureg,
 	    struct ureg_dst dst,
 	    struct ureg_src src,
@@ -368,7 +368,7 @@ create_yuv_shader(struct pipe_context *pipe, struct ureg_program *ureg)
     return ureg_create_shader_and_destroy(ureg, pipe);
 }
 
-static INLINE void
+static inline void
 xrender_tex(struct ureg_program *ureg,
 	    struct ureg_dst dst,
 	    struct ureg_src coords,
@@ -617,7 +617,7 @@ xa_shaders_destroy(struct xa_shaders *sc)
     FREE(sc);
 }
 
-static INLINE void *
+static inline void *
 shader_from_cache(struct pipe_context *pipe,
 		  unsigned type, struct cso_hash *hash, unsigned key)
 {
diff --git a/src/gallium/state_trackers/xvmc/xvmc_private.h b/src/gallium/state_trackers/xvmc/xvmc_private.h
index 84c7b6cba0b..a1d026f704e 100644
--- a/src/gallium/state_trackers/xvmc/xvmc_private.h
+++ b/src/gallium/state_trackers/xvmc/xvmc_private.h
@@ -106,7 +106,7 @@ typedef struct
 #define XVMC_WARN  2
 #define XVMC_TRACE 3
 
-static INLINE void XVMC_MSG(int level, const char *fmt, ...)
+static inline void XVMC_MSG(int level, const char *fmt, ...)
 {
    static int debug_level = -1;
 
diff --git a/src/gallium/targets/d3dadapter9/drm.c b/src/gallium/targets/d3dadapter9/drm.c
index cc660606c1f..680f5164e60 100644
--- a/src/gallium/targets/d3dadapter9/drm.c
+++ b/src/gallium/targets/d3dadapter9/drm.c
@@ -101,7 +101,7 @@ drm_destroy( struct d3dadapter9_context *ctx )
 
 /* read a DWORD in the form 0xnnnnnnnn, which is how sysfs pci id stuff is
  * formatted. */
-static INLINE DWORD
+static inline DWORD
 read_file_dword( const char *name )
 {
     char buf[32];
@@ -123,7 +123,7 @@ read_file_dword( const char *name )
  * dword at an offset in the raw PCI header. The reason this isn't used for all
  * data is that the kernel will make corrections but not expose them in the raw
  * header bytes. */
-static INLINE DWORD
+static inline DWORD
 read_config_dword( int fd,
                    unsigned offset )
 {
@@ -135,7 +135,7 @@ read_config_dword( int fd,
     return r;
 }
 
-static INLINE void
+static inline void
 get_bus_info( int fd,
               DWORD *vendorid,
               DWORD *deviceid,
@@ -160,7 +160,7 @@ get_bus_info( int fd,
     }
 }
 
-static INLINE void
+static inline void
 read_descriptor( struct d3dadapter9_context *ctx,
                  int fd )
 {
diff --git a/src/gallium/tests/graw/graw_util.h b/src/gallium/tests/graw/graw_util.h
index afcc584863e..e7cd0aa3ac3 100644
--- a/src/gallium/tests/graw/graw_util.h
+++ b/src/gallium/tests/graw/graw_util.h
@@ -26,7 +26,7 @@ struct graw_info
 
 
 
-static INLINE boolean
+static inline boolean
 graw_util_create_window(struct graw_info *info,
                         int width, int height,
                         int num_cbufs, bool zstencil_buf)
@@ -144,7 +144,7 @@ graw_util_create_window(struct graw_info *info,
 }
 
 
-static INLINE void
+static inline void
 graw_util_default_state(struct graw_info *info, boolean depth_test)
 {
    {
@@ -181,7 +181,7 @@ graw_util_default_state(struct graw_info *info, boolean depth_test)
 }
 
 
-static INLINE void
+static inline void
 graw_util_viewport(struct graw_info *info,
                    float x, float y,
                    float width, float height,
@@ -205,7 +205,7 @@ graw_util_viewport(struct graw_info *info,
 }
 
 
-static INLINE void
+static inline void
 graw_util_flush_front(const struct graw_info *info)
 {
    info->screen->flush_frontbuffer(info->screen, info->color_buf[0],
@@ -213,7 +213,7 @@ graw_util_flush_front(const struct graw_info *info)
 }
 
 
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 graw_util_create_tex2d(const struct graw_info *info,
                        int width, int height, enum pipe_format format,
                        const void *data)
@@ -278,7 +278,7 @@ graw_util_create_tex2d(const struct graw_info *info,
 }
 
 
-static INLINE void *
+static inline void *
 graw_util_create_simple_sampler(const struct graw_info *info,
                                 unsigned wrap_mode,
                                 unsigned img_filter)
@@ -304,7 +304,7 @@ graw_util_create_simple_sampler(const struct graw_info *info,
 }
 
 
-static INLINE struct pipe_sampler_view *
+static inline struct pipe_sampler_view *
 graw_util_create_simple_sampler_view(const struct graw_info *info,
                                      struct pipe_resource *texture)
 {
diff --git a/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c b/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c
index 9fedb121565..93ce6f224fe 100644
--- a/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c
+++ b/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c
@@ -26,7 +26,7 @@ struct i915_drm_batchbuffer
    drm_intel_bo *bo;
 };
 
-static INLINE struct i915_drm_batchbuffer *
+static inline struct i915_drm_batchbuffer *
 i915_drm_batchbuffer(struct i915_winsys_batchbuffer *batch)
 {
    return (struct i915_drm_batchbuffer *)batch;
diff --git a/src/gallium/winsys/i915/drm/i915_drm_winsys.h b/src/gallium/winsys/i915/drm/i915_drm_winsys.h
index 7f0d718bdb7..56b9e150497 100644
--- a/src/gallium/winsys/i915/drm/i915_drm_winsys.h
+++ b/src/gallium/winsys/i915/drm/i915_drm_winsys.h
@@ -28,7 +28,7 @@ struct i915_drm_winsys
    drm_intel_bufmgr *gem_manager;
 };
 
-static INLINE struct i915_drm_winsys *
+static inline struct i915_drm_winsys *
 i915_drm_winsys(struct i915_winsys *iws)
 {
    return (struct i915_drm_winsys *)iws;
@@ -58,13 +58,13 @@ struct i915_drm_buffer {
    unsigned flink;
 };
 
-static INLINE struct i915_drm_buffer *
+static inline struct i915_drm_buffer *
 i915_drm_buffer(struct i915_winsys_buffer *buffer)
 {
    return (struct i915_drm_buffer *)buffer;
 }
 
-static INLINE drm_intel_bo *
+static inline drm_intel_bo *
 intel_bo(struct i915_winsys_buffer *buffer)
 {
    return i915_drm_buffer(buffer)->bo;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 314d0ef8df8..e4009be816c 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -44,7 +44,7 @@
 
 static const struct pb_vtbl radeon_bo_vtbl;
 
-static INLINE struct radeon_bo *radeon_bo(struct pb_buffer *bo)
+static inline struct radeon_bo *radeon_bo(struct pb_buffer *bo)
 {
     assert(bo->vtbl == &radeon_bo_vtbl);
     return (struct radeon_bo *)bo;
@@ -78,7 +78,7 @@ struct radeon_bomgr {
     struct list_head va_holes;
 };
 
-static INLINE struct radeon_bomgr *radeon_bomgr(struct pb_manager *mgr)
+static inline struct radeon_bomgr *radeon_bomgr(struct pb_manager *mgr)
 {
     return (struct radeon_bomgr *)mgr;
 }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
index b83ce168b4e..0255313af7a 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
@@ -71,7 +71,7 @@ struct radeon_bo {
 struct pb_manager *radeon_bomgr_create(struct radeon_drm_winsys *rws);
 void radeon_bomgr_init_functions(struct radeon_drm_winsys *ws);
 
-static INLINE
+static inline
 void radeon_bo_reference(struct radeon_bo **dst, struct radeon_bo *src)
 {
     pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index ecf89578c68..d618f5904fd 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -195,7 +195,7 @@ radeon_drm_cs_create(struct radeon_winsys *rws,
 
 #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
 
-static INLINE void update_reloc(struct drm_radeon_cs_reloc *reloc,
+static inline void update_reloc(struct drm_radeon_cs_reloc *reloc,
                                 enum radeon_bo_domain rd,
                                 enum radeon_bo_domain wd,
                                 unsigned priority,
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
index fcc29fe9480..658057d75b3 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
@@ -79,13 +79,13 @@ struct radeon_drm_cs {
 
 int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo);
 
-static INLINE struct radeon_drm_cs *
+static inline struct radeon_drm_cs *
 radeon_drm_cs(struct radeon_winsys_cs *base)
 {
     return (struct radeon_drm_cs*)base;
 }
 
-static INLINE boolean
+static inline boolean
 radeon_bo_is_referenced_by_cs(struct radeon_drm_cs *cs,
                               struct radeon_bo *bo)
 {
@@ -94,7 +94,7 @@ radeon_bo_is_referenced_by_cs(struct radeon_drm_cs *cs,
            (num_refs && radeon_get_reloc(cs->csc, bo) != -1);
 }
 
-static INLINE boolean
+static inline boolean
 radeon_bo_is_referenced_by_cs_for_write(struct radeon_drm_cs *cs,
                                         struct radeon_bo *bo)
 {
@@ -110,7 +110,7 @@ radeon_bo_is_referenced_by_cs_for_write(struct radeon_drm_cs *cs,
     return cs->csc->relocs[index].write_domain != 0;
 }
 
-static INLINE boolean
+static inline boolean
 radeon_bo_is_referenced_by_any_cs(struct radeon_bo *bo)
 {
     return bo->num_cs_references != 0;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
index 99c8b8a8a1d..308b5bd976d 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
@@ -97,7 +97,7 @@ struct radeon_drm_winsys {
     struct radeon_drm_cs *cs_stack[RING_LAST];
 };
 
-static INLINE struct radeon_drm_winsys *
+static inline struct radeon_drm_winsys *
 radeon_drm_winsys(struct radeon_winsys *base)
 {
     return (struct radeon_drm_winsys*)base;
diff --git a/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c b/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c
index fceb0897058..5ef95f3d6a9 100644
--- a/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c
+++ b/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c
@@ -127,7 +127,7 @@ struct fenced_buffer
 };
 
 
-static INLINE struct fenced_manager *
+static inline struct fenced_manager *
 fenced_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -135,7 +135,7 @@ fenced_manager(struct pb_manager *mgr)
 }
 
 
-static INLINE struct fenced_buffer *
+static inline struct fenced_buffer *
 fenced_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -204,7 +204,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
 }
 
 
-static INLINE void
+static inline void
 fenced_buffer_destroy_locked(struct fenced_manager *fenced_mgr,
                              struct fenced_buffer *fenced_buf)
 {
@@ -228,7 +228,7 @@ fenced_buffer_destroy_locked(struct fenced_manager *fenced_mgr,
  *
  * Reference count should be incremented before calling this function.
  */
-static INLINE void
+static inline void
 fenced_buffer_add_locked(struct fenced_manager *fenced_mgr,
                          struct fenced_buffer *fenced_buf)
 {
@@ -252,7 +252,7 @@ fenced_buffer_add_locked(struct fenced_manager *fenced_mgr,
  *
  * Returns TRUE if the buffer was detroyed.
  */
-static INLINE boolean
+static inline boolean
 fenced_buffer_remove_locked(struct fenced_manager *fenced_mgr,
                             struct fenced_buffer *fenced_buf)
 {
@@ -289,7 +289,7 @@ fenced_buffer_remove_locked(struct fenced_manager *fenced_mgr,
  * This function will release and re-acquire the mutex, so any copy of mutable
  * state must be discarded after calling it.
  */
-static INLINE enum pipe_error
+static inline enum pipe_error
 fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
                             struct fenced_buffer *fenced_buf)
 {
@@ -430,7 +430,7 @@ fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf)
  * This function is a shorthand around pb_manager::create_buffer for
  * fenced_buffer_create_gpu_storage_locked()'s benefit.
  */
-static INLINE boolean
+static inline boolean
 fenced_buffer_try_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
                                             struct fenced_buffer *fenced_buf,
                                             const struct pb_desc *desc)
diff --git a/src/gallium/winsys/svga/drm/vmw_buffer.c b/src/gallium/winsys/svga/drm/vmw_buffer.c
index c516054b7fc..7eab3d050e4 100644
--- a/src/gallium/winsys/svga/drm/vmw_buffer.c
+++ b/src/gallium/winsys/svga/drm/vmw_buffer.c
@@ -69,7 +69,7 @@ struct vmw_gmr_buffer
 extern const struct pb_vtbl vmw_gmr_buffer_vtbl;
 
 
-static INLINE struct vmw_gmr_buffer *
+static inline struct vmw_gmr_buffer *
 vmw_gmr_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -86,7 +86,7 @@ struct vmw_gmr_bufmgr
 };
 
 
-static INLINE struct vmw_gmr_bufmgr *
+static inline struct vmw_gmr_bufmgr *
 vmw_gmr_bufmgr(struct pb_manager *mgr)
 {
    assert(mgr);
diff --git a/src/gallium/winsys/svga/drm/vmw_buffer.h b/src/gallium/winsys/svga/drm/vmw_buffer.h
index e0bb8085a48..b9cbb25541f 100644
--- a/src/gallium/winsys/svga/drm/vmw_buffer.h
+++ b/src/gallium/winsys/svga/drm/vmw_buffer.h
@@ -59,7 +59,7 @@ struct debug_flush_buf *
 vmw_debug_flush_buf(struct svga_winsys_buffer *buffer);
 
 #else
-static INLINE struct pb_buffer *
+static inline struct pb_buffer *
 vmw_pb_buffer(struct svga_winsys_buffer *buffer)
 {
    assert(buffer);
@@ -67,7 +67,7 @@ vmw_pb_buffer(struct svga_winsys_buffer *buffer)
 }
 
 
-static INLINE struct svga_winsys_buffer *
+static inline struct svga_winsys_buffer *
 vmw_svga_winsys_buffer_wrap(struct pb_buffer *buffer)
 {
    return (struct svga_winsys_buffer *)buffer;
diff --git a/src/gallium/winsys/svga/drm/vmw_context.c b/src/gallium/winsys/svga/drm/vmw_context.c
index 4e1c41db886..31bedde7c41 100644
--- a/src/gallium/winsys/svga/drm/vmw_context.c
+++ b/src/gallium/winsys/svga/drm/vmw_context.c
@@ -152,7 +152,7 @@ struct vmw_svga_winsys_context
 };
 
 
-static INLINE struct vmw_svga_winsys_context *
+static inline struct vmw_svga_winsys_context *
 vmw_svga_winsys_context(struct svga_winsys_context *swc)
 {
    assert(swc);
@@ -160,7 +160,7 @@ vmw_svga_winsys_context(struct svga_winsys_context *swc)
 }
 
 
-static INLINE unsigned
+static inline unsigned
 vmw_translate_to_pb_flags(unsigned flags)
 {
    unsigned f = 0;
diff --git a/src/gallium/winsys/svga/drm/vmw_fence.c b/src/gallium/winsys/svga/drm/vmw_fence.c
index 1b24239a7ce..17822ce27fd 100644
--- a/src/gallium/winsys/svga/drm/vmw_fence.c
+++ b/src/gallium/winsys/svga/drm/vmw_fence.c
@@ -67,7 +67,7 @@ struct vmw_fence
  * @ops: Pointer to a struct pb_fence_ops.
  *
  */
-static INLINE boolean
+static inline boolean
 vmw_fence_seq_is_signaled(uint32_t seq, uint32_t last, uint32_t cur)
 {
    return (cur - last <= cur - seq);
@@ -81,7 +81,7 @@ vmw_fence_seq_is_signaled(uint32_t seq, uint32_t last, uint32_t cur)
  * @ops: Pointer to a struct pb_fence_ops.
  *
  */
-static INLINE struct vmw_fence_ops *
+static inline struct vmw_fence_ops *
 vmw_fence_ops(struct pb_fence_ops *ops)
 {
    assert(ops);
@@ -162,7 +162,7 @@ out_unlock:
  *
  * @fence: The opaque pipe fence handle.
  */
-static INLINE struct vmw_fence *
+static inline struct vmw_fence *
 vmw_fence(struct pipe_fence_handle *fence)
 {
    return (struct vmw_fence *) fence;
diff --git a/src/gallium/winsys/svga/drm/vmw_screen.h b/src/gallium/winsys/svga/drm/vmw_screen.h
index fd76e614a5e..ce98db9b397 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen.h
+++ b/src/gallium/winsys/svga/drm/vmw_screen.h
@@ -102,7 +102,7 @@ struct vmw_winsys_screen
 };
 
 
-static INLINE struct vmw_winsys_screen *
+static inline struct vmw_winsys_screen *
 vmw_winsys_screen(struct svga_winsys_screen *base)
 {
    return (struct vmw_winsys_screen *)base;
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_dri.c b/src/gallium/winsys/svga/drm/vmw_screen_dri.c
index 9f335900e68..e70e0fec4a3 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_dri.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_dri.c
@@ -126,7 +126,7 @@ out_no_vws:
    return NULL;
 }
 
-static INLINE boolean
+static inline boolean
 vmw_dri1_intersect_src_bbox(struct drm_clip_rect *dst,
 			    int dst_x,
 			    int dst_y,
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
index 14c3b2068c6..e2f0da58bf9 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
@@ -650,7 +650,7 @@ vmw_ioctl_fence_unref(struct vmw_winsys_screen *vws,
       vmw_error("%s Failed\n", __FUNCTION__);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 vmw_drm_fence_flags(uint32_t flags)
 {
     uint32_t dflags = 0;
diff --git a/src/gallium/winsys/svga/drm/vmw_shader.h b/src/gallium/winsys/svga/drm/vmw_shader.h
index 1fd8c3311f9..28f99717391 100644
--- a/src/gallium/winsys/svga/drm/vmw_shader.h
+++ b/src/gallium/winsys/svga/drm/vmw_shader.h
@@ -47,14 +47,14 @@ struct vmw_svga_winsys_shader
    uint32_t shid;
 };
 
-static INLINE struct svga_winsys_gb_shader *
+static inline struct svga_winsys_gb_shader *
 svga_winsys_shader(struct vmw_svga_winsys_shader *shader)
 {
    assert(!shader || shader->shid != SVGA3D_INVALID_ID);
    return (struct svga_winsys_gb_shader *)shader;
 }
 
-static INLINE struct vmw_svga_winsys_shader *
+static inline struct vmw_svga_winsys_shader *
 vmw_svga_winsys_shader(struct svga_winsys_gb_shader *shader)
 {
    return (struct vmw_svga_winsys_shader *)shader;
diff --git a/src/gallium/winsys/svga/drm/vmw_surface.h b/src/gallium/winsys/svga/drm/vmw_surface.h
index e44d0554fbc..1291f380aa2 100644
--- a/src/gallium/winsys/svga/drm/vmw_surface.h
+++ b/src/gallium/winsys/svga/drm/vmw_surface.h
@@ -68,7 +68,7 @@ struct vmw_svga_winsys_surface
 };
 
 
-static INLINE struct svga_winsys_surface *
+static inline struct svga_winsys_surface *
 svga_winsys_surface(struct vmw_svga_winsys_surface *surf)
 {
    assert(!surf || surf->sid != SVGA3D_INVALID_ID);
@@ -76,7 +76,7 @@ svga_winsys_surface(struct vmw_svga_winsys_surface *surf)
 }
 
 
-static INLINE struct vmw_svga_winsys_surface *
+static inline struct vmw_svga_winsys_surface *
 vmw_svga_winsys_surface(struct svga_winsys_surface *surf)
 {
    return (struct vmw_svga_winsys_surface *)surf;
diff --git a/src/gallium/winsys/sw/dri/dri_sw_winsys.c b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
index 6fed22bbd7c..8451d832806 100644
--- a/src/gallium/winsys/sw/dri/dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
@@ -55,13 +55,13 @@ struct dri_sw_winsys
    struct drisw_loader_funcs *lf;
 };
 
-static INLINE struct dri_sw_displaytarget *
+static inline struct dri_sw_displaytarget *
 dri_sw_displaytarget( struct sw_displaytarget *dt )
 {
    return (struct dri_sw_displaytarget *)dt;
 }
 
-static INLINE struct dri_sw_winsys *
+static inline struct dri_sw_winsys *
 dri_sw_winsys( struct sw_winsys *ws )
 {
    return (struct dri_sw_winsys *)ws;
diff --git a/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c b/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c
index aae3ec55a25..dc725f4b90c 100644
--- a/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c
+++ b/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c
@@ -62,7 +62,7 @@ struct gdi_sw_displaytarget
 
 
 /** Cast wrapper */
-static INLINE struct gdi_sw_displaytarget *
+static inline struct gdi_sw_displaytarget *
 gdi_sw_displaytarget( struct sw_displaytarget *buf )
 {
    return (struct gdi_sw_displaytarget *)buf;
diff --git a/src/gallium/winsys/sw/hgl/hgl_sw_winsys.c b/src/gallium/winsys/sw/hgl/hgl_sw_winsys.c
index a71d2a76791..89dd5471b09 100644
--- a/src/gallium/winsys/sw/hgl/hgl_sw_winsys.c
+++ b/src/gallium/winsys/sw/hgl/hgl_sw_winsys.c
@@ -67,7 +67,7 @@ struct haiku_displaytarget
 
 
 // Cast
-static INLINE struct haiku_displaytarget*
+static inline struct haiku_displaytarget*
 hgl_sw_displaytarget(struct sw_displaytarget* target)
 {
 	return (struct haiku_displaytarget *)target;
diff --git a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
index 740b9201140..900c49f83e6 100644
--- a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
@@ -83,13 +83,13 @@ struct kms_sw_winsys
    struct list_head bo_list;
 };
 
-static INLINE struct kms_sw_displaytarget *
+static inline struct kms_sw_displaytarget *
 kms_sw_displaytarget( struct sw_displaytarget *dt )
 {
    return (struct kms_sw_displaytarget *)dt;
 }
 
-static INLINE struct kms_sw_winsys *
+static inline struct kms_sw_winsys *
 kms_sw_winsys( struct sw_winsys *ws )
 {
    return (struct kms_sw_winsys *)ws;
diff --git a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
index a6bf4985e1e..9b90eaa018b 100644
--- a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
+++ b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
@@ -66,13 +66,13 @@ struct wrapper_sw_displaytarget
    void *ptr;
 };
 
-static INLINE struct wrapper_sw_winsys *
+static inline struct wrapper_sw_winsys *
 wrapper_sw_winsys(struct sw_winsys *ws)
 {
    return (struct wrapper_sw_winsys *)ws;
 }
 
-static INLINE struct wrapper_sw_displaytarget *
+static inline struct wrapper_sw_displaytarget *
 wrapper_sw_displaytarget(struct sw_displaytarget *dt)
 {
    return (struct wrapper_sw_displaytarget *)dt;
diff --git a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
index 88310718049..515ecd9f7b7 100644
--- a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
+++ b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
@@ -92,7 +92,7 @@ struct xlib_sw_winsys
 
 
 /** Cast wrapper */
-static INLINE struct xlib_displaytarget *
+static inline struct xlib_displaytarget *
 xlib_displaytarget(struct sw_displaytarget *dt)
 {
    return (struct xlib_displaytarget *) dt;
diff --git a/src/mesa/state_tracker/st_cb_perfmon.h b/src/mesa/state_tracker/st_cb_perfmon.h
index 13d3627de5d..0b195de47fe 100644
--- a/src/mesa/state_tracker/st_cb_perfmon.h
+++ b/src/mesa/state_tracker/st_cb_perfmon.h
@@ -46,7 +46,7 @@ struct st_perf_counter_object
 /**
  * Cast wrapper
  */
-static INLINE struct st_perf_monitor_object *
+static inline struct st_perf_monitor_object *
 st_perf_monitor_object(struct gl_perf_monitor_object *q)
 {
    return (struct st_perf_monitor_object *)q;

From 4c7196b684fe384599c1a02bf20aec7b6447968d Mon Sep 17 00:00:00 2001
From: Jonathan Gray <jsg@jsg.id.au>
Date: Thu, 16 Jul 2015 01:17:37 +1000
Subject: [PATCH 0406/1208] mesa: include stdarg.h for va_list

Include stdarg.h for va_list.  Unbreaks the build on OpenBSD:

In file included from mesa/program/dummy_errors.c:24:
../src/mesa/main/errors.h:85: error: expected declaration specifiers or '...' before 'va_list'

Signed-off-by: Jonathan Gray <jsg@jsg.id.au>
Acked-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/main/errors.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/main/errors.h b/src/mesa/main/errors.h
index 24f234f7f10..81e47a8b8c1 100644
--- a/src/mesa/main/errors.h
+++ b/src/mesa/main/errors.h
@@ -37,6 +37,7 @@
 
 
 #include <stdio.h>
+#include <stdarg.h>
 #include "compiler.h"
 #include "glheader.h"
 #include "mtypes.h"

From 98a6c5ea1129f18ed5f097fad5bbebc86eb0e862 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0407/1208] mesa: allow GL_TEXTURE_CUBE_MAP_ARRAY case for
 glCompressedTexSubImage3D()

Since s3tc works for cube maps and 2D arrays, it should also work for
cube arrays.  NVIDIA's driver supports this too.  Seems like the spec
should say this.

This is a minor follow-on fix for the commit "mesa: fix up some texture
error checks".

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/mesa/main/teximage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 842b50d8b40..6aa1bb4e9e5 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -4588,7 +4588,7 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
        * are valid here, which they are not, but of course not mentioned by
        * core spec.
        */
-      if (target != GL_TEXTURE_2D_ARRAY) {
+      if (target != GL_TEXTURE_2D_ARRAY && target != GL_TEXTURE_CUBE_MAP_ARRAY) {
          bool invalidformat;
          switch (format) {
             /* These came from _mesa_is_compressed_format in glformats.c. */

From d7bd9fa1a363c288324d73fbde86f2257dfc0a15 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0408/1208] mesa: assorted whitespace, formatting fixes in
 teximage.c

Trivial.
---
 src/mesa/main/teximage.c | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 6aa1bb4e9e5..3b309abbe14 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -2743,7 +2743,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
        * types for SNORM formats. Also, conversion to SNORM formats is not
        * allowed by Table 3.2 on Page 110.
        */
-      if(_mesa_is_enum_format_snorm(internalFormat)) {
+      if (_mesa_is_enum_format_snorm(internalFormat)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
                      _mesa_enum_to_string(internalFormat));
@@ -3503,7 +3503,6 @@ _mesa_EGLImageTargetTexture2DOES (GLenum target, GLeglImageOES image)
       _mesa_dirty_texobj(ctx, texObj);
    }
    _mesa_unlock_texture(ctx, texObj);
-
 }
 
 
@@ -3850,8 +3849,7 @@ copytexsubimage_by_slice(struct gl_context *ctx,
 }
 
 static GLboolean
-formats_differ_in_component_sizes (mesa_format f1,
-                                   mesa_format f2)
+formats_differ_in_component_sizes(mesa_format f1, mesa_format f2)
 {
    GLint f1_r_bits = _mesa_get_format_bits(f1, GL_RED_BITS);
    GLint f1_g_bits = _mesa_get_format_bits(f1, GL_GREEN_BITS);
@@ -3924,8 +3922,8 @@ copyteximage(struct gl_context *ctx, GLuint dims,
        */
          if (rb->InternalFormat == GL_RGB10_A2) {
                _mesa_error(ctx, GL_INVALID_OPERATION,
-                           "glCopyTexImage%uD(Reading from GL_RGB10_A2 buffer and"
-                           " writing to unsized internal format)", dims);
+                           "glCopyTexImage%uD(Reading from GL_RGB10_A2 buffer"
+                           " and writing to unsized internal format)", dims);
                return;
          }
       }
@@ -4833,8 +4831,7 @@ _mesa_CompressedTextureSubImage1D(GLuint texture, GLint level, GLint xoffset,
    if (!texObj)
       return;
 
-   if (compressed_subtexture_target_check(ctx, texObj->Target, 1, format,
-                                          true,
+   if (compressed_subtexture_target_check(ctx, texObj->Target, 1, format, true,
                                           "glCompressedTextureSubImage1D")) {
       return;
    }
@@ -4911,8 +4908,7 @@ _mesa_CompressedTextureSubImage2D(GLuint texture, GLint level, GLint xoffset,
    if (!texObj)
       return;
 
-   if (compressed_subtexture_target_check(ctx, texObj->Target, 2, format,
-                                          true,
+   if (compressed_subtexture_target_check(ctx, texObj->Target, 2, format, true,
                                           "glCompressedTextureSubImage2D")) {
       return;
    }
@@ -4989,8 +4985,7 @@ _mesa_CompressedTextureSubImage3D(GLuint texture, GLint level, GLint xoffset,
    if (!texObj)
       return;
 
-   if (compressed_subtexture_target_check(ctx, texObj->Target, 3, format,
-                                          true,
+   if (compressed_subtexture_target_check(ctx, texObj->Target, 3, format, true,
                                           "glCompressedTextureSubImage3D")) {
       return;
    }
@@ -5439,7 +5434,6 @@ _mesa_TexBufferRange(GLenum target, GLenum internalFormat, GLuint buffer,
          return;
 
    } else {
-
       /* OpenGL 4.5 core spec (02.02.2015) says in Section 8.9 Buffer
        * Textures (PDF page 254):
        *    "If buffer is zero, then any buffer object attached to the buffer
@@ -5507,7 +5501,6 @@ _mesa_TextureBufferRange(GLuint texture, GLenum internalFormat, GLuint buffer,
          return;
 
    } else {
-
       /* OpenGL 4.5 core spec (02.02.2015) says in Section 8.9 Buffer
        * Textures (PDF page 254):
        *    "If buffer is zero, then any buffer object attached to the buffer
@@ -5553,12 +5546,10 @@ check_multisample_target(GLuint dims, GLenum target, bool dsa)
       return dims == 2;
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
       return dims == 2 && !dsa;
-
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return dims == 3;
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return dims == 3 && !dsa;
-
    default:
       return GL_FALSE;
    }
@@ -5670,13 +5661,12 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
    else {
       if (!dimensionsOK) {
          _mesa_error(ctx, GL_INVALID_VALUE,
-               "%s(invalid width or height)", func);
+                     "%s(invalid width or height)", func);
          return;
       }
 
       if (!sizeOK) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY,
-               "%s(texture too large)", func);
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s(texture too large)", func);
          return;
       }
 
@@ -5694,7 +5684,7 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
 
       if (width > 0 && height > 0 && depth > 0) {
          if (!ctx->Driver.AllocTextureStorage(ctx, texObj, 1,
-                  width, height, depth)) {
+                                              width, height, depth)) {
             /* tidy up the texture image state. strictly speaking,
              * we're allowed to just leave this in whatever state we
              * like, but being tidy is good.

From 2a2c9469425bc794c98dcf57237457ba41d10ce2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0409/1208] meta: handle subimages in
 _mesa_meta_setup_texture_coords()

v2: fix depth, total_depth mix-up in meta.h, per Laura Ekstrand.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
---
 src/mesa/drivers/common/meta.c                | 88 ++++++++++++-------
 src/mesa/drivers/common/meta.h                |  6 +-
 .../drivers/common/meta_generate_mipmap.c     |  4 +-
 3 files changed, 65 insertions(+), 33 deletions(-)

diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 9a75019d059..54c3d5ac66e 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -2449,30 +2449,53 @@ _mesa_meta_Bitmap(struct gl_context *ctx,
 
 /**
  * Compute the texture coordinates for the four vertices of a quad for
- * drawing a 2D texture image or slice of a cube/3D texture.
+ * drawing a 2D texture image or slice of a cube/3D texture.  The offset
+ * and width, height specify a sub-region of the 2D image.
+ *
  * \param faceTarget  GL_TEXTURE_1D/2D/3D or cube face name
  * \param slice  slice of a 1D/2D array texture or 3D texture
- * \param width  width of the texture image
- * \param height  height of the texture image
+ * \param xoffset  X position of sub texture
+ * \param yoffset  Y position of sub texture
+ * \param width  width of the sub texture image
+ * \param height  height of the sub texture image
+ * \param total_width  total width of the texture image
+ * \param total_height  total height of the texture image
+ * \param total_depth  total depth of the texture image
  * \param coords0/1/2/3  returns the computed texcoords
  */
 void
 _mesa_meta_setup_texture_coords(GLenum faceTarget,
                                 GLint slice,
+                                GLint xoffset,
+                                GLint yoffset,
                                 GLint width,
                                 GLint height,
-                                GLint depth,
+                                GLint total_width,
+                                GLint total_height,
+                                GLint total_depth,
                                 GLfloat coords0[4],
                                 GLfloat coords1[4],
                                 GLfloat coords2[4],
                                 GLfloat coords3[4])
 {
-   static const GLfloat st[4][2] = {
-      {0.0f, 0.0f}, {1.0f, 0.0f}, {1.0f, 1.0f}, {0.0f, 1.0f}
-   };
+   float st[4][2];
    GLuint i;
+   const float s0 = (float) xoffset / (float) total_width;
+   const float s1 = (float) (xoffset + width) / (float) total_width;
+   const float t0 = (float) yoffset / (float) total_height;
+   const float t1 = (float) (yoffset + height) / (float) total_height;
    GLfloat r;
 
+   /* setup the reference texcoords */
+   st[0][0] = s0;
+   st[0][1] = t0;
+   st[1][0] = s1;
+   st[1][1] = t0;
+   st[2][0] = s1;
+   st[2][1] = t1;
+   st[3][0] = s0;
+   st[3][1] = t1;
+
    if (faceTarget == GL_TEXTURE_CUBE_MAP_ARRAY)
       faceTarget = GL_TEXTURE_CUBE_MAP_POSITIVE_X + slice % 6;
 
@@ -2489,52 +2512,52 @@ _mesa_meta_setup_texture_coords(GLenum faceTarget,
    case GL_TEXTURE_3D:
    case GL_TEXTURE_2D_ARRAY:
       if (faceTarget == GL_TEXTURE_3D) {
-         assert(slice < depth);
-         assert(depth >= 1);
-         r = (slice + 0.5f) / depth;
+         assert(slice < total_depth);
+         assert(total_depth >= 1);
+         r = (slice + 0.5f) / total_depth;
       }
       else if (faceTarget == GL_TEXTURE_2D_ARRAY)
          r = (float) slice;
       else
          r = 0.0F;
-      coords0[0] = 0.0F; /* s */
-      coords0[1] = 0.0F; /* t */
+      coords0[0] = st[0][0]; /* s */
+      coords0[1] = st[0][1]; /* t */
       coords0[2] = r; /* r */
-      coords1[0] = 1.0F;
-      coords1[1] = 0.0F;
+      coords1[0] = st[1][0];
+      coords1[1] = st[1][1];
       coords1[2] = r;
-      coords2[0] = 1.0F;
-      coords2[1] = 1.0F;
+      coords2[0] = st[2][0];
+      coords2[1] = st[2][1];
       coords2[2] = r;
-      coords3[0] = 0.0F;
-      coords3[1] = 1.0F;
+      coords3[0] = st[3][0];
+      coords3[1] = st[3][1];
       coords3[2] = r;
       break;
    case GL_TEXTURE_RECTANGLE_ARB:
-      coords0[0] = 0.0F; /* s */
-      coords0[1] = 0.0F; /* t */
+      coords0[0] = (float) xoffset; /* s */
+      coords0[1] = (float) yoffset; /* t */
       coords0[2] = 0.0F; /* r */
-      coords1[0] = (float) width;
-      coords1[1] = 0.0F;
+      coords1[0] = (float) (xoffset + width);
+      coords1[1] = (float) yoffset;
       coords1[2] = 0.0F;
-      coords2[0] = (float) width;
-      coords2[1] = (float) height;
+      coords2[0] = (float) (xoffset + width);
+      coords2[1] = (float) (yoffset + height);
       coords2[2] = 0.0F;
-      coords3[0] = 0.0F;
-      coords3[1] = (float) height;
+      coords3[0] = (float) xoffset;
+      coords3[1] = (float) (yoffset + height);
       coords3[2] = 0.0F;
       break;
    case GL_TEXTURE_1D_ARRAY:
-      coords0[0] = 0.0F; /* s */
+      coords0[0] = st[0][0]; /* s */
       coords0[1] = (float) slice; /* t */
       coords0[2] = 0.0F; /* r */
-      coords1[0] = 1.0f;
+      coords1[0] = st[1][0];
       coords1[1] = (float) slice;
       coords1[2] = 0.0F;
-      coords2[0] = 1.0F;
+      coords2[0] = st[2][0];
       coords2[1] = (float) slice;
       coords2[2] = 0.0F;
-      coords3[0] = 0.0F;
+      coords3[0] = st[3][0];
       coords3[1] = (float) slice;
       coords3[2] = 0.0F;
       break;
@@ -3069,7 +3092,10 @@ decompress_texture_image(struct gl_context *ctx,
    /* Silence valgrind warnings about reading uninitialized stack. */
    memset(verts, 0, sizeof(verts));
 
-   _mesa_meta_setup_texture_coords(faceTarget, slice, width, height, depth,
+   _mesa_meta_setup_texture_coords(faceTarget, slice,
+                                   0, 0, width, height,
+                                   texImage->Width, texImage->Height,
+                                   texImage->Depth,
                                    verts[0].tex,
                                    verts[1].tex,
                                    verts[2].tex,
diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h
index e7d894df1d7..f5b74c477f7 100644
--- a/src/mesa/drivers/common/meta.h
+++ b/src/mesa/drivers/common/meta.h
@@ -594,9 +594,13 @@ _mesa_meta_alloc_texture(struct temp_texture *tex,
 void
 _mesa_meta_setup_texture_coords(GLenum faceTarget,
                                 GLint slice,
+                                GLint xoffset,
+                                GLint yoffset,
                                 GLint width,
                                 GLint height,
-                                GLint depth,
+                                GLint total_width,
+                                GLint total_height,
+                                GLint total_depth,
                                 GLfloat coords0[4],
                                 GLfloat coords1[4],
                                 GLfloat coords2[4],
diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c
index f764ebad718..0655f052219 100644
--- a/src/mesa/drivers/common/meta_generate_mipmap.c
+++ b/src/mesa/drivers/common/meta_generate_mipmap.c
@@ -317,7 +317,9 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
          /* Setup texture coordinates */
          _mesa_meta_setup_texture_coords(faceTarget,
                                          layer,
-                                         0, 0, 1, /* width, height never used here */
+                                         0, 0, /* xoffset, yoffset */
+                                         srcWidth, srcHeight, /* img size */
+                                         srcWidth, srcHeight, srcDepth,
                                          verts[0].tex,
                                          verts[1].tex,
                                          verts[2].tex,

From 096371879098c315bc054b6fe1ef6f4b8f18554f Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0410/1208] meta: add offset, width, height parameters to
 decompress_texture_image()

In preparation for decompressing texture sub images.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/drivers/common/meta.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 54c3d5ac66e..34a8e4b9d9a 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -2966,15 +2966,14 @@ static bool
 decompress_texture_image(struct gl_context *ctx,
                          struct gl_texture_image *texImage,
                          GLuint slice,
+                         GLint xoffset, GLint yoffset,
+                         GLsizei width, GLsizei height,
                          GLenum destFormat, GLenum destType,
                          GLvoid *dest)
 {
    struct decompress_state *decompress = &ctx->Meta->Decompress;
    struct decompress_fbo_state *decompress_fbo;
    struct gl_texture_object *texObj = texImage->TexObject;
-   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Height;
    const GLenum target = texObj->Target;
    GLenum rbFormat;
    GLenum faceTarget;
@@ -3093,7 +3092,7 @@ decompress_texture_image(struct gl_context *ctx,
    memset(verts, 0, sizeof(verts));
 
    _mesa_meta_setup_texture_coords(faceTarget, slice,
-                                   0, 0, width, height,
+                                   xoffset, yoffset, width, height,
                                    texImage->Width, texImage->Height,
                                    texImage->Depth,
                                    verts[0].tex,
@@ -3224,7 +3223,8 @@ _mesa_meta_GetTexImage(struct gl_context *ctx,
          else {
             dst = pixels;
          }
-         result = decompress_texture_image(ctx, texImage, slice,
+         result = decompress_texture_image(ctx, texImage, slice, 0, 0,
+                                           texImage->Width, texImage->Height,
                                            format, type, dst);
          if (!result)
             break;

From e693fc299f1f78502b9201f1e1e8f333566c9fb6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0411/1208] mesa: replace Driver.GetTexImage with
 GetTexSubImage()

The new driver hook has x/y/zoffset and width/height/depth parameters
for the new glGetTextureSubImage() function.

The meta code and gallium state tracker are updated to handle the
new parameters.

Callers to Driver.GetTexSubImage() pass in offsets=0 and sizes equal
to the whole texture size.

v2: update i965 driver code, s/GLint/GLsizei/ in GetTexSubImage hook

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/drivers/common/driverfuncs.c       |  2 +-
 src/mesa/drivers/common/meta.c              | 22 +++++++-------
 src/mesa/drivers/common/meta.h              |  8 +++--
 src/mesa/drivers/dri/i965/intel_tex_image.c | 25 +++++++++-------
 src/mesa/main/dd.h                          | 10 ++++---
 src/mesa/main/debug.c                       |  4 ++-
 src/mesa/main/mipmap.c                      |  9 ++++--
 src/mesa/main/texgetimage.c                 | 15 ++++++----
 src/mesa/main/texgetimage.h                 |  9 +++---
 src/mesa/state_tracker/st_cb_texture.c      | 33 +++++++++++++--------
 10 files changed, 83 insertions(+), 54 deletions(-)

diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 71c1a763912..ce99620c5b2 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -94,7 +94,7 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->QuerySamplesForFormat = _mesa_query_samples_for_format;
    driver->TexImage = _mesa_store_teximage;
    driver->TexSubImage = _mesa_store_texsubimage;
-   driver->GetTexImage = _mesa_meta_GetTexImage;
+   driver->GetTexSubImage = _mesa_meta_GetTexSubImage;
    driver->ClearTexSubImage = _mesa_meta_ClearTexSubImage;
    driver->CopyTexSubImage = _mesa_meta_CopyTexSubImage;
    driver->GenerateMipmap = _mesa_meta_GenerateMipmap;
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 34a8e4b9d9a..12045ebbf8e 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -3196,15 +3196,17 @@ decompress_texture_image(struct gl_context *ctx,
  * from core Mesa.
  */
 void
-_mesa_meta_GetTexImage(struct gl_context *ctx,
-                       GLenum format, GLenum type, GLvoid *pixels,
-                       struct gl_texture_image *texImage)
+_mesa_meta_GetTexSubImage(struct gl_context *ctx,
+                          GLint xoffset, GLint yoffset, GLint zoffset,
+                          GLsizei width, GLsizei height, GLsizei depth,
+                          GLenum format, GLenum type, GLvoid *pixels,
+                          struct gl_texture_image *texImage)
 {
    if (_mesa_is_format_compressed(texImage->TexFormat)) {
       GLuint slice;
       bool result = true;
 
-      for (slice = 0; slice < texImage->Depth; slice++) {
+      for (slice = 0; slice < depth; slice++) {
          void *dst;
          if (texImage->TexObject->Target == GL_TEXTURE_2D_ARRAY
              || texImage->TexObject->Target == GL_TEXTURE_CUBE_MAP_ARRAY) {
@@ -3216,15 +3218,14 @@ _mesa_meta_GetTexImage(struct gl_context *ctx,
             struct gl_pixelstore_attrib packing = ctx->Pack;
             packing.SkipPixels = 0;
             packing.SkipRows = 0;
-            dst = _mesa_image_address3d(&packing, pixels, texImage->Width,
-                                        texImage->Height, format, type,
-                                        slice, 0, 0);
+            dst = _mesa_image_address3d(&packing, pixels, width, height,
+                                        format, type, slice, 0, 0);
          }
          else {
             dst = pixels;
          }
-         result = decompress_texture_image(ctx, texImage, slice, 0, 0,
-                                           texImage->Width, texImage->Height,
+         result = decompress_texture_image(ctx, texImage, slice,
+                                           xoffset, yoffset, width, height,
                                            format, type, dst);
          if (!result)
             break;
@@ -3234,7 +3235,8 @@ _mesa_meta_GetTexImage(struct gl_context *ctx,
          return;
    }
 
-   _mesa_GetTexImage_sw(ctx, format, type, pixels, texImage);
+   _mesa_GetTexSubImage_sw(ctx, xoffset, yoffset, zoffset,
+                           width, height, depth, format, type, pixels, texImage);
 }
 
 
diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h
index f5b74c477f7..fe439153aa0 100644
--- a/src/mesa/drivers/common/meta.h
+++ b/src/mesa/drivers/common/meta.h
@@ -560,9 +560,11 @@ _mesa_meta_ClearTexSubImage(struct gl_context *ctx,
                             const GLvoid *clearValue);
 
 extern void
-_mesa_meta_GetTexImage(struct gl_context *ctx,
-                       GLenum format, GLenum type, GLvoid *pixels,
-                       struct gl_texture_image *texImage);
+_mesa_meta_GetTexSubImage(struct gl_context *ctx,
+                          GLint xoffset, GLint yoffset, GLint zoffset,
+                          GLsizei width, GLsizei height, GLsizei depth,
+                          GLenum format, GLenum type, GLvoid *pixels,
+                          struct gl_texture_image *texImage);
 
 extern void
 _mesa_meta_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index 536cc406846..3611280012e 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -471,18 +471,21 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
 }
 
 static void
-intel_get_tex_image(struct gl_context *ctx,
-                    GLenum format, GLenum type, GLvoid *pixels,
-                    struct gl_texture_image *texImage) {
+intel_get_tex_sub_image(struct gl_context *ctx,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLint depth,
+                        GLenum format, GLenum type, GLvoid *pixels,
+                        struct gl_texture_image *texImage)
+{
    struct brw_context *brw = brw_context(ctx);
    bool ok;
 
    DBG("%s\n", __func__);
 
    if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-      if (_mesa_meta_pbo_GetTexSubImage(ctx, 3, texImage, 0, 0, 0,
-                                        texImage->Width, texImage->Height,
-                                        texImage->Depth, format, type,
+      if (_mesa_meta_pbo_GetTexSubImage(ctx, 3, texImage,
+                                        xoffset, yoffset, zoffset,
+                                        width, height, depth, format, type,
                                         pixels, &ctx->Pack)) {
          /* Flush to guarantee coherency between the render cache and other
           * caches the PBO could potentially be bound to after this point.
@@ -496,14 +499,16 @@ intel_get_tex_image(struct gl_context *ctx,
       perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__);
    }
 
-   ok = intel_gettexsubimage_tiled_memcpy(ctx, texImage, 0, 0,
-                                          texImage->Width, texImage->Height,
+   ok = intel_gettexsubimage_tiled_memcpy(ctx, texImage, xoffset, yoffset,
+                                          width, height,
                                           format, type, pixels, &ctx->Pack);
 
    if(ok)
       return;
 
-   _mesa_meta_GetTexImage(ctx, format, type, pixels, texImage);
+   _mesa_meta_GetTexSubImage(ctx, xoffset, yoffset, zoffset,
+                             width, height, depth,
+                             format, type, pixels, texImage);
 
    DBG("%s - DONE\n", __func__);
 }
@@ -514,5 +519,5 @@ intelInitTextureImageFuncs(struct dd_function_table *functions)
    functions->TexImage = intelTexImage;
    functions->EGLImageTargetTexture2D = intel_image_target_texture_2d;
    functions->BindRenderbufferTexImage = intel_bind_renderbuffer_tex_image;
-   functions->GetTexImage = intel_get_tex_image;
+   functions->GetTexSubImage = intel_get_tex_sub_image;
 }
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index d783e34222f..12e54de8b08 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -232,11 +232,13 @@ struct dd_function_table {
 
 
    /**
-    * Called by glGetTexImage().
+    * Called by glGetTexImage(), glGetTextureSubImage().
     */
-   void (*GetTexImage)( struct gl_context *ctx,
-                        GLenum format, GLenum type, GLvoid *pixels,
-                        struct gl_texture_image *texImage );
+   void (*GetTexSubImage)(struct gl_context *ctx,
+                          GLint xoffset, GLint yoffset, GLint zoffset,
+                          GLsizei width, GLsizei height, GLsizei depth,
+                          GLenum format, GLenum type, GLvoid *pixels,
+                          struct gl_texture_image *texImage);
 
    /**
     * Called by glClearTex[Sub]Image
diff --git a/src/mesa/main/debug.c b/src/mesa/main/debug.c
index 3090a007707..5ca7d5ce500 100644
--- a/src/mesa/main/debug.c
+++ b/src/mesa/main/debug.c
@@ -272,7 +272,9 @@ write_texture_image(struct gl_texture_object *texObj,
       store = ctx->Pack; /* save */
       ctx->Pack = ctx->DefaultPacking;
 
-      ctx->Driver.GetTexImage(ctx, GL_RGBA, GL_UNSIGNED_BYTE, buffer, img);
+      ctx->Driver.GetTexSubImage(ctx,
+                                 0, 0, 0, img->Width, img->Height, img->Depth,
+                                 GL_RGBA, GL_UNSIGNED_BYTE, buffer, img);
 
       /* make filename */
       _mesa_snprintf(s, sizeof(s), "/tmp/tex%u.l%u.f%u.ppm", texObj->Name, level, face);
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 7732d09b2ec..1e22f930092 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -2077,9 +2077,12 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
 
       /* Get the uncompressed image */
       assert(srcImage->Level == texObj->BaseLevel);
-      ctx->Driver.GetTexImage(ctx,
-                              temp_base_format, temp_datatype,
-                              temp_src, srcImage);
+      ctx->Driver.GetTexSubImage(ctx,
+                                 0, 0, 0,
+                                 srcImage->Width, srcImage->Height,
+                                 srcImage->Depth,
+                                 temp_base_format, temp_datatype,
+                                 temp_src, srcImage);
       /* restore packing mode */
       ctx->Pack = save;
    }
diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index de6db44ac26..a067aa57975 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -684,15 +684,17 @@ get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
 
 
 /**
- * This is the software fallback for Driver.GetTexImage().
+ * This is the software fallback for Driver.GetTexSubImage().
  * All error checking will have been done before this routine is called.
  * We'll call ctx->Driver.MapTextureImage() to access the data, then
  * unmap with ctx->Driver.UnmapTextureImage().
  */
 void
-_mesa_GetTexImage_sw(struct gl_context *ctx,
-                     GLenum format, GLenum type, GLvoid *pixels,
-                     struct gl_texture_image *texImage)
+_mesa_GetTexSubImage_sw(struct gl_context *ctx,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLint depth,
+                        GLenum format, GLenum type, GLvoid *pixels,
+                        struct gl_texture_image *texImage)
 {
    const GLuint dimensions =
       _mesa_get_texture_dimensions(texImage->TexObject->Target);
@@ -1022,7 +1024,10 @@ _mesa_get_texture_image(struct gl_context *ctx,
 
    _mesa_lock_texture(ctx, texObj);
    {
-      ctx->Driver.GetTexImage(ctx, format, type, pixels, texImage);
+      ctx->Driver.GetTexSubImage(ctx, 0, 0, 0,
+                                 texImage->Width, texImage->Height,
+                                 texImage->Depth,
+                                 format, type, pixels, texImage);
    }
    _mesa_unlock_texture(ctx, texObj);
 }
diff --git a/src/mesa/main/texgetimage.h b/src/mesa/main/texgetimage.h
index 1fa2f59dcdc..611b1bd053f 100644
--- a/src/mesa/main/texgetimage.h
+++ b/src/mesa/main/texgetimage.h
@@ -37,10 +37,11 @@ extern GLenum
 _mesa_base_pack_format(GLenum format);
 
 extern void
-_mesa_GetTexImage_sw(struct gl_context *ctx,
-                     GLenum format, GLenum type, GLvoid *pixels,
-                     struct gl_texture_image *texImage);
-
+_mesa_GetTexSubImage_sw(struct gl_context *ctx,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLint depth,
+                        GLenum format, GLenum type, GLvoid *pixels,
+                        struct gl_texture_image *texImage);
 
 extern void
 _mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 7ea3846fff1..8719ab82f65 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -896,7 +896,7 @@ st_CompressedTexImage(struct gl_context *ctx, GLuint dims,
 
 
 /**
- * Called via ctx->Driver.GetTexImage()
+ * Called via ctx->Driver.GetTexSubImage()
  *
  * This uses a blit to copy the texture to a texture format which matches
  * the format and type combo and then a fast read-back is done using memcpy.
@@ -910,16 +910,15 @@ st_CompressedTexImage(struct gl_context *ctx, GLuint dims,
  *       we do here should be free in such cases.
  */
 static void
-st_GetTexImage(struct gl_context * ctx,
-               GLenum format, GLenum type, GLvoid * pixels,
-               struct gl_texture_image *texImage)
+st_GetTexSubImage(struct gl_context * ctx,
+                  GLint xoffset, GLint yoffset, GLint zoffset,
+                  GLsizei width, GLsizei height, GLint depth,
+                  GLenum format, GLenum type, GLvoid * pixels,
+                  struct gl_texture_image *texImage)
 {
    struct st_context *st = st_context(ctx);
    struct pipe_context *pipe = st->pipe;
    struct pipe_screen *screen = pipe->screen;
-   GLuint width = texImage->Width;
-   GLuint height = texImage->Height;
-   GLuint depth = texImage->Depth;
    struct st_texture_image *stImage = st_texture_image(texImage);
    struct st_texture_object *stObj = st_texture_object(texImage->TexObject);
    struct pipe_resource *src = stObj->pt;
@@ -1054,7 +1053,7 @@ st_GetTexImage(struct gl_context * ctx,
       }
    }
 
-   /* create the destination texture */
+   /* create the destination texture of size (width X height X depth) */
    memset(&dst_templ, 0, sizeof(dst_templ));
    dst_templ.target = pipe_target;
    dst_templ.format = dst_format;
@@ -1076,6 +1075,10 @@ st_GetTexImage(struct gl_context * ctx,
       height = 1;
    }
 
+   assert(texImage->Face == 0 ||
+          texImage->TexObject->MinLayer == 0 ||
+          zoffset == 0);
+
    memset(&blit, 0, sizeof(blit));
    blit.src.resource = src;
    blit.src.level = texImage->Level + texImage->TexObject->MinLevel;
@@ -1083,9 +1086,11 @@ st_GetTexImage(struct gl_context * ctx,
    blit.dst.resource = dst;
    blit.dst.level = 0;
    blit.dst.format = dst->format;
-   blit.src.box.x = blit.dst.box.x = 0;
-   blit.src.box.y = blit.dst.box.y = 0;
-   blit.src.box.z = texImage->Face + texImage->TexObject->MinLayer;
+   blit.src.box.x = xoffset;
+   blit.dst.box.x = 0;
+   blit.src.box.y = yoffset;
+   blit.dst.box.y = 0;
+   blit.src.box.z = texImage->Face + texImage->TexObject->MinLayer + zoffset;
    blit.dst.box.z = 0;
    blit.src.box.width = blit.dst.box.width = width;
    blit.src.box.height = blit.dst.box.height = height;
@@ -1206,7 +1211,9 @@ end:
 
 fallback:
    if (!done) {
-      _mesa_GetTexImage_sw(ctx, format, type, pixels, texImage);
+      _mesa_GetTexSubImage_sw(ctx, xoffset, yoffset, zoffset,
+                              width, height, depth,
+                              format, type, pixels, texImage);
    }
 }
 
@@ -1876,7 +1883,7 @@ st_init_texture_functions(struct dd_function_table *functions)
    functions->CopyTexSubImage = st_CopyTexSubImage;
    functions->GenerateMipmap = st_generate_mipmap;
 
-   functions->GetTexImage = st_GetTexImage;
+   functions->GetTexSubImage = st_GetTexSubImage;
 
    /* compressed texture functions */
    functions->CompressedTexImage = st_CompressedTexImage;

From 1ad305b612f389fb04c6d51847427d5ec72fae03 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0412/1208] mesa: plumb offset/size parameters through
 GetTexSubImage code

Needed for GL_ARB_get_texture_sub_image.  But at this point, the
offsets are always zero and the sizes match the whole texture image.

v2: Fixes, suggestions from Laura Ekstrand:
* Fix calls to ctx->Driver.UnmapTextureImage() to pass the correct
  slice value.
* Added comments and assertions to check zoffset+depth<=tex->Depth before
  the 'img' loops.
* Added a new zoffset==0 assert in get_tex_memcpy().

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/main/texgetimage.c | 137 +++++++++++++++++++++---------------
 1 file changed, 80 insertions(+), 57 deletions(-)

diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index a067aa57975..8ade264556b 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -75,12 +75,11 @@ type_needs_clamping(GLenum type)
  */
 static void
 get_tex_depth(struct gl_context *ctx, GLuint dimensions,
+              GLint xoffset, GLint yoffset, GLint zoffset,
+              GLsizei width, GLsizei height, GLint depth,
               GLenum format, GLenum type, GLvoid *pixels,
               struct gl_texture_image *texImage)
 {
-   const GLint width = texImage->Width;
-   GLint height = texImage->Height;
-   GLint depth = texImage->Depth;
    GLint img, row;
    GLfloat *depthRow = malloc(width * sizeof(GLfloat));
 
@@ -94,14 +93,15 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
       height = 1;
    }
 
+   assert(zoffset + depth <= texImage->Depth);
    for (img = 0; img < depth; img++) {
       GLubyte *srcMap;
       GLint srcRowStride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
-                                  &srcMap, &srcRowStride);
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT, &srcMap, &srcRowStride);
 
       if (srcMap) {
          for (row = 0; row < height; row++) {
@@ -113,7 +113,7 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
             _mesa_pack_depth_span(ctx, width, dest, type, depthRow, &ctx->Pack);
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -130,26 +130,26 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
  */
 static void
 get_tex_depth_stencil(struct gl_context *ctx, GLuint dimensions,
+                      GLint xoffset, GLint yoffset, GLint zoffset,
+                      GLsizei width, GLsizei height, GLint depth,
                       GLenum format, GLenum type, GLvoid *pixels,
                       struct gl_texture_image *texImage)
 {
-   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Depth;
    GLint img, row;
 
    assert(format == GL_DEPTH_STENCIL);
    assert(type == GL_UNSIGNED_INT_24_8 ||
           type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV);
 
+   assert(zoffset + depth <= texImage->Depth);
    for (img = 0; img < depth; img++) {
       GLubyte *srcMap;
       GLint rowstride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
-                                  &srcMap, &rowstride);
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT, &srcMap, &rowstride);
 
       if (srcMap) {
          for (row = 0; row < height; row++) {
@@ -166,7 +166,7 @@ get_tex_depth_stencil(struct gl_context *ctx, GLuint dimensions,
             }
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -180,12 +180,11 @@ get_tex_depth_stencil(struct gl_context *ctx, GLuint dimensions,
  */
 static void
 get_tex_stencil(struct gl_context *ctx, GLuint dimensions,
+                GLint xoffset, GLint yoffset, GLint zoffset,
+                GLsizei width, GLsizei height, GLint depth,
                 GLenum format, GLenum type, GLvoid *pixels,
                 struct gl_texture_image *texImage)
 {
-   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Depth;
    GLint img, row;
 
    assert(format == GL_STENCIL_INDEX);
@@ -195,8 +194,9 @@ get_tex_stencil(struct gl_context *ctx, GLuint dimensions,
       GLint rowstride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT,
                                   &srcMap, &rowstride);
 
       if (srcMap) {
@@ -211,7 +211,7 @@ get_tex_stencil(struct gl_context *ctx, GLuint dimensions,
                                            dest);
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -226,22 +226,22 @@ get_tex_stencil(struct gl_context *ctx, GLuint dimensions,
  */
 static void
 get_tex_ycbcr(struct gl_context *ctx, GLuint dimensions,
+              GLint xoffset, GLint yoffset, GLint zoffset,
+              GLsizei width, GLsizei height, GLint depth,
               GLenum format, GLenum type, GLvoid *pixels,
               struct gl_texture_image *texImage)
 {
-   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Depth;
    GLint img, row;
 
+   assert(zoffset + depth <= texImage->Depth);
    for (img = 0; img < depth; img++) {
       GLubyte *srcMap;
       GLint rowstride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
-                                  &srcMap, &rowstride);
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT, &srcMap, &rowstride);
 
       if (srcMap) {
          for (row = 0; row < height; row++) {
@@ -264,7 +264,7 @@ get_tex_ycbcr(struct gl_context *ctx, GLuint dimensions,
             }
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -279,6 +279,8 @@ get_tex_ycbcr(struct gl_context *ctx, GLuint dimensions,
  */
 static void
 get_tex_rgba_compressed(struct gl_context *ctx, GLuint dimensions,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLint depth,
                         GLenum format, GLenum type, GLvoid *pixels,
                         struct gl_texture_image *texImage,
                         GLbitfield transferOps)
@@ -287,9 +289,6 @@ get_tex_rgba_compressed(struct gl_context *ctx, GLuint dimensions,
    const mesa_format texFormat =
       _mesa_get_srgb_format_linear(texImage->TexFormat);
    const GLenum baseFormat = _mesa_get_format_base_format(texFormat);
-   const GLuint width = texImage->Width;
-   const GLuint height = texImage->Height;
-   const GLuint depth = texImage->Depth;
    GLfloat *tempImage, *tempSlice;
    GLuint slice;
    int srcStride, dstStride;
@@ -312,15 +311,15 @@ get_tex_rgba_compressed(struct gl_context *ctx, GLuint dimensions,
 
       tempSlice = tempImage + slice * 4 * width * height;
 
-      ctx->Driver.MapTextureImage(ctx, texImage, slice,
-                                  0, 0, width, height,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + slice,
+                                  xoffset, yoffset, width, height,
                                   GL_MAP_READ_BIT,
                                   &srcMap, &srcRowStride);
       if (srcMap) {
          _mesa_decompress_image(texFormat, width, height,
                                 srcMap, srcRowStride, tempSlice);
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, slice);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + slice);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -409,6 +408,8 @@ _mesa_base_pack_format(GLenum format)
  */
 static void
 get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
+                          GLint xoffset, GLint yoffset, GLint zoffset,
+                          GLsizei width, GLsizei height, GLint depth,
                           GLenum format, GLenum type, GLvoid *pixels,
                           struct gl_texture_image *texImage,
                           GLbitfield transferOps)
@@ -416,9 +417,6 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
    /* don't want to apply sRGB -> RGB conversion here so override the format */
    const mesa_format texFormat =
       _mesa_get_srgb_format_linear(texImage->TexFormat);
-   const GLuint width = texImage->Width;
-   GLuint height = texImage->Height;
-   GLuint depth = texImage->Depth;
    GLuint img;
    GLboolean dst_is_integer;
    uint32_t dst_format;
@@ -430,6 +428,8 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
    if (texImage->TexObject->Target == GL_TEXTURE_1D_ARRAY) {
       depth = height;
       height = 1;
+      zoffset = yoffset;
+      yoffset = 0;
    }
 
    /* Depending on the base format involved we may need to apply a rebase
@@ -480,8 +480,9 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
       uint32_t src_format;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT,
                                   &srcMap, &rowstride);
       if (!srcMap) {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -568,7 +569,7 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
       }
 
       /* Unmap the src texture buffer */
-      ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+      ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
    }
 
 done:
@@ -583,6 +584,8 @@ done:
  */
 static void
 get_tex_rgba(struct gl_context *ctx, GLuint dimensions,
+             GLint xoffset, GLint yoffset, GLint zoffset,
+             GLsizei width, GLsizei height, GLint depth,
              GLenum format, GLenum type, GLvoid *pixels,
              struct gl_texture_image *texImage)
 {
@@ -604,11 +607,17 @@ get_tex_rgba(struct gl_context *ctx, GLuint dimensions,
    }
 
    if (_mesa_is_format_compressed(texImage->TexFormat)) {
-      get_tex_rgba_compressed(ctx, dimensions, format, type,
+      get_tex_rgba_compressed(ctx, dimensions,
+                              xoffset, yoffset, zoffset,
+                              width, height, depth,
+                              format, type,
                               pixels, texImage, transferOps);
    }
    else {
-      get_tex_rgba_uncompressed(ctx, dimensions, format, type,
+      get_tex_rgba_uncompressed(ctx, dimensions,
+                                xoffset, yoffset, zoffset,
+                                width, height, depth,
+                                format, type,
                                 pixels, texImage, transferOps);
    }
 }
@@ -619,8 +628,10 @@ get_tex_rgba(struct gl_context *ctx, GLuint dimensions,
  * \return GL_TRUE if done, GL_FALSE otherwise
  */
 static GLboolean
-get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
-               GLvoid *pixels,
+get_tex_memcpy(struct gl_context *ctx,
+               GLint xoffset, GLint yoffset, GLint zoffset,
+               GLsizei width, GLsizei height, GLint depth,
+               GLenum format, GLenum type, GLvoid *pixels,
                struct gl_texture_image *texImage)
 {
    const GLenum target = texImage->TexObject->Target;
@@ -642,20 +653,25 @@ get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
                                                      ctx->Pack.SwapBytes);
    }
 
+   if (depth > 1) {
+      /* only a single slice is supported at this time */
+      memCopy = FALSE;
+   }
+
    if (memCopy) {
       const GLuint bpp = _mesa_get_format_bytes(texImage->TexFormat);
-      const GLint bytesPerRow = texImage->Width * bpp;
+      const GLint bytesPerRow = width * bpp;
       GLubyte *dst =
-         _mesa_image_address2d(&ctx->Pack, pixels, texImage->Width,
-                               texImage->Height, format, type, 0, 0);
+         _mesa_image_address2d(&ctx->Pack, pixels, width, height,
+                               format, type, 0, 0);
       const GLint dstRowStride =
-         _mesa_image_row_stride(&ctx->Pack, texImage->Width, format, type);
+         _mesa_image_row_stride(&ctx->Pack, width, format, type);
       GLubyte *src;
       GLint srcRowStride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, 0,
-                                  0, 0, texImage->Width, texImage->Height,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset,
+                                  xoffset, yoffset, width, height,
                                   GL_MAP_READ_BIT, &src, &srcRowStride);
 
       if (src) {
@@ -664,7 +680,7 @@ get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
          }
          else {
             GLuint row;
-            for (row = 0; row < texImage->Height; row++) {
+            for (row = 0; row < height; row++) {
                memcpy(dst, src, bytesPerRow);
                dst += dstRowStride;
                src += srcRowStride;
@@ -672,7 +688,7 @@ get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
          }
 
          /* unmap src texture buffer */
-         ctx->Driver.UnmapTextureImage(ctx, texImage, 0);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -722,23 +738,30 @@ _mesa_GetTexSubImage_sw(struct gl_context *ctx,
       pixels = ADD_POINTERS(buf, pixels);
    }
 
-   if (get_tex_memcpy(ctx, format, type, pixels, texImage)) {
+   if (get_tex_memcpy(ctx, xoffset, yoffset, zoffset, width, height, depth,
+                      format, type, pixels, texImage)) {
       /* all done */
    }
    else if (format == GL_DEPTH_COMPONENT) {
-      get_tex_depth(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_depth(ctx, dimensions, xoffset, yoffset, zoffset,
+                    width, height, depth, format, type, pixels, texImage);
    }
    else if (format == GL_DEPTH_STENCIL_EXT) {
-      get_tex_depth_stencil(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_depth_stencil(ctx, dimensions, xoffset, yoffset, zoffset,
+                            width, height, depth, format, type, pixels,
+                            texImage);
    }
    else if (format == GL_STENCIL_INDEX) {
-      get_tex_stencil(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_stencil(ctx, dimensions, xoffset, yoffset, zoffset,
+                      width, height, depth, format, type, pixels, texImage);
    }
    else if (format == GL_YCBCR_MESA) {
-      get_tex_ycbcr(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_ycbcr(ctx, dimensions, xoffset, yoffset, zoffset,
+                    width, height, depth, format, type, pixels, texImage);
    }
    else {
-      get_tex_rgba(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_rgba(ctx, dimensions, xoffset, yoffset, zoffset,
+                   width, height, depth, format, type, pixels, texImage);
    }
 
    if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {

From 5bfc360e40680b3fe2b6f74ac487fa76502559e3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0413/1208] mesa: make _mesa_get_[compressed_]texture_image()
 static

These functions are only called from teximage.c

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/main/texgetimage.c | 24 ++++++++++++------------
 src/mesa/main/texgetimage.h |  7 -------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 8ade264556b..02c1fdf6c60 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -1004,12 +1004,12 @@ getteximage_error_check(struct gl_context *ctx,
  * \param dsa True when the caller is an ARB_direct_state_access function,
  *            false otherwise
  */
-void
-_mesa_get_texture_image(struct gl_context *ctx,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage, GLenum target,
-                        GLint level, GLenum format, GLenum type,
-                        GLsizei bufSize, GLvoid *pixels, bool dsa)
+static void
+get_texture_image(struct gl_context *ctx,
+                  struct gl_texture_object *texObj,
+                  struct gl_texture_image *texImage, GLenum target,
+                  GLint level, GLenum format, GLenum type,
+                  GLsizei bufSize, GLvoid *pixels, bool dsa)
 {
    assert(texObj);
    assert(texImage);
@@ -1103,8 +1103,8 @@ _mesa_GetnTexImageARB(GLenum target, GLint level, GLenum format,
    if (!texImage)
       return;
 
-   _mesa_get_texture_image(ctx, texObj, texImage, target, level, format, type,
-                           bufSize, pixels, false);
+   get_texture_image(ctx, texObj, texImage, target, level, format, type,
+                     bufSize, pixels, false);
 }
 
 
@@ -1179,8 +1179,8 @@ _mesa_GetTextureImage(GLuint texture, GLint level, GLenum format,
          texImage = texObj->Image[i][level];
          assert(texImage);
 
-         _mesa_get_texture_image(ctx, texObj, texImage, texObj->Target, level,
-                                 format, type, bufSize, pixels, true);
+         get_texture_image(ctx, texObj, texImage, texObj->Target, level,
+                           format, type, bufSize, pixels, true);
 
          image_stride = _mesa_image_image_stride(&ctx->Pack, texImage->Width,
                                                  texImage->Height, format,
@@ -1194,8 +1194,8 @@ _mesa_GetTextureImage(GLuint texture, GLint level, GLenum format,
       if (!texImage)
          return;
 
-      _mesa_get_texture_image(ctx, texObj, texImage, texObj->Target, level,
-                              format, type, bufSize, pixels, true);
+      get_texture_image(ctx, texObj, texImage, texObj->Target, level,
+                        format, type, bufSize, pixels, true);
    }
 }
 
diff --git a/src/mesa/main/texgetimage.h b/src/mesa/main/texgetimage.h
index 611b1bd053f..5124851f461 100644
--- a/src/mesa/main/texgetimage.h
+++ b/src/mesa/main/texgetimage.h
@@ -48,13 +48,6 @@ _mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
                                struct gl_texture_image *texImage,
                                GLvoid *data);
 
-extern void
-_mesa_get_texture_image(struct gl_context *ctx,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage, GLenum target,
-                        GLint level, GLenum format, GLenum type,
-                        GLsizei bufSize, GLvoid *pixels, bool dsa);
-
 extern void
 _mesa_get_compressed_texture_image( struct gl_context *ctx,
                                     struct gl_texture_object *texObj,

From 2a95fd153158e20e6b44548d4f247a5763713fb3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0414/1208] mesa: replace Driver.GetCompressedTexImage() w/
 GetCompressedTexSubImage()

For now, pass offsets of zero and width/height/depth equal to the
whole image.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/drivers/common/driverfuncs.c  |  2 +-
 src/mesa/main/dd.h                     |  9 ++++++---
 src/mesa/main/texgetimage.c            | 28 +++++++++++++++-----------
 src/mesa/main/texgetimage.h            |  9 ++++++---
 src/mesa/state_tracker/st_cb_texture.c |  2 +-
 5 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index ce99620c5b2..6fe42b1775c 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -101,7 +101,7 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->TestProxyTexImage = _mesa_test_proxy_teximage;
    driver->CompressedTexImage = _mesa_store_compressed_teximage;
    driver->CompressedTexSubImage = _mesa_store_compressed_texsubimage;
-   driver->GetCompressedTexImage = _mesa_GetCompressedTexImage_sw;
+   driver->GetCompressedTexSubImage = _mesa_GetCompressedTexSubImage_sw;
    driver->BindTexture = NULL;
    driver->NewTextureObject = _mesa_new_texture_object;
    driver->DeleteTexture = _mesa_delete_texture_object;
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 12e54de8b08..ae01770cce4 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -335,9 +335,12 @@ struct dd_function_table {
    /**
     * Called by glGetCompressedTexImage.
     */
-   void (*GetCompressedTexImage)(struct gl_context *ctx,
-                                 struct gl_texture_image *texImage,
-                                 GLvoid *data);
+   void (*GetCompressedTexSubImage)(struct gl_context *ctx,
+                                    struct gl_texture_image *texImage,
+                                    GLint xoffset, GLint yoffset,
+                                    GLint zoffset, GLsizei width,
+                                    GLsizei height, GLsizei depth,
+                                    GLvoid *data);
    /*@}*/
 
    /**
diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 02c1fdf6c60..98e4aa78c4e 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -772,13 +772,16 @@ _mesa_GetTexSubImage_sw(struct gl_context *ctx,
 
 
 /**
- * This is the software fallback for Driver.GetCompressedTexImage().
+ * This is the software fallback for Driver.GetCompressedTexSubImage().
  * All error checking will have been done before this routine is called.
  */
 void
-_mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
-                               struct gl_texture_image *texImage,
-                               GLvoid *img)
+_mesa_GetCompressedTexSubImage_sw(struct gl_context *ctx,
+                                  struct gl_texture_image *texImage,
+                                  GLint xoffset, GLint yoffset,
+                                  GLint zoffset, GLsizei width,
+                                  GLint height, GLint depth,
+                                  GLvoid *img)
 {
    const GLuint dimensions =
       _mesa_get_texture_dimensions(texImage->TexObject->Target);
@@ -787,10 +790,8 @@ _mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
    GLubyte *dest;
 
    _mesa_compute_compressed_pixelstore(dimensions, texImage->TexFormat,
-                                       texImage->Width, texImage->Height,
-                                       texImage->Depth,
-                                       &ctx->Pack,
-                                       &store);
+                                       width, height, depth,
+                                       &ctx->Pack, &store);
 
    if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
       /* pack texture image into a PBO */
@@ -816,8 +817,8 @@ _mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
       GLubyte *src;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, slice,
-                                  0, 0, texImage->Width, texImage->Height,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + slice,
+                                  xoffset, yoffset, width, height,
                                   GL_MAP_READ_BIT, &src, &srcRowStride);
 
       if (src) {
@@ -828,7 +829,7 @@ _mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
             src += srcRowStride;
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, slice);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + slice);
 
          /* Advance to next slice */
          dest += store.TotalBytesPerRow * (store.TotalRowsPerSlice - store.CopyRowsPerSlice);
@@ -1323,7 +1324,10 @@ _mesa_get_compressed_texture_image(struct gl_context *ctx,
 
    _mesa_lock_texture(ctx, texObj);
    {
-      ctx->Driver.GetCompressedTexImage(ctx, texImage, pixels);
+      ctx->Driver.GetCompressedTexSubImage(ctx, texImage,
+                                           0, 0, 0,
+                                           texImage->Width, texImage->Height,
+                                           texImage->Depth, pixels);
    }
    _mesa_unlock_texture(ctx, texObj);
 }
diff --git a/src/mesa/main/texgetimage.h b/src/mesa/main/texgetimage.h
index 5124851f461..040495a773b 100644
--- a/src/mesa/main/texgetimage.h
+++ b/src/mesa/main/texgetimage.h
@@ -44,9 +44,12 @@ _mesa_GetTexSubImage_sw(struct gl_context *ctx,
                         struct gl_texture_image *texImage);
 
 extern void
-_mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
-                               struct gl_texture_image *texImage,
-                               GLvoid *data);
+_mesa_GetCompressedTexSubImage_sw(struct gl_context *ctx,
+                                  struct gl_texture_image *texImage,
+                                  GLint xoffset, GLint yoffset,
+                                  GLint zoffset, GLsizei width,
+                                  GLint height, GLint depth,
+                                  GLvoid *data);
 
 extern void
 _mesa_get_compressed_texture_image( struct gl_context *ctx,
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 8719ab82f65..715d69c0c68 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -1887,7 +1887,7 @@ st_init_texture_functions(struct dd_function_table *functions)
 
    /* compressed texture functions */
    functions->CompressedTexImage = st_CompressedTexImage;
-   functions->GetCompressedTexImage = _mesa_GetCompressedTexImage_sw;
+   functions->GetCompressedTexSubImage = _mesa_GetCompressedTexSubImage_sw;
 
    functions->NewTextureObject = st_NewTextureObject;
    functions->NewTextureImage = st_NewTextureImage;

From 613f1e00b8f8493a0cac7dbeec6647ce3a5a0355 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0415/1208] mesa: 80-column wrapping in texgetimage.c

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/main/texgetimage.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 98e4aa78c4e..3a396eaa641 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -449,7 +449,8 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
       rebaseSwizzle[1] = MESA_FORMAT_SWIZZLE_ZERO;
       rebaseSwizzle[2] = MESA_FORMAT_SWIZZLE_ZERO;
       rebaseSwizzle[3] = MESA_FORMAT_SWIZZLE_W;
-    } else if (texImage->_BaseFormat != _mesa_get_format_base_format(texFormat)) {
+    } else if (texImage->_BaseFormat !=
+               _mesa_get_format_base_format(texFormat)) {
       needsRebase =
          _mesa_compute_rgba2base2rgba_component_mapping(texImage->_BaseFormat,
                                                         rebaseSwizzle);
@@ -531,8 +532,8 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
          /* If we had to rebase, we have already handled that */
          needsRebase = false;
 
-         /* If we were lucky and our RGBA conversion matches the dst format, then
-          * we are done.
+         /* If we were lucky and our RGBA conversion matches the dst format,
+          * then we are done.
           */
          if (!need_convert)
             goto do_swap;
@@ -832,7 +833,8 @@ _mesa_GetCompressedTexSubImage_sw(struct gl_context *ctx,
          ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + slice);
 
          /* Advance to next slice */
-         dest += store.TotalBytesPerRow * (store.TotalRowsPerSlice - store.CopyRowsPerSlice);
+         dest += store.TotalBytesPerRow * (store.TotalRowsPerSlice -
+                                           store.CopyRowsPerSlice);
 
       } else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetCompresssedTexImage");
@@ -953,7 +955,8 @@ getteximage_error_check(struct gl_context *ctx,
                   "glGetTex%sImage(format mismatch)", suffix);
       return GL_TRUE;
    }
-   else if (!_mesa_is_stencil_format(format) && _mesa_is_enum_format_integer(format) !=
+   else if (!_mesa_is_stencil_format(format) &&
+            _mesa_is_enum_format_integer(format) !=
             _mesa_is_format_integer(texImage->TexFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glGetTex%sImage(format mismatch)", suffix);

From f20cfc5a409b69d5ae3d10a870d90e0b4e493ddf Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0416/1208] mesa: overhaul the glGetTexImage code

1. Reorganize the error checking code.
2. Lay groundwork for getting sub images by passing image offset and
   dimensions to the error checking code.
3. Implement _mesa_GetnTexImageARB(), _mesa_GetTexImage() and
   _mesa_GetTextureImage() all in terms of get_texture_image().

v2: pass offset/width/height/depth arguments to the error checking
function, avoid using magic width/height/depth values.
v3: remove unused bufSize param to get_texture_image()

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/main/texgetimage.c | 657 ++++++++++++++++++++++++------------
 1 file changed, 445 insertions(+), 212 deletions(-)

diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 3a396eaa641..e58e0500491 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -891,28 +891,298 @@ legal_getteximage_target(struct gl_context *ctx, GLenum target, bool dsa)
 
 
 /**
- * Do error checking for a glGetTex(ture)Image() call.
- * \return GL_TRUE if any error, GL_FALSE if no errors.
+ * Wrapper for _mesa_select_tex_image() which can handle target being
+ * GL_TEXTURE_CUBE_MAP_ARB in which case we use zoffset to select a cube face.
+ * This can happen for glGetTextureImage (DSA function).
  */
-static GLboolean
-getteximage_error_check(struct gl_context *ctx,
-                        struct gl_texture_image *texImage,
-                        GLenum target, GLint level,
-                        GLenum format, GLenum type, GLsizei clientMemSize,
-                        GLvoid *pixels, bool dsa)
+static struct gl_texture_image *
+select_tex_image(const struct gl_texture_object *texObj, GLenum target,
+                 GLint level, GLint zoffset)
 {
-   const GLint maxLevels = _mesa_max_texture_levels(ctx, target);
-   const GLuint dimensions = (target == GL_TEXTURE_3D) ? 3 : 2;
-   GLenum baseFormat;
-   const char *suffix = dsa ? "ture" : "";
-
-   assert(texImage);
-   assert(maxLevels != 0);
-   if (level < 0 || level >= maxLevels) {
-      _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glGetTex%sImage(level out of range)", suffix);
-      return GL_TRUE;
+   assert(level >= 0);
+   assert(level < MAX_TEXTURE_LEVELS);
+   if (target == GL_TEXTURE_CUBE_MAP) {
+      assert(zoffset >= 0);
+      assert(zoffset < 6);
+      target = GL_TEXTURE_CUBE_MAP_POSITIVE_X + zoffset;
    }
+   return _mesa_select_tex_image(texObj, target, level);
+}
+
+
+/**
+ * Error-check the offset and size arguments to
+ * glGet[Compressed]TextureSubImage().  Also checks if the specified
+ * texture image is missing.
+ * \return true if error, false if no error.
+ */
+static bool
+dimensions_error_check(struct gl_context *ctx,
+                       struct gl_texture_object *texObj,
+                       GLenum target, GLint level,
+                       GLint xoffset, GLint yoffset, GLint zoffset,
+                       GLsizei width, GLsizei height, GLsizei depth,
+                       const char *caller)
+{
+   const struct gl_texture_image *texImage;
+   int i;
+
+   if (xoffset < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(xoffset = %d)", caller, xoffset);
+      return true;
+   }
+
+   if (yoffset < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(yoffset = %d)", caller, yoffset);
+      return true;
+   }
+
+   if (zoffset < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(zoffset = %d)", caller, zoffset);
+      return true;
+   }
+
+   if (width < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(width = %d)", caller, width);
+      return true;
+   }
+
+   if (height < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(height = %d)", caller, height);
+      return true;
+   }
+
+   if (depth < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(depth = %d)", caller, depth);
+      return true;
+   }
+
+   /* do special per-target checks */
+   switch (target) {
+   case GL_TEXTURE_1D:
+      if (yoffset != 0) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(1D, yoffset = %d)", caller, yoffset);
+         return true;
+      }
+      if (height != 1) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(1D, height = %d)", caller, height);
+         return true;
+      }
+      /* fall-through */
+   case GL_TEXTURE_1D_ARRAY:
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_RECTANGLE:
+      if (zoffset != 0) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(zoffset = %d)", caller, zoffset);
+         return true;
+      }
+      if (depth != 1) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(depth = %d)", caller, depth);
+         return true;
+      }
+      break;
+   case GL_TEXTURE_CUBE_MAP:
+      /* Non-array cube maps are special because we have a gl_texture_image
+       * per face.
+       */
+      if (zoffset + depth > 6) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(zoffset + depth = %d)", caller, zoffset + depth);
+         return true;
+      }
+      /* check that the range of faces exist */
+      for (i = 0; i < depth; i++) {
+         GLenum face = GL_TEXTURE_CUBE_MAP_POSITIVE_X + zoffset + i;
+         if (!_mesa_select_tex_image(texObj, face, level)) {
+            /* non-existant face */
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(missing cube face)", caller);
+            return true;
+         }
+      }
+      break;
+   default:
+      ; /* nothing */
+   }
+
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   if (!texImage) {
+      /* missing texture image */
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(zoffset = %d)", caller, zoffset);
+      return true;
+   }
+
+   if (xoffset + width > texImage->Width) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(xoffset %d + width %d > %u)",
+                  caller, xoffset, width, texImage->Width);
+      return true;
+   }
+
+   if (yoffset + height > texImage->Height) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(yoffset %d + height %d > %u)",
+                  caller, yoffset, height, texImage->Height);
+      return true;
+   }
+
+   if (target != GL_TEXTURE_CUBE_MAP) {
+      /* Cube map error checking was done above */
+      if (zoffset + depth > texImage->Depth) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(zoffset %d + depth %d > %u)",
+                     caller, zoffset, depth, texImage->Depth);
+         return true;
+      }
+   }
+
+   /* Extra checks for compressed textures */
+   {
+      GLuint bw, bh;
+      _mesa_get_format_block_size(texImage->TexFormat, &bw, &bh);
+      if (bw > 1 || bh > 1) {
+         /* offset must be multiple of block size */
+         if (xoffset % bw != 0) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "%s(xoffset = %d)", caller, xoffset);
+            return true;
+         }
+         if (target != GL_TEXTURE_1D && target != GL_TEXTURE_1D_ARRAY) {
+            if (yoffset % bh != 0) {
+               _mesa_error(ctx, GL_INVALID_VALUE,
+                           "%s(yoffset = %d)", caller, yoffset);
+               return true;
+            }
+         }
+
+         /* The size must be a multiple of bw x bh, or we must be using a
+          * offset+size that exactly hits the edge of the image.
+          */
+         if ((width % bw != 0) &&
+             (xoffset + width != (GLint) texImage->Width)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "%s(width = %d)", caller, width);
+            return true;
+         }
+
+         if ((height % bh != 0) &&
+             (yoffset + height != (GLint) texImage->Height)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "%s(height = %d)", caller, height);
+            return true;
+         }
+      }
+   }
+
+   if (width == 0 || height == 0 || depth == 0) {
+      /* Not an error, but nothing to do.  Return 'true' so that the
+       * caller simply returns.
+       */
+      return true;
+   }
+
+   return false;
+}
+
+
+/**
+ * Do PBO-related error checking for getting uncompressed images.
+ * \return true if there was an error (or the GetTexImage is to be a no-op)
+ */
+static bool
+pbo_error_check(struct gl_context *ctx, GLenum target,
+                GLsizei width, GLsizei height, GLsizei depth,
+                GLenum format, GLenum type, GLsizei clientMemSize,
+                GLvoid *pixels,
+                const char *caller)
+{
+   const GLuint dimensions = (target == GL_TEXTURE_3D) ? 3 : 2;
+
+   if (!_mesa_validate_pbo_access(dimensions, &ctx->Pack, width, height, depth,
+                                  format, type, clientMemSize, pixels)) {
+      if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(out of bounds PBO access)", caller);
+      } else {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(out of bounds access: bufSize (%d) is too small)",
+                     caller, clientMemSize);
+      }
+      return true;
+   }
+
+   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
+      /* PBO should not be mapped */
+      if (_mesa_check_disallowed_mapping(ctx->Pack.BufferObj)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(PBO is mapped)", caller);
+         return true;
+      }
+   }
+
+   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && !pixels) {
+      /* not an error, do nothing */
+      return true;
+   }
+
+   return false;
+}
+
+
+/**
+ * Do error checking for all (non-compressed) get-texture-image functions.
+ * \return true if any error, false if no errors.
+ */
+static bool
+getteximage_error_check(struct gl_context *ctx,
+                        struct gl_texture_object *texObj,
+                        GLenum target, GLint level,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLsizei depth,
+                        GLenum format, GLenum type, GLsizei bufSize,
+                        GLvoid *pixels, const char *caller)
+{
+   struct gl_texture_image *texImage;
+   GLenum baseFormat, err;
+   GLint maxLevels;
+
+   assert(texObj);
+
+   if (texObj->Target == 0) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid texture)", caller);
+      return true;
+   }
+
+   maxLevels = _mesa_max_texture_levels(ctx, target);
+   if (level < 0 || level >= maxLevels) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(level = %d)", caller, level);
+      return true;
+   }
+
+   if (dimensions_error_check(ctx, texObj, target, level,
+                              xoffset, yoffset, zoffset,
+                              width, height, depth, caller)) {
+      return true;
+   }
+
+   err = _mesa_error_check_format_and_type(ctx, format, type);
+   if (err != GL_NO_ERROR) {
+      _mesa_error(ctx, err, "%s(format/type)", caller);
+      return true;
+   }
+
+   if (pbo_error_check(ctx, target, width, height, depth,
+                       format, type, bufSize, pixels, caller)) {
+      return true;
+   }
+
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   assert(texImage);
 
    /*
     * Format and type checking has been moved up to GetnTexImage and
@@ -927,282 +1197,245 @@ getteximage_error_check(struct gl_context *ctx,
    if (_mesa_is_color_format(format)
        && !_mesa_is_color_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
    else if (_mesa_is_depth_format(format)
             && !_mesa_is_depth_format(baseFormat)
             && !_mesa_is_depthstencil_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
    else if (_mesa_is_stencil_format(format)
             && !ctx->Extensions.ARB_texture_stencil8) {
       _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetTex%sImage(format=GL_STENCIL_INDEX)", suffix);
-      return GL_TRUE;
+                  "%s(format=GL_STENCIL_INDEX)", caller);
+      return true;
    }
    else if (_mesa_is_ycbcr_format(format)
             && !_mesa_is_ycbcr_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
    else if (_mesa_is_depthstencil_format(format)
             && !_mesa_is_depthstencil_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
    else if (!_mesa_is_stencil_format(format) &&
             _mesa_is_enum_format_integer(format) !=
             _mesa_is_format_integer(texImage->TexFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
 
-   if (!_mesa_validate_pbo_access(dimensions, &ctx->Pack, texImage->Width,
-                                  texImage->Height, texImage->Depth,
-                                  format, type, clientMemSize, pixels)) {
-      if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetTex%sImage(out of bounds PBO access)", suffix);
-      } else {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "%s(out of bounds access:"
-                     " bufSize (%d) is too small)",
-                     dsa ? "glGetTextureImage" : "glGetnTexImageARB",
-                     clientMemSize);
-      }
-      return GL_TRUE;
-   }
-
-   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-      /* PBO should not be mapped */
-      if (_mesa_check_disallowed_mapping(ctx->Pack.BufferObj)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetTex%sImage(PBO is mapped)", suffix);
-         return GL_TRUE;
-      }
-   }
-
-   return GL_FALSE;
+   return false;
 }
 
 
 /**
- * This is the implementation for glGetnTexImageARB, glGetTextureImage,
- * and glGetTexImage.
- *
- * Requires caller to pass in texImage object because _mesa_GetTextureImage
- * must handle the GL_TEXTURE_CUBE_MAP target.
- *
- * \param target texture target.
+ * Return the width, height and depth of a texture image.
+ * This function must be resilient to bad parameter values since
+ * this is called before full error checking.
+ */
+static void
+get_texture_image_dims(const struct gl_texture_object *texObj,
+                       GLenum target, GLint level,
+                       GLsizei *width, GLsizei *height, GLsizei *depth)
+{
+   const struct gl_texture_image *texImage = NULL;
+
+   if (level >= 0 && level < MAX_TEXTURE_LEVELS) {
+      texImage = _mesa_select_tex_image(texObj, target, level);
+   }
+
+   if (texImage) {
+      *width = texImage->Width;
+      *height = texImage->Height;
+      if (target == GL_TEXTURE_CUBE_MAP) {
+         *depth = 6;
+      }
+      else {
+         *depth = texImage->Depth;
+      }
+   }
+   else {
+      *width = *height = *depth = 0;
+   }
+}
+
+
+/**
+ * Common code for all (uncompressed) get-texture-image functions.
+ * \param texObj  the texture object (should not be null)
+ * \param target  user-provided target, or 0 for DSA
  * \param level image level.
  * \param format pixel data format for returned image.
  * \param type pixel data type for returned image.
  * \param bufSize size of the pixels data buffer.
  * \param pixels returned pixel data.
- * \param dsa True when the caller is an ARB_direct_state_access function,
- *            false otherwise
+ * \param caller  name of calling function
  */
 static void
 get_texture_image(struct gl_context *ctx,
                   struct gl_texture_object *texObj,
-                  struct gl_texture_image *texImage, GLenum target,
-                  GLint level, GLenum format, GLenum type,
-                  GLsizei bufSize, GLvoid *pixels, bool dsa)
+                  GLenum target, GLint level,
+                  GLint xoffset, GLint yoffset, GLint zoffset,
+                  GLsizei width, GLsizei height, GLint depth,
+                  GLenum format, GLenum type,
+                  GLvoid *pixels, const char *caller)
 {
-   assert(texObj);
-   assert(texImage);
+   struct gl_texture_image *texImage;
+   unsigned firstFace, numFaces, i;
+   GLint imageStride;
 
    FLUSH_VERTICES(ctx, 0);
 
-   /*
-    * Legal target checking has been moved up to GetnTexImage and
-    * GetTextureImage so that it can be caught before receiving a NULL
-    * texImage object and exiting.
-    */
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   assert(texImage);  /* should have been error checked already */
 
-   if (getteximage_error_check(ctx, texImage, target, level, format,
-                               type, bufSize, pixels, dsa)) {
+   if (_mesa_is_zero_size_texture(texImage)) {
+      /* no image data to return */
       return;
    }
 
-   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && !pixels) {
-      /* not an error, do nothing */
-      return;
-   }
-
-   if (_mesa_is_zero_size_texture(texImage))
-      return;
-
    if (MESA_VERBOSE & (VERBOSE_API | VERBOSE_TEXTURE)) {
-      _mesa_debug(ctx, "glGetTex%sImage(tex %u) format = %s, w=%d, h=%d,"
+      _mesa_debug(ctx, "%s(tex %u) format = %s, w=%d, h=%d,"
                   " dstFmt=0x%x, dstType=0x%x\n",
-                  dsa ? "ture": "",
-                  texObj->Name,
+                  caller, texObj->Name,
                   _mesa_get_format_name(texImage->TexFormat),
                   texImage->Width, texImage->Height,
                   format, type);
    }
 
-   _mesa_lock_texture(ctx, texObj);
-   {
-      ctx->Driver.GetTexSubImage(ctx, 0, 0, 0,
-                                 texImage->Width, texImage->Height,
-                                 texImage->Depth,
-                                 format, type, pixels, texImage);
+   if (target == GL_TEXTURE_CUBE_MAP) {
+      /* Compute stride between cube faces */
+      imageStride = _mesa_image_image_stride(&ctx->Pack, width, height,
+                                             format, type);
+      firstFace = zoffset;
+      numFaces = depth;
+      zoffset = 0;
+      depth = 1;
    }
+   else {
+      imageStride = 0;
+      firstFace = _mesa_tex_target_to_face(target);
+      numFaces = 1;
+   }
+
+   _mesa_lock_texture(ctx, texObj);
+
+   for (i = 0; i < numFaces; i++) {
+      texImage = texObj->Image[firstFace + i][level];
+      assert(texImage);
+
+      ctx->Driver.GetTexSubImage(ctx, xoffset, yoffset, zoffset,
+                                 width, height, depth,
+                                 format, type, pixels, texImage);
+
+      /* next cube face */
+      pixels = (GLubyte *) pixels + imageStride;
+   }
+
    _mesa_unlock_texture(ctx, texObj);
 }
 
-/**
- * Get texture image.  Called by glGetTexImage.
- *
- * \param target texture target.
- * \param level image level.
- * \param format pixel data format for returned image.
- * \param type pixel data type for returned image.
- * \param bufSize size of the pixels data buffer.
- * \param pixels returned pixel data.
- */
+
 void GLAPIENTRY
-_mesa_GetnTexImageARB(GLenum target, GLint level, GLenum format,
-                      GLenum type, GLsizei bufSize, GLvoid *pixels)
+_mesa_GetnTexImageARB(GLenum target, GLint level, GLenum format, GLenum type,
+                      GLsizei bufSize, GLvoid *pixels)
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-   GLenum err;
    GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetnTexImageARB";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj;
 
-   /*
-    * This has been moved here because a format/type mismatch can cause a NULL
-    * texImage object, which in turn causes the mismatch error to be
-    * ignored.
-    */
-   err = _mesa_error_check_format_and_type(ctx, format, type);
-   if (err != GL_NO_ERROR) {
-      _mesa_error(ctx, err, "glGetnTexImage(format/type)");
-      return;
-   }
-
-   /*
-    * Legal target checking has been moved here to prevent exiting with a NULL
-    * texImage object.
-    */
    if (!legal_getteximage_target(ctx, target, false)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glGetnTexImage(target=0x%x)",
-                  target);
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", caller);
       return;
    }
 
    texObj = _mesa_get_current_tex_object(ctx, target);
-   if (!texObj)
-      return;
+   assert(texObj);
 
-   texImage = _mesa_select_tex_image(texObj, target, level);
-   if (!texImage)
-      return;
+   get_texture_image_dims(texObj, target, level, &width, &height, &depth);
 
-   get_texture_image(ctx, texObj, texImage, target, level, format, type,
-                     bufSize, pixels, false);
+   if (getteximage_error_check(ctx, texObj, target, level,
+                               0, 0, 0, width, height, depth,
+                               format, type, bufSize, pixels, caller)) {
+      return;
+   }
+
+   get_texture_image(ctx, texObj, target, level,
+                     0, 0, 0, width, height, depth,
+                     format, type, pixels, caller);
 }
 
 
 void GLAPIENTRY
-_mesa_GetTexImage( GLenum target, GLint level, GLenum format,
-                   GLenum type, GLvoid *pixels )
+_mesa_GetTexImage(GLenum target, GLint level, GLenum format, GLenum type,
+                  GLvoid *pixels )
 {
-   _mesa_GetnTexImageARB(target, level, format, type, INT_MAX, pixels);
-}
-
-/**
- * Get texture image.
- *
- * \param texture texture name.
- * \param level image level.
- * \param format pixel data format for returned image.
- * \param type pixel data type for returned image.
- * \param bufSize size of the pixels data buffer.
- * \param pixels returned pixel data.
- */
-void GLAPIENTRY
-_mesa_GetTextureImage(GLuint texture, GLint level, GLenum format,
-                      GLenum type, GLsizei bufSize, GLvoid *pixels)
-{
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-   int i;
-   GLint image_stride;
-   GLenum err;
    GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetTextImage";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj;
 
-   /*
-    * This has been moved here because a format/type mismatch can cause a NULL
-    * texImage object, which in turn causes the mismatch error to be
-    * ignored.
-    */
-   err = _mesa_error_check_format_and_type(ctx, format, type);
-   if (err != GL_NO_ERROR) {
-      _mesa_error(ctx, err, "glGetTextureImage(format/type)");
+   if (!legal_getteximage_target(ctx, target, false)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", caller);
       return;
    }
 
-   texObj = _mesa_lookup_texture_err(ctx, texture, "glGetTextureImage");
-   if (!texObj)
-      return;
+   texObj = _mesa_get_current_tex_object(ctx, target);
+   assert(texObj);
 
-   /*
-    * Legal target checking has been moved here to prevent exiting with a NULL
-    * texImage object.
-    */
-   if (!legal_getteximage_target(ctx, texObj->Target, true)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glGetTextureImage(target=%s)",
-                  _mesa_enum_to_string(texObj->Target));
+   get_texture_image_dims(texObj, target, level, &width, &height, &depth);
+
+   if (getteximage_error_check(ctx, texObj, target, level,
+                               0, 0, 0, width, height, depth,
+                               format, type, INT_MAX, pixels, caller)) {
       return;
    }
 
-   /* Must handle special case GL_TEXTURE_CUBE_MAP. */
-   if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
-
-      /* Make sure the texture object is a proper cube.
-       * (See texturesubimage in teximage.c for details on why this check is
-       * performed.)
-       */
-      if (!_mesa_cube_level_complete(texObj, level)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetTextureImage(cube map incomplete)");
-         return;
-      }
-
-      /* Copy each face. */
-      for (i = 0; i < 6; ++i) {
-         texImage = texObj->Image[i][level];
-         assert(texImage);
-
-         get_texture_image(ctx, texObj, texImage, texObj->Target, level,
-                           format, type, bufSize, pixels, true);
-
-         image_stride = _mesa_image_image_stride(&ctx->Pack, texImage->Width,
-                                                 texImage->Height, format,
-                                                 type);
-         pixels = (GLubyte *) pixels + image_stride;
-         bufSize -= image_stride;
-      }
-   }
-   else {
-      texImage = _mesa_select_tex_image(texObj, texObj->Target, level);
-      if (!texImage)
-         return;
-
-      get_texture_image(ctx, texObj, texImage, texObj->Target, level,
-                        format, type, bufSize, pixels, true);
-   }
+   get_texture_image(ctx, texObj, target, level,
+                     0, 0, 0, width, height, depth,
+                     format, type, pixels, caller);
 }
 
+
+void GLAPIENTRY
+_mesa_GetTextureImage(GLuint texture, GLint level, GLenum format, GLenum type,
+                      GLsizei bufSize, GLvoid *pixels)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   GLsizei width, height, depth;
+   static const char *caller = "glGetTextureImage";
+   struct gl_texture_object *texObj =
+      _mesa_lookup_texture_err(ctx, texture, caller);
+
+   if (!texObj) {
+      return;
+   }
+
+   get_texture_image_dims(texObj, texObj->Target, level,
+                          &width, &height, &depth);
+
+   if (getteximage_error_check(ctx, texObj, texObj->Target, level,
+                               0, 0, 0, width, height, depth,
+                               format, type, bufSize, pixels, caller)) {
+      return;
+   }
+
+   get_texture_image(ctx, texObj, texObj->Target, level,
+                     0, 0, 0, width, height, depth,
+                     format, type, pixels, caller);
+}
+
+
 /**
  * Do error checking for a glGetCompressedTexImage() call.
  * \return GL_TRUE if any error, GL_FALSE if no errors.

From a92f0277d854d6ac5dd524134f113632c990b1b0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0417/1208] mesa: overhaul the glGetCompressedTexImage code

Same idea as the previous patch.
v2: a few clean-ups spotted by Ilia

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/main/texgetimage.c | 350 +++++++++++++++++++++---------------
 1 file changed, 203 insertions(+), 147 deletions(-)

diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index e58e0500491..2831cd2683c 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -1437,224 +1437,280 @@ _mesa_GetTextureImage(GLuint texture, GLint level, GLenum format, GLenum type,
 
 
 /**
- * Do error checking for a glGetCompressedTexImage() call.
- * \return GL_TRUE if any error, GL_FALSE if no errors.
+ * Compute the number of bytes which will be written when retrieving
+ * a sub-region of a compressed texture.
  */
-static GLboolean
-getcompressedteximage_error_check(struct gl_context *ctx,
-                                  struct gl_texture_image *texImage,
-                                  GLenum target,
-                                  GLint level, GLsizei clientMemSize,
-                                  GLvoid *img, bool dsa)
+static GLsizei
+packed_compressed_size(GLuint dimensions, mesa_format format,
+                       GLsizei width, GLsizei height, GLsizei depth,
+                       const struct gl_pixelstore_attrib *packing)
 {
-   const GLint maxLevels = _mesa_max_texture_levels(ctx, target);
-   GLuint compressedSize, dimensions;
-   const char *suffix = dsa ? "ture" : "";
+   struct compressed_pixelstore st;
+   GLsizei totalBytes;
 
-   assert(texImage);
+   _mesa_compute_compressed_pixelstore(dimensions, format,
+                                       width, height, depth,
+                                       packing, &st);
+   totalBytes =
+      (st.CopySlices - 1) * st.TotalRowsPerSlice * st.TotalBytesPerRow +
+      st.SkipBytes +
+      (st.CopyRowsPerSlice - 1) * st.TotalBytesPerRow +
+      st.CopyBytesPerRow;
 
-   if (!legal_getteximage_target(ctx, target, dsa)) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetCompressedTex%sImage(target=%s)", suffix,
-                  _mesa_enum_to_string(target));
-      return GL_TRUE;
+   return totalBytes;
+}
+
+
+/**
+ * Do error checking for getting compressed texture images.
+ * \return true if any error, false if no errors.
+ */
+static bool
+getcompressedteximage_error_check(struct gl_context *ctx,
+                                  struct gl_texture_object *texObj,
+                                  GLenum target, GLint level,
+                                  GLint xoffset, GLint yoffset, GLint zoffset,
+                                  GLsizei width, GLsizei height, GLsizei depth,
+                                  GLsizei bufSize, GLvoid *pixels,
+                                  const char *caller)
+{
+   struct gl_texture_image *texImage;
+   GLint maxLevels;
+   GLsizei totalBytes;
+   GLuint dimensions;
+
+   assert(texObj);
+
+   if (texObj->Target == 0) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid texture)", caller);
+      return true;
    }
 
-   assert(maxLevels != 0);
+   maxLevels = _mesa_max_texture_levels(ctx, target);
    if (level < 0 || level >= maxLevels) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glGetCompressedTex%sImage(bad level = %d)", suffix, level);
-      return GL_TRUE;
+                  "%s(bad level = %d)", caller, level);
+      return true;
    }
 
+   if (dimensions_error_check(ctx, texObj, target, level,
+                              xoffset, yoffset, zoffset,
+                              width, height, depth, caller)) {
+      return true;
+   }
+
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   assert(texImage);
+
    if (!_mesa_is_format_compressed(texImage->TexFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetCompressedTex%sImage(texture is not compressed)",
-                  suffix);
-      return GL_TRUE;
+                  "%s(texture is not compressed)", caller);
+      return true;
    }
 
-   compressedSize = _mesa_format_image_size(texImage->TexFormat,
-                                            texImage->Width,
-                                            texImage->Height,
-                                            texImage->Depth);
-
    /* Check for invalid pixel storage modes */
-   dimensions = _mesa_get_texture_dimensions(texImage->TexObject->Target);
+   dimensions = _mesa_get_texture_dimensions(texObj->Target);
    if (!_mesa_compressed_pixel_storage_error_check(ctx, dimensions,
-                                              &ctx->Pack, dsa ?
-                                              "glGetCompressedTextureImage":
-                                              "glGetCompressedTexImage")) {
-      return GL_TRUE;
+                                                   &ctx->Pack,
+                                                   caller)) {
+      return true;
    }
 
-   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-      /* do bounds checking on writing to client memory */
-      if (clientMemSize < (GLsizei) compressedSize) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "%s(out of bounds access: bufSize (%d) is too small)",
-                     dsa ? "glGetCompressedTextureImage" :
-                     "glGetnCompressedTexImageARB", clientMemSize);
-         return GL_TRUE;
-      }
-   } else {
+   /* Compute number of bytes that may be touched in the dest buffer */
+   totalBytes = packed_compressed_size(dimensions, texImage->TexFormat,
+                                       width, height, depth,
+                                       &ctx->Pack);
+
+   /* Do dest buffer bounds checking */
+   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
       /* do bounds checking on PBO write */
-      if ((const GLubyte *) img + compressedSize >
-          (const GLubyte *) ctx->Pack.BufferObj->Size) {
+      if ((GLubyte *) pixels + totalBytes >
+          (GLubyte *) ctx->Pack.BufferObj->Size) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetCompressedTex%sImage(out of bounds PBO access)",
-                     suffix);
-         return GL_TRUE;
+                     "%s(out of bounds PBO access)", caller);
+         return true;
       }
 
       /* make sure PBO is not mapped */
       if (_mesa_check_disallowed_mapping(ctx->Pack.BufferObj)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(PBO is mapped)", caller);
+         return true;
+      }
+   }
+   else {
+      /* do bounds checking on writing to client memory */
+      if (totalBytes > bufSize) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetCompressedTex%sImage(PBO is mapped)", suffix);
-         return GL_TRUE;
+                     "%s(out of bounds access: bufSize (%d) is too small)",
+                     caller, bufSize);
+         return true;
       }
    }
 
-   return GL_FALSE;
+   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && !pixels) {
+      /* not an error, but do nothing */
+      return true;
+   }
+
+   return false;
 }
 
-/** Implements glGetnCompressedTexImageARB, glGetCompressedTexImage, and
- * glGetCompressedTextureImage.
- *
- * texImage must be passed in because glGetCompressedTexImage must handle the
- * target GL_TEXTURE_CUBE_MAP.
+
+/**
+ * Common helper for all glGetCompressed-teximage functions.
  */
-void
-_mesa_get_compressed_texture_image(struct gl_context *ctx,
-                                   struct gl_texture_object *texObj,
-                                   struct gl_texture_image *texImage,
-                                   GLenum target, GLint level,
-                                   GLsizei bufSize, GLvoid *pixels,
-                                   bool dsa)
+static void
+get_compressed_texture_image(struct gl_context *ctx,
+                             struct gl_texture_object *texObj,
+                             GLenum target, GLint level,
+                             GLint xoffset, GLint yoffset, GLint zoffset,
+                             GLsizei width, GLsizei height, GLint depth,
+                             GLvoid *pixels,
+                             const char *caller)
 {
-   assert(texObj);
-   assert(texImage);
+   struct gl_texture_image *texImage;
+   unsigned firstFace, numFaces, i, imageStride;
 
    FLUSH_VERTICES(ctx, 0);
 
-   if (getcompressedteximage_error_check(ctx, texImage, target, level,
-                                         bufSize, pixels, dsa)) {
-      return;
-   }
-
-   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && !pixels) {
-      /* not an error, do nothing */
-      return;
-   }
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   assert(texImage);  /* should have been error checked already */
 
    if (_mesa_is_zero_size_texture(texImage))
       return;
 
    if (MESA_VERBOSE & (VERBOSE_API | VERBOSE_TEXTURE)) {
       _mesa_debug(ctx,
-                  "glGetCompressedTex%sImage(tex %u) format = %s, w=%d, h=%d\n",
-                  dsa ? "ture" : "", texObj->Name,
+                  "%s(tex %u) format = %s, w=%d, h=%d\n",
+                  caller, texObj->Name,
                   _mesa_get_format_name(texImage->TexFormat),
                   texImage->Width, texImage->Height);
    }
 
-   _mesa_lock_texture(ctx, texObj);
-   {
-      ctx->Driver.GetCompressedTexSubImage(ctx, texImage,
-                                           0, 0, 0,
-                                           texImage->Width, texImage->Height,
-                                           texImage->Depth, pixels);
+   if (target == GL_TEXTURE_CUBE_MAP) {
+      struct compressed_pixelstore store;
+
+      /* Compute image stride between cube faces */
+      _mesa_compute_compressed_pixelstore(2, texImage->TexFormat,
+                                          width, height, depth,
+                                          &ctx->Pack, &store);
+      imageStride = store.TotalBytesPerRow * store.TotalRowsPerSlice;
+
+      firstFace = zoffset;
+      numFaces = depth;
+      zoffset = 0;
+      depth = 1;
    }
+   else {
+      imageStride = 0;
+      firstFace = _mesa_tex_target_to_face(target);
+      numFaces = 1;
+   }
+
+   _mesa_lock_texture(ctx, texObj);
+
+   for (i = 0; i < numFaces; i++) {
+      texImage = texObj->Image[firstFace + i][level];
+      assert(texImage);
+
+      ctx->Driver.GetCompressedTexSubImage(ctx, texImage,
+                                           xoffset, yoffset, zoffset,
+                                           width, height, depth, pixels);
+
+      /* next cube face */
+      pixels = (GLubyte *) pixels + imageStride;
+   }
+
    _mesa_unlock_texture(ctx, texObj);
 }
 
+
 void GLAPIENTRY
 _mesa_GetnCompressedTexImageARB(GLenum target, GLint level, GLsizei bufSize,
-                                GLvoid *img)
+                                GLvoid *pixels)
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
    GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetnCompressedTexImageARB";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj;
+
+   if (!legal_getteximage_target(ctx, target, false)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", caller);
+      return;
+   }
 
    texObj = _mesa_get_current_tex_object(ctx, target);
-   if (!texObj)
-      return;
+   assert(texObj);
 
-   texImage = _mesa_select_tex_image(texObj, target, level);
-   if (!texImage)
-      return;
+   get_texture_image_dims(texObj, target, level, &width, &height, &depth);
 
-   _mesa_get_compressed_texture_image(ctx, texObj, texImage, target, level,
-                                      bufSize, img, false);
+   if (getcompressedteximage_error_check(ctx, texObj, target, level,
+                                         0, 0, 0, width, height, depth,
+                                         INT_MAX, pixels, caller)) {
+      return;
+   }
+
+   get_compressed_texture_image(ctx, texObj, target, level,
+                                0, 0, 0, width, height, depth,
+                                pixels, caller);
 }
 
+
 void GLAPIENTRY
-_mesa_GetCompressedTexImage(GLenum target, GLint level, GLvoid *img)
+_mesa_GetCompressedTexImage(GLenum target, GLint level, GLvoid *pixels)
 {
-   _mesa_GetnCompressedTexImageARB(target, level, INT_MAX, img);
+   GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetCompressedTexImage";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj;
+
+   if (!legal_getteximage_target(ctx, target, false)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", caller);
+      return;
+   }
+
+   texObj = _mesa_get_current_tex_object(ctx, target);
+   assert(texObj);
+
+   get_texture_image_dims(texObj, target, level,
+                          &width, &height, &depth);
+
+   if (getcompressedteximage_error_check(ctx, texObj, target, level,
+                                         0, 0, 0, width, height, depth,
+                                         INT_MAX, pixels, caller)) {
+      return;
+   }
+
+   get_compressed_texture_image(ctx, texObj, target, level,
+                                0, 0, 0, width, height, depth,
+                                pixels, caller);
 }
 
-/**
- * Get compressed texture image.
- *
- * \param texture texture name.
- * \param level image level.
- * \param bufSize size of the pixels data buffer.
- * \param pixels returned pixel data.
- */
+
 void GLAPIENTRY
 _mesa_GetCompressedTextureImage(GLuint texture, GLint level,
                                 GLsizei bufSize, GLvoid *pixels)
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-   int i;
-   GLint image_stride;
    GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetCompressedTextureImage";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj =
+      _mesa_lookup_texture_err(ctx, texture, caller);
 
-   texObj = _mesa_lookup_texture_err(ctx, texture,
-                                     "glGetCompressedTextureImage");
-   if (!texObj)
+   if (!texObj) {
       return;
-
-   /* Must handle special case GL_TEXTURE_CUBE_MAP. */
-   if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
-
-      /* Make sure the texture object is a proper cube.
-       * (See texturesubimage in teximage.c for details on why this check is
-       * performed.)
-       */
-      if (!_mesa_cube_level_complete(texObj, level)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetCompressedTextureImage(cube map incomplete)");
-         return;
-      }
-
-      /* Copy each face. */
-      for (i = 0; i < 6; ++i) {
-         texImage = texObj->Image[i][level];
-         assert(texImage);
-
-         _mesa_get_compressed_texture_image(ctx, texObj, texImage,
-                                            texObj->Target, level,
-                                            bufSize, pixels, true);
-
-         /* Compressed images don't have a client format */
-         image_stride = _mesa_format_image_size(texImage->TexFormat,
-                                                texImage->Width,
-                                                texImage->Height, 1);
-
-         pixels = (GLubyte *) pixels + image_stride;
-         bufSize -= image_stride;
-      }
    }
-   else {
-      texImage = _mesa_select_tex_image(texObj, texObj->Target, level);
-      if (!texImage)
-         return;
 
-      _mesa_get_compressed_texture_image(ctx, texObj, texImage,
-                                         texObj->Target, level, bufSize,
-                                         pixels, true);
+   get_texture_image_dims(texObj, texObj->Target, level,
+                          &width, &height, &depth);
+
+   if (getcompressedteximage_error_check(ctx, texObj, texObj->Target, level,
+                                         0, 0, 0, width, height, depth,
+                                         bufSize, pixels, caller)) {
+      return;
    }
+
+   get_compressed_texture_image(ctx, texObj, texObj->Target, level,
+                                0, 0, 0, width, height, depth,
+                                pixels, caller);
 }

From 2494f91fb82eb545fd59ed4c9850ff378fc0c591 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0418/1208] mesa: add new
 _mesa_Get[Compressed]TextureSubImage() functions

Simple implementations in terms of get_[compressed_]texture_image().

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/main/texgetimage.c | 62 ++++++++++++++++++++++++++++++++++++-
 src/mesa/main/texgetimage.h | 15 +++++++++
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 2831cd2683c..8539afcf47b 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -893,7 +893,8 @@ legal_getteximage_target(struct gl_context *ctx, GLenum target, bool dsa)
 /**
  * Wrapper for _mesa_select_tex_image() which can handle target being
  * GL_TEXTURE_CUBE_MAP_ARB in which case we use zoffset to select a cube face.
- * This can happen for glGetTextureImage (DSA function).
+ * This can happen for glGetTextureImage and glGetTextureSubImage (DSA
+ * functions).
  */
 static struct gl_texture_image *
 select_tex_image(const struct gl_texture_object *texObj, GLenum target,
@@ -1436,6 +1437,35 @@ _mesa_GetTextureImage(GLuint texture, GLint level, GLenum format, GLenum type,
 }
 
 
+void GLAPIENTRY
+_mesa_GetTextureSubImage(GLuint texture, GLint level,
+                         GLint xoffset, GLint yoffset, GLint zoffset,
+                         GLsizei width, GLsizei height, GLsizei depth,
+                         GLenum format, GLenum type, GLsizei bufSize,
+                         void *pixels)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetTextureSubImage";
+   struct gl_texture_object *texObj =
+      _mesa_lookup_texture_err(ctx, texture, caller);
+
+   if (!texObj) {
+      return;
+   }
+
+   if (getteximage_error_check(ctx, texObj, texObj->Target, level,
+                               xoffset, yoffset, zoffset, width, height, depth,
+                               format, type, bufSize, pixels, caller)) {
+      return;
+   }
+
+   get_texture_image(ctx, texObj, texObj->Target, level,
+                     xoffset, yoffset, zoffset, width, height, depth,
+                     format, type, pixels, caller);
+}
+
+
+
 /**
  * Compute the number of bytes which will be written when retrieving
  * a sub-region of a compressed texture.
@@ -1714,3 +1744,33 @@ _mesa_GetCompressedTextureImage(GLuint texture, GLint level,
                                 0, 0, 0, width, height, depth,
                                 pixels, caller);
 }
+
+
+void APIENTRY
+_mesa_GetCompressedTextureSubImage(GLuint texture, GLint level,
+                                   GLint xoffset, GLint yoffset,
+                                   GLint zoffset, GLsizei width,
+                                   GLsizei height, GLsizei depth,
+                                   GLsizei bufSize, void *pixels)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetCompressedTextureImage";
+   struct gl_texture_object *texObj;
+
+   texObj = _mesa_lookup_texture_err(ctx, texture, caller);
+   if (!texObj) {
+      return;
+   }
+
+   if (getcompressedteximage_error_check(ctx, texObj, texObj->Target, level,
+                                         xoffset, yoffset, zoffset,
+                                         width, height, depth,
+                                         bufSize, pixels, caller)) {
+      return;
+   }
+
+   get_compressed_texture_image(ctx, texObj, texObj->Target, level,
+                                xoffset, yoffset, zoffset,
+                                width, height, depth,
+                                pixels, caller);
+}
diff --git a/src/mesa/main/texgetimage.h b/src/mesa/main/texgetimage.h
index 040495a773b..63c75eb931d 100644
--- a/src/mesa/main/texgetimage.h
+++ b/src/mesa/main/texgetimage.h
@@ -70,6 +70,14 @@ extern void GLAPIENTRY
 _mesa_GetTextureImage(GLuint texture, GLint level, GLenum format,
                       GLenum type, GLsizei bufSize, GLvoid *pixels);
 
+extern void GLAPIENTRY
+_mesa_GetTextureSubImage(GLuint texture, GLint level,
+                         GLint xoffset, GLint yoffset, GLint zoffset,
+                         GLsizei width, GLsizei height, GLsizei depth,
+                         GLenum format, GLenum type, GLsizei bufSize,
+                         void *pixels);
+
+
 extern void GLAPIENTRY
 _mesa_GetCompressedTexImage(GLenum target, GLint lod, GLvoid *img);
 
@@ -81,4 +89,11 @@ extern void GLAPIENTRY
 _mesa_GetCompressedTextureImage(GLuint texture, GLint level, GLsizei bufSize,
                                 GLvoid *pixels);
 
+extern void APIENTRY
+_mesa_GetCompressedTextureSubImage(GLuint texture, GLint level,
+                                   GLint xoffset, GLint yoffset,
+                                   GLint zoffset, GLsizei width,
+                                   GLsizei height, GLsizei depth,
+                                   GLsizei bufSize, void *pixels);
+
 #endif /* TEXGETIMAGE_H */

From 48f9f0bfdd6bfb0ab4844cf005c1534c86cd3836 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0419/1208] mesa: add API dispatch for
 GL_ARB_get_texture_sub_image

This adds the new glGetTextureSubImage() and
glGetCompressedTextureSubImage() functions.  Also update the
dispatch sanity test program.

v2: remove stray brace, move xi:include line in gl_API.xml, fix extension
number typo, s/program/texture/ in xml file.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../glapi/gen/ARB_get_texture_sub_image.xml   | 40 +++++++++++++++++++
 src/mapi/glapi/gen/Makefile.am                |  1 +
 src/mapi/glapi/gen/gl_API.xml                 |  4 +-
 src/mesa/main/tests/dispatch_sanity.cpp       |  5 +++
 4 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 src/mapi/glapi/gen/ARB_get_texture_sub_image.xml

diff --git a/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml b/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml
new file mode 100644
index 00000000000..14e1c20b9d5
--- /dev/null
+++ b/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<!-- This is included by gl_and_es_API.xml.  Could be moved to gl_API.xml. -->
+
+<OpenGLAPI>
+
+<category name="GL_ARB_get_texture_sub_image" number="165">
+
+    <function name="GetTextureSubImage" offset="assign">
+        <param name="texture" type="GLuint"/>
+        <param name="level" type="GLint"/>
+        <param name="xoffset" type="GLint"/>
+        <param name="yoffset" type="GLint"/>
+        <param name="zoffset" type="GLint"/>
+        <param name="width" type="GLsizei"/>
+        <param name="height" type="GLsizei"/>
+        <param name="depth" type="GLsizei"/>
+        <param name="format" type="GLenum"/>
+        <param name="type" type="GLenum"/>
+        <param name="bufSize" type="GLsizei"/>
+        <param name="pixels" type="GLvoid *"/>
+    </function>
+
+    <function name="GetCompressedTextureSubImage" offset="assign">
+        <param name="texture" type="GLuint"/>
+        <param name="level" type="GLint"/>
+        <param name="xoffset" type="GLint"/>
+        <param name="yoffset" type="GLint"/>
+        <param name="zoffset" type="GLint"/>
+        <param name="width" type="GLsizei"/>
+        <param name="height" type="GLsizei"/>
+        <param name="depth" type="GLsizei"/>
+        <param name="bufSize" type="GLsizei"/>
+        <param name="pixels" type="GLvoid *"/>
+    </function>
+
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 5b163b02e00..170898c60ea 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -135,6 +135,7 @@ API_XML = \
 	ARB_framebuffer_object.xml \
 	ARB_geometry_shader4.xml \
 	ARB_get_program_binary.xml \
+	ARB_get_texture_sub_image.xml \
 	ARB_gpu_shader_fp64.xml \
 	ARB_gpu_shader5.xml \
 	ARB_instanced_arrays.xml \
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index 2f330756f22..eb8c72a5e15 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -8253,7 +8253,9 @@
 
 <xi:include href="ARB_direct_state_access.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
-<!-- ARB extensions 165 - 166 -->
+<xi:include href="ARB_get_texture_sub_image.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
+<!-- ARB extension 166 -->
 
 <xi:include href="ARB_texture_barrier.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 800720b798e..901e6fc53b8 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -930,6 +930,11 @@ const struct function common_desktop_functions_possible[] = {
 
    /* GL_EXT_polygon_offset_clamp */
    { "glPolygonOffsetClampEXT", 11, -1 },
+
+   /* GL_ARB_get_texture_sub_image */
+   { "glGetTextureSubImage", 20, -1 },
+   { "glGetCompressedTextureSubImage", 20, -1 },
+
    { NULL, 0, -1 }
 };
 

From 89212f9d06cbd5dd7cebefe7b4e535692525e3e9 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0420/1208] mesa: enable GL_ARB_get_texture_sub_image for all
 drivers

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/main/extensions.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index d753e5fea31..7deb823e1c4 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -121,6 +121,7 @@ static const struct extension extension_table[] = {
    { "GL_ARB_framebuffer_object",                  o(ARB_framebuffer_object),                  GL,             2005 },
    { "GL_ARB_framebuffer_sRGB",                    o(EXT_framebuffer_sRGB),                    GL,             1998 },
    { "GL_ARB_get_program_binary",                  o(dummy_true),                              GL,             2010 },
+   { "GL_ARB_get_texture_sub_image",               o(dummy_true),                              GL,             2014 },
    { "GL_ARB_gpu_shader5",                         o(ARB_gpu_shader5),                         GLC,            2010 },
    { "GL_ARB_gpu_shader_fp64",                     o(ARB_gpu_shader_fp64),                     GLC,            2010 },
    { "GL_ARB_half_float_pixel",                    o(dummy_true),                              GL,             2003 },

From b94367ba8d14ea999d2b81cb24db21fa72d4bab8 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0421/1208] docs: document that GL_ARB_get_texture_sub_image is
 completed

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 docs/GL3.txt              | 2 +-
 docs/relnotes/10.7.0.html | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 33a282edd5b..214425713b8 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -202,7 +202,7 @@ GL 4.5, GLSL 4.50:
   - Sampler object                                     DONE
   - Program Pipeline object                            DONE
   - Query object                                       DONE (will require changes when GL_ARB_query_buffer_object lands)
-  GL_ARB_get_texture_sub_image                         started (Brian Paul)
+  GL_ARB_get_texture_sub_image                         DONE (all drivers)
   GL_ARB_shader_texture_image_samples                  not started
   GL_ARB_texture_barrier                               DONE (nv50, nvc0, r600, radeonsi)
   GL_KHR_context_flush_control                         DONE (all - but needs GLX/EXT extension to be useful)
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index 42ea807e1ff..f39e272ffde 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -47,6 +47,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_AMD_vertex_shader_viewport_index on radeonsi</li>
 <li>GL_ARB_fragment_layer_viewport on radeonsi</li>
 <li>GL_ARB_framebuffer_no_attachments on i965</li>
+<li>GL_ARB_get_texture_sub_image for all drivers</li>
 <li>GL_ARB_gpu_shader_fp64 on llvmpipe, radeonsi</li>
 <li>GL_ARB_shader_stencil_export on llvmpipe</li>
 <li>GL_ARB_vertex_attrib_64bit on llvmpipe, radeonsi</li>

From 24799c422365f609403ccde0759c77b0179328d3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:35:38 -0600
Subject: [PATCH 0422/1208] mesa: s/GLint/GLsizei/ for consistency

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/main/dd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index ae01770cce4..87eb63ea374 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -328,7 +328,7 @@ struct dd_function_table {
    void (*CompressedTexSubImage)(struct gl_context *ctx, GLuint dims,
                                  struct gl_texture_image *texImage,
                                  GLint xoffset, GLint yoffset, GLint zoffset,
-                                 GLsizei width, GLint height, GLint depth,
+                                 GLsizei width, GLsizei height, GLsizei depth,
                                  GLenum format,
                                  GLsizei imageSize, const GLvoid *data);
 

From 61ed88b1ddf8aea6f74518bcae5c13d9bf4ae822 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:44:07 -0600
Subject: [PATCH 0423/1208] mesa: move check for no-op glFrontFace call earlier

If the new mode matches the current mode, there can be no error.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/polygon.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/polygon.c b/src/mesa/main/polygon.c
index c1c31660e12..60af88f9857 100644
--- a/src/mesa/main/polygon.c
+++ b/src/mesa/main/polygon.c
@@ -93,14 +93,14 @@ _mesa_FrontFace( GLenum mode )
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glFrontFace %s\n", _mesa_enum_to_string(mode));
 
+   if (ctx->Polygon.FrontFace == mode)
+      return;
+
    if (mode!=GL_CW && mode!=GL_CCW) {
       _mesa_error( ctx, GL_INVALID_ENUM, "glFrontFace" );
       return;
    }
 
-   if (ctx->Polygon.FrontFace == mode)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_POLYGON);
    ctx->Polygon.FrontFace = mode;
 

From d323f26830c1ce7e157cfeeb4f1e38b1a4d19d31 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:44:07 -0600
Subject: [PATCH 0424/1208] mesa: move check for no-op glAlphaFunc call earlier

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/blend.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index 99ab3c354ab..4fc32962425 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -569,6 +569,9 @@ _mesa_AlphaFunc( GLenum func, GLclampf ref )
       _mesa_debug(ctx, "glAlphaFunc(%s, %f)\n",
                   _mesa_enum_to_string(func), ref);
 
+   if (ctx->Color.AlphaFunc == func && ctx->Color.AlphaRefUnclamped == ref)
+      return; /* no change */
+
    switch (func) {
    case GL_NEVER:
    case GL_LESS:
@@ -578,9 +581,6 @@ _mesa_AlphaFunc( GLenum func, GLclampf ref )
    case GL_NOTEQUAL:
    case GL_GEQUAL:
    case GL_ALWAYS:
-      if (ctx->Color.AlphaFunc == func && ctx->Color.AlphaRefUnclamped == ref)
-         return; /* no change */
-
       FLUSH_VERTICES(ctx, _NEW_COLOR);
       ctx->Color.AlphaFunc = func;
       ctx->Color.AlphaRefUnclamped = ref;

From 300926def04afc76d67ce964f10247b2e787c5dc Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:44:07 -0600
Subject: [PATCH 0425/1208] mesa: move check for no-op glShadeModel call
 earlier

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/light.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/light.c b/src/mesa/main/light.c
index dab21d1a634..89b1c4b142f 100644
--- a/src/mesa/main/light.c
+++ b/src/mesa/main/light.c
@@ -44,14 +44,14 @@ _mesa_ShadeModel( GLenum mode )
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glShadeModel %s\n", _mesa_enum_to_string(mode));
 
+   if (ctx->Light.ShadeModel == mode)
+      return;
+
    if (mode != GL_FLAT && mode != GL_SMOOTH) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glShadeModel");
       return;
    }
 
-   if (ctx->Light.ShadeModel == mode)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_LIGHT);
    ctx->Light.ShadeModel = mode;
 

From 7fccebf9803973c7403318da20afe23e80b5b59f Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:44:07 -0600
Subject: [PATCH 0426/1208] swrast: remove unneeded & operators in
 _swrast_choose_texture_sample_func()

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/mesa/swrast/s_texfilter.c | 56 +++++++++++++++++------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/mesa/swrast/s_texfilter.c b/src/mesa/swrast/s_texfilter.c
index abc1727cf29..cd6395fbcde 100644
--- a/src/mesa/swrast/s_texfilter.c
+++ b/src/mesa/swrast/s_texfilter.c
@@ -3713,7 +3713,7 @@ _swrast_choose_texture_sample_func( struct gl_context *ctx,
                                     const struct gl_sampler_object *sampler)
 {
    if (!t || !_mesa_is_texture_complete(t, sampler)) {
-      return &null_sample_func;
+      return null_sample_func;
    }
    else {
       const GLboolean needLambda =
@@ -3722,32 +3722,32 @@ _swrast_choose_texture_sample_func( struct gl_context *ctx,
       switch (t->Target) {
       case GL_TEXTURE_1D:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
          else if (needLambda) {
-            return &sample_lambda_1d;
+            return sample_lambda_1d;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_1d;
+            return sample_linear_1d;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_1d;
+            return sample_nearest_1d;
          }
       case GL_TEXTURE_2D:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
          else if (needLambda) {
             /* Anisotropic filtering extension. Activated only if mipmaps are used */
             if (sampler->MaxAnisotropy > 1.0 &&
                 sampler->MinFilter == GL_LINEAR_MIPMAP_LINEAR) {
-               return &sample_lambda_2d_aniso;
+               return sample_lambda_2d_aniso;
             }
-            return &sample_lambda_2d;
+            return sample_lambda_2d;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_2d;
+            return sample_linear_2d;
          }
          else {
             /* check for a few optimized cases */
@@ -3772,72 +3772,72 @@ _swrast_choose_texture_sample_func( struct gl_context *ctx,
          }
       case GL_TEXTURE_3D:
          if (needLambda) {
-            return &sample_lambda_3d;
+            return sample_lambda_3d;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_3d;
+            return sample_linear_3d;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_3d;
+            return sample_nearest_3d;
          }
       case GL_TEXTURE_CUBE_MAP:
          if (needLambda) {
-            return &sample_lambda_cube;
+            return sample_lambda_cube;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_cube;
+            return sample_linear_cube;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_cube;
+            return sample_nearest_cube;
          }
       case GL_TEXTURE_RECTANGLE_NV:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
          else if (needLambda) {
-            return &sample_lambda_rect;
+            return sample_lambda_rect;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_rect;
+            return sample_linear_rect;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_rect;
+            return sample_nearest_rect;
          }
       case GL_TEXTURE_1D_ARRAY_EXT:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
 	 else if (needLambda) {
-            return &sample_lambda_1d_array;
+            return sample_lambda_1d_array;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_1d_array;
+            return sample_linear_1d_array;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_1d_array;
+            return sample_nearest_1d_array;
          }
       case GL_TEXTURE_2D_ARRAY_EXT:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
 	 else if (needLambda) {
-            return &sample_lambda_2d_array;
+            return sample_lambda_2d_array;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_2d_array;
+            return sample_linear_2d_array;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_2d_array;
+            return sample_nearest_2d_array;
          }
       default:
          _mesa_problem(ctx,
                        "invalid target in _swrast_choose_texture_sample_func");
-         return &null_sample_func;
+         return null_sample_func;
       }
    }
 }

From 09c440c718992d48ef118c1aad6929ad215ccd3b Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Fri, 3 Jul 2015 08:45:30 +1000
Subject: [PATCH 0427/1208] glsl: check for leading zeros in array index
 validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/glsl/linker.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 28ad2f38e0e..58dd9fbfdae 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -462,6 +462,10 @@ parse_program_resource_name(const GLchar *name,
    if (array_index < 0)
       return -1;
 
+   /* Check for leading zero */
+   if (name[i] == '0' && name[i+1] != ']')
+      return -1;
+
    *out_base_name_end = name + (i - 1);
    return array_index;
 }

From 13322a6590b9e64a9a9f8dd304898e9ab6bedd49 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Wed, 17 Jun 2015 23:03:52 +1000
Subject: [PATCH 0428/1208] mesa: fix active sampler conflict validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The type stored in gl_uniform_storage is the type of a single array
element not the array type so size was always 1.

V2: Dont validate sampler units pointing to 0

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/mesa/main/uniform_query.cpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
index cab5083e81b..b5a94e9473b 100644
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -1101,18 +1101,23 @@ _mesa_sampler_uniforms_pipeline_are_valid(struct gl_pipeline_object *pipeline)
       for (unsigned i = 0; i < shProg[idx]->NumUniformStorage; i++) {
          const struct gl_uniform_storage *const storage =
             &shProg[idx]->UniformStorage[i];
-         const glsl_type *const t = (storage->type->is_array())
-            ? storage->type->fields.array : storage->type;
 
-         if (!t->is_sampler())
+         if (!storage->type->is_sampler())
             continue;
 
          active_samplers++;
 
-         const unsigned count = MAX2(1, storage->type->array_size());
+         const unsigned count = MAX2(1, storage->array_elements);
          for (unsigned j = 0; j < count; j++) {
             const unsigned unit = storage->storage[j].i;
 
+            /* FIXME: Samplers are initialized to 0 and Mesa doesn't do a
+             * great job of eliminating unused uniforms currently so for now
+             * don't throw an error if two sampler types both point to 0.
+             */
+            if (unit == 0)
+               continue;
+
             /* The types of the samplers associated with a particular texture
              * unit must be an exact match.  Page 74 (page 89 of the PDF) of
              * the OpenGL 3.3 core spec says:
@@ -1122,13 +1127,14 @@ _mesa_sampler_uniforms_pipeline_are_valid(struct gl_pipeline_object *pipeline)
              *     program object."
              */
             if (unit_types[unit] == NULL) {
-               unit_types[unit] = t;
-            } else if (unit_types[unit] != t) {
+               unit_types[unit] = storage->type;
+            } else if (unit_types[unit] != storage->type) {
                pipeline->InfoLog =
                   ralloc_asprintf(pipeline,
                                   "Texture unit %d is accessed both as %s "
                                   "and %s",
-                                  unit, unit_types[unit]->name, t->name);
+                                  unit, unit_types[unit]->name,
+                                  storage->type->name);
                return false;
             }
          }

From fcc1949cc4d97d8ed714020d5b86b31b70eca774 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sat, 4 Jul 2015 08:34:32 +1000
Subject: [PATCH 0429/1208] mesa: fix misleading comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/mesa/main/shader_query.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index d562a90d2b2..c8d3687faa6 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -787,8 +787,6 @@ program_resource_location(struct gl_shader_program *shProg,
 
 /**
  * Function implements following location queries:
- *    glGetAttribLocation
- *    glGetFragDataLocation
  *    glGetUniformLocation
  */
 GLint

From 800efb0690e962750b9a072bcbab279fdaae24a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
Date: Wed, 22 Jul 2015 16:11:39 +0900
Subject: [PATCH 0430/1208] radeonsi: Flush when we're asked to return a fence
 but don't have one yet
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_hw_context.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 08cc08e64fe..dc8702e95a6 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -84,7 +84,8 @@ void si_context_gfx_flush(void *context, unsigned flags,
 	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
 	struct radeon_winsys *ws = ctx->b.ws;
 
-	if (cs->cdw == ctx->b.initial_gfx_cs_size) {
+	if (cs->cdw == ctx->b.initial_gfx_cs_size &&
+	    (!fence || ctx->last_gfx_fence)) {
 		if (fence)
 			ws->fence_reference(fence, ctx->last_gfx_fence);
 		if (!(flags & RADEON_FLUSH_ASYNC))

From fe4290200942b1103cdc1a238876143b61b731f0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 22 Jul 2015 08:04:49 -0600
Subject: [PATCH 0431/1208] mesa: fix typo s/glGetTextImage/glGetTexImage/

Trivial.
---
 src/mesa/main/texgetimage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 8539afcf47b..59ec091257e 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -1382,7 +1382,7 @@ _mesa_GetTexImage(GLenum target, GLint level, GLenum format, GLenum type,
                   GLvoid *pixels )
 {
    GET_CURRENT_CONTEXT(ctx);
-   static const char *caller = "glGetTextImage";
+   static const char *caller = "glGetTexImage";
    GLsizei width, height, depth;
    struct gl_texture_object *texObj;
 

From 16f6d432de07dcb537dafd0c9f3ef7614891ed6b Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Thu, 9 Jul 2015 21:19:15 +0100
Subject: [PATCH 0432/1208] configure.ac: do not set HAVE_DRI(23) when libdrm
 is missing

These conditionals are used to guard both dri modules and loader(s).

Currently if we try to build the gallium swrast dri module (without glx)
on a system that's missing libdrm the build will fail.

v2: Make sure we assign prior to checking the have_libdrm variable.

Cc: 10.6 <mesa-stable@lists.freedesktop.org>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 configure.ac | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/configure.ac b/configure.ac
index bb6d4f618ff..530fc64dafe 100644
--- a/configure.ac
+++ b/configure.ac
@@ -913,6 +913,13 @@ fi
 AM_CONDITIONAL(HAVE_DRI_GLX, test "x$enable_glx" = xyes -a \
                                   "x$enable_dri" = xyes)
 
+# Check for libdrm
+PKG_CHECK_MODULES([LIBDRM], [libdrm >= $LIBDRM_REQUIRED],
+                  [have_libdrm=yes], [have_libdrm=no])
+if test "x$have_libdrm" = xyes; then
+	DEFINES="$DEFINES -DHAVE_LIBDRM"
+fi
+
 # Select which platform-dependent DRI code gets built
 case "$host_os" in
 darwin*)
@@ -925,8 +932,8 @@ esac
 
 AM_CONDITIONAL(HAVE_DRICOMMON, test "x$enable_dri" = xyes )
 AM_CONDITIONAL(HAVE_DRISW, test "x$enable_dri" = xyes )
-AM_CONDITIONAL(HAVE_DRI2, test "x$enable_dri" = xyes -a "x$dri_platform" = xdrm )
-AM_CONDITIONAL(HAVE_DRI3, test "x$enable_dri3" = xyes -a "x$dri_platform" = xdrm )
+AM_CONDITIONAL(HAVE_DRI2, test "x$enable_dri" = xyes -a "x$dri_platform" = xdrm -a "x$have_libdrm" = xyes )
+AM_CONDITIONAL(HAVE_DRI3, test "x$enable_dri3" = xyes -a "x$dri_platform" = xdrm -a "x$have_libdrm" = xyes )
 AM_CONDITIONAL(HAVE_APPLEDRI, test "x$enable_dri" = xyes -a "x$dri_platform" = xapple )
 
 AC_ARG_ENABLE([shared-glapi],
@@ -1112,13 +1119,6 @@ if test "x$with_sha1" = "x"; then
 fi
 AM_CONDITIONAL([ENABLE_SHADER_CACHE], [test x$enable_shader_cache = xyes])
 
-# Check for libdrm
-PKG_CHECK_MODULES([LIBDRM], [libdrm >= $LIBDRM_REQUIRED],
-                  [have_libdrm=yes], [have_libdrm=no])
-if test "x$have_libdrm" = xyes; then
-	DEFINES="$DEFINES -DHAVE_LIBDRM"
-fi
-
 case "$host_os" in
 linux*)
     need_pci_id=yes ;;

From 816e4c1b5e2887c45ffa69d41c8106e7b31977fb Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 22 Jul 2015 16:22:44 +0100
Subject: [PATCH 0433/1208] dri/swrast: automake: add LIBDRM_CFLAGS

With the follow up commit we'll remove the __NOT_HAVE_DRM_H macro. As
requested by Ian HAVE_LIBDRM will be used instead, which will lead to
swrast including drm.h when libdrm package is available, even though we
don't need/make use of the header.

As the define is added after the AM_CFLAGS we cannnot use -UHAVE_LIBDRM,
but instead let's just add LIBDRM_CFLAGS. The latter of which will
expand to NULL when the libdrm package is not around.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/mesa/drivers/dri/swrast/Makefile.am | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/drivers/dri/swrast/Makefile.am b/src/mesa/drivers/dri/swrast/Makefile.am
index bfc3c10e334..9c86d504477 100644
--- a/src/mesa/drivers/dri/swrast/Makefile.am
+++ b/src/mesa/drivers/dri/swrast/Makefile.am
@@ -33,6 +33,7 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_builddir)/src/mesa/drivers/dri/common \
+	$(LIBDRM_CFLAGS) \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS)
 

From 0efd773f719dd2deddb4b6703edf022b294cd349 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 7 Jul 2015 15:13:46 +0100
Subject: [PATCH 0434/1208] dri_interface: drop __NOT_HAVE_DRM_H magic

v2: use HAVE_LIBDRM macro.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 include/GL/internal/dri_interface.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index c827bb640f3..72660718bd8 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -40,14 +40,7 @@
 #ifndef DRI_INTERFACE_H
 #define DRI_INTERFACE_H
 
-/* For archs with no drm.h */
-#if defined(__APPLE__) || defined(__CYGWIN__) || defined(__GNU__)
-#ifndef __NOT_HAVE_DRM_H
-#define __NOT_HAVE_DRM_H
-#endif
-#endif
-
-#ifndef __NOT_HAVE_DRM_H
+#ifdef HAVE_LIBDRM
 #include <drm.h>
 #else
 typedef unsigned int drm_context_t;

From 72c784347bf66b61385cb57bb666033e5234ba69 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 22 Jul 2015 16:34:15 +0100
Subject: [PATCH 0435/1208] st/dri: unwrap/remove __NOT_HAVE_DRM_H magic

With the dri_interface.h clean of the macro, we can remove the final
only st/dri specific use of the very same.

Seemingly it was incorrectly used, as the build-time presence of dri2 is
not libdrm specific. At run-time, the code is already limited to dri2
use-cases plus returning true, when the extension is not present (or too
old) will likely lead to a crash as one tries to use it shortly after
the dri_with_format() call.

As a side effect this gives us a nice cleanup the builds.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/gallium/state_trackers/dri/Android.mk   |  5 +----
 src/gallium/state_trackers/dri/Makefile.am  |  4 ----
 src/gallium/state_trackers/dri/SConscript   |  3 ---
 src/gallium/state_trackers/dri/dri_screen.h | 12 ------------
 4 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/src/gallium/state_trackers/dri/Android.mk b/src/gallium/state_trackers/dri/Android.mk
index 188e4a1404d..4ee43385352 100644
--- a/src/gallium/state_trackers/dri/Android.mk
+++ b/src/gallium/state_trackers/dri/Android.mk
@@ -48,10 +48,7 @@ LOCAL_CFLAGS += -DGALLIUM_SOFTPIPE
 LOCAL_SRC_FILES += $(drisw_SOURCES)
 endif
 
-# swrast only?
-ifeq ($(MESA_GPU_DRIVERS),swrast)
-LOCAL_CFLAGS += -D__NOT_HAVE_DRM_H
-else
+ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
 LOCAL_SRC_FILES += $(dri2_SOURCES)
 LOCAL_SHARED_LIBRARIES := libdrm
 endif
diff --git a/src/gallium/state_trackers/dri/Makefile.am b/src/gallium/state_trackers/dri/Makefile.am
index d2c7a82d720..9f4deba0c1e 100644
--- a/src/gallium/state_trackers/dri/Makefile.am
+++ b/src/gallium/state_trackers/dri/Makefile.am
@@ -50,10 +50,6 @@ noinst_LTLIBRARIES = libdri.la
 libdri_la_SOURCES = $(common_SOURCES)
 
 if HAVE_DRISW
-if !HAVE_DRI2
-AM_CPPFLAGS += \
-	-D__NOT_HAVE_DRM_H
-endif
 libdri_la_SOURCES += $(drisw_SOURCES)
 endif
 
diff --git a/src/gallium/state_trackers/dri/SConscript b/src/gallium/state_trackers/dri/SConscript
index 89b5e611c2e..7a7060c4c2d 100644
--- a/src/gallium/state_trackers/dri/SConscript
+++ b/src/gallium/state_trackers/dri/SConscript
@@ -5,10 +5,7 @@ Import('*')
 
 env = env.Clone()
 
-# XXX: If HAVE_DRI2
 env.PkgUseModules(['DRM'])
-# else
-#env.Append(CPPDEFINES = [('__NOT_HAVE_DRM_H', '1')])
 
 env.Append(CPPPATH = [
     '#/src',
diff --git a/src/gallium/state_trackers/dri/dri_screen.h b/src/gallium/state_trackers/dri/dri_screen.h
index 6d46fea9684..4bcb0291d86 100644
--- a/src/gallium/state_trackers/dri/dri_screen.h
+++ b/src/gallium/state_trackers/dri/dri_screen.h
@@ -122,8 +122,6 @@ struct __DRIimageRec {
 
 };
 
-#ifndef __NOT_HAVE_DRM_H
-
 static inline boolean
 dri_with_format(__DRIscreen * sPriv)
 {
@@ -134,16 +132,6 @@ dri_with_format(__DRIscreen * sPriv)
        && (loader->getBuffersWithFormat != NULL);
 }
 
-#else
-
-static inline boolean
-dri_with_format(__DRIscreen * sPriv)
-{
-   return TRUE;
-}
-
-#endif
-
 void
 dri_fill_st_visual(struct st_visual *stvis, struct dri_screen *screen,
                    const struct gl_config *mode);

From 1c328b8aa79b0644160082b7e9e02df18ab3ca48 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 7 Jul 2015 15:42:15 +0100
Subject: [PATCH 0436/1208] loader: use HAVE_LIBDRM instead of !
 __NOT_HAVE_DRM_H

Double negatives in English language are normally avoided, plus the
former seems cleaner and more consistent.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/loader/Android.mk          | 6 ++----
 src/loader/Makefile.am         | 5 +----
 src/loader/SConscript          | 2 --
 src/loader/loader.c            | 8 ++++----
 src/loader/pci_id_driver_map.c | 2 +-
 5 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/loader/Android.mk b/src/loader/Android.mk
index 92d9fd20d3c..869056564ce 100644
--- a/src/loader/Android.mk
+++ b/src/loader/Android.mk
@@ -33,10 +33,8 @@ include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
 	$(LOADER_C_FILES)
 
-# swrast only
-ifeq ($(MESA_GPU_DRIVERS),swrast)
-LOCAL_CFLAGS += -D__NOT_HAVE_DRM_H
-else
+ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS += -DHAVE_LIBDRM
 LOCAL_SHARED_LIBRARIES := libdrm
 endif
 
diff --git a/src/loader/Makefile.am b/src/loader/Makefile.am
index aef1bd61bea..5190f7f8a46 100644
--- a/src/loader/Makefile.am
+++ b/src/loader/Makefile.am
@@ -48,10 +48,7 @@ libloader_la_CPPFLAGS += \
 
 endif
 
-if !HAVE_LIBDRM
-libloader_la_CPPFLAGS += \
-	-D__NOT_HAVE_DRM_H
-else
+if HAVE_LIBDRM
 libloader_la_CPPFLAGS += \
 	$(LIBDRM_CFLAGS)
 
diff --git a/src/loader/SConscript b/src/loader/SConscript
index 16d1053ff2d..d98f11e3cf6 100644
--- a/src/loader/SConscript
+++ b/src/loader/SConscript
@@ -8,8 +8,6 @@ env.Prepend(CPPPATH = [
     '#include'
 ])
 
-env.Append(CPPDEFINES = ['__NOT_HAVE_DRM_H'])
-
 if env['udev']:
     env.PkgUseModules('UDEV')
     env.Append(CPPDEFINES = ['HAVE_LIBUDEV'])
diff --git a/src/loader/loader.c b/src/loader/loader.c
index 8da1858734a..8634f45e78d 100644
--- a/src/loader/loader.c
+++ b/src/loader/loader.c
@@ -85,7 +85,7 @@
 #endif
 #include "loader.h"
 
-#ifndef __NOT_HAVE_DRM_H
+#ifdef HAVE_LIBDRM
 #include <xf86drm.h>
 #endif
 
@@ -505,7 +505,7 @@ sysfs_get_pci_id_for_fd(int fd, int *vendor_id, int *chip_id)
 }
 #endif
 
-#if !defined(__NOT_HAVE_DRM_H)
+#if defined(HAVE_LIBDRM)
 /* for i915 */
 #include <i915_drm.h>
 /* for radeon */
@@ -588,7 +588,7 @@ loader_get_pci_id_for_fd(int fd, int *vendor_id, int *chip_id)
    if (sysfs_get_pci_id_for_fd(fd, vendor_id, chip_id))
       return 1;
 #endif
-#if !defined(__NOT_HAVE_DRM_H)
+#if HAVE_LIBDRM
    if (drm_get_pci_id_for_fd(fd, vendor_id, chip_id))
       return 1;
 #endif
@@ -699,7 +699,7 @@ loader_get_driver_for_fd(int fd, unsigned driver_types)
 
    if (!loader_get_pci_id_for_fd(fd, &vendor_id, &chip_id)) {
 
-#ifndef __NOT_HAVE_DRM_H
+#if HAVE_LIBDRM
       /* fallback to drmGetVersion(): */
       drmVersionPtr version = drmGetVersion(fd);
 
diff --git a/src/loader/pci_id_driver_map.c b/src/loader/pci_id_driver_map.c
index cb6f705acbd..3c4657fd014 100644
--- a/src/loader/pci_id_driver_map.c
+++ b/src/loader/pci_id_driver_map.c
@@ -23,7 +23,7 @@
 
 int is_nouveau_vieux(int fd);
 
-#ifndef __NOT_HAVE_DRM_H
+#ifdef HAVE_LIBDRM
 
 #include <xf86drm.h>
 #include <nouveau_drm.h>

From 9ab5b644ef64ee23f88a6687cc541ee4e745234a Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 7 Jul 2015 15:57:41 +0100
Subject: [PATCH 0437/1208] dri/common: use HAVE_LIBDRM over __NOT_HAVE_DRM_H

See previous commit message for details.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/mesa/drivers/dri/common/Android.mk  | 13 ++-----------
 src/mesa/drivers/dri/common/Makefile.am |  2 +-
 src/mesa/drivers/dri/common/SConscript  |  4 ----
 src/mesa/drivers/dri/common/dri_util.c  |  4 ++--
 4 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/src/mesa/drivers/dri/common/Android.mk b/src/mesa/drivers/dri/common/Android.mk
index 6986f5e8cb4..6e29bafe4d7 100644
--- a/src/mesa/drivers/dri/common/Android.mk
+++ b/src/mesa/drivers/dri/common/Android.mk
@@ -43,10 +43,8 @@ LOCAL_EXPORT_C_INCLUDE_DIRS := \
     $(LOCAL_PATH) \
     $(intermediates)
 
-# swrast only
-ifeq ($(MESA_GPU_DRIVERS),swrast)
-LOCAL_CFLAGS := -D__NOT_HAVE_DRM_H
-else
+ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS := -DHAVE_LIBDRM
 LOCAL_SHARED_LIBRARIES := libdrm
 endif
 
@@ -110,13 +108,6 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_C_INCLUDES := \
     $(MESA_DRI_C_INCLUDES)
 
-# swrast only
-ifeq ($(MESA_GPU_DRIVERS),swrast)
-LOCAL_CFLAGS := -D__NOT_HAVE_DRM_H
-else
-LOCAL_SHARED_LIBRARIES := libdrm
-endif
-
 LOCAL_SRC_FILES := $(megadriver_stub_FILES)
 
 include $(MESA_COMMON_MK)
diff --git a/src/mesa/drivers/dri/common/Makefile.am b/src/mesa/drivers/dri/common/Makefile.am
index ae19fcb3565..ee6c6914499 100644
--- a/src/mesa/drivers/dri/common/Makefile.am
+++ b/src/mesa/drivers/dri/common/Makefile.am
@@ -58,5 +58,5 @@ if DRICOMMON_NEED_LIBDRM
 AM_CFLAGS += $(LIBDRM_CFLAGS)
 libdricommon_la_LIBADD = $(LIBDRM_LIBS)
 else
-AM_CFLAGS += -D__NOT_HAVE_DRM_H
+AM_CFLAGS += -UHAVE_LIBDRM
 endif
diff --git a/src/mesa/drivers/dri/common/SConscript b/src/mesa/drivers/dri/common/SConscript
index b402736db69..adaac291662 100644
--- a/src/mesa/drivers/dri/common/SConscript
+++ b/src/mesa/drivers/dri/common/SConscript
@@ -32,10 +32,7 @@ drienv.AppendUnique(LIBS = [
     'expat',
 ])
 
-# if HAVE_DRI2
 drienv.PkgUseModules('DRM')
-# else
-#env.Append(CPPDEFINES = ['__NOT_HAVE_DRM_H'])
 
 sources = drienv.ParseSourceList('Makefile.sources', ['DRI_COMMON_FILES', 'XMLCONFIG_FILES' ])
 
@@ -57,7 +54,6 @@ env.Append(CPPPATH = [
 ])
 
 env.Append(CPPDEFINES = [
-    '__NOT_HAVE_DRM_H',
     'HAVE_DLADDR',
 ])
 
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index ae4592c739d..100a7272369 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -40,7 +40,7 @@
 
 
 #include <stdbool.h>
-#ifndef __NOT_HAVE_DRM_H
+#ifdef HAVE_LIBDRM
 #include <xf86drm.h>
 #endif
 #include "dri_util.h"
@@ -137,7 +137,7 @@ driCreateNewScreen2(int scrn, int fd,
 
     setupLoaderExtensions(psp, extensions);
 
-#ifndef __NOT_HAVE_DRM_H
+#ifdef HAVE_LIBDRM
     if (fd != -1) {
        drmVersionPtr version = drmGetVersion(fd);
        if (version) {

From 787995bffb52d955f3046618286d831b76b72119 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 7 Jul 2015 15:59:32 +0100
Subject: [PATCH 0438/1208] swrast: remove unneeded __NOT_HAVE_DRM_H define

No longer applicable since the cleanup of dri_interface.h.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/mesa/drivers/dri/swrast/Makefile.am | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mesa/drivers/dri/swrast/Makefile.am b/src/mesa/drivers/dri/swrast/Makefile.am
index 9c86d504477..9d21d9ea4dc 100644
--- a/src/mesa/drivers/dri/swrast/Makefile.am
+++ b/src/mesa/drivers/dri/swrast/Makefile.am
@@ -24,7 +24,6 @@
 include Makefile.sources
 
 AM_CFLAGS = \
-	-D__NOT_HAVE_DRM_H \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/ \
 	-I$(top_srcdir)/src/mapi \

From fe1503fe38602c91e030ca206cb392a26a343f91 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 7 Jul 2015 16:04:27 +0100
Subject: [PATCH 0439/1208] android: dri: correctly set HAVE_LIBDRM

Set the macro if we're not building swrast alone.

Cc: Eric Anholt <eric@anholt.net>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Chih-Wei Huang <cwhuang@linux.org.tw>
---
 src/gallium/targets/dri/Android.mk | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/gallium/targets/dri/Android.mk b/src/gallium/targets/dri/Android.mk
index 5ba129b7961..bccc91aa526 100644
--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -35,17 +35,15 @@ endif
 
 LOCAL_SRC_FILES := target.c
 
-LOCAL_CFLAGS := -DDRI_TARGET -DHAVE_LIBDRM
+LOCAL_CFLAGS := -DDRI_TARGET
 
 LOCAL_SHARED_LIBRARIES := \
 	libdl \
 	libglapi \
 	libexpat \
 
-# swrast only?
-ifeq ($(MESA_GPU_DRIVERS),swrast)
-LOCAL_CFLAGS += -D__NOT_HAVE_DRM_H
-else
+ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS += -DHAVE_LIBDRM
 LOCAL_SHARED_LIBRARIES += libdrm
 endif
 

From a29a8b92ff05f3a63dd3b6ae8c7d0e07f039c0ad Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 7 Jul 2015 21:02:40 +0100
Subject: [PATCH 0440/1208] android: don't build the kms-dri winsys

GBM (the only user of kms-dri) is currently not available under Android.
Considering we have no way of testing/using this let's not bother
building it for now.

Cc: Chih-Wei Huang <cwhuang@linux.org.tw>
Cc: Eric Anholt <eric@anholt.net>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/gallium/Android.mk                    |  2 +-
 src/gallium/state_trackers/dri/Android.mk |  1 -
 src/gallium/targets/dri/Android.mk        |  2 +-
 src/gallium/winsys/sw/kms-dri/Android.mk  | 37 -----------------------
 4 files changed, 2 insertions(+), 40 deletions(-)
 delete mode 100644 src/gallium/winsys/sw/kms-dri/Android.mk

diff --git a/src/gallium/Android.mk b/src/gallium/Android.mk
index b946681840c..0703148af08 100644
--- a/src/gallium/Android.mk
+++ b/src/gallium/Android.mk
@@ -34,7 +34,7 @@ SUBDIRS := auxiliary
 
 # swrast
 ifneq ($(filter swrast,$(MESA_GPU_DRIVERS)),)
-SUBDIRS += winsys/sw/dri winsys/sw/kms-dri drivers/softpipe
+SUBDIRS += winsys/sw/dri drivers/softpipe
 endif
 
 # freedreno
diff --git a/src/gallium/state_trackers/dri/Android.mk b/src/gallium/state_trackers/dri/Android.mk
index 4ee43385352..43f0de9b464 100644
--- a/src/gallium/state_trackers/dri/Android.mk
+++ b/src/gallium/state_trackers/dri/Android.mk
@@ -44,7 +44,6 @@ LOCAL_STATIC_LIBRARIES := \
 	libmesa_dri_common \
 
 ifneq ($(filter swrast,$(MESA_GPU_DRIVERS)),)
-LOCAL_CFLAGS += -DGALLIUM_SOFTPIPE
 LOCAL_SRC_FILES += $(drisw_SOURCES)
 endif
 
diff --git a/src/gallium/targets/dri/Android.mk b/src/gallium/targets/dri/Android.mk
index bccc91aa526..7168e1dbfb3 100644
--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -85,7 +85,7 @@ gallium_DRIVERS += libmesa_winsys_radeon libmesa_pipe_radeon
 LOCAL_SHARED_LIBRARIES += libdrm_radeon
 endif
 ifneq ($(filter swrast,$(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_pipe_softpipe libmesa_winsys_sw_dri libmesa_winsys_sw_kms_dri
+gallium_DRIVERS += libmesa_pipe_softpipe libmesa_winsys_sw_dri
 LOCAL_CFLAGS += -DGALLIUM_SOFTPIPE
 endif
 ifneq ($(filter vc4,$(MESA_GPU_DRIVERS)),)
diff --git a/src/gallium/winsys/sw/kms-dri/Android.mk b/src/gallium/winsys/sw/kms-dri/Android.mk
deleted file mode 100644
index b065242aaf3..00000000000
--- a/src/gallium/winsys/sw/kms-dri/Android.mk
+++ /dev/null
@@ -1,37 +0,0 @@
-# Mesa 3-D graphics library
-#
-# Copyright (C) 2015 Chih-Wei Huang <cwhuang@linux.org.tw>
-# Copyright (C) 2015 Android-x86 Open Source Project
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-LOCAL_PATH := $(call my-dir)
-
-include $(LOCAL_PATH)/Makefile.sources
-
-include $(CLEAR_VARS)
-
-LOCAL_SRC_FILES := $(C_SOURCES)
-
-LOCAL_MODULE := libmesa_winsys_sw_kms_dri
-
-LOCAL_SHARED_LIBRARIES := libdrm
-
-include $(GALLIUM_COMMON_MK)
-include $(BUILD_STATIC_LIBRARY)

From 66d77cd71c6359cddfd21c128afe95bad860e231 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 8 Jul 2015 01:44:31 +0100
Subject: [PATCH 0441/1208] scons: don't build the kms-dri winsys

Same as previous commit - unused (gbm is not a thing outside the
autotools build).

v2: Remove trailing HAVE_LIBDRM.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/gallium/SConscript                    |  1 -
 src/gallium/state_trackers/dri/SConscript |  1 -
 src/gallium/targets/dri/SConscript        |  5 -----
 src/gallium/winsys/sw/kms-dri/SConscript  | 23 -----------------------
 4 files changed, 30 deletions(-)
 delete mode 100644 src/gallium/winsys/sw/kms-dri/SConscript

diff --git a/src/gallium/SConscript b/src/gallium/SConscript
index eeb1c780fcd..fa5fa6e8734 100644
--- a/src/gallium/SConscript
+++ b/src/gallium/SConscript
@@ -46,7 +46,6 @@ if env['platform'] == 'haiku':
 if env['dri']:
     SConscript([
         'winsys/sw/dri/SConscript',
-        'winsys/sw/kms-dri/SConscript',
         'winsys/svga/drm/SConscript',
     ])
 
diff --git a/src/gallium/state_trackers/dri/SConscript b/src/gallium/state_trackers/dri/SConscript
index 7a7060c4c2d..657300baf13 100644
--- a/src/gallium/state_trackers/dri/SConscript
+++ b/src/gallium/state_trackers/dri/SConscript
@@ -17,7 +17,6 @@ env.Append(CPPPATH = [
 
 env.Append(CPPDEFINES = [
     ('GALLIUM_STATIC_TARGETS', '1'),
-    'GALLIUM_SOFTPIPE',
 ])
 
 sources = env.ParseSourceList('Makefile.sources', 'common_SOURCES')
diff --git a/src/gallium/targets/dri/SConscript b/src/gallium/targets/dri/SConscript
index a51ed564344..8d29f3beb18 100644
--- a/src/gallium/targets/dri/SConscript
+++ b/src/gallium/targets/dri/SConscript
@@ -29,7 +29,6 @@ env.Append(CPPDEFINES = [
     'GALLIUM_VMWGFX',
     'GALLIUM_SOFTPIPE',
     'DRI_TARGET',
-    'HAVE_LIBDRM',
 ])
 
 env.Prepend(LIBS = [
@@ -37,7 +36,6 @@ env.Prepend(LIBS = [
     svgadrm,
     svga,
     ws_dri,
-    ws_kms_dri,
     softpipe,
     libloader,
     mesautil,
@@ -58,9 +56,6 @@ module = env.LoadableModule(
 env.Command('vmwgfx_dri.so', 'gallium_dri.so', "ln -f ${SOURCE} ${TARGET}")
 # swrast_dri.so
 env.Command('swrast_dri.so', 'gallium_dri.so', "ln -f ${SOURCE} ${TARGET}")
-# kms_swrast_dri.so
-env.Command('kms_swrast_dri.so', 'gallium_dri.so', "ln -f ${SOURCE} ${TARGET}")
 
 env.Alias('dri-vmwgfx', module)
 env.Alias('dri-swrast', module)
-env.Alias('dri-kms-swrast', module)
diff --git a/src/gallium/winsys/sw/kms-dri/SConscript b/src/gallium/winsys/sw/kms-dri/SConscript
deleted file mode 100644
index e7dd721dd13..00000000000
--- a/src/gallium/winsys/sw/kms-dri/SConscript
+++ /dev/null
@@ -1,23 +0,0 @@
-#######################################################################
-# SConscript for kms-dri winsys
-
-
-Import('*')
-
-if env['platform'] not in ('linux'):
-    Return()
-
-env = env.Clone()
-
-env.PkgUseModules('DRM')
-
-env.Append(CPPPATH = [
-    '#/src/gallium/include',
-    '#/src/gallium/auxiliary',
-])
-
-ws_kms_dri = env.ConvenienceLibrary(
-    target = 'ws_kms_dri',
-    source = env.ParseSourceList('Makefile.sources', 'C_SOURCES'),
-)
-Export('ws_kms_dri')

From bf6247f608969c3f1fa987e297c6063c02896b5a Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Thu, 9 Jul 2015 17:29:57 +0100
Subject: [PATCH 0442/1208] radeon,r200: remove unused variable texmicrotile
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dead since at least 2009 with commit ccf7814a315(radeon: major cleanups
removing old dead codepaths.)

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/mesa/drivers/dri/r200/r200_context.c     | 3 ---
 src/mesa/drivers/dri/r200/r200_context.h     | 1 -
 src/mesa/drivers/dri/radeon/radeon_context.c | 3 ---
 src/mesa/drivers/dri/radeon/radeon_context.h | 1 -
 4 files changed, 8 deletions(-)

diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index fb15082114f..40cc50a6302 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -234,9 +234,6 @@ GLboolean r200CreateContext( gl_api api,
 	 rmesa->using_hyperz = GL_TRUE;
    }
  
-   if ( sPriv->drm_version.minor >= 15 )
-      rmesa->texmicrotile = GL_TRUE;
-
    /* Init default driver functions then plug in our R200-specific functions
     * (the texture functions are especially important)
     */
diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h
index e8784aeeed5..c02a4f399ee 100644
--- a/src/mesa/drivers/dri/r200/r200_context.h
+++ b/src/mesa/drivers/dri/r200/r200_context.h
@@ -614,7 +614,6 @@ struct r200_context {
    struct r200_swtcl_info swtcl;
 
    GLboolean using_hyperz;
-   GLboolean texmicrotile;
 
   struct ati_fragment_shader *afs_loaded;
 };
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index d4d19354b6d..edb154c1341 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -199,9 +199,6 @@ r100CreateContext( gl_api api,
 	 rmesa->using_hyperz = GL_TRUE;
    }
 
-   if ( sPriv->drm_version.minor >= 15 )
-      rmesa->texmicrotile = GL_TRUE;
-
    /* Init default driver functions then plug in our Radeon-specific functions
     * (the texture functions are especially important)
     */
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.h b/src/mesa/drivers/dri/radeon/radeon_context.h
index 40325327813..badabd9508c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_context.h
@@ -426,7 +426,6 @@ struct r100_context {
 	struct r100_swtcl_info swtcl;
 
 	GLboolean using_hyperz;
-	GLboolean texmicrotile;
 
 	/* Performance counters
 	 */

From 48926da0f7a1d1656bfbaf9d5344cc1fa0b6e089 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Thu, 9 Jul 2015 17:34:30 +0100
Subject: [PATCH 0443/1208] radeon,r200: remove support for UMS radeon DRM
 module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As mentioned by Michel Dänzer
 "FWIW though, any code which is specific to radeon DRM major version 1
  can be removed, because that's the UMS major version."

and Marek Olšák
 "major != 2" can't occur. You don't have to check the major version at
  all and you can just assume it's always 2."

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/drivers/dri/radeon/radeon_screen.c | 44 ++++++++-------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 45d9b2b8c0b..98b4741b456 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -135,36 +135,26 @@ DRI_CONF_END
 static int
 radeonGetParam(__DRIscreen *sPriv, int param, void *value)
 {
-  int ret;
-  drm_radeon_getparam_t gp = { 0 };
   struct drm_radeon_info info = { 0 };
 
-  if (sPriv->drm_version.major >= 2) {
-      info.value = (uint64_t)(uintptr_t)value;
-      switch (param) {
-      case RADEON_PARAM_DEVICE_ID:
-          info.request = RADEON_INFO_DEVICE_ID;
-          break;
-      case RADEON_PARAM_NUM_GB_PIPES:
-          info.request = RADEON_INFO_NUM_GB_PIPES;
-          break;
-      case RADEON_PARAM_NUM_Z_PIPES:
-          info.request = RADEON_INFO_NUM_Z_PIPES;
-          break;
-      case RADEON_INFO_TILE_CONFIG:
-	  info.request = RADEON_INFO_TILE_CONFIG;
-          break;
-      default:
-          return -EINVAL;
-      }
-      ret = drmCommandWriteRead(sPriv->fd, DRM_RADEON_INFO, &info, sizeof(info));
-  } else {
-      gp.param = param;
-      gp.value = value;
-
-      ret = drmCommandWriteRead(sPriv->fd, DRM_RADEON_GETPARAM, &gp, sizeof(gp));
+  info.value = (uint64_t)(uintptr_t)value;
+  switch (param) {
+  case RADEON_PARAM_DEVICE_ID:
+    info.request = RADEON_INFO_DEVICE_ID;
+    break;
+  case RADEON_PARAM_NUM_GB_PIPES:
+    info.request = RADEON_INFO_NUM_GB_PIPES;
+    break;
+  case RADEON_PARAM_NUM_Z_PIPES:
+    info.request = RADEON_INFO_NUM_Z_PIPES;
+    break;
+  case RADEON_INFO_TILE_CONFIG:
+    info.request = RADEON_INFO_TILE_CONFIG;
+    break;
+  default:
+    return -EINVAL;
   }
-  return ret;
+  return drmCommandWriteRead(sPriv->fd, DRM_RADEON_INFO, &info, sizeof(info));
 }
 
 #if defined(RADEON_R100)

From 5284e9e2c4922479b28db96ae88121a053a6e66b Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Thu, 9 Jul 2015 18:06:14 +0100
Subject: [PATCH 0444/1208] radeon,r200: allow hyperz for radeon DRM module v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The original code only half considered hyperz as an option. As per
previous commit "major != 2 cannot occur" we can simply things, and
allow users to set the option if they choose to do so.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/drivers/dri/r200/r200_context.c     | 10 ++--------
 src/mesa/drivers/dri/radeon/radeon_context.c |  9 ++-------
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index 40cc50a6302..2a42ab3f4c8 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -225,14 +225,8 @@ GLboolean r200CreateContext( gl_api api,
    rmesa->radeon.initialMaxAnisotropy = driQueryOptionf(&rmesa->radeon.optionCache,
 							"def_max_anisotropy");
 
-   if ( sPriv->drm_version.major == 1
-       && driQueryOptionb( &rmesa->radeon.optionCache, "hyperz" ) ) {
-      if ( sPriv->drm_version.minor < 13 )
-	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
-			  "disabling.\n", sPriv->drm_version.minor );
-      else
-	 rmesa->using_hyperz = GL_TRUE;
-   }
+   if (driQueryOptionb( &rmesa->radeon.optionCache, "hyperz"))
+      rmesa->using_hyperz = GL_TRUE;
  
    /* Init default driver functions then plug in our R200-specific functions
     * (the texture functions are especially important)
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index edb154c1341..a9e2ab563d3 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -191,13 +191,8 @@ r100CreateContext( gl_api api,
    rmesa->radeon.initialMaxAnisotropy = driQueryOptionf(&rmesa->radeon.optionCache,
                                                  "def_max_anisotropy");
 
-   if ( driQueryOptionb( &rmesa->radeon.optionCache, "hyperz" ) ) {
-      if ( sPriv->drm_version.minor < 13 )
-	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
-			  "disabling.\n", sPriv->drm_version.minor );
-      else
-	 rmesa->using_hyperz = GL_TRUE;
-   }
+   if (driQueryOptionb(&rmesa->radeon.optionCache, "hyperz"))
+      rmesa->using_hyperz = GL_TRUE;
 
    /* Init default driver functions then plug in our Radeon-specific functions
     * (the texture functions are especially important)

From ce2a4bd541241dade00a36e9f2d8e5ca16c6ff03 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 13 Jul 2015 21:12:01 +0100
Subject: [PATCH 0445/1208] dri/common: remove unused drm_version variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As of last commit the only user of it (radeon/r200) no longer uses it.
As such let's remove it and cleanup the nasty hacks that we had in place
to support this.

v2: Leave LIBDRM_CFLAGS around.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com> (v1)
Reviewed-by: Marek Olšák <marek.olsak@amd.com> (v1)
---
 configure.ac                            | 21 ---------------------
 src/mesa/drivers/dri/common/Android.mk  |  5 -----
 src/mesa/drivers/dri/common/Makefile.am |  8 +-------
 src/mesa/drivers/dri/common/SConscript  |  2 --
 src/mesa/drivers/dri/common/dri_util.c  | 15 ---------------
 src/mesa/drivers/dri/common/dri_util.h  |  5 -----
 6 files changed, 1 insertion(+), 55 deletions(-)

diff --git a/configure.ac b/configure.ac
index 530fc64dafe..c25dc5d1962 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1375,26 +1375,6 @@ if test "x$enable_dri" = xyes; then
                      [AC_MSG_ERROR([Expat library required for DRI not found])])
          EXPAT_LIBS="-lexpat"])
 
-    DRICOMMON_NEED_LIBDRM=no
-    # If we are building any DRI driver other than swrast.
-    if test -n "$with_dri_drivers"; then
-        if test "x$with_dri_drivers" != xswrast; then
-            # ... libdrm is required
-            if test "x$have_libdrm" != xyes; then
-                AC_MSG_ERROR([DRI drivers requires libdrm >= $LIBDRM_REQUIRED])
-            fi
-            DRICOMMON_NEED_LIBDRM=yes
-        fi
-    fi
-
-    # If we're building any gallium DRI driver other than swrast
-    if test -n "$with_gallium_drivers" -a "x$DRICOMMON_NEED_LIBDRM" = xno; then
-        if test "x$with_gallium_drivers" != xswrast; then
-            # ... build a libdrm aware dricommon
-            DRICOMMON_NEED_LIBDRM=yes
-        fi
-    fi
-
     # put all the necessary libs together
     DRI_LIB_DEPS="$DRI_LIB_DEPS $SELINUX_LIBS $LIBDRM_LIBS $EXPAT_LIBS -lm $PTHREAD_LIBS $DLOPEN_LIBS"
 fi
@@ -2264,7 +2244,6 @@ fi
 
 AC_SUBST([ELF_LIB])
 
-AM_CONDITIONAL(DRICOMMON_NEED_LIBDRM, test "x$DRICOMMON_NEED_LIBDRM" = xyes)
 AM_CONDITIONAL(HAVE_LIBDRM, test "x$have_libdrm" = xyes)
 AM_CONDITIONAL(HAVE_X11_DRIVER, test "x$enable_xlib_glx" = xyes)
 AM_CONDITIONAL(HAVE_OSMESA, test "x$enable_osmesa" = xyes)
diff --git a/src/mesa/drivers/dri/common/Android.mk b/src/mesa/drivers/dri/common/Android.mk
index 6e29bafe4d7..f1a733011b9 100644
--- a/src/mesa/drivers/dri/common/Android.mk
+++ b/src/mesa/drivers/dri/common/Android.mk
@@ -43,11 +43,6 @@ LOCAL_EXPORT_C_INCLUDE_DIRS := \
     $(LOCAL_PATH) \
     $(intermediates)
 
-ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
-LOCAL_CFLAGS := -DHAVE_LIBDRM
-LOCAL_SHARED_LIBRARIES := libdrm
-endif
-
 LOCAL_SRC_FILES := \
 	$(DRI_COMMON_FILES) \
 	$(XMLCONFIG_FILES)
diff --git a/src/mesa/drivers/dri/common/Makefile.am b/src/mesa/drivers/dri/common/Makefile.am
index ee6c6914499..b307f10f56b 100644
--- a/src/mesa/drivers/dri/common/Makefile.am
+++ b/src/mesa/drivers/dri/common/Makefile.am
@@ -32,6 +32,7 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
+	$(LIBDRM_CFLAGS) \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS)
 
@@ -53,10 +54,3 @@ libdri_test_stubs_la_CFLAGS = $(AM_CFLAGS) -DNO_MAIN
 libmegadriver_stub_la_SOURCES = $(megadriver_stub_FILES)
 
 sysconf_DATA = drirc
-
-if DRICOMMON_NEED_LIBDRM
-AM_CFLAGS += $(LIBDRM_CFLAGS)
-libdricommon_la_LIBADD = $(LIBDRM_LIBS)
-else
-AM_CFLAGS += -UHAVE_LIBDRM
-endif
diff --git a/src/mesa/drivers/dri/common/SConscript b/src/mesa/drivers/dri/common/SConscript
index adaac291662..52d201f8913 100644
--- a/src/mesa/drivers/dri/common/SConscript
+++ b/src/mesa/drivers/dri/common/SConscript
@@ -32,8 +32,6 @@ drienv.AppendUnique(LIBS = [
     'expat',
 ])
 
-drienv.PkgUseModules('DRM')
-
 sources = drienv.ParseSourceList('Makefile.sources', ['DRI_COMMON_FILES', 'XMLCONFIG_FILES' ])
 
 dri_common = drienv.ConvenienceLibrary(
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index 100a7272369..884a7e0f650 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -40,9 +40,6 @@
 
 
 #include <stdbool.h>
-#ifdef HAVE_LIBDRM
-#include <xf86drm.h>
-#endif
 #include "dri_util.h"
 #include "utils.h"
 #include "xmlpool.h"
@@ -137,18 +134,6 @@ driCreateNewScreen2(int scrn, int fd,
 
     setupLoaderExtensions(psp, extensions);
 
-#ifdef HAVE_LIBDRM
-    if (fd != -1) {
-       drmVersionPtr version = drmGetVersion(fd);
-       if (version) {
-          psp->drm_version.major = version->version_major;
-          psp->drm_version.minor = version->version_minor;
-          psp->drm_version.patch = version->version_patchlevel;
-          drmFreeVersion(version);
-       }
-    }
-#endif
-
     psp->loaderPrivate = data;
 
     psp->extensions = emptyExtensionList;
diff --git a/src/mesa/drivers/dri/common/dri_util.h b/src/mesa/drivers/dri/common/dri_util.h
index 1138bf106de..6987f555e66 100644
--- a/src/mesa/drivers/dri/common/dri_util.h
+++ b/src/mesa/drivers/dri/common/dri_util.h
@@ -148,11 +148,6 @@ struct __DRIscreenRec {
      */
     int fd;
 
-    /**
-     * DRM (kernel module) version information.
-     */
-    __DRIversion drm_version;
-
     /**
      * Device-dependent private information (not stored in the SAREA).
      * 

From 78674631a2d0ff1eb538470e2a1d516201361f03 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 14 Jul 2015 01:52:51 +0100
Subject: [PATCH 0446/1208] egl: remove the non-haiku scons build

It has been broken since 2011 with commit c98ea26e16b(egl: Make
egl_dri2 and egl_glx built-in drivers.). When the backends got merged
into the main library each entry point was guarded by a
_EGL_BUILT_IN_DRIVER_* define.

As the define was missing, the linker kindly removed the whole of the
dri2 backend, thus we did not notice any errors due to the unresolved
link to xcb and friends.

Cc: Chia-I Wu <olv@lunarg.com>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Acked-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/SConscript                   |  4 ----
 src/egl/drivers/dri2/Makefile.am |  2 --
 src/egl/drivers/dri2/SConscript  | 40 --------------------------------
 src/egl/main/SConscript          | 31 +++++++------------------
 4 files changed, 8 insertions(+), 69 deletions(-)
 delete mode 100644 src/egl/drivers/dri2/SConscript

diff --git a/src/SConscript b/src/SConscript
index b0578e89258..46482fbd62e 100644
--- a/src/SConscript
+++ b/src/SConscript
@@ -31,10 +31,6 @@ SConscript('mesa/SConscript')
 if not env['embedded']:
     if env['platform'] not in ('cygwin', 'darwin', 'freebsd', 'haiku', 'windows'):
         SConscript('glx/SConscript')
-    if env['platform'] not in ['darwin', 'haiku', 'sunos', 'windows']:
-        if env['dri']:
-            SConscript('egl/drivers/dri2/SConscript')
-        SConscript('egl/main/SConscript')
     if env['platform'] == 'haiku':
         SConscript('egl/drivers/haiku/SConscript')
         SConscript('egl/main/SConscript')
diff --git a/src/egl/drivers/dri2/Makefile.am b/src/egl/drivers/dri2/Makefile.am
index 55be4a75ba5..f4649de18df 100644
--- a/src/egl/drivers/dri2/Makefile.am
+++ b/src/egl/drivers/dri2/Makefile.am
@@ -69,5 +69,3 @@ if HAVE_EGL_PLATFORM_SURFACELESS
 libegl_dri2_la_SOURCES += platform_surfaceless.c
 AM_CFLAGS += -DHAVE_SURFACELESS_PLATFORM
 endif
-
-EXTRA_DIST = SConscript
diff --git a/src/egl/drivers/dri2/SConscript b/src/egl/drivers/dri2/SConscript
deleted file mode 100644
index 5b03107cbb3..00000000000
--- a/src/egl/drivers/dri2/SConscript
+++ /dev/null
@@ -1,40 +0,0 @@
-Import('*')
-
-env = env.Clone()
-
-env.Append(CPPDEFINES = [
-	'DEFAULT_DRIVER_DIR=\\"\\"'
-])
-
-env.Append(CPPPATH = [
-	'#/include',
-	'#/src/egl/main',
-	'#/src/loader',
-])
-
-sources = [
-	'egl_dri2.c',
-]
-
-if env['x11']:
-	sources.append('platform_x11.c')
-	env.Append(CPPDEFINES = [
-		'HAVE_X11_PLATFORM',
-	])
-	#env.Append(CPPPATH = [
-	#	'XCB_DRI2_CFLAGS',
-	#])
-
-if env['drm']:
-	env.PkgUseModules('DRM')
-
-env.Prepend(LIBS = [
-	libloader,
-])
-
-egl_dri2 = env.ConvenienceLibrary(
-	target = 'egl_dri2',
-	source = sources,
-)
-
-Export('egl_dri2')
diff --git a/src/egl/main/SConscript b/src/egl/main/SConscript
index c0012831bb9..6fc13416a5d 100644
--- a/src/egl/main/SConscript
+++ b/src/egl/main/SConscript
@@ -10,29 +10,14 @@ env.Append(CPPDEFINES = [
     '_EGL_DRIVER_SEARCH_DIR=\\"\\"',
 ])
 
-if env['platform'] == 'haiku':
-    env.Append(CPPDEFINES = [
-        '_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_HAIKU',
-        '_EGL_OS_UNIX',
-        '_EGL_BUILT_IN_DRIVER_HAIKU',
-    ])
-    env.Prepend(LIBS = [
-        egl_haiku,
-        libloader,
-    ])
-else:
-    env.Append(CPPDEFINES = [
-        '_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_X11',
-        '_EGL_OS_UNIX',
-    ])
-    if env['dri']:
-        env.Prepend(LIBS = [
-            egl_dri2,
-            libloader,
-        ])
-    # Disallow undefined symbols
-    if env['platform'] != 'darwin':
-        env.Append(SHLINKFLAGS = ['-Wl,-z,defs'])
+env.Append(CPPDEFINES = [
+    '_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_HAIKU',
+    '_EGL_OS_UNIX',
+    '_EGL_BUILT_IN_DRIVER_HAIKU',
+])
+env.Prepend(LIBS = [
+    egl_haiku,
+])
 
 env.Append(CPPPATH = [
     '#/include',

From dc1ece3748f3a97d685c9c72ad26684fd35f1944 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 13 Jul 2015 22:20:38 +0100
Subject: [PATCH 0447/1208] egl: remove flatten HAVE_SHARED_GLAPI

It is simply not possible to use the dri backend without shared glapi,
as the alternative provider (libGL) is not always present. We have fixed
the build for a while now, so we can rip this out.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/egl/drivers/dri2/Android.mk  | 1 -
 src/egl/drivers/dri2/Makefile.am | 4 ----
 src/egl/drivers/dri2/egl_dri2.c  | 8 --------
 3 files changed, 13 deletions(-)

diff --git a/src/egl/drivers/dri2/Android.mk b/src/egl/drivers/dri2/Android.mk
index 109e4d4a0d8..76be3b246ba 100644
--- a/src/egl/drivers/dri2/Android.mk
+++ b/src/egl/drivers/dri2/Android.mk
@@ -32,7 +32,6 @@ LOCAL_SRC_FILES := \
 	platform_android.c
 
 LOCAL_CFLAGS := \
-	-DHAVE_SHARED_GLAPI \
 	-DHAVE_ANDROID_PLATFORM
 
 ifeq ($(MESA_LOLLIPOP_BUILD),true)
diff --git a/src/egl/drivers/dri2/Makefile.am b/src/egl/drivers/dri2/Makefile.am
index f4649de18df..b59360de123 100644
--- a/src/egl/drivers/dri2/Makefile.am
+++ b/src/egl/drivers/dri2/Makefile.am
@@ -44,10 +44,6 @@ libegl_dri2_la_LIBADD = \
 	$(top_builddir)/src/loader/libloader.la \
 	$(EGL_LIB_DEPS)
 
-if HAVE_SHARED_GLAPI
-AM_CFLAGS += -DHAVE_SHARED_GLAPI
-endif
-
 if HAVE_EGL_PLATFORM_X11
 libegl_dri2_la_SOURCES += platform_x11.c
 AM_CFLAGS += -DHAVE_X11_PLATFORM
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 65194cb990c..5600e8212d0 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -2365,20 +2365,12 @@ static EGLBoolean
 dri2_load(_EGLDriver *drv)
 {
    struct dri2_egl_driver *dri2_drv = dri2_egl_driver(drv);
-#ifdef HAVE_SHARED_GLAPI
 #ifdef HAVE_ANDROID_PLATFORM
    const char *libname = "libglapi.so";
 #elif defined(__APPLE__)
    const char *libname = "libglapi.0.dylib";
 #else
    const char *libname = "libglapi.so.0";
-#endif
-#else
-   /*
-    * Both libGL.so and libglapi.so are glapi providers.  There is no way to
-    * tell which one to load.
-    */
-   const char *libname = NULL;
 #endif
    void *handle;
 

From 32debea337755cf15550844955b14c29bb3006fa Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 13 Jul 2015 22:05:46 +0100
Subject: [PATCH 0448/1208] egl: remove final references of platform_null

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/egl/main/Makefile.am  | 4 ----
 src/egl/main/egldisplay.c | 1 -
 src/egl/main/egldisplay.h | 1 -
 3 files changed, 6 deletions(-)

diff --git a/src/egl/main/Makefile.am b/src/egl/main/Makefile.am
index 9030d272b53..ec0f88e0424 100644
--- a/src/egl/main/Makefile.am
+++ b/src/egl/main/Makefile.am
@@ -64,10 +64,6 @@ AM_CFLAGS += -DHAVE_DRM_PLATFORM
 libEGL_la_LIBADD += ../../gbm/libgbm.la
 endif
 
-if HAVE_EGL_PLATFORM_NULL
-AM_CFLAGS += -DHAVE_NULL_PLATFORM
-endif
-
 if HAVE_EGL_PLATFORM_SURFACELESS
 AM_CFLAGS += -DHAVE_SURFACELESS_PLATFORM
 endif
diff --git a/src/egl/main/egldisplay.c b/src/egl/main/egldisplay.c
index 24a0c7e61a7..f29f54ebbc7 100644
--- a/src/egl/main/egldisplay.c
+++ b/src/egl/main/egldisplay.c
@@ -69,7 +69,6 @@ static const struct {
    { _EGL_PLATFORM_X11, "x11" },
    { _EGL_PLATFORM_WAYLAND, "wayland" },
    { _EGL_PLATFORM_DRM, "drm" },
-   { _EGL_PLATFORM_NULL, "null" },
    { _EGL_PLATFORM_ANDROID, "android" },
    { _EGL_PLATFORM_HAIKU, "haiku" },
    { _EGL_PLATFORM_SURFACELESS, "surfaceless" },
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index 8d3393564bf..2d34fb383cc 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -48,7 +48,6 @@ enum _egl_platform_type {
    _EGL_PLATFORM_X11,
    _EGL_PLATFORM_WAYLAND,
    _EGL_PLATFORM_DRM,
-   _EGL_PLATFORM_NULL,
    _EGL_PLATFORM_ANDROID,
    _EGL_PLATFORM_HAIKU,
    _EGL_PLATFORM_SURFACELESS,

From d62879565a5c8479c3cfea513aa4e90f0d90b304 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 13 Jul 2015 23:36:19 +0100
Subject: [PATCH 0449/1208] egl: remove _EGL_PLATFORM_WINDOWS enum

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/egl/main/egldisplay.c | 1 -
 src/egl/main/egldisplay.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/egl/main/egldisplay.c b/src/egl/main/egldisplay.c
index f29f54ebbc7..f6db03ab50c 100644
--- a/src/egl/main/egldisplay.c
+++ b/src/egl/main/egldisplay.c
@@ -65,7 +65,6 @@ static const struct {
    _EGLPlatformType platform;
    const char *name;
 } egl_platforms[_EGL_NUM_PLATFORMS] = {
-   { _EGL_PLATFORM_WINDOWS, "gdi" },
    { _EGL_PLATFORM_X11, "x11" },
    { _EGL_PLATFORM_WAYLAND, "wayland" },
    { _EGL_PLATFORM_DRM, "drm" },
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index 2d34fb383cc..6c64980cf20 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -44,7 +44,6 @@ extern "C" {
 #endif
 
 enum _egl_platform_type {
-   _EGL_PLATFORM_WINDOWS,
    _EGL_PLATFORM_X11,
    _EGL_PLATFORM_WAYLAND,
    _EGL_PLATFORM_DRM,

From 3593f37fd7b599e217bd1f894ac671a14a058b8d Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 14 Jul 2015 00:19:54 +0100
Subject: [PATCH 0450/1208] egl: remove custom string functions

Support for Windows has been removed for a while now, and virtually
every POSIX compliant system provides strcasecmp, strdup and snprintf.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/egl/main/Makefile.sources |  2 --
 src/egl/main/eglapi.c         |  3 +-
 src/egl/main/egldriver.c      |  3 +-
 src/egl/main/egllog.c         |  5 ++--
 src/egl/main/eglstring.c      | 54 -----------------------------------
 src/egl/main/eglstring.h      | 50 --------------------------------
 6 files changed, 5 insertions(+), 112 deletions(-)
 delete mode 100644 src/egl/main/eglstring.c
 delete mode 100644 src/egl/main/eglstring.h

diff --git a/src/egl/main/Makefile.sources b/src/egl/main/Makefile.sources
index e39a80f14a6..b90e88b9480 100644
--- a/src/egl/main/Makefile.sources
+++ b/src/egl/main/Makefile.sources
@@ -22,8 +22,6 @@ LIBEGL_C_FILES := \
 	eglimage.h \
 	egllog.c \
 	egllog.h \
-	eglstring.c \
-	eglstring.h \
 	eglsurface.c \
 	eglsurface.h \
 	eglsync.c \
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index 824e51ea55e..96bd8856c25 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -100,7 +100,6 @@
 #include "eglconfig.h"
 #include "eglimage.h"
 #include "eglsync.h"
-#include "eglstring.h"
 
 
 /**
@@ -506,7 +505,7 @@ eglInitialize(EGLDisplay dpy, EGLint *major, EGLint *minor)
       _eglComputeVersion(disp);
       _eglCreateExtensionsString(disp);
       _eglCreateAPIsString(disp);
-      _eglsnprintf(disp->VersionString, sizeof(disp->VersionString),
+      snprintf(disp->VersionString, sizeof(disp->VersionString),
               "%d.%d (%s)", disp->Version / 10, disp->Version % 10,
               disp->Driver->Name);
    }
diff --git a/src/egl/main/egldriver.c b/src/egl/main/egldriver.c
index 6ef79d96502..05ccd0e3796 100644
--- a/src/egl/main/egldriver.c
+++ b/src/egl/main/egldriver.c
@@ -39,7 +39,6 @@
 #include <stdlib.h>
 #include "c11/threads.h"
 
-#include "eglstring.h"
 #include "egldefines.h"
 #include "egldisplay.h"
 #include "egldriver.h"
@@ -135,7 +134,7 @@ _eglAddModule(const char *name)
    /* allocate a new one */
    mod = calloc(1, sizeof(*mod));
    if (mod) {
-      mod->Name = _eglstrdup(name);
+      mod->Name = strdup(name);
       if (!mod->Name) {
          free(mod);
          mod = NULL;
diff --git a/src/egl/main/egllog.c b/src/egl/main/egllog.c
index 1877d8bfd10..956946532cd 100644
--- a/src/egl/main/egllog.c
+++ b/src/egl/main/egllog.c
@@ -38,10 +38,11 @@
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+#include <strings.h>
 #include "c11/threads.h"
 
 #include "egllog.h"
-#include "eglstring.h"
 
 #define MAXSTRING 1000
 #define FALLBACK_LOG_LEVEL _EGL_WARNING
@@ -146,7 +147,7 @@ _eglInitLogger(void)
    log_env = getenv("EGL_LOG_LEVEL");
    if (log_env) {
       for (i = 0; level_strings[i]; i++) {
-         if (_eglstrcasecmp(log_env, level_strings[i]) == 0) {
+         if (strcasecmp(log_env, level_strings[i]) == 0) {
             level = i;
             break;
          }
diff --git a/src/egl/main/eglstring.c b/src/egl/main/eglstring.c
deleted file mode 100644
index 8b4c491ac64..00000000000
--- a/src/egl/main/eglstring.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 VMware, Inc.
- * Copyright 2009-2010 Chia-I Wu <olvaffe@gmail.com>
- * Copyright 2010-2011 LunarG, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * String utils.
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include "eglstring.h"
-
-
-char *
-_eglstrdup(const char *s)
-{
-   if (s) {
-      size_t l = strlen(s);
-      char *s2 = malloc(l + 1);
-      if (s2)
-         strcpy(s2, s);
-      return s2;
-   }
-   return NULL;
-}
-
-
-
diff --git a/src/egl/main/eglstring.h b/src/egl/main/eglstring.h
deleted file mode 100644
index 16baa477714..00000000000
--- a/src/egl/main/eglstring.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 VMware, Inc.
- * Copyright 2009-2010 Chia-I Wu <olvaffe@gmail.com>
- * Copyright 2010-2011 LunarG, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#ifndef EGLSTRING_INCLUDED
-#define EGLSTRING_INCLUDED
-
-#include <string.h>
-#include <stdio.h>
-
-#ifdef _EGL_OS_WINDOWS
-#define _eglstrcasecmp _stricmp
-#define _eglsnprintf _snprintf
-#else
-#include <strings.h> // for strcasecmp
-#define _eglstrcasecmp strcasecmp
-#define _eglsnprintf snprintf
-#endif
-
-extern char *
-_eglstrdup(const char *s);
-
-
-#endif /* EGLSTRING_INCLUDED */

From c17e01748e9efc2d638b7b5bc9d4344521334f48 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 14 Jul 2015 01:57:23 +0100
Subject: [PATCH 0451/1208] egl: remove final Windows specific workaround

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/egl/main/Android.mk  | 3 +--
 src/egl/main/Makefile.am | 3 +--
 src/egl/main/SConscript  | 1 -
 src/egl/main/egldriver.c | 5 -----
 4 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/egl/main/Android.mk b/src/egl/main/Android.mk
index 0ba72953960..4856528ed54 100644
--- a/src/egl/main/Android.mk
+++ b/src/egl/main/Android.mk
@@ -40,8 +40,7 @@ LOCAL_SRC_FILES := $(SOURCES)
 
 LOCAL_CFLAGS := \
 	-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_ANDROID \
-	-D_EGL_DRIVER_SEARCH_DIR=\"/system/lib/egl\" \
-	-D_EGL_OS_UNIX=1
+	-D_EGL_DRIVER_SEARCH_DIR=\"/system/lib/egl\"
 
 LOCAL_SHARED_LIBRARIES := \
 	libdl \
diff --git a/src/egl/main/Makefile.am b/src/egl/main/Makefile.am
index ec0f88e0424..77f8b34acee 100644
--- a/src/egl/main/Makefile.am
+++ b/src/egl/main/Makefile.am
@@ -28,8 +28,7 @@ AM_CFLAGS = \
 	$(VISIBILITY_CFLAGS) \
 	$(EGL_CFLAGS) \
 	-D_EGL_NATIVE_PLATFORM=$(EGL_NATIVE_PLATFORM) \
-	-D_EGL_DRIVER_SEARCH_DIR=\"$(libdir)/egl\" \
-	-D_EGL_OS_UNIX=1
+	-D_EGL_DRIVER_SEARCH_DIR=\"$(libdir)/egl\"
 
 lib_LTLIBRARIES = libEGL.la
 
diff --git a/src/egl/main/SConscript b/src/egl/main/SConscript
index 6fc13416a5d..b77ae783941 100644
--- a/src/egl/main/SConscript
+++ b/src/egl/main/SConscript
@@ -12,7 +12,6 @@ env.Append(CPPDEFINES = [
 
 env.Append(CPPDEFINES = [
     '_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_HAIKU',
-    '_EGL_OS_UNIX',
     '_EGL_BUILT_IN_DRIVER_HAIKU',
 ])
 env.Prepend(LIBS = [
diff --git a/src/egl/main/egldriver.c b/src/egl/main/egldriver.c
index 05ccd0e3796..b9b21dec5ea 100644
--- a/src/egl/main/egldriver.c
+++ b/src/egl/main/egldriver.c
@@ -96,15 +96,10 @@ _eglLoadModule(_EGLModule *mod)
 static void
 _eglUnloadModule(_EGLModule *mod)
 {
-#if defined(_EGL_OS_UNIX)
    /* destroy the driver */
    if (mod->Driver && mod->Driver->Unload)
       mod->Driver->Unload(mod->Driver);
 
-#elif defined(_EGL_OS_WINDOWS)
-   /* XXX Windows unloads DLLs before atexit */
-#endif
-
    mod->Driver = NULL;
 }
 

From 0b915856bac4871b90c101b44a2830b9a0e22e05 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 13 Jul 2015 22:16:11 +0100
Subject: [PATCH 0452/1208] egl/haiku: remove unused DEFAULT_DRIVER_DIR define

Cc: Alexander von Gluck IV <kallisti5@unixzen.com>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Acked-by: Matt Turner <mattst88@gmail.com>
---
 src/egl/drivers/haiku/SConscript | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/egl/drivers/haiku/SConscript b/src/egl/drivers/haiku/SConscript
index ec6020ece77..9db7ecfdd07 100644
--- a/src/egl/drivers/haiku/SConscript
+++ b/src/egl/drivers/haiku/SConscript
@@ -2,10 +2,6 @@ Import('*')
 
 env = env.Clone()
 
-env.Append(CPPDEFINES = [
-	'DEFAULT_DRIVER_DIR=\\"\\"',
-])
-
 env.Append(CPPPATH = [
 	'#/include',
 	'#/src/egl/main',

From a1202807dccc6e2ac02ff52639a49f6c2d06648d Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 14 Jul 2015 01:58:12 +0100
Subject: [PATCH 0453/1208] egl: remove unused _EGL_DRIVER_SEARCH_DIR define

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/egl/main/Android.mk  | 3 +--
 src/egl/main/Makefile.am | 3 +--
 src/egl/main/SConscript  | 4 ----
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/egl/main/Android.mk b/src/egl/main/Android.mk
index 4856528ed54..270c165eb99 100644
--- a/src/egl/main/Android.mk
+++ b/src/egl/main/Android.mk
@@ -39,8 +39,7 @@ include $(CLEAR_VARS)
 LOCAL_SRC_FILES := $(SOURCES)
 
 LOCAL_CFLAGS := \
-	-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_ANDROID \
-	-D_EGL_DRIVER_SEARCH_DIR=\"/system/lib/egl\"
+	-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_ANDROID
 
 LOCAL_SHARED_LIBRARIES := \
 	libdl \
diff --git a/src/egl/main/Makefile.am b/src/egl/main/Makefile.am
index 77f8b34acee..03fb826a5e6 100644
--- a/src/egl/main/Makefile.am
+++ b/src/egl/main/Makefile.am
@@ -27,8 +27,7 @@ AM_CFLAGS = \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
 	$(EGL_CFLAGS) \
-	-D_EGL_NATIVE_PLATFORM=$(EGL_NATIVE_PLATFORM) \
-	-D_EGL_DRIVER_SEARCH_DIR=\"$(libdir)/egl\"
+	-D_EGL_NATIVE_PLATFORM=$(EGL_NATIVE_PLATFORM)
 
 lib_LTLIBRARIES = libEGL.la
 
diff --git a/src/egl/main/SConscript b/src/egl/main/SConscript
index b77ae783941..631ba20826a 100644
--- a/src/egl/main/SConscript
+++ b/src/egl/main/SConscript
@@ -6,10 +6,6 @@ Import('*')
 
 env = env.Clone()
 
-env.Append(CPPDEFINES = [
-    '_EGL_DRIVER_SEARCH_DIR=\\"\\"',
-])
-
 env.Append(CPPDEFINES = [
     '_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_HAIKU',
     '_EGL_BUILT_IN_DRIVER_HAIKU',

From 8e5e18ac286f0782380c72cebffd4c54c98e4ccb Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Mon, 13 Jul 2015 22:35:25 +0100
Subject: [PATCH 0454/1208] egl: automake: remove unused HAVE_XCB_DRI2 define

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/egl/main/Makefile.am | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/egl/main/Makefile.am b/src/egl/main/Makefile.am
index 03fb826a5e6..32fed81b0c0 100644
--- a/src/egl/main/Makefile.am
+++ b/src/egl/main/Makefile.am
@@ -68,7 +68,6 @@ endif
 
 if HAVE_EGL_DRIVER_DRI2
 AM_CFLAGS += -D_EGL_BUILT_IN_DRIVER_DRI2
-AM_CFLAGS += -DHAVE_XCB_DRI2
 libEGL_la_LIBADD += ../drivers/dri2/libegl_dri2.la
 libEGL_la_LIBADD += $(DLOPEN_LIBS) $(LIBDRM_LIBS)
 endif

From 0399d7ab3f69624b7f0b7b39e948432959fe270e Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 30 Jun 2015 22:43:50 +0100
Subject: [PATCH 0455/1208] gbm: do not build intermittent libgbm_dri static
 library

The only user of it (libgbm.la) immediately links it. Just build it
directly into the library.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/gbm/Makefile.am | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/gbm/Makefile.am b/src/gbm/Makefile.am
index da4195df31c..9a584cab352 100644
--- a/src/gbm/Makefile.am
+++ b/src/gbm/Makefile.am
@@ -39,18 +39,15 @@ libgbm_la_LIBADD += $(top_builddir)/src/egl/wayland/wayland-drm/libwayland-drm.l
 endif
 
 if HAVE_DRI2
-noinst_LTLIBRARIES = libgbm_dri.la
-libgbm_dri_la_SOURCES = \
+libgbm_la_SOURCES += \
 	backends/dri/gbm_dri.c \
 	backends/dri/gbm_driint.h
 
-libgbm_dri_la_CFLAGS = \
-	$(AM_CFLAGS) \
+AM_CFLAGS += \
 	-DDEFAULT_DRIVER_DIR='"$(DRI_DRIVER_SEARCH_DIR)"' \
 	$(LIBDRM_CFLAGS)
 
 libgbm_la_LIBADD += \
-	libgbm_dri.la \
 	$(LIBDRM_LIBS)
 endif
 

From e3420396124c75ec9679c4d1cf3a42c185207e5a Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 14 Jul 2015 01:58:33 +0100
Subject: [PATCH 0456/1208] automake: rework the EGL build

Simplify things by merging the two makefiles. This way we can combine
the duplicated HAVE_PLATFORM_ checks, and build the library without
having a separate static library.

v2: use $() when referencing variables, use correct define (Matt)

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 configure.ac                     |  3 +-
 src/Makefile.am                  |  8 +---
 src/egl/{main => }/Makefile.am   | 43 +++++++++++++++-----
 src/egl/Makefile.sources         | 34 ++++++++++++++++
 src/egl/drivers/dri2/Makefile.am | 67 --------------------------------
 5 files changed, 70 insertions(+), 85 deletions(-)
 rename src/egl/{main => }/Makefile.am (69%)
 create mode 100644 src/egl/Makefile.sources
 delete mode 100644 src/egl/drivers/dri2/Makefile.am

diff --git a/configure.ac b/configure.ac
index c25dc5d1962..480018ac536 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2296,8 +2296,7 @@ CXXFLAGS="$CXXFLAGS $USER_CXXFLAGS"
 dnl Substitute the config
 AC_CONFIG_FILES([Makefile
 		src/Makefile
-		src/egl/drivers/dri2/Makefile
-		src/egl/main/Makefile
+		src/egl/Makefile
 		src/egl/main/egl.pc
 		src/egl/wayland/wayland-drm/Makefile
 		src/egl/wayland/wayland-egl/Makefile
diff --git a/src/Makefile.am b/src/Makefile.am
index 90bf94737cf..0d49bcd19ed 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -35,16 +35,12 @@ if HAVE_EGL_PLATFORM_WAYLAND
 SUBDIRS += egl/wayland/wayland-egl egl/wayland/wayland-drm
 endif
 
-if HAVE_EGL_DRIVER_DRI2
-SUBDIRS += egl/drivers/dri2
-endif
-
 if HAVE_GBM
 SUBDIRS += gbm
 endif
 
 if HAVE_EGL
-SUBDIRS += egl/main
+SUBDIRS += egl
 endif
 
 if HAVE_GALLIUM
@@ -52,8 +48,6 @@ SUBDIRS += gallium
 endif
 
 EXTRA_DIST = \
-	egl/drivers/haiku \
-	egl/docs \
 	getopt hgl SConscript
 
 AM_CFLAGS = $(VISIBILITY_CFLAGS)
diff --git a/src/egl/main/Makefile.am b/src/egl/Makefile.am
similarity index 69%
rename from src/egl/main/Makefile.am
rename to src/egl/Makefile.am
index 32fed81b0c0..6f9abcefc38 100644
--- a/src/egl/main/Makefile.am
+++ b/src/egl/Makefile.am
@@ -23,16 +23,18 @@ include Makefile.sources
 
 AM_CFLAGS = \
 	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/egl/main \
 	-I$(top_srcdir)/src/gbm/main \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
+	$(LIBDRM_CFLAGS) \
 	$(EGL_CFLAGS) \
 	-D_EGL_NATIVE_PLATFORM=$(EGL_NATIVE_PLATFORM)
 
 lib_LTLIBRARIES = libEGL.la
 
 libEGL_la_SOURCES = \
-	${LIBEGL_C_FILES}
+	$(LIBEGL_C_FILES)
 
 libEGL_la_LIBADD = \
 	$(EGL_LIB_DEPS)
@@ -43,10 +45,13 @@ libEGL_la_LDFLAGS = \
 	$(GC_SECTIONS) \
 	$(LD_NO_UNDEFINED)
 
+dri2_backend_FILES =
+
 if HAVE_EGL_PLATFORM_X11
 AM_CFLAGS += -DHAVE_X11_PLATFORM
 AM_CFLAGS += $(XCB_DRI2_CFLAGS)
 libEGL_la_LIBADD += $(XCB_DRI2_LIBS)
+dri2_backend_FILES += drivers/dri2/platform_x11.c
 endif
 
 if HAVE_EGL_PLATFORM_WAYLAND
@@ -54,21 +59,37 @@ AM_CFLAGS += -DHAVE_WAYLAND_PLATFORM
 AM_CFLAGS += $(WAYLAND_CFLAGS)
 libEGL_la_LIBADD += $(WAYLAND_LIBS)
 libEGL_la_LIBADD += $(LIBDRM_LIBS)
-libEGL_la_LIBADD += ../wayland/wayland-drm/libwayland-drm.la
+libEGL_la_LIBADD += $(top_builddir)/src/egl/wayland/wayland-drm/libwayland-drm.la
+dri2_backend_FILES += drivers/dri2/platform_wayland.c
 endif
 
 if HAVE_EGL_PLATFORM_DRM
 AM_CFLAGS += -DHAVE_DRM_PLATFORM
-libEGL_la_LIBADD += ../../gbm/libgbm.la
+libEGL_la_LIBADD += $(top_builddir)/src/gbm/libgbm.la
+dri2_backend_FILES += drivers/dri2/platform_drm.c
 endif
 
 if HAVE_EGL_PLATFORM_SURFACELESS
 AM_CFLAGS += -DHAVE_SURFACELESS_PLATFORM
+dri2_backend_FILES += drivers/dri2/platform_surfaceless.c
 endif
 
 if HAVE_EGL_DRIVER_DRI2
-AM_CFLAGS += -D_EGL_BUILT_IN_DRIVER_DRI2
-libEGL_la_LIBADD += ../drivers/dri2/libegl_dri2.la
+AM_CFLAGS += \
+	-I$(top_srcdir)/src/loader \
+	-I$(top_srcdir)/src/egl/drivers/dri2 \
+	-I$(top_srcdir)/src/gbm/backends/dri \
+	-I$(top_srcdir)/src/egl/wayland/wayland-egl \
+	-I$(top_srcdir)/src/egl/wayland/wayland-drm \
+	-I$(top_builddir)/src/egl/wayland/wayland-drm \
+	-DDEFAULT_DRIVER_DIR=\"$(DRI_DRIVER_SEARCH_DIR)\" \
+	-D_EGL_BUILT_IN_DRIVER_DRI2
+
+libEGL_la_SOURCES += \
+	$(dri2_backend_core_FILES) \
+	$(dri2_backend_FILES)
+
+libEGL_la_LIBADD += $(top_builddir)/src/loader/libloader.la
 libEGL_la_LIBADD += $(DLOPEN_LIBS) $(LIBDRM_LIBS)
 endif
 
@@ -76,7 +97,7 @@ include $(top_srcdir)/install-lib-links.mk
 
 pkgconfigdir = $(libdir)/pkgconfig
 
-pkgconfig_DATA = egl.pc
+pkgconfig_DATA = main/egl.pc
 
 khrdir = $(includedir)/KHR
 khr_HEADERS = $(top_srcdir)/include/KHR/khrplatform.h
@@ -90,6 +111,10 @@ egl_HEADERS = \
 	$(top_srcdir)/include/EGL/eglplatform.h
 
 EXTRA_DIST = \
-	egl.def \
-	README.txt \
-	SConscript
+	drivers/haiku \
+	docs \
+	main/egl.def \
+	main/README.txt \
+	main/SConscript \
+	main/Makefile.sources \
+	drivers/dri2/SConscript
diff --git a/src/egl/Makefile.sources b/src/egl/Makefile.sources
new file mode 100644
index 00000000000..48db8518f8a
--- /dev/null
+++ b/src/egl/Makefile.sources
@@ -0,0 +1,34 @@
+LIBEGL_C_FILES := \
+	main/eglapi.c \
+	main/eglapi.h \
+	main/eglarray.c \
+	main/eglarray.h \
+	main/eglcompiler.h \
+	main/eglconfig.c \
+	main/eglconfig.h \
+	main/eglcontext.c \
+	main/eglcontext.h \
+	main/eglcurrent.c \
+	main/eglcurrent.h \
+	main/egldefines.h \
+	main/egldisplay.c \
+	main/egldisplay.h \
+	main/egldriver.c \
+	main/egldriver.h \
+	main/eglfallbacks.c \
+	main/eglglobals.c \
+	main/eglglobals.h \
+	main/eglimage.c \
+	main/eglimage.h \
+	main/egllog.c \
+	main/egllog.h \
+	main/eglsurface.c \
+	main/eglsurface.h \
+	main/eglsync.c \
+	main/eglsync.h \
+	main/egltypedefs.h
+
+dri2_backend_core_FILES := \
+	drivers/dri2/egl_dri2.c \
+	drivers/dri2/egl_dri2.h \
+	drivers/dri2/egl_dri2_fallbacks.h
diff --git a/src/egl/drivers/dri2/Makefile.am b/src/egl/drivers/dri2/Makefile.am
deleted file mode 100644
index b59360de123..00000000000
--- a/src/egl/drivers/dri2/Makefile.am
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright © 2012 Intel Corporation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-AM_CFLAGS = \
-	-I$(top_srcdir)/include \
-	-I$(top_srcdir)/src/egl/main \
-	-I$(top_srcdir)/src/loader \
-	-I$(top_srcdir)/src/gbm/main \
-	-I$(top_srcdir)/src/gbm/backends/dri \
-	-I$(top_srcdir)/src/egl/wayland/wayland-egl \
-	-I$(top_srcdir)/src/egl/wayland/wayland-drm \
-	-I$(top_builddir)/src/egl/wayland/wayland-drm \
-	$(DEFINES) \
-	$(VISIBILITY_CFLAGS) \
-	$(LIBDRM_CFLAGS) \
-	-DDEFAULT_DRIVER_DIR=\"$(DRI_DRIVER_SEARCH_DIR)\"
-
-noinst_LTLIBRARIES = libegl_dri2.la
-
-libegl_dri2_la_SOURCES = \
-	egl_dri2.c \
-	egl_dri2.h \
-	egl_dri2_fallbacks.h
-
-libegl_dri2_la_LIBADD = \
-	$(top_builddir)/src/loader/libloader.la \
-	$(EGL_LIB_DEPS)
-
-if HAVE_EGL_PLATFORM_X11
-libegl_dri2_la_SOURCES += platform_x11.c
-AM_CFLAGS += -DHAVE_X11_PLATFORM
-AM_CFLAGS += $(XCB_DRI2_CFLAGS)
-endif
-
-if HAVE_EGL_PLATFORM_WAYLAND
-libegl_dri2_la_SOURCES += platform_wayland.c
-AM_CFLAGS += -DHAVE_WAYLAND_PLATFORM
-AM_CFLAGS += $(WAYLAND_CFLAGS)
-endif
-
-if HAVE_EGL_PLATFORM_DRM
-libegl_dri2_la_SOURCES += platform_drm.c
-AM_CFLAGS += -DHAVE_DRM_PLATFORM
-endif
-
-if HAVE_EGL_PLATFORM_SURFACELESS
-libegl_dri2_la_SOURCES += platform_surfaceless.c
-AM_CFLAGS += -DHAVE_SURFACELESS_PLATFORM
-endif

From e7e29189e27bb404bf84d757a8f1dd617126808a Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 14 Jul 2015 02:04:30 +0100
Subject: [PATCH 0457/1208] scons: rework the EGL build

The scons equivalent of the previous commit - just fold the almost
identical driver + main Sconscripts.

Cc: Alexander von Gluck IV <kallisti5@unixzen.com>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Acked-by: Matt Turner <mattst88@gmail.com>
---
 src/SConscript                   |  3 +--
 src/egl/Makefile.am              |  5 ++---
 src/egl/{main => }/SConscript    | 17 +++++++++--------
 src/egl/drivers/haiku/SConscript | 25 -------------------------
 4 files changed, 12 insertions(+), 38 deletions(-)
 rename src/egl/{main => }/SConscript (74%)
 delete mode 100644 src/egl/drivers/haiku/SConscript

diff --git a/src/SConscript b/src/SConscript
index 46482fbd62e..106b87d4251 100644
--- a/src/SConscript
+++ b/src/SConscript
@@ -32,8 +32,7 @@ if not env['embedded']:
     if env['platform'] not in ('cygwin', 'darwin', 'freebsd', 'haiku', 'windows'):
         SConscript('glx/SConscript')
     if env['platform'] == 'haiku':
-        SConscript('egl/drivers/haiku/SConscript')
-        SConscript('egl/main/SConscript')
+        SConscript('egl/SConscript')
 
     if env['gles']:
         SConscript('mapi/shared-glapi/SConscript')
diff --git a/src/egl/Makefile.am b/src/egl/Makefile.am
index 6f9abcefc38..10eb1d56bdd 100644
--- a/src/egl/Makefile.am
+++ b/src/egl/Makefile.am
@@ -111,10 +111,9 @@ egl_HEADERS = \
 	$(top_srcdir)/include/EGL/eglplatform.h
 
 EXTRA_DIST = \
+	SConscript \
 	drivers/haiku \
 	docs \
 	main/egl.def \
 	main/README.txt \
-	main/SConscript \
-	main/Makefile.sources \
-	drivers/dri2/SConscript
+	main/Makefile.sources
diff --git a/src/egl/main/SConscript b/src/egl/SConscript
similarity index 74%
rename from src/egl/main/SConscript
rename to src/egl/SConscript
index 631ba20826a..a7f62824e14 100644
--- a/src/egl/main/SConscript
+++ b/src/egl/SConscript
@@ -6,21 +6,22 @@ Import('*')
 
 env = env.Clone()
 
-env.Append(CPPDEFINES = [
-    '_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_HAIKU',
-    '_EGL_BUILT_IN_DRIVER_HAIKU',
-])
-env.Prepend(LIBS = [
-    egl_haiku,
-])
-
 env.Append(CPPPATH = [
     '#/include',
+    '#/src/egl/main',
 ])
 
 
 # parse Makefile.sources
 egl_sources = env.ParseSourceList('Makefile.sources', 'LIBEGL_C_FILES')
+egl_sources.append(env.ParseSourceList('Makefile.sources', 'dri2_backend_core_FILES'))
+
+env.Append(CPPDEFINES = [
+    '_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_HAIKU',
+    '_EGL_BUILT_IN_DRIVER_HAIKU',
+    'HAVE_HAIKU_PLATFORM',
+])
+egl_sources.append('drivers/haiku/egl_haiku.cpp')
 
 egl = env.SharedLibrary(
     target = 'EGL',
diff --git a/src/egl/drivers/haiku/SConscript b/src/egl/drivers/haiku/SConscript
deleted file mode 100644
index 9db7ecfdd07..00000000000
--- a/src/egl/drivers/haiku/SConscript
+++ /dev/null
@@ -1,25 +0,0 @@
-Import('*')
-
-env = env.Clone()
-
-env.Append(CPPPATH = [
-	'#/include',
-	'#/src/egl/main',
-])
-
-sources = [
-	'egl_haiku.cpp'
-]
-
-if env['platform'] == 'haiku':
-	env.Append(CPPDEFINES = [
-		'HAVE_HAIKU_PLATFORM',
-		'_EGL_NATIVE_PLATFORM=haiku',
-	])
-
-egl_haiku = env.ConvenienceLibrary(
-	target = 'egl_haiku',
-	source = sources,
-)
-
-Export('egl_haiku')

From 1040a861a80486502c7ac86a258741d5cdf6459a Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 14 Jul 2015 00:28:10 +0100
Subject: [PATCH 0458/1208] android: rework the EGL build

See previous two commits for details.

v2: Don't forget git mv, bring back DRM_GRALLOC_TOP. Spotted by Varad.

Cc: Chih-Wei Huang <cwhuang@android-x86.org>
Cc: Eric Anholt <eric@anholt.net>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Acked-by: Matt Turner <mattst88@gmail.com>
Tested-by: Varad Gautam <varadgautam@gmail.com>
---
 Android.mk                      |  3 +-
 src/egl/{main => }/Android.mk   | 36 +++++++++++++------
 src/egl/drivers/dri2/Android.mk | 63 ---------------------------------
 3 files changed, 26 insertions(+), 76 deletions(-)
 rename src/egl/{main => }/Android.mk (76%)
 delete mode 100644 src/egl/drivers/dri2/Android.mk

diff --git a/Android.mk b/Android.mk
index 69e0d33f1aa..17fd5f54496 100644
--- a/Android.mk
+++ b/Android.mk
@@ -91,8 +91,7 @@ SUBDIRS := \
 	src/glsl \
 	src/mesa \
 	src/util \
-	src/egl/main \
-	src/egl/drivers/dri2 \
+	src/egl \
 	src/mesa/drivers/dri
 
 ifeq ($(strip $(MESA_BUILD_GALLIUM)),true)
diff --git a/src/egl/main/Android.mk b/src/egl/Android.mk
similarity index 76%
rename from src/egl/main/Android.mk
rename to src/egl/Android.mk
index 270c165eb99..516b53c6255 100644
--- a/src/egl/main/Android.mk
+++ b/src/egl/Android.mk
@@ -27,19 +27,37 @@ LOCAL_PATH := $(call my-dir)
 
 include $(LOCAL_PATH)/Makefile.sources
 
-SOURCES := \
-	${LIBEGL_C_FILES}
-
 # ---------------------------------------
 # Build libGLES_mesa
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES := $(SOURCES)
+LOCAL_SRC_FILES := \
+	$(LIBEGL_C_FILES) \
+	$(dri2_backend_core_FILES) \
+	drivers/dri2/platform_android.c
 
 LOCAL_CFLAGS := \
-	-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_ANDROID
+	-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_ANDROID \
+	-D_EGL_BUILT_IN_DRIVER_DRI2 \
+	-DHAVE_ANDROID_PLATFORM
+
+ifeq ($(MESA_LOLLIPOP_BUILD),true)
+LOCAL_CFLAGS_arm := -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
+LOCAL_CFLAGS_x86 := -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
+LOCAL_CFLAGS_x86_64 := -DDEFAULT_DRIVER_DIR=\"/system/lib64/dri\"
+else
+LOCAL_CFLAGS += -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
+endif
+
+LOCAL_C_INCLUDES := \
+	$(DRM_GRALLOC_TOP) \
+	$(MESA_TOP)/src/egl/main \
+	$(MESA_TOP)/src/egl/drivers/dri2 \
+
+LOCAL_STATIC_LIBRARIES := \
+	libmesa_loader
 
 LOCAL_SHARED_LIBRARIES := \
 	libdl \
@@ -53,12 +71,11 @@ LOCAL_SHARED_LIBRARIES += libsync
 endif
 
 # add libdrm if there are hardware drivers
-ifneq ($(MESA_GPU_DRIVERS),swrast)
+ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS += -DHAVE_LIBDRM
 LOCAL_SHARED_LIBRARIES += libdrm
 endif
 
-LOCAL_CFLAGS += -D_EGL_BUILT_IN_DRIVER_DRI2
-
 ifeq ($(strip $(MESA_BUILD_CLASSIC)),true)
 # require i915_dri and/or i965_dri
 LOCAL_REQUIRED_MODULES += \
@@ -69,9 +86,6 @@ ifeq ($(strip $(MESA_BUILD_GALLIUM)),true)
 LOCAL_REQUIRED_MODULES += gallium_dri
 endif # MESA_BUILD_GALLIUM
 
-LOCAL_STATIC_LIBRARIES := \
-	libmesa_egl_dri2 \
-	libmesa_loader
 
 LOCAL_MODULE := libGLES_mesa
 ifeq ($(MESA_LOLLIPOP_BUILD),true)
diff --git a/src/egl/drivers/dri2/Android.mk b/src/egl/drivers/dri2/Android.mk
deleted file mode 100644
index 76be3b246ba..00000000000
--- a/src/egl/drivers/dri2/Android.mk
+++ /dev/null
@@ -1,63 +0,0 @@
-# Mesa 3-D graphics library
-#
-# Copyright (C) 2010-2011 Chia-I Wu <olvaffe@gmail.com>
-# Copyright (C) 2010-2011 LunarG Inc.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-# Android.mk for egl_dri2
-
-LOCAL_PATH := $(call my-dir)
-
-include $(CLEAR_VARS)
-
-LOCAL_SRC_FILES := \
-	egl_dri2.c \
-	platform_android.c
-
-LOCAL_CFLAGS := \
-	-DHAVE_ANDROID_PLATFORM
-
-ifeq ($(MESA_LOLLIPOP_BUILD),true)
-LOCAL_CFLAGS_arm := -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
-LOCAL_CFLAGS_x86 := -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
-LOCAL_CFLAGS_x86_64 := -DDEFAULT_DRIVER_DIR=\"/system/lib64/dri\"
-else
-LOCAL_CFLAGS += -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
-endif
-
-LOCAL_C_INCLUDES := \
-	$(MESA_TOP)/src/mapi \
-	$(MESA_TOP)/src/egl/main \
-	$(DRM_GRALLOC_TOP)
-
-LOCAL_STATIC_LIBRARIES := \
-	libmesa_loader
-
-LOCAL_SHARED_LIBRARIES := libdrm
-
-ifeq ($(shell echo "$(MESA_ANDROID_VERSION) >= 4.2" | bc),1)
-LOCAL_SHARED_LIBRARIES += \
-	libsync
-endif
-
-LOCAL_MODULE := libmesa_egl_dri2
-
-include $(MESA_COMMON_MK)
-include $(BUILD_STATIC_LIBRARY)

From e2ef659c2ed36cca5a5c4a09440edb227eedcf60 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 14 Jul 2015 00:28:46 +0100
Subject: [PATCH 0459/1208] egl: remove old makefile.sources

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Acked-by: Matt Turner <mattst88@gmail.com>
---
 src/egl/Makefile.am           |  3 +--
 src/egl/main/Makefile.sources | 29 -----------------------------
 2 files changed, 1 insertion(+), 31 deletions(-)
 delete mode 100644 src/egl/main/Makefile.sources

diff --git a/src/egl/Makefile.am b/src/egl/Makefile.am
index 10eb1d56bdd..be7bfe9c3ab 100644
--- a/src/egl/Makefile.am
+++ b/src/egl/Makefile.am
@@ -115,5 +115,4 @@ EXTRA_DIST = \
 	drivers/haiku \
 	docs \
 	main/egl.def \
-	main/README.txt \
-	main/Makefile.sources
+	main/README.txt
diff --git a/src/egl/main/Makefile.sources b/src/egl/main/Makefile.sources
deleted file mode 100644
index b90e88b9480..00000000000
--- a/src/egl/main/Makefile.sources
+++ /dev/null
@@ -1,29 +0,0 @@
-LIBEGL_C_FILES := \
-	eglapi.c \
-	eglapi.h \
-	eglarray.c \
-	eglarray.h \
-	eglcompiler.h \
-	eglconfig.c \
-	eglconfig.h \
-	eglcontext.c \
-	eglcontext.h \
-	eglcurrent.c \
-	eglcurrent.h \
-	egldefines.h \
-	egldisplay.c \
-	egldisplay.h \
-	egldriver.c \
-	egldriver.h \
-	eglfallbacks.c \
-	eglglobals.c \
-	eglglobals.h \
-	eglimage.c \
-	eglimage.h \
-	egllog.c \
-	egllog.h \
-	eglsurface.c \
-	eglsurface.h \
-	eglsync.c \
-	eglsync.h \
-	egltypedefs.h

From 461b4b103f545027beb59c1d747c85892c6c1f63 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 21 Jul 2015 15:34:19 +0100
Subject: [PATCH 0460/1208] egl: android: remove DRM_GRALLOC_TOP hack

Now that the drm_gralloc module exports the correct includes we can get
rid of this hack.

Cc: Chih-Wei Huang <cwhuang@android-x86.org>
Cc: Eric Anholt <eric@anholt.net>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Tested-by: Varad Gautam <varadgautam@gmail.com>
---
 Android.mk         | 2 --
 src/egl/Android.mk | 1 -
 2 files changed, 3 deletions(-)

diff --git a/Android.mk b/Android.mk
index 17fd5f54496..ed160fb3d0e 100644
--- a/Android.mk
+++ b/Android.mk
@@ -45,8 +45,6 @@ endif
 MESA_COMMON_MK := $(MESA_TOP)/Android.common.mk
 MESA_PYTHON2 := python
 
-DRM_GRALLOC_TOP := hardware/drm_gralloc
-
 classic_drivers := i915 i965
 gallium_drivers := swrast freedreno i915g ilo nouveau r300g r600g radeonsi vmwgfx vc4
 
diff --git a/src/egl/Android.mk b/src/egl/Android.mk
index 516b53c6255..ebd67af34cc 100644
--- a/src/egl/Android.mk
+++ b/src/egl/Android.mk
@@ -52,7 +52,6 @@ LOCAL_CFLAGS += -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
 endif
 
 LOCAL_C_INCLUDES := \
-	$(DRM_GRALLOC_TOP) \
 	$(MESA_TOP)/src/egl/main \
 	$(MESA_TOP)/src/egl/drivers/dri2 \
 

From b06a6852ff782bb20d9e91a3a67eccb92e856ed3 Mon Sep 17 00:00:00 2001
From: Dylan Baker <baker.dylan.c@gmail.com>
Date: Thu, 2 Jul 2015 10:25:41 -0700
Subject: [PATCH 0461/1208] glapi: fix argument parsing in glX_proto_recv.py

One of the plugins I use with vim "helpfully" added an underscore to the
front of mode for kicks.

Obviously this isn't a feature used very often because it's been broken
since d986cb7c70db (since May 20th), and no one has noticed.

Signed-off-by: Dylan Baker <dylanx.c.baker@intel.com>
---
 src/mapi/glapi/gen/glX_proto_recv.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mapi/glapi/gen/glX_proto_recv.py b/src/mapi/glapi/gen/glX_proto_recv.py
index da468dc5876..5d95f278a91 100644
--- a/src/mapi/glapi/gen/glX_proto_recv.py
+++ b/src/mapi/glapi/gen/glX_proto_recv.py
@@ -549,9 +549,9 @@ def main():
     """Main function."""
     args = _parser()
 
-    if args._mode == "dispatch_c":
+    if args.mode == "dispatch_c":
         printer = PrintGlxDispatchFunctions(args.swap)
-    elif args._mode == "dispatch_h":
+    elif args.mode == "dispatch_h":
         printer = PrintGlxDispatch_h()
 
     api = gl_XML.parse_GL_API(

From 956ebf41aca6b74052cf6876cc479b404777700c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 10 Jun 2015 01:49:36 +0200
Subject: [PATCH 0462/1208] st/dri: expose sRGB visuals (v2)

v2: The fix for the darkness in Ubuntu Unity is in the hunk
    with the 4-line comment.

Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/gallium/state_trackers/dri/dri_drawable.c |  7 +++++-
 src/gallium/state_trackers/dri/dri_screen.c   | 23 +++++++++++++++----
 src/mesa/state_tracker/st_manager.c           |  1 +
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/gallium/state_trackers/dri/dri_drawable.c b/src/gallium/state_trackers/dri/dri_drawable.c
index b8afe6c4d23..0d2929aaaa1 100644
--- a/src/gallium/state_trackers/dri/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/dri_drawable.c
@@ -279,7 +279,12 @@ dri_drawable_get_format(struct dri_drawable *drawable,
    case ST_ATTACHMENT_BACK_LEFT:
    case ST_ATTACHMENT_FRONT_RIGHT:
    case ST_ATTACHMENT_BACK_RIGHT:
-      *format = drawable->stvis.color_format;
+      /* Other pieces of the driver stack get confused and behave incorrectly
+       * when they get an sRGB drawable. st/mesa receives "drawable->stvis"
+       * though other means and handles it correctly, so we don't really need
+       * to use an sRGB format here.
+       */
+      *format = util_format_linear(drawable->stvis.color_format);
       *bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
       break;
    case ST_ATTACHMENT_DEPTH_STENCIL:
diff --git a/src/gallium/state_trackers/dri/dri_screen.c b/src/gallium/state_trackers/dri/dri_screen.c
index 85393d867e4..c4c2d9c8fb1 100644
--- a/src/gallium/state_trackers/dri/dri_screen.c
+++ b/src/gallium/state_trackers/dri/dri_screen.c
@@ -103,14 +103,18 @@ dri_fill_st_options(struct st_config_options *options,
 static const __DRIconfig **
 dri_fill_in_modes(struct dri_screen *screen)
 {
-   static const mesa_format mesa_formats[3] = {
+   static const mesa_format mesa_formats[] = {
       MESA_FORMAT_B8G8R8A8_UNORM,
       MESA_FORMAT_B8G8R8X8_UNORM,
+      MESA_FORMAT_B8G8R8A8_SRGB,
+      MESA_FORMAT_B8G8R8X8_SRGB,
       MESA_FORMAT_B5G6R5_UNORM,
    };
-   static const enum pipe_format pipe_formats[3] = {
+   static const enum pipe_format pipe_formats[] = {
       PIPE_FORMAT_BGRA8888_UNORM,
       PIPE_FORMAT_BGRX8888_UNORM,
+      PIPE_FORMAT_BGRA8888_SRGB,
+      PIPE_FORMAT_BGRX8888_SRGB,
       PIPE_FORMAT_B5G6R5_UNORM,
    };
    mesa_format format;
@@ -186,6 +190,11 @@ dri_fill_in_modes(struct dri_screen *screen)
       unsigned num_msaa_modes = 0; /* includes a single-sample mode */
       uint8_t msaa_modes[MSAA_VISUAL_MAX_SAMPLES];
 
+      if (!p_screen->is_format_supported(p_screen, pipe_formats[format],
+                                         PIPE_TEXTURE_2D, 0,
+                                         PIPE_BIND_RENDER_TARGET))
+         continue;
+
       for (i = 1; i <= msaa_samples_max; i++) {
          int samples = i > 1 ? i : 0;
 
@@ -241,9 +250,15 @@ dri_fill_st_visual(struct st_visual *stvis, struct dri_screen *screen,
 
    if (mode->redBits == 8) {
       if (mode->alphaBits == 8)
-         stvis->color_format = PIPE_FORMAT_BGRA8888_UNORM;
+         if (mode->sRGBCapable)
+            stvis->color_format = PIPE_FORMAT_BGRA8888_SRGB;
+         else
+            stvis->color_format = PIPE_FORMAT_BGRA8888_UNORM;
       else
-         stvis->color_format = PIPE_FORMAT_BGRX8888_UNORM;
+         if (mode->sRGBCapable)
+            stvis->color_format = PIPE_FORMAT_BGRX8888_SRGB;
+         else
+            stvis->color_format = PIPE_FORMAT_BGRX8888_UNORM;
    } else {
       stvis->color_format = PIPE_FORMAT_B5G6R5_UNORM;
    }
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index a2dee6298fa..2e2c8ffaed9 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -368,6 +368,7 @@ st_visual_to_context_mode(const struct st_visual *visual,
 
       mode->rgbBits = mode->redBits +
          mode->greenBits + mode->blueBits + mode->alphaBits;
+      mode->sRGBCapable = util_format_is_srgb(visual->color_format);
    }
 
    if (visual->depth_stencil_format != PIPE_FORMAT_NONE) {

From c2c2e9ab604793c6e01f85497f3f5bf645f962fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 10 Jun 2015 02:49:29 +0200
Subject: [PATCH 0463/1208] egl: implement EGL_KHR_gl_colorspace (v2)

v2: add missing "break"

Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/egl/drivers/dri2/egl_dri2.c         | 41 ++++++++++++++++++++-----
 src/egl/drivers/dri2/egl_dri2.h         |  6 ++++
 src/egl/drivers/dri2/platform_android.c |  7 +++--
 src/egl/drivers/dri2/platform_drm.c     |  9 ++++--
 src/egl/drivers/dri2/platform_wayland.c | 11 ++++---
 src/egl/drivers/dri2/platform_x11.c     | 12 ++++----
 src/egl/main/eglconfig.c                |  3 +-
 src/egl/main/eglsurface.c               | 24 +++++++++++++++
 src/egl/main/eglsurface.h               |  1 +
 9 files changed, 90 insertions(+), 24 deletions(-)

diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 5600e8212d0..bec894c50dd 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -28,6 +28,7 @@
 #define WL_HIDE_DEPRECATED
 
 #include <stdint.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
@@ -109,6 +110,18 @@ EGLint dri2_to_egl_attribute_map[] = {
    0,				/* __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE */
 };
 
+const __DRIconfig *
+dri2_get_dri_config(struct dri2_egl_config *conf, EGLint surface_type,
+                    EGLenum colorspace)
+{
+   if (colorspace == EGL_GL_COLORSPACE_SRGB_KHR)
+      return surface_type == EGL_WINDOW_BIT ? conf->dri_srgb_double_config :
+                                              conf->dri_srgb_single_config;
+   else
+      return surface_type == EGL_WINDOW_BIT ? conf->dri_double_config :
+                                              conf->dri_single_config;
+}
+
 static EGLBoolean
 dri2_match_config(const _EGLConfig *conf, const _EGLConfig *criteria)
 {
@@ -130,6 +143,7 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
    struct dri2_egl_display *dri2_dpy;
    _EGLConfig base;
    unsigned int attrib, value, double_buffer;
+   bool srgb = false;
    EGLint key, bind_to_texture_rgb, bind_to_texture_rgba;
    unsigned int dri_masks[4] = { 0, 0, 0, 0 };
    _EGLConfig *matching_config;
@@ -204,6 +218,10 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
             return NULL;
          break;
 
+      case __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE:
+         srgb = value != 0;
+         break;
+
       default:
 	 key = dri2_to_egl_attribute_map[attrib];
 	 if (key != 0)
@@ -249,28 +267,35 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
    if (num_configs == 1) {
       conf = (struct dri2_egl_config *) matching_config;
 
-      if (double_buffer && !conf->dri_double_config)
+      if (double_buffer && srgb && !conf->dri_srgb_double_config)
+         conf->dri_srgb_double_config = dri_config;
+      else if (double_buffer && !srgb && !conf->dri_double_config)
          conf->dri_double_config = dri_config;
-      else if (!double_buffer && !conf->dri_single_config)
+      else if (!double_buffer && srgb && !conf->dri_srgb_single_config)
+         conf->dri_srgb_single_config = dri_config;
+      else if (!double_buffer && !srgb && !conf->dri_single_config)
          conf->dri_single_config = dri_config;
       else
          /* a similar config type is already added (unlikely) => discard */
          return NULL;
    }
    else if (num_configs == 0) {
-      conf = malloc(sizeof *conf);
+      conf = calloc(1, sizeof *conf);
       if (conf == NULL)
          return NULL;
 
       memcpy(&conf->base, &base, sizeof base);
       if (double_buffer) {
-         conf->dri_double_config = dri_config;
-         conf->dri_single_config = NULL;
+         if (srgb)
+            conf->dri_srgb_double_config = dri_config;
+         else
+            conf->dri_double_config = dri_config;
       } else {
-         conf->dri_single_config = dri_config;
-         conf->dri_double_config = NULL;
+         if (srgb)
+            conf->dri_srgb_single_config = dri_config;
+         else
+            conf->dri_single_config = dri_config;
       }
-      conf->base.SurfaceType = 0;
       conf->base.ConfigID = config_id;
 
       _eglLinkConfig(&conf->base);
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index f0cc6da1867..0dfbc752656 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -285,6 +285,8 @@ struct dri2_egl_config
    _EGLConfig         base;
    const __DRIconfig *dri_single_config;
    const __DRIconfig *dri_double_config;
+   const __DRIconfig *dri_srgb_single_config;
+   const __DRIconfig *dri_srgb_double_config;
 };
 
 struct dri2_egl_image
@@ -357,4 +359,8 @@ dri2_initialize_surfaceless(_EGLDriver *drv, _EGLDisplay *disp);
 void
 dri2_flush_drawable_for_swapbuffers(_EGLDisplay *disp, _EGLSurface *draw);
 
+const __DRIconfig *
+dri2_get_dri_config(struct dri2_egl_config *conf, EGLint surface_type,
+                    EGLenum colorspace);
+
 #endif /* EGL_DRI2_INCLUDED */
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index fed3073088a..4abe82f63a0 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -199,6 +199,7 @@ droid_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
    struct dri2_egl_config *dri2_conf = dri2_egl_config(conf);
    struct dri2_egl_surface *dri2_surf;
    struct ANativeWindow *window = native_window;
+   const __DRIconfig *config;
 
    dri2_surf = calloc(1, sizeof *dri2_surf);
    if (!dri2_surf) {
@@ -230,9 +231,11 @@ droid_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
       window->query(window, NATIVE_WINDOW_HEIGHT, &dri2_surf->base.Height);
    }
 
+   config = dri2_get_dri_config(dri2_conf, EGL_WINDOW_BIT,
+                                dri2_surf->base.GLColorspace);
+
    dri2_surf->dri_drawable =
-      (*dri2_dpy->dri2->createNewDrawable)(dri2_dpy->dri_screen,
-					   dri2_conf->dri_double_config,
+      (*dri2_dpy->dri2->createNewDrawable)(dri2_dpy->dri_screen, config,
                                            dri2_surf);
    if (dri2_surf->dri_drawable == NULL) {
       _eglError(EGL_BAD_ALLOC, "dri2->createNewDrawable");
diff --git a/src/egl/drivers/dri2/platform_drm.c b/src/egl/drivers/dri2/platform_drm.c
index 0d1f4c6e0a7..a439a3be6b6 100644
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -131,10 +131,13 @@ dri2_drm_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
    }
 
    if (dri2_dpy->dri2) {
+      const __DRIconfig *config =
+         dri2_get_dri_config(dri2_conf, EGL_WINDOW_BIT,
+                             dri2_surf->base.GLColorspace);
+
       dri2_surf->dri_drawable =
-         (*dri2_dpy->dri2->createNewDrawable) (dri2_dpy->dri_screen,
-                                               dri2_conf->dri_double_config,
-                                               dri2_surf->gbm_surf);
+         (*dri2_dpy->dri2->createNewDrawable)(dri2_dpy->dri_screen, config,
+                                              dri2_surf->gbm_surf);
 
    } else {
       assert(dri2_dpy->swrast != NULL);
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index 1e127607cdd..f4cb1857a37 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -130,6 +130,7 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp,
    struct dri2_egl_config *dri2_conf = dri2_egl_config(conf);
    struct wl_egl_window *window = native_window;
    struct dri2_egl_surface *dri2_surf;
+   const __DRIconfig *config;
 
    (void) drv;
 
@@ -162,10 +163,12 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp,
    dri2_surf->base.Width =  -1;
    dri2_surf->base.Height = -1;
 
-   dri2_surf->dri_drawable =
-      (*dri2_dpy->dri2->createNewDrawable) (dri2_dpy->dri_screen,
-					    dri2_conf->dri_double_config,
-					    dri2_surf);
+   config = dri2_get_dri_config(dri2_conf, EGL_WINDOW_BIT,
+                                dri2_surf->base.GLColorspace);
+
+   dri2_surf->dri_drawable = 
+      (*dri2_dpy->dri2->createNewDrawable)(dri2_dpy->dri_screen, config,
+                                           dri2_surf);
    if (dri2_surf->dri_drawable == NULL) {
       _eglError(EGL_BAD_ALLOC, "dri2->createNewDrawable");
       goto cleanup_surf;
diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index ad40bd57aa6..fecd36b3e7e 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -243,12 +243,12 @@ dri2_x11_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
    }
 
    if (dri2_dpy->dri2) {
-      dri2_surf->dri_drawable = 
-	 (*dri2_dpy->dri2->createNewDrawable) (dri2_dpy->dri_screen,
-					       type == EGL_WINDOW_BIT ?
-					       dri2_conf->dri_double_config : 
-					       dri2_conf->dri_single_config,
-					       dri2_surf);
+      const __DRIconfig *config =
+         dri2_get_dri_config(dri2_conf, type, dri2_surf->base.GLColorspace);
+
+      dri2_surf->dri_drawable =
+	 (*dri2_dpy->dri2->createNewDrawable)(dri2_dpy->dri_screen, config,
+					      dri2_surf);
    } else {
       assert(dri2_dpy->swrast);
       dri2_surf->dri_drawable = 
diff --git a/src/egl/main/eglconfig.c b/src/egl/main/eglconfig.c
index cf65c69b7b4..c445d9b0c92 100644
--- a/src/egl/main/eglconfig.c
+++ b/src/egl/main/eglconfig.c
@@ -83,7 +83,8 @@ _eglLinkConfig(_EGLConfig *conf)
    _EGLDisplay *dpy = conf->Display;
 
    /* sanity check */
-   assert(dpy && conf->ConfigID > 0);
+   assert(dpy);
+   assert(conf->ConfigID > 0);
 
    if (!dpy->Configs) {
       dpy->Configs = _eglCreateArray("Config", 16);
diff --git a/src/egl/main/eglsurface.c b/src/egl/main/eglsurface.c
index 76c60e940dc..541353f9e0a 100644
--- a/src/egl/main/eglsurface.c
+++ b/src/egl/main/eglsurface.c
@@ -84,6 +84,22 @@ _eglParseSurfaceAttribList(_EGLSurface *surf, const EGLint *attrib_list)
 
       switch (attr) {
       /* common attributes */
+      case EGL_GL_COLORSPACE_KHR:
+         if (!dpy->Extensions.KHR_gl_colorspace) {
+            err = EGL_BAD_ATTRIBUTE;
+            break;
+         }
+         switch (val) {
+         case EGL_GL_COLORSPACE_SRGB_KHR:
+         case EGL_GL_COLORSPACE_LINEAR_KHR:
+            break;
+         default:
+            err = EGL_BAD_ATTRIBUTE;
+         }
+         if (err != EGL_SUCCESS)
+            break;
+         surf->GLColorspace = val;
+         break;
       case EGL_VG_COLORSPACE:
          switch (val) {
          case EGL_VG_COLORSPACE_sRGB:
@@ -272,6 +288,7 @@ _eglInitSurface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type,
    surf->RenderBuffer = renderBuffer;
    surf->VGAlphaFormat = EGL_VG_ALPHA_FORMAT_NONPRE;
    surf->VGColorspace = EGL_VG_COLORSPACE_sRGB;
+   surf->GLColorspace = EGL_GL_COLORSPACE_LINEAR_KHR;
 
    surf->MipmapLevel = 0;
    surf->MultisampleResolve = EGL_MULTISAMPLE_RESOLVE_DEFAULT;
@@ -352,6 +369,13 @@ _eglQuerySurface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface,
    case EGL_VG_COLORSPACE:
       *value = surface->VGColorspace;
       break;
+   case EGL_GL_COLORSPACE_KHR:
+      if (!dpy->Extensions.KHR_gl_colorspace) {
+         _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
+         return EGL_FALSE;
+      }
+      *value = surface->GLColorspace;
+      break;
    case EGL_POST_SUB_BUFFER_SUPPORTED_NV:
       *value = surface->PostSubBufferSupportedNV;
       break;
diff --git a/src/egl/main/eglsurface.h b/src/egl/main/eglsurface.h
index 74c429a9628..fc799ee43dc 100644
--- a/src/egl/main/eglsurface.h
+++ b/src/egl/main/eglsurface.h
@@ -65,6 +65,7 @@ struct _egl_surface
    EGLenum RenderBuffer;
    EGLenum VGAlphaFormat;
    EGLenum VGColorspace;
+   EGLenum GLColorspace;
 
    /* attributes set by eglSurfaceAttrib */
    EGLint MipmapLevel;

From 4f57ccd02d4c5f214c7e59e7302c1dc650cff31d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 10 Jun 2015 14:45:58 +0200
Subject: [PATCH 0464/1208] egl,dri_interface: use DRI2rendererQueryExtension
 to enable 3D textures & sRGB

Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 include/GL/internal/dri_interface.h |  5 +++++
 src/egl/drivers/dri2/egl_dri2.c     | 23 +++++++++++++++++++++++
 src/egl/drivers/dri2/egl_dri2.h     |  1 +
 3 files changed, 29 insertions(+)

diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index 72660718bd8..f17b7b1e080 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -1437,6 +1437,11 @@ typedef struct __DRIDriverVtableExtensionRec {
 #define __DRI2_RENDERER_OPENGL_COMPATIBILITY_PROFILE_VERSION  0x0008
 #define __DRI2_RENDERER_OPENGL_ES_PROFILE_VERSION             0x0009
 #define __DRI2_RENDERER_OPENGL_ES2_PROFILE_VERSION            0x000a
+#define __DRI2_RENDERER_HAS_TEXTURE_3D                        0x000b
+/* Whether there is an sRGB format support for every supported 32-bit UNORM
+ * color format.
+ */
+#define __DRI2_RENDERER_HAS_FRAMEBUFFER_SRGB                  0x000c
 
 typedef struct __DRI2rendererQueryExtensionRec __DRI2rendererQueryExtension;
 struct __DRI2rendererQueryExtensionRec {
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index bec894c50dd..e5aa396b08a 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -525,6 +525,19 @@ dri2_load_driver_swrast(_EGLDisplay *disp)
    return EGL_TRUE;
 }
 
+static unsigned
+dri2_renderer_query_integer(struct dri2_egl_display *dri2_dpy, int param)
+{
+   const __DRI2rendererQueryExtension *rendererQuery = dri2_dpy->rendererQuery;
+   unsigned int value = 0;
+
+   if (!rendererQuery ||
+       rendererQuery->queryInteger(dri2_dpy->dri_screen, param, &value) == -1)
+      return 0;
+
+   return value;
+}
+
 void
 dri2_setup_screen(_EGLDisplay *disp)
 {
@@ -555,6 +568,10 @@ dri2_setup_screen(_EGLDisplay *disp)
    disp->Extensions.KHR_surfaceless_context = EGL_TRUE;
    disp->Extensions.MESA_configless_context = EGL_TRUE;
 
+   if (dri2_renderer_query_integer(dri2_dpy,
+                                   __DRI2_RENDERER_HAS_FRAMEBUFFER_SRGB))
+      disp->Extensions.KHR_gl_colorspace = EGL_TRUE;
+
    if (dri2_dpy->dri2 && dri2_dpy->dri2->base.version >= 3) {
       disp->Extensions.KHR_create_context = EGL_TRUE;
 
@@ -592,6 +609,9 @@ dri2_setup_screen(_EGLDisplay *disp)
          disp->Extensions.KHR_gl_texture_2D_image = EGL_TRUE;
          disp->Extensions.KHR_gl_texture_cubemap_image = EGL_TRUE;
       }
+      if (dri2_renderer_query_integer(dri2_dpy,
+                                      __DRI2_RENDERER_HAS_TEXTURE_3D))
+         disp->Extensions.KHR_gl_texture_3D_image = EGL_TRUE;
 #ifdef HAVE_LIBDRM
       if (dri2_dpy->image->base.version >= 8 &&
           dri2_dpy->image->createImageFromDmaBufs) {
@@ -669,6 +689,9 @@ dri2_create_screen(_EGLDisplay *disp)
       if (strcmp(extensions[i]->name, __DRI2_FENCE) == 0) {
          dri2_dpy->fence = (__DRI2fenceExtension *) extensions[i];
       }
+      if (strcmp(extensions[i]->name, __DRI2_RENDERER_QUERY) == 0) {
+         dri2_dpy->rendererQuery = (__DRI2rendererQueryExtension *) extensions[i];
+      }
    }
 
    dri2_setup_screen(disp);
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index 0dfbc752656..9aa2a8c1003 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -166,6 +166,7 @@ struct dri2_egl_display
    const __DRIrobustnessExtension *robustness;
    const __DRI2configQueryExtension *config;
    const __DRI2fenceExtension *fence;
+   const __DRI2rendererQueryExtension *rendererQuery;
    int                       fd;
 
    int                       own_device;

From 1828357629721e53a305a29047c0eb18be10915b Mon Sep 17 00:00:00 2001
From: Anatoli Antonovitch <anatoli.antonovitch@amd.com>
Date: Wed, 10 Jun 2015 14:47:03 +0200
Subject: [PATCH 0465/1208] st/dri: enable 3D textures and sRGB colorspace for
 EGL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 .../state_trackers/dri/dri_query_renderer.c        | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/gallium/state_trackers/dri/dri_query_renderer.c b/src/gallium/state_trackers/dri/dri_query_renderer.c
index 4a28ac37b70..ea31b6c1e10 100644
--- a/src/gallium/state_trackers/dri/dri_query_renderer.c
+++ b/src/gallium/state_trackers/dri/dri_query_renderer.c
@@ -42,6 +42,20 @@ dri2_query_renderer_integer(__DRIscreen *_screen, int param,
                                                       PIPE_CAP_UMA);
       return 0;
 
+   case __DRI2_RENDERER_HAS_TEXTURE_3D:
+      value[0] =
+         screen->base.screen->get_param(screen->base.screen,
+                                        PIPE_CAP_MAX_TEXTURE_3D_LEVELS) != 0;
+      return 0;
+
+   case __DRI2_RENDERER_HAS_FRAMEBUFFER_SRGB:
+      value[0] =
+         screen->base.screen->is_format_supported(screen->base.screen,
+                                                  PIPE_FORMAT_B8G8R8A8_SRGB,
+                                                  PIPE_TEXTURE_2D, 0,
+                                                  PIPE_BIND_RENDER_TARGET);
+      return 0;
+
    default:
       return driQueryRendererIntegerCommon(_screen, param, value);
    }

From 2f50fc040c223339dc14f2975c45d35dd4513c13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 29 Apr 2015 17:57:46 +0200
Subject: [PATCH 0466/1208] docs/relnotes: document new EGL extensions and EGL
 1.5

---
 docs/relnotes/10.7.0.html | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index f39e272ffde..0d993313f34 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -54,6 +54,9 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_viewport_array on radeonsi</li>
 <li>GLX_ARB_create_context_robustness on r600, radeonsi</li>
 <li>EGL_EXT_create_context_robustness on r600, radeonsi</li>
+<li>EGL_KHR_gl_colorspace on r600, radeonsi, nv50, nvc0</li>
+<li>EGL_KHR_gl_texture_3D_image on r600, radeonsi, nv50, nvc0</li>
+<li>EGL 1.5 on r600, radeonsi, nv50, nvc0</li>
 </ul>
 
 <h2>Bug fixes</h2>

From d082c5324914212f76e45be497229c7a0681f706 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Jul 2015 17:14:07 +0200
Subject: [PATCH 0467/1208] st/mesa: don't call st_validate_state in
 BlitFramebuffer

None of the draw states are used here.
This fixes a crash in piglit: ext_framebuffer_blit/blit-early

Calling st_manager_validate_framebuffers is the minimum requirement here.

Cc: mesa-stable@lists.freedesktop.org
---
 src/mesa/state_tracker/st_cb_blit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_blit.c b/src/mesa/state_tracker/st_cb_blit.c
index 6d9371852c5..139690615d6 100644
--- a/src/mesa/state_tracker/st_cb_blit.c
+++ b/src/mesa/state_tracker/st_cb_blit.c
@@ -39,7 +39,7 @@
 #include "st_cb_bitmap.h"
 #include "st_cb_blit.h"
 #include "st_cb_fbo.h"
-#include "st_atom.h"
+#include "st_manager.h"
 
 #include "util/u_format.h"
 
@@ -92,7 +92,7 @@ st_BlitFramebuffer(struct gl_context *ctx,
    } clip;
    struct pipe_blit_info blit;
 
-   st_validate_state(st);
+   st_manager_validate_framebuffers(st);
 
    /* Make sure bitmap rendering has landed in the framebuffers */
    st_flush_bitmap_cache(st);

From 8141b4cee514bb673e394f6fbe2cbe02e5b0faf2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 4 Jul 2015 13:17:07 +0200
Subject: [PATCH 0468/1208] tgsi: allow dumping to a file directly

---
 src/gallium/auxiliary/tgsi/tgsi_dump.c    | 19 +++++++++++++++----
 src/gallium/auxiliary/tgsi/tgsi_dump.h    |  5 +++++
 src/gallium/auxiliary/util/u_dump_state.c |  7 +++----
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index c80d7a20481..8ceb5b47584 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -48,6 +48,7 @@ struct dump_ctx
    int indent;
    
    uint indentation;
+   FILE *file;
 
    void (*dump_printf)(struct dump_ctx *ctx, const char *format, ...);
 };
@@ -58,7 +59,10 @@ dump_ctx_printf(struct dump_ctx *ctx, const char *format, ...)
    va_list ap;
    (void)ctx;
    va_start(ap, format);
-   _debug_vprintf(format, ap);
+   if (ctx->file)
+      vfprintf(ctx->file, format, ap);
+   else
+      _debug_vprintf(format, ap);
    va_end(ap);
 }
 
@@ -659,9 +663,7 @@ prolog(
 }
 
 void
-tgsi_dump(
-   const struct tgsi_token *tokens,
-   uint flags )
+tgsi_dump_to_file(const struct tgsi_token *tokens, uint flags, FILE *file)
 {
    struct dump_ctx ctx;
 
@@ -677,10 +679,17 @@ tgsi_dump(
    ctx.indent = 0;
    ctx.dump_printf = dump_ctx_printf;
    ctx.indentation = 0;
+   ctx.file = file;
 
    tgsi_iterate_shader( tokens, &ctx.iter );
 }
 
+void
+tgsi_dump(const struct tgsi_token *tokens, uint flags)
+{
+   tgsi_dump_to_file(tokens, flags, NULL);
+}
+
 struct str_dump_ctx
 {
    struct dump_ctx base;
@@ -733,6 +742,7 @@ tgsi_dump_str(
    ctx.base.indent = 0;
    ctx.base.dump_printf = &str_dump_ctx_printf;
    ctx.base.indentation = 0;
+   ctx.base.file = NULL;
 
    ctx.str = str;
    ctx.str[0] = 0;
@@ -756,6 +766,7 @@ tgsi_dump_instruction_str(
    ctx.base.indent = 0;
    ctx.base.dump_printf = &str_dump_ctx_printf;
    ctx.base.indentation = 0;
+   ctx.base.file = NULL;
 
    ctx.str = str;
    ctx.str[0] = 0;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h
index bc873a54ae9..7c8f92ee7bc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h
@@ -32,6 +32,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 
+#include <stdio.h>
+
 #if defined __cplusplus
 extern "C" {
 #endif
@@ -43,6 +45,9 @@ tgsi_dump_str(
    char *str,
    size_t size);
 
+void
+tgsi_dump_to_file(const struct tgsi_token *tokens, uint flags, FILE *file);
+
 void
 tgsi_dump(
    const struct tgsi_token *tokens,
diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c
index 58ccfba2dc4..c8713ddd711 100644
--- a/src/gallium/auxiliary/util/u_dump_state.c
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -426,7 +426,6 @@ util_dump_clip_state(FILE *stream, const struct pipe_clip_state *state)
 void
 util_dump_shader_state(FILE *stream, const struct pipe_shader_state *state)
 {
-   char str[8192];
    unsigned i;
 
    if(!state) {
@@ -434,12 +433,12 @@ util_dump_shader_state(FILE *stream, const struct pipe_shader_state *state)
       return;
    }
 
-   tgsi_dump_str(state->tokens, 0, str, sizeof(str));
-
    util_dump_struct_begin(stream, "pipe_shader_state");
 
    util_dump_member_begin(stream, "tokens");
-   util_dump_string(stream, str);
+   fprintf(stream, "\"\n");
+   tgsi_dump_to_file(state->tokens, 0, stream);
+   fprintf(stream, "\"");
    util_dump_member_end(stream);
 
    util_dump_member_begin(stream, "stream_output");

From 2d8213bfa9023b47a5fd6599596e1b02fdcdd4f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 4 Jul 2015 13:18:11 +0200
Subject: [PATCH 0469/1208] gallium/util: improve dump functions

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/util/u_dump.h         |  20 +-
 src/gallium/auxiliary/util/u_dump_defines.c |  41 ++++
 src/gallium/auxiliary/util/u_dump_state.c   | 233 +++++++++++++++-----
 3 files changed, 241 insertions(+), 53 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_dump.h b/src/gallium/auxiliary/util/u_dump.h
index 3ddf518fa2b..2598851152b 100644
--- a/src/gallium/auxiliary/util/u_dump.h
+++ b/src/gallium/auxiliary/util/u_dump.h
@@ -88,14 +88,16 @@ util_dump_tex_filter(unsigned value, boolean shortened);
 const char *
 util_dump_query_type(unsigned value, boolean shortened);
 
+const char *
+util_dump_prim_mode(unsigned value, boolean shortened);
+
 
 /*
  * p_state.h, through a FILE
  */
 
 void
-util_dump_template(FILE *stream,
-                   const struct pipe_resource *templat);
+util_dump_resource(FILE *stream, const struct pipe_resource *state);
 
 void
 util_dump_rasterizer_state(FILE *stream,
@@ -156,10 +158,20 @@ util_dump_surface(FILE *stream,
 void
 util_dump_image_view(FILE *stream, const struct pipe_image_view *state);
 
+void
+util_dump_sampler_view(FILE *stream, const struct pipe_sampler_view *state);
+
 void
 util_dump_transfer(FILE *stream,
                    const struct pipe_transfer *state);
 
+void
+util_dump_constant_buffer(FILE *stream,
+                          const struct pipe_constant_buffer *state);
+
+void
+util_dump_index_buffer(FILE *stream, const struct pipe_index_buffer *state);
+
 void
 util_dump_vertex_buffer(FILE *stream,
                         const struct pipe_vertex_buffer *state);
@@ -168,6 +180,10 @@ void
 util_dump_vertex_element(FILE *stream,
                          const struct pipe_vertex_element *state);
 
+void
+util_dump_stream_output_target(FILE *stream,
+                               const struct pipe_stream_output_target *state);
+
 void
 util_dump_draw_info(FILE *stream, const struct pipe_draw_info *state);
 
diff --git a/src/gallium/auxiliary/util/u_dump_defines.c b/src/gallium/auxiliary/util/u_dump_defines.c
index 03fd15d0c44..3ddc9554b50 100644
--- a/src/gallium/auxiliary/util/u_dump_defines.c
+++ b/src/gallium/auxiliary/util/u_dump_defines.c
@@ -392,3 +392,44 @@ util_dump_query_type_short_names[] = {
 };
 
 DEFINE_UTIL_DUMP_CONTINUOUS(query_type)
+
+
+static const char *
+util_dump_prim_mode_names[] = {
+   "PIPE_PRIM_POINTS",
+   "PIPE_PRIM_LINES",
+   "PIPE_PRIM_LINE_LOOP",
+   "PIPE_PRIM_LINE_STRIP",
+   "PIPE_PRIM_TRIANGLES",
+   "PIPE_PRIM_TRIANGLE_STRIP",
+   "PIPE_PRIM_TRIANGLE_FAN",
+   "PIPE_PRIM_QUADS",
+   "PIPE_PRIM_QUAD_STRIP",
+   "PIPE_PRIM_POLYGON",
+   "PIPE_PRIM_LINES_ADJACENCY",
+   "PIPE_PRIM_LINE_STRIP_ADJACENCY",
+   "PIPE_PRIM_TRIANGLES_ADJACENCY",
+   "PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY",
+   "PIPE_PRIM_PATCHES",
+};
+
+static const char *
+util_dump_prim_mode_short_names[] = {
+   "points",
+   "lines",
+   "line_loop",
+   "line_strip",
+   "triangles",
+   "triangle_strip",
+   "triangle_fan",
+   "quads",
+   "quad_strip",
+   "polygon",
+   "lines_adjacency",
+   "line_strip_adjacency",
+   "triangles_adjacency",
+   "triangle_strip_adjacency",
+   "patches",
+};
+
+DEFINE_UTIL_DUMP_CONTINUOUS(prim_mode)
diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c
index c8713ddd711..441d16236b5 100644
--- a/src/gallium/auxiliary/util/u_dump_state.c
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -247,6 +247,42 @@ util_dump_enum_func(FILE *stream, unsigned value)
    util_dump_enum(stream, util_dump_func(value, TRUE));
 }
 
+static void
+util_dump_enum_prim_mode(FILE *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_prim_mode(value, TRUE));
+}
+
+static void
+util_dump_enum_tex_target(FILE *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_tex_target(value, TRUE));
+}
+
+static void
+util_dump_enum_tex_filter(FILE *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_tex_filter(value, TRUE));
+}
+
+static void
+util_dump_enum_tex_mipfilter(FILE *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_tex_mipfilter(value, TRUE));
+}
+
+static void
+util_dump_enum_tex_wrap(FILE *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_tex_wrap(value, TRUE));
+}
+
+static void
+util_dump_enum_stencil_op(FILE *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_stencil_op(value, TRUE));
+}
+
 
 /*
  * Public functions
@@ -254,38 +290,28 @@ util_dump_enum_func(FILE *stream, unsigned value)
 
 
 void
-util_dump_template(FILE *stream, const struct pipe_resource *templat)
+util_dump_resource(FILE *stream, const struct pipe_resource *state)
 {
-   if(!templat) {
+   if (!state) {
       util_dump_null(stream);
       return;
    }
 
    util_dump_struct_begin(stream, "pipe_resource");
 
-   util_dump_member(stream, int, templat, target);
-   util_dump_member(stream, format, templat, format);
+   util_dump_member(stream, enum_tex_target, state, target);
+   util_dump_member(stream, format, state, format);
 
-   util_dump_member_begin(stream, "width");
-   util_dump_uint(stream, templat->width0);
-   util_dump_member_end(stream);
+   util_dump_member(stream, uint, state, width0);
+   util_dump_member(stream, uint, state, height0);
+   util_dump_member(stream, uint, state, depth0);
+   util_dump_member(stream, uint, state, array_size);
 
-   util_dump_member_begin(stream, "height");
-   util_dump_uint(stream, templat->height0);
-   util_dump_member_end(stream);
-
-   util_dump_member_begin(stream, "depth");
-   util_dump_uint(stream, templat->depth0);
-   util_dump_member_end(stream);
-
-   util_dump_member_begin(stream, "array_size");
-   util_dump_uint(stream, templat->array_size);
-   util_dump_member_end(stream);
-
-   util_dump_member(stream, uint, templat, last_level);
-   util_dump_member(stream, uint, templat, usage);
-   util_dump_member(stream, uint, templat, bind);
-   util_dump_member(stream, uint, templat, flags);
+   util_dump_member(stream, uint, state, last_level);
+   util_dump_member(stream, uint, state, nr_samples);
+   util_dump_member(stream, uint, state, usage);
+   util_dump_member(stream, uint, state, bind);
+   util_dump_member(stream, uint, state, flags);
 
    util_dump_struct_end(stream);
 }
@@ -319,6 +345,7 @@ util_dump_rasterizer_state(FILE *stream, const struct pipe_rasterizer_state *sta
    util_dump_member(stream, uint, state, sprite_coord_enable);
    util_dump_member(stream, bool, state, sprite_coord_mode);
    util_dump_member(stream, bool, state, point_quad_rasterization);
+   util_dump_member(stream, bool, state, point_tri_clip);
    util_dump_member(stream, bool, state, point_size_per_vertex);
    util_dump_member(stream, bool, state, multisample);
    util_dump_member(stream, bool, state, line_smooth);
@@ -331,6 +358,7 @@ util_dump_rasterizer_state(FILE *stream, const struct pipe_rasterizer_state *sta
    util_dump_member(stream, bool, state, bottom_edge_rule);
    util_dump_member(stream, bool, state, rasterizer_discard);
    util_dump_member(stream, bool, state, depth_clip);
+   util_dump_member(stream, bool, state, clip_halfz);
    util_dump_member(stream, uint, state, clip_plane_enable);
 
    util_dump_member(stream, float, state, line_width);
@@ -441,25 +469,27 @@ util_dump_shader_state(FILE *stream, const struct pipe_shader_state *state)
    fprintf(stream, "\"");
    util_dump_member_end(stream);
 
-   util_dump_member_begin(stream, "stream_output");
-   util_dump_struct_begin(stream, "pipe_stream_output_info");
-   util_dump_member(stream, uint, &state->stream_output, num_outputs);
-   util_dump_array(stream, uint, state->stream_output.stride,
-                   Elements(state->stream_output.stride));
-   util_dump_array_begin(stream);
-   for(i = 0; i < state->stream_output.num_outputs; ++i) {
-      util_dump_elem_begin(stream);
-      util_dump_struct_begin(stream, ""); /* anonymous */
-      util_dump_member(stream, uint, &state->stream_output.output[i], register_index);
-      util_dump_member(stream, uint, &state->stream_output.output[i], start_component);
-      util_dump_member(stream, uint, &state->stream_output.output[i], num_components);
-      util_dump_member(stream, uint, &state->stream_output.output[i], output_buffer);
+   if (state->stream_output.num_outputs) {
+      util_dump_member_begin(stream, "stream_output");
+      util_dump_struct_begin(stream, "pipe_stream_output_info");
+      util_dump_member(stream, uint, &state->stream_output, num_outputs);
+      util_dump_array(stream, uint, state->stream_output.stride,
+                      Elements(state->stream_output.stride));
+      util_dump_array_begin(stream);
+      for(i = 0; i < state->stream_output.num_outputs; ++i) {
+         util_dump_elem_begin(stream);
+         util_dump_struct_begin(stream, ""); /* anonymous */
+         util_dump_member(stream, uint, &state->stream_output.output[i], register_index);
+         util_dump_member(stream, uint, &state->stream_output.output[i], start_component);
+         util_dump_member(stream, uint, &state->stream_output.output[i], num_components);
+         util_dump_member(stream, uint, &state->stream_output.output[i], output_buffer);
+         util_dump_struct_end(stream);
+         util_dump_elem_end(stream);
+      }
+      util_dump_array_end(stream);
       util_dump_struct_end(stream);
-      util_dump_elem_end(stream);
+      util_dump_member_end(stream);
    }
-   util_dump_array_end(stream);
-   util_dump_struct_end(stream);
-   util_dump_member_end(stream);
 
    util_dump_struct_end(stream);
 }
@@ -495,9 +525,12 @@ util_dump_depth_stencil_alpha_state(FILE *stream, const struct pipe_depth_stenci
       util_dump_member(stream, bool, &state->stencil[i], enabled);
       if (state->stencil[i].enabled) {
          util_dump_member(stream, enum_func, &state->stencil[i], func);
-         util_dump_member(stream, uint, &state->stencil[i], fail_op);
-         util_dump_member(stream, uint, &state->stencil[i], zpass_op);
-         util_dump_member(stream, uint, &state->stencil[i], zfail_op);
+         util_dump_member(stream, enum_stencil_op,
+                          &state->stencil[i], fail_op);
+         util_dump_member(stream, enum_stencil_op,
+                          &state->stencil[i], zpass_op);
+         util_dump_member(stream, enum_stencil_op,
+                          &state->stencil[i], zfail_op);
          util_dump_member(stream, uint, &state->stencil[i], valuemask);
          util_dump_member(stream, uint, &state->stencil[i], writemask);
       }
@@ -554,6 +587,8 @@ util_dump_blend_state(FILE *stream, const struct pipe_blend_state *state)
    util_dump_struct_begin(stream, "pipe_blend_state");
 
    util_dump_member(stream, bool, state, dither);
+   util_dump_member(stream, bool, state, alpha_to_coverage);
+   util_dump_member(stream, bool, state, alpha_to_one);
 
    util_dump_member(stream, bool, state, logicop_enable);
    if (state->logicop_enable) {
@@ -628,16 +663,17 @@ util_dump_sampler_state(FILE *stream, const struct pipe_sampler_state *state)
 
    util_dump_struct_begin(stream, "pipe_sampler_state");
 
-   util_dump_member(stream, uint, state, wrap_s);
-   util_dump_member(stream, uint, state, wrap_t);
-   util_dump_member(stream, uint, state, wrap_r);
-   util_dump_member(stream, uint, state, min_img_filter);
-   util_dump_member(stream, uint, state, min_mip_filter);
-   util_dump_member(stream, uint, state, mag_img_filter);
+   util_dump_member(stream, enum_tex_wrap, state, wrap_s);
+   util_dump_member(stream, enum_tex_wrap, state, wrap_t);
+   util_dump_member(stream, enum_tex_wrap, state, wrap_r);
+   util_dump_member(stream, enum_tex_filter, state, min_img_filter);
+   util_dump_member(stream, enum_tex_mipfilter, state, min_mip_filter);
+   util_dump_member(stream, enum_tex_filter, state, mag_img_filter);
    util_dump_member(stream, uint, state, compare_mode);
    util_dump_member(stream, enum_func, state, compare_func);
    util_dump_member(stream, bool, state, normalized_coords);
    util_dump_member(stream, uint, state, max_anisotropy);
+   util_dump_member(stream, bool, state, seamless_cube_map);
    util_dump_member(stream, float, state, lod_bias);
    util_dump_member(stream, float, state, min_lod);
    util_dump_member(stream, float, state, max_lod);
@@ -697,6 +733,40 @@ util_dump_image_view(FILE *stream, const struct pipe_image_view *state)
 }
 
 
+void
+util_dump_sampler_view(FILE *stream, const struct pipe_sampler_view *state)
+{
+   if (!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_sampler_view");
+
+   util_dump_member(stream, enum_tex_target, state, target);
+   util_dump_member(stream, format, state, format);
+   util_dump_member(stream, ptr, state, texture);
+
+   if (state->target == PIPE_BUFFER) {
+      util_dump_member(stream, uint, state, u.buf.first_element);
+      util_dump_member(stream, uint, state, u.buf.last_element);
+   }
+   else {
+      util_dump_member(stream, uint, state, u.tex.first_layer);
+      util_dump_member(stream, uint, state, u.tex.last_layer);
+      util_dump_member(stream, uint, state, u.tex.last_level);
+      util_dump_member(stream, uint, state, u.tex.last_level);
+   }
+
+   util_dump_member(stream, uint, state, swizzle_r);
+   util_dump_member(stream, uint, state, swizzle_g);
+   util_dump_member(stream, uint, state, swizzle_b);
+   util_dump_member(stream, uint, state, swizzle_a);
+
+   util_dump_struct_end(stream);
+}
+
+
 void
 util_dump_transfer(FILE *stream, const struct pipe_transfer *state)
 {
@@ -720,6 +790,45 @@ util_dump_transfer(FILE *stream, const struct pipe_transfer *state)
 }
 
 
+void
+util_dump_constant_buffer(FILE *stream,
+                          const struct pipe_constant_buffer *state)
+{
+   if (!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_constant_buffer");
+
+   util_dump_member(stream, ptr, state, buffer);
+   util_dump_member(stream, uint, state, buffer_offset);
+   util_dump_member(stream, uint, state, buffer_size);
+   util_dump_member(stream, ptr, state, user_buffer);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_index_buffer(FILE *stream, const struct pipe_index_buffer *state)
+{
+   if (!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_index_buffer");
+
+   util_dump_member(stream, uint, state, index_size);
+   util_dump_member(stream, uint, state, offset);
+   util_dump_member(stream, ptr, state, buffer);
+   util_dump_member(stream, ptr, state, user_buffer);
+
+   util_dump_struct_end(stream);
+}
+
+
 void
 util_dump_vertex_buffer(FILE *stream, const struct pipe_vertex_buffer *state)
 {
@@ -733,6 +842,7 @@ util_dump_vertex_buffer(FILE *stream, const struct pipe_vertex_buffer *state)
    util_dump_member(stream, uint, state, stride);
    util_dump_member(stream, uint, state, buffer_offset);
    util_dump_member(stream, ptr, state, buffer);
+   util_dump_member(stream, ptr, state, user_buffer);
 
    util_dump_struct_end(stream);
 }
@@ -757,6 +867,25 @@ util_dump_vertex_element(FILE *stream, const struct pipe_vertex_element *state)
 }
 
 
+void
+util_dump_stream_output_target(FILE *stream,
+                               const struct pipe_stream_output_target *state)
+{
+   if (!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_stream_output_target");
+
+   util_dump_member(stream, ptr, state, buffer);
+   util_dump_member(stream, uint, state, buffer_offset);
+   util_dump_member(stream, uint, state, buffer_size);
+
+   util_dump_struct_end(stream);
+}
+
+
 void
 util_dump_draw_info(FILE *stream, const struct pipe_draw_info *state)
 {
@@ -769,7 +898,7 @@ util_dump_draw_info(FILE *stream, const struct pipe_draw_info *state)
 
    util_dump_member(stream, bool, state, indexed);
 
-   util_dump_member(stream, uint, state, mode);
+   util_dump_member(stream, enum_prim_mode, state, mode);
    util_dump_member(stream, uint, state, start);
    util_dump_member(stream, uint, state, count);
 
@@ -856,12 +985,14 @@ void util_dump_blit_info(FILE *stream, const struct pipe_blit_info *info)
    util_dump_member_begin(stream, "mask");
    util_dump_string(stream, mask);
    util_dump_member_end(stream);
-   util_dump_member(stream, uint, info, filter);
+   util_dump_member(stream, enum_tex_filter, info, filter);
 
    util_dump_member(stream, bool, info, scissor_enable);
    util_dump_member_begin(stream, "scissor");
    util_dump_scissor_state(stream, &info->scissor);
    util_dump_member_end(stream);
 
+   util_dump_member(stream, bool, info, render_condition_enable);
+
    util_dump_struct_end(stream);
 }

From 3639d66a473591e21aa2ec7692c95c827b479632 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 5 Jul 2015 15:53:10 +0200
Subject: [PATCH 0470/1208] cso: only allow saving and restoring fragment
 sampler views

Not needed for other shader stages.
---
 src/gallium/auxiliary/cso_cache/cso_context.c | 103 +++++++++---------
 src/gallium/auxiliary/cso_cache/cso_context.h |   4 +-
 src/gallium/auxiliary/hud/hud_context.c       |   4 +-
 src/gallium/auxiliary/postprocess/pp_run.c    |   4 +-
 src/gallium/auxiliary/util/u_blit.c           |   4 +-
 src/mesa/state_tracker/st_cb_bitmap.c         |   4 +-
 src/mesa/state_tracker/st_cb_drawpixels.c     |   4 +-
 7 files changed, 63 insertions(+), 64 deletions(-)

diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 5aef2d5c487..1705175d660 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -66,12 +66,6 @@ struct sampler_info
 
    void *samplers_saved[PIPE_MAX_SAMPLERS];
    unsigned nr_samplers_saved;
-
-   struct pipe_sampler_view *views[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   unsigned nr_views;
-
-   struct pipe_sampler_view *views_saved[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   unsigned nr_views_saved;
 };
 
 
@@ -85,6 +79,12 @@ struct cso_context {
    boolean has_tessellation;
    boolean has_streamout;
 
+   struct pipe_sampler_view *fragment_views[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+   unsigned nr_fragment_views;
+
+   struct pipe_sampler_view *fragment_views_saved[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+   unsigned nr_fragment_views_saved;
+
    struct sampler_info samplers[PIPE_SHADER_TYPES];
 
    struct pipe_vertex_buffer aux_vertex_buffer_current;
@@ -297,7 +297,7 @@ out:
  */
 void cso_destroy_context( struct cso_context *ctx )
 {
-   unsigned i, shader;
+   unsigned i;
 
    if (ctx->pipe) {
       ctx->pipe->set_index_buffer(ctx->pipe, NULL);
@@ -347,13 +347,9 @@ void cso_destroy_context( struct cso_context *ctx )
          ctx->pipe->set_stream_output_targets(ctx->pipe, 0, NULL, NULL);
    }
 
-   /* free sampler views for each shader stage */
-   for (shader = 0; shader < Elements(ctx->samplers); shader++) {
-      struct sampler_info *info = &ctx->samplers[shader];
-      for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
-         pipe_sampler_view_reference(&info->views[i], NULL);
-         pipe_sampler_view_reference(&info->views_saved[i], NULL);
-      }
+   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+      pipe_sampler_view_reference(&ctx->fragment_views[i], NULL);
+      pipe_sampler_view_reference(&ctx->fragment_views_saved[i], NULL);
    }
 
    util_unreference_framebuffer_state(&ctx->fb);
@@ -1281,71 +1277,74 @@ cso_set_sampler_views(struct cso_context *ctx,
                       unsigned count,
                       struct pipe_sampler_view **views)
 {
-   struct sampler_info *info = &ctx->samplers[shader_stage];
-   unsigned i;
-   boolean any_change = FALSE;
+   if (shader_stage == PIPE_SHADER_FRAGMENT) {
+      unsigned i;
+      boolean any_change = FALSE;
 
-   /* reference new views */
-   for (i = 0; i < count; i++) {
-      any_change |= info->views[i] != views[i];
-      pipe_sampler_view_reference(&info->views[i], views[i]);
-   }
-   /* unref extra old views, if any */
-   for (; i < info->nr_views; i++) {
-      any_change |= info->views[i] != NULL;
-      pipe_sampler_view_reference(&info->views[i], NULL);
-   }
+      /* reference new views */
+      for (i = 0; i < count; i++) {
+         any_change |= ctx->fragment_views[i] != views[i];
+         pipe_sampler_view_reference(&ctx->fragment_views[i], views[i]);
+      }
+      /* unref extra old views, if any */
+      for (; i < ctx->nr_fragment_views; i++) {
+         any_change |= ctx->fragment_views[i] != NULL;
+         pipe_sampler_view_reference(&ctx->fragment_views[i], NULL);
+      }
 
-   /* bind the new sampler views */
-   if (any_change) {
-      ctx->pipe->set_sampler_views(ctx->pipe, shader_stage, 0,
-                                   MAX2(info->nr_views, count),
-                                   info->views);
-   }
+      /* bind the new sampler views */
+      if (any_change) {
+         ctx->pipe->set_sampler_views(ctx->pipe, shader_stage, 0,
+                                      MAX2(ctx->nr_fragment_views, count),
+                                      ctx->fragment_views);
+      }
 
-   info->nr_views = count;
+      ctx->nr_fragment_views = count;
+   }
+   else
+      ctx->pipe->set_sampler_views(ctx->pipe, shader_stage, 0, count, views);
 }
 
 
 void
-cso_save_sampler_views(struct cso_context *ctx, unsigned shader_stage)
+cso_save_fragment_sampler_views(struct cso_context *ctx)
 {
-   struct sampler_info *info = &ctx->samplers[shader_stage];
    unsigned i;
 
-   info->nr_views_saved = info->nr_views;
+   ctx->nr_fragment_views_saved = ctx->nr_fragment_views;
 
-   for (i = 0; i < info->nr_views; i++) {
-      assert(!info->views_saved[i]);
-      pipe_sampler_view_reference(&info->views_saved[i], info->views[i]);
+   for (i = 0; i < ctx->nr_fragment_views; i++) {
+      assert(!ctx->fragment_views_saved[i]);
+      pipe_sampler_view_reference(&ctx->fragment_views_saved[i],
+                                  ctx->fragment_views[i]);
    }
 }
 
 
 void
-cso_restore_sampler_views(struct cso_context *ctx, unsigned shader_stage)
+cso_restore_fragment_sampler_views(struct cso_context *ctx)
 {
-   struct sampler_info *info = &ctx->samplers[shader_stage];
-   unsigned i, nr_saved = info->nr_views_saved;
+   unsigned i, nr_saved = ctx->nr_fragment_views_saved;
    unsigned num;
 
    for (i = 0; i < nr_saved; i++) {
-      pipe_sampler_view_reference(&info->views[i], NULL);
+      pipe_sampler_view_reference(&ctx->fragment_views[i], NULL);
       /* move the reference from one pointer to another */
-      info->views[i] = info->views_saved[i];
-      info->views_saved[i] = NULL;
+      ctx->fragment_views[i] = ctx->fragment_views_saved[i];
+      ctx->fragment_views_saved[i] = NULL;
    }
-   for (; i < info->nr_views; i++) {
-      pipe_sampler_view_reference(&info->views[i], NULL);
+   for (; i < ctx->nr_fragment_views; i++) {
+      pipe_sampler_view_reference(&ctx->fragment_views[i], NULL);
    }
 
-   num = MAX2(info->nr_views, nr_saved);
+   num = MAX2(ctx->nr_fragment_views, nr_saved);
 
    /* bind the old/saved sampler views */
-   ctx->pipe->set_sampler_views(ctx->pipe, shader_stage, 0, num, info->views);
+   ctx->pipe->set_sampler_views(ctx->pipe, PIPE_SHADER_FRAGMENT, 0, num,
+                                ctx->fragment_views);
 
-   info->nr_views = nr_saved;
-   info->nr_views_saved = 0;
+   ctx->nr_fragment_views = nr_saved;
+   ctx->nr_fragment_views_saved = 0;
 }
 
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index cc50b60c6cd..9d12aaf45be 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -210,10 +210,10 @@ cso_set_sampler_views(struct cso_context *cso,
                       struct pipe_sampler_view **views);
 
 void
-cso_save_sampler_views(struct cso_context *cso, unsigned shader_stage);
+cso_save_fragment_sampler_views(struct cso_context *ctx);
 
 void
-cso_restore_sampler_views(struct cso_context *cso, unsigned shader_stage);
+cso_restore_fragment_sampler_views(struct cso_context *ctx);
 
 
 /* constant buffers */
diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index bd571907f2f..4602b7c23c9 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -437,7 +437,7 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
    cso_save_blend(cso);
    cso_save_depth_stencil_alpha(cso);
    cso_save_fragment_shader(cso);
-   cso_save_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_sampler_views(cso);
    cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
    cso_save_rasterizer(cso);
    cso_save_viewport(cso);
@@ -567,7 +567,7 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
    cso_restore_blend(cso);
    cso_restore_depth_stencil_alpha(cso);
    cso_restore_fragment_shader(cso);
-   cso_restore_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_sampler_views(cso);
    cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
    cso_restore_rasterizer(cso);
    cso_restore_viewport(cso);
diff --git a/src/gallium/auxiliary/postprocess/pp_run.c b/src/gallium/auxiliary/postprocess/pp_run.c
index e76ce854442..04f92c99236 100644
--- a/src/gallium/auxiliary/postprocess/pp_run.c
+++ b/src/gallium/auxiliary/postprocess/pp_run.c
@@ -126,7 +126,7 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    cso_save_sample_mask(cso);
    cso_save_min_samples(cso);
    cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_save_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_sampler_views(cso);
    cso_save_stencil_ref(cso);
    cso_save_stream_outputs(cso);
    cso_save_vertex_elements(cso);
@@ -197,7 +197,7 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    cso_restore_sample_mask(cso);
    cso_restore_min_samples(cso);
    cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_restore_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_sampler_views(cso);
    cso_restore_stencil_ref(cso);
    cso_restore_stream_outputs(cso);
    cso_restore_vertex_elements(cso);
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 0ba00a3997a..049ab35955a 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -547,7 +547,7 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_save_sample_mask(ctx->cso);
    cso_save_min_samples(ctx->cso);
    cso_save_samplers(ctx->cso, PIPE_SHADER_FRAGMENT);
-   cso_save_sampler_views(ctx->cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_sampler_views(ctx->cso);
    cso_save_stream_outputs(ctx->cso);
    cso_save_viewport(ctx->cso);
    cso_save_framebuffer(ctx->cso);
@@ -629,7 +629,7 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_restore_sample_mask(ctx->cso);
    cso_restore_min_samples(ctx->cso);
    cso_restore_samplers(ctx->cso, PIPE_SHADER_FRAGMENT);
-   cso_restore_sampler_views(ctx->cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_sampler_views(ctx->cso);
    cso_restore_viewport(ctx->cso);
    cso_restore_framebuffer(ctx->cso);
    cso_restore_fragment_shader(ctx->cso);
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index c881e194f70..e1445585b53 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -447,7 +447,7 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
 
    cso_save_rasterizer(cso);
    cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_save_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_sampler_views(cso);
    cso_save_viewport(cso);
    cso_save_fragment_shader(cso);
    cso_save_stream_outputs(cso);
@@ -536,7 +536,7 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* restore state */
    cso_restore_rasterizer(cso);
    cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_restore_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_sampler_views(cso);
    cso_restore_viewport(cso);
    cso_restore_fragment_shader(cso);
    cso_restore_vertex_shader(cso);
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index e736d4b5083..9f1a37ff82c 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -690,7 +690,7 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_save_rasterizer(cso);
    cso_save_viewport(cso);
    cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_save_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_sampler_views(cso);
    cso_save_fragment_shader(cso);
    cso_save_stream_outputs(cso);
    cso_save_vertex_shader(cso);
@@ -818,7 +818,7 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_restore_rasterizer(cso);
    cso_restore_viewport(cso);
    cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_restore_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_sampler_views(cso);
    cso_restore_fragment_shader(cso);
    cso_restore_vertex_shader(cso);
    cso_restore_tessctrl_shader(cso);

From 4e8bbed926729fe280701412d85aff64ab79856c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 5 Jul 2015 16:10:54 +0200
Subject: [PATCH 0471/1208] cso: drop inefficient checking for redundant
 sampler state changes

Drivers can do this better, because they can skip redundant state changes
at per-slot granularity.
---
 src/gallium/auxiliary/cso_cache/cso_context.c | 28 ++-----------------
 1 file changed, 2 insertions(+), 26 deletions(-)

diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 1705175d660..969a9a41650 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -56,11 +56,6 @@
  */
 struct sampler_info
 {
-   struct {
-      void *samplers[PIPE_MAX_SAMPLERS];
-      unsigned nr_samplers;
-   } hw;
-
    void *samplers[PIPE_MAX_SAMPLERS];
    unsigned nr_samplers;
 
@@ -1187,27 +1182,8 @@ single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
    }
 
    info->nr_samplers = i;
-
-   if (info->hw.nr_samplers != info->nr_samplers ||
-       memcmp(info->hw.samplers,
-              info->samplers,
-              info->nr_samplers * sizeof(void *)) != 0)
-   {
-      memcpy(info->hw.samplers,
-             info->samplers,
-             info->nr_samplers * sizeof(void *));
-
-      /* set remaining slots/pointers to null */
-      for (i = info->nr_samplers; i < info->hw.nr_samplers; i++)
-         info->samplers[i] = NULL;
-
-      ctx->pipe->bind_sampler_states(ctx->pipe, shader_stage, 0,
-                                     MAX2(info->nr_samplers,
-                                          info->hw.nr_samplers),
-                                     info->samplers);
-
-      info->hw.nr_samplers = info->nr_samplers;
-   }
+   ctx->pipe->bind_sampler_states(ctx->pipe, shader_stage, 0, i,
+                                  info->samplers);
 }
 
 void

From b7492a1f45866a01b00263f9e252ddc3835304e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 5 Jul 2015 16:32:49 +0200
Subject: [PATCH 0472/1208] cso: only allow saving and restoring fragment
 sampler states

---
 src/gallium/auxiliary/cso_cache/cso_context.c | 28 +++++++++++--------
 src/gallium/auxiliary/cso_cache/cso_context.h |  4 +--
 src/gallium/auxiliary/hud/hud_context.c       |  4 +--
 src/gallium/auxiliary/postprocess/pp_run.c    |  4 +--
 src/gallium/auxiliary/util/u_blit.c           |  4 +--
 src/mesa/state_tracker/st_cb_bitmap.c         |  4 +--
 src/mesa/state_tracker/st_cb_drawpixels.c     |  4 +--
 7 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 969a9a41650..46055a099dc 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -58,9 +58,6 @@ struct sampler_info
 {
    void *samplers[PIPE_MAX_SAMPLERS];
    unsigned nr_samplers;
-
-   void *samplers_saved[PIPE_MAX_SAMPLERS];
-   unsigned nr_samplers_saved;
 };
 
 
@@ -80,6 +77,9 @@ struct cso_context {
    struct pipe_sampler_view *fragment_views_saved[PIPE_MAX_SHADER_SAMPLER_VIEWS];
    unsigned nr_fragment_views_saved;
 
+   void *fragment_samplers_saved[PIPE_MAX_SAMPLERS];
+   unsigned nr_fragment_samplers_saved;
+
    struct sampler_info samplers[PIPE_SHADER_TYPES];
 
    struct pipe_vertex_buffer aux_vertex_buffer_current;
@@ -1229,21 +1229,25 @@ cso_set_samplers(struct cso_context *ctx,
 }
 
 void
-cso_save_samplers(struct cso_context *ctx, unsigned shader_stage)
+cso_save_fragment_samplers(struct cso_context *ctx)
 {
-   struct sampler_info *info = &ctx->samplers[shader_stage];
-   info->nr_samplers_saved = info->nr_samplers;
-   memcpy(info->samplers_saved, info->samplers, sizeof(info->samplers));
+   struct sampler_info *info = &ctx->samplers[PIPE_SHADER_FRAGMENT];
+
+   ctx->nr_fragment_samplers_saved = info->nr_samplers;
+   memcpy(ctx->fragment_samplers_saved, info->samplers,
+          sizeof(info->samplers));
 }
 
 
 void
-cso_restore_samplers(struct cso_context *ctx, unsigned shader_stage)
+cso_restore_fragment_samplers(struct cso_context *ctx)
 {
-   struct sampler_info *info = &ctx->samplers[shader_stage];
-   info->nr_samplers = info->nr_samplers_saved;
-   memcpy(info->samplers, info->samplers_saved, sizeof(info->samplers));
-   single_sampler_done(ctx, shader_stage);
+   struct sampler_info *info = &ctx->samplers[PIPE_SHADER_FRAGMENT];
+
+   info->nr_samplers = ctx->nr_fragment_samplers_saved;
+   memcpy(info->samplers, ctx->fragment_samplers_saved,
+          sizeof(info->samplers));
+   single_sampler_done(ctx, PIPE_SHADER_FRAGMENT);
 }
 
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index 9d12aaf45be..c9a422698a2 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -72,10 +72,10 @@ cso_set_samplers(struct cso_context *cso,
                  const struct pipe_sampler_state **states);
 
 void
-cso_save_samplers(struct cso_context *cso, unsigned shader_stage);
+cso_save_fragment_samplers(struct cso_context *cso);
 
 void
-cso_restore_samplers(struct cso_context *cso, unsigned shader_stage);
+cso_restore_fragment_samplers(struct cso_context *cso);
 
 /* Alternate interface to support state trackers that like to modify
  * samplers one at a time:
diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index 4602b7c23c9..4631cd3b323 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -438,7 +438,7 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
    cso_save_depth_stencil_alpha(cso);
    cso_save_fragment_shader(cso);
    cso_save_fragment_sampler_views(cso);
-   cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_samplers(cso);
    cso_save_rasterizer(cso);
    cso_save_viewport(cso);
    cso_save_stream_outputs(cso);
@@ -568,7 +568,7 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
    cso_restore_depth_stencil_alpha(cso);
    cso_restore_fragment_shader(cso);
    cso_restore_fragment_sampler_views(cso);
-   cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_samplers(cso);
    cso_restore_rasterizer(cso);
    cso_restore_viewport(cso);
    cso_restore_stream_outputs(cso);
diff --git a/src/gallium/auxiliary/postprocess/pp_run.c b/src/gallium/auxiliary/postprocess/pp_run.c
index 04f92c99236..caa2062f4cf 100644
--- a/src/gallium/auxiliary/postprocess/pp_run.c
+++ b/src/gallium/auxiliary/postprocess/pp_run.c
@@ -125,7 +125,7 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    cso_save_rasterizer(cso);
    cso_save_sample_mask(cso);
    cso_save_min_samples(cso);
-   cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_samplers(cso);
    cso_save_fragment_sampler_views(cso);
    cso_save_stencil_ref(cso);
    cso_save_stream_outputs(cso);
@@ -196,7 +196,7 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    cso_restore_rasterizer(cso);
    cso_restore_sample_mask(cso);
    cso_restore_min_samples(cso);
-   cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_samplers(cso);
    cso_restore_fragment_sampler_views(cso);
    cso_restore_stencil_ref(cso);
    cso_restore_stream_outputs(cso);
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 049ab35955a..70782a1aea9 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -546,7 +546,7 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_save_rasterizer(ctx->cso);
    cso_save_sample_mask(ctx->cso);
    cso_save_min_samples(ctx->cso);
-   cso_save_samplers(ctx->cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_samplers(ctx->cso);
    cso_save_fragment_sampler_views(ctx->cso);
    cso_save_stream_outputs(ctx->cso);
    cso_save_viewport(ctx->cso);
@@ -628,7 +628,7 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_restore_rasterizer(ctx->cso);
    cso_restore_sample_mask(ctx->cso);
    cso_restore_min_samples(ctx->cso);
-   cso_restore_samplers(ctx->cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_samplers(ctx->cso);
    cso_restore_fragment_sampler_views(ctx->cso);
    cso_restore_viewport(ctx->cso);
    cso_restore_framebuffer(ctx->cso);
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index e1445585b53..01a96c18264 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -446,7 +446,7 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    assert(height <= (GLsizei)maxSize);
 
    cso_save_rasterizer(cso);
-   cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_samplers(cso);
    cso_save_fragment_sampler_views(cso);
    cso_save_viewport(cso);
    cso_save_fragment_shader(cso);
@@ -535,7 +535,7 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
 
    /* restore state */
    cso_restore_rasterizer(cso);
-   cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_samplers(cso);
    cso_restore_fragment_sampler_views(cso);
    cso_restore_viewport(cso);
    cso_restore_fragment_shader(cso);
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 9f1a37ff82c..f67f00d504f 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -689,7 +689,7 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
 
    cso_save_rasterizer(cso);
    cso_save_viewport(cso);
-   cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_samplers(cso);
    cso_save_fragment_sampler_views(cso);
    cso_save_fragment_shader(cso);
    cso_save_stream_outputs(cso);
@@ -817,7 +817,7 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* restore state */
    cso_restore_rasterizer(cso);
    cso_restore_viewport(cso);
-   cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_samplers(cso);
    cso_restore_fragment_sampler_views(cso);
    cso_restore_fragment_shader(cso);
    cso_restore_vertex_shader(cso);

From 4ef7d93a941257b18506eae056631e8f4a11f893 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 5 Jul 2015 16:34:59 +0200
Subject: [PATCH 0473/1208] cso: remove clip state handling

There is no need for this.

v2: handle redundant clip state changes in st/mesa
---
 src/gallium/auxiliary/cso_cache/cso_context.c | 44 -------------------
 src/gallium/auxiliary/cso_cache/cso_context.h | 13 ------
 src/mesa/state_tracker/st_atom_clip.c         |  7 ++-
 3 files changed, 5 insertions(+), 59 deletions(-)

diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 46055a099dc..9ee5bdcc942 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -111,9 +111,6 @@ struct cso_context {
    uint render_condition_mode, render_condition_mode_saved;
    boolean render_condition_cond, render_condition_cond_saved;
 
-   struct pipe_clip_state clip;
-   struct pipe_clip_state clip_saved;
-
    struct pipe_framebuffer_state fb, fb_saved;
    struct pipe_viewport_state vp, vp_saved;
    struct pipe_blend_color blend_color;
@@ -910,47 +907,6 @@ void cso_restore_tesseval_shader(struct cso_context *ctx)
    ctx->tesseval_shader_saved = NULL;
 }
 
-/* clip state */
-
-static inline void
-clip_state_cpy(struct pipe_clip_state *dst,
-               const struct pipe_clip_state *src)
-{
-   memcpy(dst->ucp, src->ucp, sizeof(dst->ucp));
-}
-
-static inline int
-clip_state_cmp(const struct pipe_clip_state *a,
-               const struct pipe_clip_state *b)
-{
-   return memcmp(a->ucp, b->ucp, sizeof(a->ucp));
-}
-
-void
-cso_set_clip(struct cso_context *ctx,
-             const struct pipe_clip_state *clip)
-{
-   if (clip_state_cmp(&ctx->clip, clip)) {
-      clip_state_cpy(&ctx->clip, clip);
-      ctx->pipe->set_clip_state(ctx->pipe, clip);
-   }
-}
-
-void
-cso_save_clip(struct cso_context *ctx)
-{
-   clip_state_cpy(&ctx->clip_saved, &ctx->clip);
-}
-
-void
-cso_restore_clip(struct cso_context *ctx)
-{
-   if (clip_state_cmp(&ctx->clip, &ctx->clip_saved)) {
-      clip_state_cpy(&ctx->clip, &ctx->clip_saved);
-      ctx->pipe->set_clip_state(ctx->pipe, &ctx->clip_saved);
-   }
-}
-
 enum pipe_error
 cso_set_vertex_elements(struct cso_context *ctx,
                         unsigned count,
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index c9a422698a2..3bee4298802 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -188,19 +188,6 @@ void cso_save_render_condition(struct cso_context *cso);
 void cso_restore_render_condition(struct cso_context *cso);
 
 
-/* clip state */
-
-void
-cso_set_clip(struct cso_context *cso,
-             const struct pipe_clip_state *clip);
-
-void
-cso_save_clip(struct cso_context *cso);
-
-void
-cso_restore_clip(struct cso_context *cso);
-
-
 /* sampler view state */
 
 void
diff --git a/src/mesa/state_tracker/st_atom_clip.c b/src/mesa/state_tracker/st_atom_clip.c
index f82c1332afc..506a770499f 100644
--- a/src/mesa/state_tracker/st_atom_clip.c
+++ b/src/mesa/state_tracker/st_atom_clip.c
@@ -59,8 +59,11 @@ static void update_clip( struct st_context *st )
    memcpy(clip.ucp,
           use_eye ? ctx->Transform.EyeUserPlane
                   : ctx->Transform._ClipUserPlane, sizeof(clip.ucp));
-   st->state.clip = clip;
-   cso_set_clip(st->cso_context, &clip);
+
+   if (memcmp(&st->state.clip, &clip, sizeof(clip)) != 0) {
+      st->state.clip = clip;
+      st->pipe->set_clip_state(st->pipe, &clip);
+   }
 }
 
 

From 5ef1782b9ff8aa06f5b7fdbc7ade3e80131d1fda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 5 Jul 2015 16:54:44 +0200
Subject: [PATCH 0474/1208] st/mesa: use cso_set_samplers

---
 src/mesa/state_tracker/st_atom_sampler.c  | 9 ++++-----
 src/mesa/state_tracker/st_cb_drawpixels.c | 9 ++++-----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c
index b68eb16d7be..4ab82bc9acb 100644
--- a/src/mesa/state_tracker/st_atom_sampler.c
+++ b/src/mesa/state_tracker/st_atom_sampler.c
@@ -245,6 +245,7 @@ update_shader_samplers(struct st_context *st,
    GLuint unit;
    GLbitfield samplers_used;
    const GLuint old_max = *num_samplers;
+   const struct pipe_sampler_state *states[PIPE_MAX_SAMPLERS];
 
    samplers_used = prog->SamplersUsed;
 
@@ -261,13 +262,11 @@ update_shader_samplers(struct st_context *st,
          const GLuint texUnit = prog->SamplerUnits[unit];
 
          convert_sampler(st, sampler, texUnit);
-
+         states[unit] = sampler;
          *num_samplers = unit + 1;
-
-         cso_single_sampler(st->cso_context, shader_stage, unit, sampler);
       }
       else if (samplers_used != 0 || unit < old_max) {
-         cso_single_sampler(st->cso_context, shader_stage, unit, NULL);
+         states[unit] = NULL;
       }
       else {
          /* if we've reset all the old samplers and we have no more new ones */
@@ -275,7 +274,7 @@ update_shader_samplers(struct st_context *st,
       }
    }
 
-   cso_single_sampler_done(st->cso_context, shader_stage);
+   cso_set_samplers(st->cso_context, shader_stage, *num_samplers, states);
 }
 
 
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index f67f00d504f..b372697026b 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -757,6 +757,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* texture sampling state: */
    {
       struct pipe_sampler_state sampler;
+      const struct pipe_sampler_state *states[2] = {&sampler, &sampler};
+
       memset(&sampler, 0, sizeof(sampler));
       sampler.wrap_s = PIPE_TEX_WRAP_CLAMP;
       sampler.wrap_t = PIPE_TEX_WRAP_CLAMP;
@@ -766,11 +768,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
       sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
       sampler.normalized_coords = normalized;
 
-      cso_single_sampler(cso, PIPE_SHADER_FRAGMENT, 0, &sampler);
-      if (num_sampler_view > 1) {
-         cso_single_sampler(cso, PIPE_SHADER_FRAGMENT, 1, &sampler);
-      }
-      cso_single_sampler_done(cso, PIPE_SHADER_FRAGMENT);
+      cso_set_samplers(cso, PIPE_SHADER_FRAGMENT,
+                       num_sampler_view > 1 ? 2 : 1, states);
    }
 
    /* viewport state: viewport matching window dims */

From 85f5722f70075e7a93b7a6cc41abee1bc493f4e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 12 Jul 2015 15:52:44 +0200
Subject: [PATCH 0475/1208] gallium/util: use cso_set_samplers

---
 src/gallium/auxiliary/postprocess/pp_colors.c |  4 ++--
 src/gallium/auxiliary/postprocess/pp_mlaa.c   | 23 +++++++++++--------
 src/gallium/auxiliary/util/u_blit.c           |  6 +++--
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/postprocess/pp_colors.c b/src/gallium/auxiliary/postprocess/pp_colors.c
index 247e4df72a4..e6ea0102eac 100644
--- a/src/gallium/auxiliary/postprocess/pp_colors.c
+++ b/src/gallium/auxiliary/postprocess/pp_colors.c
@@ -37,6 +37,7 @@ pp_nocolor(struct pp_queue_t *ppq, struct pipe_resource *in,
 {
 
    struct pp_program *p = ppq->p;
+   const struct pipe_sampler_state *samplers[] = {&p->sampler_point};
 
    pp_filter_setup_in(p, in);
    pp_filter_setup_out(p, out);
@@ -44,8 +45,7 @@ pp_nocolor(struct pp_queue_t *ppq, struct pipe_resource *in,
    pp_filter_set_fb(p);
    pp_filter_misc_state(p);
 
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 0, &p->sampler_point);
-   cso_single_sampler_done(p->cso, PIPE_SHADER_FRAGMENT);
+   cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 1, samplers);
    cso_set_sampler_views(p->cso, PIPE_SHADER_FRAGMENT, 1, &p->view);
 
    cso_set_vertex_shader_handle(p->cso, ppq->shaders[n][0]);
diff --git a/src/gallium/auxiliary/postprocess/pp_mlaa.c b/src/gallium/auxiliary/postprocess/pp_mlaa.c
index 147d14de95d..024a24895c8 100644
--- a/src/gallium/auxiliary/postprocess/pp_mlaa.c
+++ b/src/gallium/auxiliary/postprocess/pp_mlaa.c
@@ -141,8 +141,10 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    p->pipe->clear(p->pipe, PIPE_CLEAR_STENCIL | PIPE_CLEAR_COLOR0,
                   &p->clear_color, 0, 0);
 
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 0, &p->sampler_point);
-   cso_single_sampler_done(p->cso, PIPE_SHADER_FRAGMENT);
+   {
+      const struct pipe_sampler_state *samplers[] = {&p->sampler_point};
+      cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 1, samplers);
+   }
    cso_set_sampler_views(p->cso, PIPE_SHADER_FRAGMENT, 1, &p->view);
 
    cso_set_vertex_shader_handle(p->cso, ppq->shaders[n][1]);    /* offsetvs */
@@ -168,10 +170,11 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
 
    pp_filter_set_clear_fb(p);
 
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 0, &p->sampler_point);
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 1, &p->sampler_point);
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 2, &p->sampler);
-   cso_single_sampler_done(p->cso, PIPE_SHADER_FRAGMENT);
+   {
+      const struct pipe_sampler_state *samplers[] =
+         {&p->sampler_point, &p->sampler_point, &p->sampler};
+      cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 3, samplers);
+   }
 
    arr[0] = p->view;
    cso_set_sampler_views(p->cso, PIPE_SHADER_FRAGMENT, 3, arr);
@@ -199,9 +202,11 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    u_sampler_view_default_template(&v_tmp, in, in->format);
    arr[0] = p->pipe->create_sampler_view(p->pipe, in, &v_tmp);
 
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 0, &p->sampler_point);
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 1, &p->sampler_point);
-   cso_single_sampler_done(p->cso, PIPE_SHADER_FRAGMENT);
+   {
+      const struct pipe_sampler_state *samplers[] =
+         {&p->sampler_point, &p->sampler_point};
+      cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 2, samplers);
+   }
 
    arr[1] = p->view;
    cso_set_sampler_views(p->cso, PIPE_SHADER_FRAGMENT, 2, arr);
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 70782a1aea9..9737c940936 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -572,8 +572,10 @@ util_blit_pixels_tex(struct blit_state *ctx,
    ctx->sampler.normalized_coords = normalized;
    ctx->sampler.min_img_filter = filter;
    ctx->sampler.mag_img_filter = filter;
-   cso_single_sampler(ctx->cso, PIPE_SHADER_FRAGMENT, 0, &ctx->sampler);
-   cso_single_sampler_done(ctx->cso, PIPE_SHADER_FRAGMENT);
+   {
+      const struct pipe_sampler_state *samplers[] = {&ctx->sampler};
+      cso_set_samplers(ctx->cso, PIPE_SHADER_FRAGMENT, 1, samplers);
+   }
 
    /* viewport */
    ctx->viewport.scale[0] = 0.5f * dst->width;

From 68dcbf4c4679ad4e62d55e4f2632311aeef38eed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 12 Jul 2015 16:12:59 +0200
Subject: [PATCH 0476/1208] gallium/tests: use cso_set_samplers

---
 src/gallium/tests/trivial/quad-tex.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/tests/trivial/quad-tex.c b/src/gallium/tests/trivial/quad-tex.c
index daae577ec4b..c019c7bb0a3 100644
--- a/src/gallium/tests/trivial/quad-tex.c
+++ b/src/gallium/tests/trivial/quad-tex.c
@@ -297,6 +297,8 @@ static void close_prog(struct program *p)
 
 static void draw(struct program *p)
 {
+	const struct pipe_sampler_state *samplers[] = {&p->sampler};
+
 	/* set the render target */
 	cso_set_framebuffer(p->cso, &p->framebuffer);
 
@@ -310,8 +312,7 @@ static void draw(struct program *p)
 	cso_set_viewport(p->cso, &p->viewport);
 
 	/* sampler */
-	cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 0, &p->sampler);
-	cso_single_sampler_done(p->cso, PIPE_SHADER_FRAGMENT);
+	cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 1, samplers);
 
 	/* texture sampler view */
 	cso_set_sampler_views(p->cso, PIPE_SHADER_FRAGMENT, 1, &p->view);

From 2369dc83826c7b1f413ff78f55e460c38d7a0660 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 5 Jul 2015 16:57:58 +0200
Subject: [PATCH 0477/1208] cso: eliminate some sampler function wrappers

---
 src/gallium/auxiliary/cso_cache/cso_context.c | 39 +++++--------------
 src/gallium/auxiliary/cso_cache/cso_context.h |  6 +--
 2 files changed, 12 insertions(+), 33 deletions(-)

diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 9ee5bdcc942..00686d2af41 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -1069,11 +1069,9 @@ unsigned cso_get_aux_vertex_buffer_slot(struct cso_context *ctx)
 
 /**************** fragment/vertex sampler view state *************************/
 
-static enum pipe_error
-single_sampler(struct cso_context *ctx,
-               struct sampler_info *info,
-               unsigned idx,
-               const struct pipe_sampler_state *templ)
+enum pipe_error
+cso_single_sampler(struct cso_context *ctx, unsigned shader_stage,
+                   unsigned idx, const struct pipe_sampler_state *templ)
 {
    void *handle = NULL;
 
@@ -1109,24 +1107,13 @@ single_sampler(struct cso_context *ctx,
       }
    }
 
-   info->samplers[idx] = handle;
-
+   ctx->samplers[shader_stage].samplers[idx] = handle;
    return PIPE_OK;
 }
 
-enum pipe_error
-cso_single_sampler(struct cso_context *ctx,
-                   unsigned shader_stage,
-                   unsigned idx,
-                   const struct pipe_sampler_state *templ)
-{
-   return single_sampler(ctx, &ctx->samplers[shader_stage], idx, templ);
-}
 
-
-
-static void
-single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
+void
+cso_single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
 {
    struct sampler_info *info = &ctx->samplers[shader_stage];
    unsigned i;
@@ -1142,12 +1129,6 @@ single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
                                   info->samplers);
 }
 
-void
-cso_single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
-{
-   single_sampler_done(ctx, shader_stage);
-}
-
 
 /*
  * If the function encouters any errors it will return the
@@ -1168,18 +1149,18 @@ cso_set_samplers(struct cso_context *ctx,
     */
 
    for (i = 0; i < nr; i++) {
-      temp = single_sampler(ctx, info, i, templates[i]);
+      temp = cso_single_sampler(ctx, shader_stage, i, templates[i]);
       if (temp != PIPE_OK)
          error = temp;
    }
 
    for ( ; i < info->nr_samplers; i++) {
-      temp = single_sampler(ctx, info, i, NULL);
+      temp = cso_single_sampler(ctx, shader_stage, i, NULL);
       if (temp != PIPE_OK)
          error = temp;
    }
 
-   single_sampler_done(ctx, shader_stage);
+   cso_single_sampler_done(ctx, shader_stage);
 
    return error;
 }
@@ -1203,7 +1184,7 @@ cso_restore_fragment_samplers(struct cso_context *ctx)
    info->nr_samplers = ctx->nr_fragment_samplers_saved;
    memcpy(info->samplers, ctx->fragment_samplers_saved,
           sizeof(info->samplers));
-   single_sampler_done(ctx, PIPE_SHADER_FRAGMENT);
+   cso_single_sampler_done(ctx, PIPE_SHADER_FRAGMENT);
 }
 
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index 3bee4298802..f0a27390d17 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -81,10 +81,8 @@ cso_restore_fragment_samplers(struct cso_context *cso);
  * samplers one at a time:
  */
 enum pipe_error
-cso_single_sampler(struct cso_context *cso,
-                   unsigned shader_stage,
-                   unsigned count,
-                   const struct pipe_sampler_state *states);
+cso_single_sampler(struct cso_context *cso, unsigned shader_stage,
+                   unsigned idx, const struct pipe_sampler_state *states);
 
 void
 cso_single_sampler_done(struct cso_context *cso, unsigned shader_stage);

From e4d738f6c6b98a78830c10ab7b89704d847637a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 10 Jul 2015 23:29:04 +0200
Subject: [PATCH 0478/1208] radeonsi: remove redundant parameter in
 si_shader_binary_read
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Tom Stellard <thomas.stellard@amd.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c | 2 +-
 src/gallium/drivers/radeonsi/si_shader.c  | 8 +++-----
 src/gallium/drivers/radeonsi/si_shader.h  | 3 +--
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 89bef2e7afd..0361c99b549 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -144,7 +144,7 @@ static void *si_create_compute_state(
 	 * the shader code to the GPU.
 	 */
 	init_scratch_buffer(sctx, program);
-	si_shader_binary_read(sctx->screen, &program->shader, &program->shader.binary);
+	si_shader_binary_read(sctx->screen, &program->shader);
 
 #endif
 	program->input_buffer =	si_resource_create_custom(sctx->b.b.screen,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 75a29aeebc9..b988f6d2c10 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2686,11 +2686,9 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
 	}
 }
 
-int si_shader_binary_read(struct si_screen *sscreen,
-			struct si_shader *shader,
-			const struct radeon_shader_binary *binary)
+int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
 {
-
+	const struct radeon_shader_binary *binary = &shader->binary;
 	unsigned i;
 	unsigned code_size;
 	unsigned char *ptr;
@@ -2750,7 +2748,7 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 	if (r) {
 		return r;
 	}
-	r = si_shader_binary_read(sscreen, shader, &shader->binary);
+	r = si_shader_binary_read(sscreen, shader);
 
 	FREE(shader->binary.config);
 	FREE(shader->binary.rodata);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 8d309b4eb37..1e8b52b8e58 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -191,8 +191,7 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    LLVMTargetMachineRef tm, LLVMModuleRef mod);
 void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
-		const struct radeon_shader_binary *binary);
+int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);

From 50a957c5de842b18e10c361f7b0310aa46bb483f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 10 Jul 2015 23:35:55 +0200
Subject: [PATCH 0479/1208] radeonsi: upload shader rodata after updating
 scratch relocations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: 10.5 10.6 <mesa-stable@lists.freedesktop.org>
Reviewed-by: Tom Stellard <thomas.stellard@amd.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c      | 49 ++++++++++---------
 src/gallium/drivers/radeonsi/si_shader.h      |  1 +
 .../drivers/radeonsi/si_state_shaders.c       |  8 +--
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index b988f6d2c10..955e780faf2 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2686,16 +2686,41 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
 	}
 }
 
+int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
+{
+	const struct radeon_shader_binary *binary = &shader->binary;
+	unsigned code_size = binary->code_size + binary->rodata_size;
+	unsigned char *ptr;
+
+	r600_resource_reference(&shader->bo, NULL);
+	shader->bo = si_resource_create_custom(&sscreen->b.b,
+					       PIPE_USAGE_IMMUTABLE,
+					       code_size);
+	if (!shader->bo)
+		return -ENOMEM;
+
+	ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
+					PIPE_TRANSFER_READ_WRITE);
+	util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
+	if (binary->rodata_size > 0) {
+		ptr += binary->code_size;
+		util_memcpy_cpu_to_le32(ptr, binary->rodata,
+					binary->rodata_size);
+	}
+
+	sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
+	return 0;
+}
+
 int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
 {
 	const struct radeon_shader_binary *binary = &shader->binary;
 	unsigned i;
-	unsigned code_size;
-	unsigned char *ptr;
 	bool dump  = r600_can_dump_shader(&sscreen->b,
 		shader->selector ? shader->selector->tokens : NULL);
 
 	si_shader_binary_read_config(sscreen, shader, 0);
+	si_shader_binary_upload(sscreen, shader);
 
 	if (dump) {
 		if (!binary->disassembled) {
@@ -2713,26 +2738,6 @@ int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
 			shader->num_sgprs, shader->num_vgprs, binary->code_size,
 			shader->lds_size, shader->scratch_bytes_per_wave);
 	}
-
-	/* copy new shader */
-	code_size = binary->code_size + binary->rodata_size;
-	r600_resource_reference(&shader->bo, NULL);
-	shader->bo = si_resource_create_custom(&sscreen->b.b, PIPE_USAGE_IMMUTABLE,
-					       code_size);
-	if (shader->bo == NULL) {
-		return -ENOMEM;
-	}
-
-
-	ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_READ_WRITE);
-	util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
-	if (binary->rodata_size > 0) {
-		ptr += binary->code_size;
-		util_memcpy_cpu_to_le32(ptr, binary->rodata, binary->rodata_size);
-	}
-
-	sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
-
 	return 0;
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 1e8b52b8e58..c12782f288c 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -191,6 +191,7 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    LLVMTargetMachineRef tm, LLVMModuleRef mod);
 void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
+int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
 int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 78be4d9baf2..b153228fb2c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -749,7 +749,6 @@ static unsigned si_update_scratch_buffer(struct si_context *sctx,
 {
 	struct si_shader *shader;
 	uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
-	unsigned char *ptr;
 
 	if (!sel)
 		return 0;
@@ -770,12 +769,7 @@ static unsigned si_update_scratch_buffer(struct si_context *sctx,
 	si_shader_apply_scratch_relocs(sctx, shader, scratch_va);
 
 	/* Replace the shader bo with a new bo that has the relocs applied. */
-	r600_resource_reference(&shader->bo, NULL);
-	shader->bo = si_resource_create_custom(&sctx->screen->b.b, PIPE_USAGE_IMMUTABLE,
-					       shader->binary.code_size);
-	ptr = sctx->screen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_WRITE);
-	util_memcpy_cpu_to_le32(ptr, shader->binary.code, shader->binary.code_size);
-	sctx->screen->b.ws->buffer_unmap(shader->bo->cs_buf);
+	si_shader_binary_upload(sctx->screen, shader);
 
 	/* Update the shader state to use the new shader bo. */
 	si_shader_init_pm4_state(shader);

From 46b2b3bda8d962fce02838e09c742ac06fbec45f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Jul 2015 13:17:14 +0200
Subject: [PATCH 0480/1208] radeonsi: don't change pipe_resource in
 resource_copy_region
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copied from r600g. pipe_resource can be shared by multiple threads, so we
shouldn't change it.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.h |   1 -
 src/gallium/drivers/radeonsi/si_blit.c        | 201 +++++-------------
 src/gallium/drivers/radeonsi/si_state.c       |  62 ++++--
 src/gallium/drivers/radeonsi/si_state.h       |   6 +
 4 files changed, 110 insertions(+), 160 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index f6d0380c35f..6f736eb17a5 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -214,7 +214,6 @@ struct r600_texture {
 	float				depth_clear_value;
 
 	bool				non_disp_tiling; /* R600-Cayman only */
-	unsigned			mipmap_shift;
 };
 
 struct r600_surface {
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 6c7b383a4a3..f6db3f54192 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -455,89 +455,6 @@ struct texture_orig_info {
 	unsigned npix0_y;
 };
 
-static void si_compressed_to_blittable(struct pipe_resource *tex,
-				       unsigned level,
-				       struct texture_orig_info *orig)
-{
-	struct r600_texture *rtex = (struct r600_texture*)tex;
-	unsigned pixsize = util_format_get_blocksize(rtex->resource.b.b.format);
-	int new_format;
-	int new_height, new_width;
-
-	orig->format = tex->format;
-	orig->width0 = tex->width0;
-	orig->height0 = tex->height0;
-	orig->npix0_x = rtex->surface.level[0].npix_x;
-	orig->npix0_y = rtex->surface.level[0].npix_y;
-	orig->npix_x = rtex->surface.level[level].npix_x;
-	orig->npix_y = rtex->surface.level[level].npix_y;
-
-	if (pixsize == 8)
-		new_format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
-	else
-		new_format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
-
-	new_width = util_format_get_nblocksx(tex->format, orig->width0);
-	new_height = util_format_get_nblocksy(tex->format, orig->height0);
-
-	tex->width0 = new_width;
-	tex->height0 = new_height;
-	tex->format = new_format;
-	rtex->surface.level[0].npix_x = util_format_get_nblocksx(orig->format, orig->npix0_x);
-	rtex->surface.level[0].npix_y = util_format_get_nblocksy(orig->format, orig->npix0_y);
-	rtex->surface.level[level].npix_x = util_format_get_nblocksx(orig->format, orig->npix_x);
-	rtex->surface.level[level].npix_y = util_format_get_nblocksy(orig->format, orig->npix_y);
-
-	/* By dividing the dimensions by 4, we effectively decrement
-	 * last_level by 2, therefore the last 2 mipmap levels disappear and
-	 * aren't blittable. Note that the last 3 mipmap levels (4x4, 2x2,
-	 * 1x1) have equal slice sizes, which is an important assumption
-	 * for this to work.
-	 *
-	 * In order to make the last 2 mipmap levels blittable, we have to
-	 * add the slice size of the last mipmap level to the texture
-	 * address, so that even though the hw thinks it reads last_level-2,
-	 * it will actually read last_level-1, and if we add the slice size*2,
-	 * it will read last_level. That's how this workaround works.
-	 */
-	if (level > rtex->resource.b.b.last_level-2)
-		rtex->mipmap_shift = level - (rtex->resource.b.b.last_level-2);
-}
-
-static void si_change_format(struct pipe_resource *tex,
-			     unsigned level,
-			     struct texture_orig_info *orig,
-			     enum pipe_format format)
-{
-	struct r600_texture *rtex = (struct r600_texture*)tex;
-
-	orig->format = tex->format;
-	orig->width0 = tex->width0;
-	orig->height0 = tex->height0;
-	orig->npix0_x = rtex->surface.level[0].npix_x;
-	orig->npix0_y = rtex->surface.level[0].npix_y;
-	orig->npix_x = rtex->surface.level[level].npix_x;
-	orig->npix_y = rtex->surface.level[level].npix_y;
-
-	tex->format = format;
-}
-
-static void si_reset_blittable_to_orig(struct pipe_resource *tex,
-				       unsigned level,
-				       struct texture_orig_info *orig)
-{
-	struct r600_texture *rtex = (struct r600_texture*)tex;
-
-	tex->format = orig->format;
-	tex->width0 = orig->width0;
-	tex->height0 = orig->height0;
-	rtex->surface.level[0].npix_x = orig->npix0_x;
-	rtex->surface.level[0].npix_y = orig->npix0_y;
-	rtex->surface.level[level].npix_x = orig->npix_x;
-	rtex->surface.level[level].npix_y = orig->npix_y;
-	rtex->mipmap_shift = 0;
-}
-
 void si_resource_copy_region(struct pipe_context *ctx,
 			     struct pipe_resource *dst,
 			     unsigned dst_level,
@@ -547,114 +464,114 @@ void si_resource_copy_region(struct pipe_context *ctx,
 			     const struct pipe_box *src_box)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct r600_texture *rdst = (struct r600_texture*)dst;
 	struct pipe_surface *dst_view, dst_templ;
 	struct pipe_sampler_view src_templ, *src_view;
-	struct texture_orig_info orig_info[2];
+	unsigned dst_width, dst_height, src_width0, src_height0;
+	unsigned src_force_level = 0;
 	struct pipe_box sbox, dstbox;
-	boolean restore_orig[2];
 
-	/* Fallback for buffers. */
+	/* Handle buffers first. */
 	if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
 		si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width, false);
 		return;
 	}
 
-	memset(orig_info, 0, sizeof(orig_info));
+	assert(u_max_sample(dst) == u_max_sample(src));
 
 	/* The driver doesn't decompress resources automatically while
 	 * u_blitter is rendering. */
 	si_decompress_subresource(ctx, src, src_level,
 				  src_box->z, src_box->z + src_box->depth - 1);
 
-	restore_orig[0] = restore_orig[1] = FALSE;
+	dst_width = u_minify(dst->width0, dst_level);
+	dst_height = u_minify(dst->height0, dst_level);
+	src_width0 = src->width0;
+	src_height0 = src->height0;
+
+	util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
+	util_blitter_default_src_texture(&src_templ, src, src_level);
 
 	if (util_format_is_compressed(src->format) &&
 	    util_format_is_compressed(dst->format)) {
-		si_compressed_to_blittable(src, src_level, &orig_info[0]);
-		restore_orig[0] = TRUE;
-		sbox.x = util_format_get_nblocksx(orig_info[0].format, src_box->x);
-		sbox.y = util_format_get_nblocksy(orig_info[0].format, src_box->y);
+		unsigned blocksize = util_format_get_blocksize(src->format);
+
+		if (blocksize == 8)
+			src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
+		else
+			src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
+		dst_templ.format = src_templ.format;
+
+		dst_width = util_format_get_nblocksx(dst->format, dst_width);
+		dst_height = util_format_get_nblocksy(dst->format, dst_height);
+		src_width0 = util_format_get_nblocksx(src->format, src_width0);
+		src_height0 = util_format_get_nblocksy(src->format, src_height0);
+
+		dstx = util_format_get_nblocksx(dst->format, dstx);
+		dsty = util_format_get_nblocksy(dst->format, dsty);
+
+		sbox.x = util_format_get_nblocksx(src->format, src_box->x);
+		sbox.y = util_format_get_nblocksy(src->format, src_box->y);
 		sbox.z = src_box->z;
-		sbox.width = util_format_get_nblocksx(orig_info[0].format, src_box->width);
-		sbox.height = util_format_get_nblocksy(orig_info[0].format, src_box->height);
+		sbox.width = util_format_get_nblocksx(src->format, src_box->width);
+		sbox.height = util_format_get_nblocksy(src->format, src_box->height);
 		sbox.depth = src_box->depth;
 		src_box = &sbox;
 
-		si_compressed_to_blittable(dst, dst_level, &orig_info[1]);
-		restore_orig[1] = TRUE;
-		/* translate the dst box as well */
-		dstx = util_format_get_nblocksx(orig_info[1].format, dstx);
-		dsty = util_format_get_nblocksy(orig_info[1].format, dsty);
+		src_force_level = src_level;
 	} else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) {
 		if (util_format_is_subsampled_422(src->format)) {
-			/* XXX untested */
-			si_change_format(src, src_level, &orig_info[0],
-					 PIPE_FORMAT_R8G8B8A8_UINT);
-			si_change_format(dst, dst_level, &orig_info[1],
-					 PIPE_FORMAT_R8G8B8A8_UINT);
+			src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
+			dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
+
+			dst_width = util_format_get_nblocksx(dst->format, dst_width);
+			src_width0 = util_format_get_nblocksx(src->format, src_width0);
+
+			dstx = util_format_get_nblocksx(dst->format, dstx);
 
 			sbox = *src_box;
-			sbox.x = util_format_get_nblocksx(orig_info[0].format, src_box->x);
-			sbox.width = util_format_get_nblocksx(orig_info[0].format, src_box->width);
+			sbox.x = util_format_get_nblocksx(src->format, src_box->x);
+			sbox.width = util_format_get_nblocksx(src->format, src_box->width);
 			src_box = &sbox;
-			dstx = util_format_get_nblocksx(orig_info[1].format, dstx);
-
-			restore_orig[0] = TRUE;
-			restore_orig[1] = TRUE;
 		} else {
 			unsigned blocksize = util_format_get_blocksize(src->format);
 
 			switch (blocksize) {
 			case 1:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R8_UNORM);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R8_UNORM);
+				dst_templ.format = PIPE_FORMAT_R8_UNORM;
+				src_templ.format = PIPE_FORMAT_R8_UNORM;
 				break;
 			case 2:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R8G8_UNORM);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R8G8_UNORM);
+				dst_templ.format = PIPE_FORMAT_R8G8_UNORM;
+				src_templ.format = PIPE_FORMAT_R8G8_UNORM;
 				break;
 			case 4:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R8G8B8A8_UNORM);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R8G8B8A8_UNORM);
+				dst_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
+				src_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
 				break;
 			case 8:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R16G16B16A16_UINT);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R16G16B16A16_UINT);
+				dst_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
+				src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
 				break;
 			case 16:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R32G32B32A32_UINT);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R32G32B32A32_UINT);
+				dst_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
+				src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
 				break;
 			default:
 				fprintf(stderr, "Unhandled format %s with blocksize %u\n",
 					util_format_short_name(src->format), blocksize);
 				assert(0);
 			}
-			restore_orig[0] = TRUE;
-			restore_orig[1] = TRUE;
 		}
 	}
 
 	/* Initialize the surface. */
-	util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
 	dst_view = r600_create_surface_custom(ctx, dst, &dst_templ,
-					      rdst->surface.level[dst_level].npix_x,
-					      rdst->surface.level[dst_level].npix_y);
+					      dst_width, dst_height);
 
 	/* Initialize the sampler view. */
-	util_blitter_default_src_texture(&src_templ, src, src_level);
-	src_view = ctx->create_sampler_view(ctx, src, &src_templ);
+	src_view = si_create_sampler_view_custom(ctx, src, &src_templ,
+						 src_width0, src_height0,
+						 src_force_level);
 
 	u_box_3d(dstx, dsty, dstz, abs(src_box->width), abs(src_box->height),
 		 abs(src_box->depth), &dstbox);
@@ -662,18 +579,12 @@ void si_resource_copy_region(struct pipe_context *ctx,
 	/* Copy. */
 	si_blitter_begin(ctx, SI_COPY);
 	util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox,
-				  src_view, src_box, src->width0, src->height0,
+				  src_view, src_box, src_width0, src_height0,
 				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
 	si_blitter_end(ctx);
 
 	pipe_surface_reference(&dst_view, NULL);
 	pipe_sampler_view_reference(&src_view, NULL);
-
-	if (restore_orig[0])
-		si_reset_blittable_to_orig(src, src_level, &orig_info[0]);
-
-	if (restore_orig[1])
-		si_reset_blittable_to_orig(dst, dst_level, &orig_info[1]);
 }
 
 /* For MSAA integer resolving to work, we change the format to NORM using this function. */
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 0dd08a248f4..1d4a4e8dc3e 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2272,15 +2272,28 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
  * Samplers
  */
 
-static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx,
-							struct pipe_resource *texture,
-							const struct pipe_sampler_view *state)
+/**
+ * Create a sampler view.
+ *
+ * @param ctx		context
+ * @param texture	texture
+ * @param state		sampler view template
+ * @param width0	width0 override (for compressed textures as int)
+ * @param height0	height0 override (for compressed textures as int)
+ * @param force_level   set the base address to the level (for compressed textures)
+ */
+struct pipe_sampler_view *
+si_create_sampler_view_custom(struct pipe_context *ctx,
+			      struct pipe_resource *texture,
+			      const struct pipe_sampler_view *state,
+			      unsigned width0, unsigned height0,
+			      unsigned force_level)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
 	struct r600_texture *tmp = (struct r600_texture*)texture;
 	const struct util_format_description *desc;
-	unsigned format, num_format;
+	unsigned format, num_format, base_level, first_level, last_level;
 	uint32_t pitch = 0;
 	unsigned char state_swizzle[4], swizzle[4];
 	unsigned height, depth, width;
@@ -2453,13 +2466,25 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 		format = 0;
 	}
 
-	/* not supported any more */
-	//endian = si_colorformat_endian_swap(format);
+	base_level = 0;
+	first_level = state->u.tex.first_level;
+	last_level = state->u.tex.last_level;
+	width = width0;
+	height = height0;
+	depth = texture->depth0;
 
-	width = surflevel[0].npix_x;
-	height = surflevel[0].npix_y;
-	depth = surflevel[0].npix_z;
-	pitch = surflevel[0].nblk_x * util_format_get_blockwidth(pipe_format);
+	if (force_level) {
+		assert(force_level == first_level &&
+		       force_level == last_level);
+		base_level = force_level;
+		first_level = 0;
+		last_level = 0;
+		width = u_minify(width, force_level);
+		height = u_minify(height, force_level);
+		depth = u_minify(depth, force_level);
+	}
+
+	pitch = surflevel[base_level].nblk_x * util_format_get_blockwidth(pipe_format);
 
 	if (texture->target == PIPE_TEXTURE_1D_ARRAY) {
 	        height = 1;
@@ -2469,8 +2494,7 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 	} else if (texture->target == PIPE_TEXTURE_CUBE_ARRAY)
 		depth = texture->array_size / 6;
 
-	va = tmp->resource.gpu_address + surflevel[0].offset;
-	va += tmp->mipmap_shift * surflevel[texture->last_level].slice_size * tmp->surface.array_size;
+	va = tmp->resource.gpu_address + surflevel[base_level].offset;
 
 	view->state[0] = va >> 8;
 	view->state[1] = (S_008F14_BASE_ADDRESS_HI(va >> 40) |
@@ -2483,10 +2507,10 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 			  S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
 			  S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
 			  S_008F1C_BASE_LEVEL(texture->nr_samples > 1 ?
-						      0 : state->u.tex.first_level - tmp->mipmap_shift) |
+						      0 : first_level) |
 			  S_008F1C_LAST_LEVEL(texture->nr_samples > 1 ?
 						      util_logbase2(texture->nr_samples) :
-						      state->u.tex.last_level - tmp->mipmap_shift) |
+						      last_level) |
 			  S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, 0, false)) |
 			  S_008F1C_POW2_PAD(texture->last_level > 0) |
 			  S_008F1C_TYPE(si_tex_dim(texture->target, texture->nr_samples)));
@@ -2539,6 +2563,16 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 	return &view->base;
 }
 
+static struct pipe_sampler_view *
+si_create_sampler_view(struct pipe_context *ctx,
+		       struct pipe_resource *texture,
+		       const struct pipe_sampler_view *state)
+{
+	return si_create_sampler_view_custom(ctx, texture, state,
+					     texture ? texture->width0 : 0,
+					     texture ? texture->height0 : 0, 0);
+}
+
 static void si_sampler_view_destroy(struct pipe_context *ctx,
 				    struct pipe_sampler_view *state)
 {
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index d1f2dff2c3f..634f947792f 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -264,6 +264,12 @@ unsigned cik_tile_split(unsigned tile_split);
 unsigned si_array_mode(unsigned mode);
 uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex);
 unsigned si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool stencil);
+struct pipe_sampler_view *
+si_create_sampler_view_custom(struct pipe_context *ctx,
+			      struct pipe_resource *texture,
+			      const struct pipe_sampler_view *state,
+			      unsigned width0, unsigned height0,
+			      unsigned force_level);
 
 /* si_state_shader.c */
 void si_update_shaders(struct si_context *sctx);

From 0aa2446e2c18e4a54ccf8555a8ff3426e4eb3ded Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Jul 2015 14:42:38 +0200
Subject: [PATCH 0481/1208] radeonsi: remove switch statement in
 si_create_context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

and make si_init_config static

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.c  | 13 ++-----------
 src/gallium/drivers/radeonsi/si_state.c |  6 +++++-
 src/gallium/drivers/radeonsi/si_state.h |  1 -
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 2b6a6ff5522..0878b8887c9 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -128,17 +128,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 	sctx->atoms.s.streamout_begin = &sctx->b.streamout.begin_atom;
 	sctx->atoms.s.streamout_enable = &sctx->b.streamout.enable_atom;
 
-	switch (sctx->b.chip_class) {
-	case SI:
-	case CIK:
-		si_init_state_functions(sctx);
-		si_init_shader_functions(sctx);
-		si_init_config(sctx);
-		break;
-	default:
-		R600_ERR("Unsupported chip class %d.\n", sctx->b.chip_class);
-		goto fail;
-	}
+	si_init_state_functions(sctx);
+	si_init_shader_functions(sctx);
 
 	if (sscreen->b.debug_flags & DBG_FORCE_DMA)
 		sctx->b.b.resource_copy_region = sctx->b.dma_copy;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 1d4a4e8dc3e..316c689357b 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2920,6 +2920,8 @@ static void si_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw,
 	si_need_cs_space((struct si_context*)ctx, num_dw, include_draw_vbo);
 }
 
+static void si_init_config(struct si_context *sctx);
+
 void si_init_state_functions(struct si_context *sctx)
 {
 	si_init_atom(&sctx->framebuffer.atom, &sctx->atoms.s.framebuffer, si_emit_framebuffer_state, 0);
@@ -2981,6 +2983,8 @@ void si_init_state_functions(struct si_context *sctx)
 	} else {
 		sctx->b.dma_copy = si_dma_copy;
 	}
+
+	si_init_config(sctx);
 }
 
 static void
@@ -3087,7 +3091,7 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 		       INSTANCE_BROADCAST_WRITES);
 }
 
-void si_init_config(struct si_context *sctx)
+static void si_init_config(struct si_context *sctx)
 {
 	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 634f947792f..0c1fdb41408 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -256,7 +256,6 @@ boolean si_is_format_supported(struct pipe_screen *screen,
                                unsigned sample_count,
                                unsigned usage);
 void si_init_state_functions(struct si_context *sctx);
-void si_init_config(struct si_context *sctx);
 unsigned cik_bank_wh(unsigned bankwh);
 unsigned cik_db_pipe_config(struct si_screen *sscreen, unsigned tile_mode);
 unsigned cik_macro_tile_aspect(unsigned macro_tile_aspect);

From 5e3974338ed7ea49a41405f8c2e4bcd5fd1f5c80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Jul 2015 14:54:50 +0200
Subject: [PATCH 0482/1208] gallium/radeon: remove buffer_unmap calls that can
 potentially decrease perf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

buffer_unmap is currently a no-op on radeon and done correctly on amdgpu.
I plan to fix it for radeon, but before that, all occurences of buffer_unmap
that can negatively affect performance in the future must be removed.

There are 2 reasons for removing buffer_unmap calls:
- There is a likelihood that buffer_map will be called again, so we don't
  want to unmap yet.
- The buffer is being released, which automatically unmaps it.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/r300/r300_query.c         | 2 --
 src/gallium/drivers/r300/r300_transfer.c      | 4 ----
 src/gallium/drivers/r600/r600_state_common.c  | 1 -
 src/gallium/drivers/radeon/r600_pipe_common.c | 4 +---
 src/gallium/drivers/radeon/r600_query.c       | 5 -----
 src/gallium/drivers/radeon/r600_texture.c     | 9 ---------
 src/gallium/drivers/radeonsi/si_compute.c     | 2 --
 7 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c
index 01b83b87fcf..6a4cd71b169 100644
--- a/src/gallium/drivers/r300/r300_query.c
+++ b/src/gallium/drivers/r300/r300_query.c
@@ -168,8 +168,6 @@ static boolean r300_get_query_result(struct pipe_context* pipe,
         map++;
     }
 
-    r300->rws->buffer_unmap(q->cs_buf);
-
     if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE) {
         vresult->b = temp != 0;
     } else {
diff --git a/src/gallium/drivers/r300/r300_transfer.c b/src/gallium/drivers/r300/r300_transfer.c
index bc0835d1875..4fa360a042d 100644
--- a/src/gallium/drivers/r300/r300_transfer.c
+++ b/src/gallium/drivers/r300/r300_transfer.c
@@ -251,16 +251,12 @@ void r300_texture_transfer_unmap(struct pipe_context *ctx,
     struct r300_resource *tex = r300_resource(transfer->resource);
 
     if (trans->linear_texture) {
-        rws->buffer_unmap(trans->linear_texture->cs_buf);
-
         if (transfer->usage & PIPE_TRANSFER_WRITE) {
             r300_copy_into_tiled_texture(ctx, trans);
         }
 
         pipe_resource_reference(
             (struct pipe_resource**)&trans->linear_texture, NULL);
-    } else {
-        rws->buffer_unmap(tex->cs_buf);
     }
     FREE(transfer);
 }
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 26184e0b517..0c78b50aa0c 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1409,7 +1409,6 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 					data += info.indirect_offset / sizeof(unsigned);
 					start = data[2] * ib.index_size;
 					count = data[0];
-					rctx->b.ws->buffer_unmap(indirect_resource->cs_buf);
 				}
 				else {
 					start = 0;
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index bcbf0b95373..94a7535f531 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -939,10 +939,8 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen)
 	pipe_mutex_destroy(rscreen->aux_context_lock);
 	rscreen->aux_context->destroy(rscreen->aux_context);
 
-	if (rscreen->trace_bo) {
-		rscreen->ws->buffer_unmap(rscreen->trace_bo->cs_buf);
+	if (rscreen->trace_bo)
 		pipe_resource_reference((struct pipe_resource**)&rscreen->trace_bo, NULL);
-	}
 
 	rscreen->ws->destroy(rscreen->ws);
 	FREE(rscreen);
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 71f4a1522f9..a1d8241c513 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -118,7 +118,6 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 			}
 			results += 4 * ctx->max_db;
 		}
-		ctx->ws->buffer_unmap(buf->cs_buf);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 	case PIPE_QUERY_TIMESTAMP:
@@ -130,7 +129,6 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		results = r600_buffer_map_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE);
 		memset(results, 0, buf_size);
-		ctx->ws->buffer_unmap(buf->cs_buf);
 		break;
 	default:
 		assert(0);
@@ -751,7 +749,6 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
 		assert(0);
 	}
 
-	ctx->ws->buffer_unmap(qbuf->buf->cs_buf);
 	return TRUE;
 }
 
@@ -919,7 +916,6 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 	results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE);
 	if (results) {
 		memset(results, 0, ctx->max_db * 4 * 4);
-		ctx->ws->buffer_unmap(buffer->cs_buf);
 
 		/* emit EVENT_WRITE for ZPASS_DONE */
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -937,7 +933,6 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 				if (results[i*4 + 1])
 					mask |= (1<<i);
 			}
-			ctx->ws->buffer_unmap(buffer->cs_buf);
 		}
 	}
 
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index dc510c99749..e63c37e890c 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -1059,18 +1059,9 @@ static void r600_texture_transfer_unmap(struct pipe_context *ctx,
 					struct pipe_transfer* transfer)
 {
 	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
-	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
-	struct radeon_winsys_cs_handle *buf;
 	struct pipe_resource *texture = transfer->resource;
 	struct r600_texture *rtex = (struct r600_texture*)texture;
 
-	if (rtransfer->staging) {
-		buf = rtransfer->staging->cs_buf;
-	} else {
-		buf = r600_resource(transfer->resource)->cs_buf;
-	}
-	rctx->ws->buffer_unmap(buf);
-
 	if ((transfer->usage & PIPE_TRANSFER_WRITE) && rtransfer->staging) {
 		if (rtex->is_depth && rtex->resource.b.b.nr_samples <= 1) {
 			ctx->resource_copy_region(ctx, texture, transfer->level,
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 0361c99b549..5e4225bd04c 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -309,8 +309,6 @@ static void si_launch_grid(
 			kernel_args[i]);
 	}
 
-	sctx->b.ws->buffer_unmap(input_buffer->cs_buf);
-
 	kernel_args_va = input_buffer->gpu_address;
 	kernel_args_va += kernel_args_offset;
 

From 6b37643b820b32c3e15e4a8661448a11af8321dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Jul 2015 15:27:34 +0200
Subject: [PATCH 0483/1208] winsys/radeon: implement buffer_unmap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This has been a no-op due to performance concerns. From now on, drivers
should decide when they don't want to unmap, not the winsys.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 29 +++++++++++++++----
 src/gallium/winsys/radeon/drm/radeon_drm_bo.h |  1 +
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index e4009be816c..ac37b24b6bb 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -351,14 +351,11 @@ void *radeon_bo_do_map(struct radeon_bo *bo)
     if (bo->user_ptr)
         return bo->user_ptr;
 
-    /* Return the pointer if it's already mapped. */
-    if (bo->ptr)
-        return bo->ptr;
-
     /* Map the buffer. */
     pipe_mutex_lock(bo->map_mutex);
-    /* Return the pointer if it's already mapped (in case of a race). */
+    /* Return the pointer if it's already mapped. */
     if (bo->ptr) {
+        bo->map_count++;
         pipe_mutex_unlock(bo->map_mutex);
         return bo->ptr;
     }
@@ -383,6 +380,7 @@ void *radeon_bo_do_map(struct radeon_bo *bo)
         return NULL;
     }
     bo->ptr = ptr;
+    bo->map_count = 1;
     pipe_mutex_unlock(bo->map_mutex);
 
     return bo->ptr;
@@ -467,7 +465,26 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
 
 static void radeon_bo_unmap(struct radeon_winsys_cs_handle *_buf)
 {
-    /* NOP */
+    struct radeon_bo *bo = (struct radeon_bo*)_buf;
+
+    if (bo->user_ptr)
+        return;
+
+    pipe_mutex_lock(bo->map_mutex);
+    if (!bo->ptr) {
+        pipe_mutex_unlock(bo->map_mutex);
+        return; /* it's not been mapped */
+    }
+
+    assert(bo->map_count);
+    if (--bo->map_count) {
+        pipe_mutex_unlock(bo->map_mutex);
+        return; /* it's been mapped multiple times */
+    }
+
+    os_munmap(bo->ptr, bo->base.size);
+    bo->ptr = NULL;
+    pipe_mutex_unlock(bo->map_mutex);
 }
 
 static void radeon_bo_get_base_buffer(struct pb_buffer *buf,
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
index 0255313af7a..f8f50cc5d5b 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
@@ -54,6 +54,7 @@ struct radeon_bo {
 
     void *ptr;
     pipe_mutex map_mutex;
+    unsigned map_count;
 
     uint32_t handle;
     uint32_t flink_name;

From 5ead448719f39d27bfbf4cabf138324dfee34a4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 16 Jun 2015 22:13:34 +0200
Subject: [PATCH 0484/1208] drirc: drop support for Heaven 3.0, fixes
 tessellation in 4.0

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/common/drirc | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/src/mesa/drivers/dri/common/drirc b/src/mesa/drivers/dri/common/drirc
index 145e707a64c..97d961b6597 100644
--- a/src/mesa/drivers/dri/common/drirc
+++ b/src/mesa/drivers/dri/common/drirc
@@ -4,24 +4,15 @@
 Application bugs worked around in this file:
 ============================================
 
+* Unigine Heaven 3.0 and older contain too many bugs and can't be supported
+  by drivers that want to be compliant.
+
 * Various Unigine products don't use the #version and #extension GLSL
   directives, meaning they only get GLSL 1.10 and no extensions for their
   shaders.
   Enabling all extensions for Unigine fixes most issues, but the GLSL version
   is still 1.10.
 
-* Unigine Heaven 3.0 with ARB_texture_multisample uses a "ivec4 * vec4"
-  expression, which is illegal in GLSL 1.10.
-  Adding "#version 130" fixes this.
-
-* Unigine Heaven 3.0 with ARB_shader_bit_encoding uses the uint keyword, which
-  is illegal in GLSL 1.10.
-  Adding "#version 130" fixes this.
-
-* Unigine Heaven 3.0 with ARB_shader_bit_encoding uses a "uint & int"
-  expression, which is illegal in any GLSL version.
-  Disabling ARB_shader_bit_encoding fixes this.
-
 * If ARB_sample_shading is supported, Unigine Heaven 4.0 and Valley 1.0 uses
   an #extension directive in the middle of its shaders, which is illegal
   in GLSL.
@@ -45,18 +36,10 @@ TODO: document the other workarounds.
 	</application>
 
         <application name="Unigine Heaven (32-bit)" executable="heaven_x86">
-            <option name="force_glsl_extensions_warn" value="true" />
-            <option name="disable_blend_func_extended" value="true" />
-            <option name="force_glsl_version" value="130" />
-            <option name="disable_shader_bit_encoding" value="true" />
             <option name="allow_glsl_extension_directive_midshader" value="true" />
 	</application>
 
         <application name="Unigine Heaven (64-bit)" executable="heaven_x64">
-            <option name="force_glsl_extensions_warn" value="true" />
-            <option name="disable_blend_func_extended" value="true" />
-            <option name="force_glsl_version" value="130" />
-            <option name="disable_shader_bit_encoding" value="true" />
             <option name="allow_glsl_extension_directive_midshader" value="true" />
 	</application>
 

From e2b59a39cbb64f6759f463f7bad162f5f03807b4 Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Fri, 7 Mar 2014 09:59:11 +0100
Subject: [PATCH 0485/1208] mapi: add ARB_tessellation_shader

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 .../glapi/gen/ARB_tessellation_shader.xml     | 62 +++++++++++++++++++
 src/mapi/glapi/gen/gl_API.xml                 |  6 +-
 src/mapi/glapi/gen/gl_enums.py                |  1 +
 src/mesa/main/mtypes.h                        |  2 +-
 src/mesa/main/shaderapi.c                     | 18 ++++++
 src/mesa/main/shaderapi.h                     |  8 +++
 src/mesa/main/tests/dispatch_sanity.cpp       |  2 +
 7 files changed, 97 insertions(+), 2 deletions(-)
 create mode 100644 src/mapi/glapi/gen/ARB_tessellation_shader.xml

diff --git a/src/mapi/glapi/gen/ARB_tessellation_shader.xml b/src/mapi/glapi/gen/ARB_tessellation_shader.xml
new file mode 100644
index 00000000000..16a213933ef
--- /dev/null
+++ b/src/mapi/glapi/gen/ARB_tessellation_shader.xml
@@ -0,0 +1,62 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<!-- Note: no GLX protocol info yet. -->
+
+<OpenGLAPI>
+
+
+<category name="GL_ARB_tessellation_shader" number="91">
+
+    <!--<enum value="0" name="FALSE"/>
+    <enum value="1" name="TRUE"/>
+    <enum value="0x0004" name="TRIANGLES"/>
+    <enum value="0x0007" name="QUADS"/>
+    <enum value="0x0202" name="EQUAL"/>
+    <enum value="0x0900" name="CW"/>
+    <enum value="0x0901" name="CCW"/>-->
+
+    <enum value="0x000E" name="PATCHES"/>
+    <enum value="0x84F0" name="UNIFORM_BLOCK_REFERENCED_BY_TESS_CONTROL_SHADER"/>
+    <enum value="0x84F1" name="UNIFORM_BLOCK_REFERENCED_BY_TESS_EVALUATION_SHADER"/>
+    <enum value="0x886C" name="MAX_TESS_CONTROL_INPUT_COMPONENTS"/>
+    <enum value="0x886D" name="MAX_TESS_EVALUATION_INPUT_COMPONENTS"/>
+    <enum value="0x8E1E" name="MAX_COMBINED_TESS_CONTROL_UNIFORM_COMPONENTS"/>
+    <enum value="0x8E1F" name="MAX_COMBINED_TESS_EVALUATION_UNIFORM_COMPONENTS"/>
+    <enum value="0x8E72" name="PATCH_VERTICES"/>
+    <enum value="0x8E73" name="PATCH_DEFAULT_INNER_LEVEL"/>
+    <enum value="0x8E74" name="PATCH_DEFAULT_OUTER_LEVEL"/>
+    <enum value="0x8E75" name="TESS_CONTROL_OUTPUT_VERTICES"/>
+    <enum value="0x8E76" name="TESS_GEN_MODE"/>
+    <enum value="0x8E77" name="TESS_GEN_SPACING"/>
+    <enum value="0x8E78" name="TESS_GEN_VERTEX_ORDER"/>
+    <enum value="0x8E79" name="TESS_GEN_POINT_MODE"/>
+    <enum value="0x8E7A" name="ISOLINES"/>
+    <enum value="0x8E7B" name="FRACTIONAL_ODD"/>
+    <enum value="0x8E7C" name="FRACTIONAL_EVEN"/>
+    <enum value="0x8E7D" name="MAX_PATCH_VERTICES"/>
+    <enum value="0x8E7E" name="MAX_TESS_GEN_LEVEL"/>
+    <enum value="0x8E7F" name="MAX_TESS_CONTROL_UNIFORM_COMPONENTS"/>
+    <enum value="0x8E80" name="MAX_TESS_EVALUATION_UNIFORM_COMPONENTS"/>
+    <enum value="0x8E81" name="MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS"/>
+    <enum value="0x8E82" name="MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS"/>
+    <enum value="0x8E83" name="MAX_TESS_CONTROL_OUTPUT_COMPONENTS"/>
+    <enum value="0x8E84" name="MAX_TESS_PATCH_COMPONENTS"/>
+    <enum value="0x8E85" name="MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS"/>
+    <enum value="0x8E86" name="MAX_TESS_EVALUATION_OUTPUT_COMPONENTS"/>
+    <enum value="0x8E87" name="TESS_EVALUATION_SHADER"/>
+    <enum value="0x8E88" name="TESS_CONTROL_SHADER"/>
+    <enum value="0x8E89" name="MAX_TESS_CONTROL_UNIFORM_BLOCKS"/>
+    <enum value="0x8E8A" name="MAX_TESS_EVALUATION_UNIFORM_BLOCKS"/>
+
+    <function name="PatchParameteri" offset="assign">
+        <param name="pname" type="GLenum"/>
+        <param name="value" type="GLint"/>
+    </function>
+    <function name="PatchParameterfv" offset="assign">
+        <param name="pname" type="GLenum"/>
+        <param name="values" type="const GLfloat *"/>
+    </function>
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index eb8c72a5e15..4bcce3c3240 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -8072,7 +8072,11 @@
 
 <xi:include href="ARB_vertex_type_2_10_10_10_rev.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
-<!-- ARB extensions #86...#93 -->
+<!-- ARB extensions #86...#90 -->
+
+<xi:include href="ARB_tessellation_shader.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
+<!-- ARB extensions #92...#93 -->
 
 <xi:include href="ARB_draw_indirect.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
diff --git a/src/mapi/glapi/gen/gl_enums.py b/src/mapi/glapi/gen/gl_enums.py
index a2d6c7ca2fd..041c2f8ddb8 100644
--- a/src/mapi/glapi/gen/gl_enums.py
+++ b/src/mapi/glapi/gen/gl_enums.py
@@ -118,6 +118,7 @@ static const char *prim_names[PRIM_MAX+3] = {
    "GL_LINE_STRIP_ADJACENCY",
    "GL_TRIANGLES_ADJACENCY",
    "GL_TRIANGLE_STRIP_ADJACENCY",
+   "GL_PATCHES",
    "outside begin/end",
    "unknown state"
 };
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 4b0a995aec9..711f031a0fd 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -90,7 +90,7 @@ struct vbo_context;
 
 
 /** Extra draw modes beyond GL_POINTS, GL_TRIANGLE_FAN, etc */
-#define PRIM_MAX                 GL_TRIANGLE_STRIP_ADJACENCY
+#define PRIM_MAX                 GL_PATCHES
 #define PRIM_OUTSIDE_BEGIN_END   (PRIM_MAX + 1)
 #define PRIM_UNKNOWN             (PRIM_MAX + 2)
 
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 3365c7a6727..ccf008a914b 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1984,3 +1984,21 @@ _mesa_CreateShaderProgramv(GLenum type, GLsizei count,
 
    return _mesa_create_shader_program(ctx, GL_TRUE, type, count, strings);
 }
+
+
+/**
+ * For GL_ARB_tessellation_shader
+ */
+extern void GLAPIENTRY
+_mesa_PatchParameteri(GLenum pname, GLint value)
+{
+   /* STUB */
+}
+
+
+extern void GLAPIENTRY
+_mesa_PatchParameterfv(GLenum pname, const GLfloat *values)
+{
+   /* STUB */
+}
+
diff --git a/src/mesa/main/shaderapi.h b/src/mesa/main/shaderapi.h
index aba6d5d8306..90e2e2d25d4 100644
--- a/src/mesa/main/shaderapi.h
+++ b/src/mesa/main/shaderapi.h
@@ -264,6 +264,14 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg,
                              GLsizei bufSize, GLsizei *length,
                              GLint *params);
 
+/* GL_ARB_tessellation_shader */
+extern void GLAPIENTRY
+_mesa_PatchParameteri(GLenum pname, GLint value);
+
+extern void GLAPIENTRY
+_mesa_PatchParameterfv(GLenum pname, const GLfloat *values);
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 901e6fc53b8..7e3a97baea8 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -563,6 +563,8 @@ const struct function common_desktop_functions_possible[] = {
 
    /* GL 4.0 */
    { "glMinSampleShading", 40, -1 },
+   { "glPatchParameteri", 40, -1 },
+   { "glPatchParameterfv", 40, -1 },
    { "glBlendEquationi", 40, -1 },
    { "glBlendEquationSeparatei", 40, -1 },
    { "glBlendFunci", 40, -1 },

From df3860a3e3269bfe77562058fd87b39ae2f57fcc Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Fri, 7 Mar 2014 10:13:16 +0100
Subject: [PATCH 0486/1208] mesa: add tessellation shader structs

Marek: remove unused members, cleanup

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/mtypes.h | 105 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 711f031a0fd..fc001c2db15 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2164,6 +2164,29 @@ struct gl_vertex_program
 };
 
 
+/** Tessellation control program object */
+struct gl_tess_ctrl_program
+{
+   struct gl_program Base;   /**< base class */
+
+   /* output layout */
+   GLint VerticesOut;
+};
+
+
+/** Tessellation evaluation program object */
+struct gl_tess_eval_program
+{
+   struct gl_program Base;   /**< base class */
+
+   /* input layout */
+   GLenum PrimitiveMode; /* GL_TRIANGLES, GL_QUADS or GL_ISOLINES */
+   GLenum Spacing;       /* GL_EQUAL, GL_FRACTIONAL_EVEN, GL_FRACTIONAL_ODD */
+   GLenum VertexOrder;   /* GL_CW or GL_CCW */
+   bool PointMode;
+};
+
+
 /** Geometry program object */
 struct gl_geometry_program
 {
@@ -2266,6 +2289,27 @@ struct gl_vertex_program_state
    GLboolean _Overriden;
 };
 
+/**
+ * Context state for tessellation control programs.
+ */
+struct gl_tess_ctrl_program_state
+{
+   /** Currently bound and valid shader. */
+   struct gl_tess_ctrl_program *_Current;
+
+   GLint patch_vertices;
+   GLfloat patch_default_outer_level[4];
+   GLfloat patch_default_inner_level[2];
+};
+
+/**
+ * Context state for tessellation evaluation programs.
+ */
+struct gl_tess_eval_program_state
+{
+   /** Currently bound and valid shader. */
+   struct gl_tess_eval_program *_Current;
+};
 
 /**
  * Context state for geometry programs.
@@ -2445,6 +2489,41 @@ struct gl_shader
    bool origin_upper_left;
    bool pixel_center_integer;
 
+   /**
+    * Tessellation Control shader state from layout qualifiers.
+    */
+   struct {
+      /**
+       * 0 - vertices not declared in shader, or
+       * 1 .. GL_MAX_PATCH_VERTICES
+       */
+      GLint VerticesOut;
+   } TessCtrl;
+
+   /**
+    * Tessellation Evaluation shader state from layout qualifiers.
+    */
+   struct {
+      /**
+       * GL_TRIANGLES, GL_QUADS, GL_ISOLINES or PRIM_UNKNOWN if it's not set
+       * in this shader.
+       */
+      GLenum PrimitiveMode;
+      /**
+       * GL_EQUAL, GL_FRACTIONAL_ODD, GL_FRACTIONAL_EVEN, or 0 if it's not set
+       * in this shader.
+       */
+      GLenum Spacing;
+      /**
+       * GL_CW, GL_CCW, or 0 if it's not set in this shader.
+       */
+      GLenum VertexOrder;
+      /**
+       * 1, 0, or -1 if it's not set in this shader.
+       */
+      int PointMode;
+   } TessEval;
+
    /**
     * Geometry shader state from GLSL 1.50 layout qualifiers.
     */
@@ -2673,6 +2752,30 @@ struct gl_shader_program
    /** Post-link gl_FragDepth layout for ARB_conservative_depth. */
    enum gl_frag_depth_layout FragDepthLayout;
 
+   /**
+    * Tessellation Control shader state from layout qualifiers.
+    */
+   struct {
+      /**
+       * 0 - vertices not declared in shader, or
+       * 1 .. GL_MAX_PATCH_VERTICES
+       */
+      GLint VerticesOut;
+   } TessCtrl;
+
+   /**
+    * Tessellation Evaluation shader state from layout qualifiers.
+    */
+   struct {
+      /** GL_TRIANGLES, GL_QUADS or GL_ISOLINES */
+      GLenum PrimitiveMode;
+      /** GL_EQUAL, GL_FRACTIONAL_ODD or GL_FRACTIONAL_EVEN */
+      GLenum Spacing;
+      /** GL_CW or GL_CCW */
+      GLenum VertexOrder;
+      bool PointMode;
+   } TessEval;
+
    /**
     * Geometry shader state - copied into gl_geometry_program by
     * _mesa_copy_linked_program_data().
@@ -4287,6 +4390,8 @@ struct gl_context
    struct gl_fragment_program_state FragmentProgram;
    struct gl_geometry_program_state GeometryProgram;
    struct gl_compute_program_state ComputeProgram;
+   struct gl_tess_ctrl_program_state TessCtrlProgram;
+   struct gl_tess_eval_program_state TessEvalProgram;
    struct gl_ati_fragment_shader_state ATIFragmentShader;
 
    struct gl_pipeline_shader_state Pipeline; /**< GLSL pipeline shader object state */

From a2af956963b6bc4d29f37485e44c98008d2ef077 Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Fri, 7 Mar 2014 10:19:09 +0100
Subject: [PATCH 0487/1208] mesa: add tessellation shader enums

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/shader_enums.h                 | 8 +++++---
 src/mesa/drivers/common/meta.c          | 2 ++
 src/mesa/drivers/dri/i965/intel_debug.c | 4 +++-
 src/mesa/main/context.c                 | 2 ++
 src/mesa/main/shaderobj.h               | 4 ++++
 src/mesa/program/prog_print.c           | 6 ++++++
 src/mesa/program/program.h              | 8 ++++++++
 7 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/src/glsl/shader_enums.h b/src/glsl/shader_enums.h
index 79e0f6b5f78..42a30ae8fb9 100644
--- a/src/glsl/shader_enums.h
+++ b/src/glsl/shader_enums.h
@@ -36,9 +36,11 @@
 typedef enum
 {
    MESA_SHADER_VERTEX = 0,
-   MESA_SHADER_GEOMETRY = 1,
-   MESA_SHADER_FRAGMENT = 2,
-   MESA_SHADER_COMPUTE = 3,
+   MESA_SHADER_TESS_CTRL = 1,
+   MESA_SHADER_TESS_EVAL = 2,
+   MESA_SHADER_GEOMETRY = 3,
+   MESA_SHADER_FRAGMENT = 4,
+   MESA_SHADER_COMPUTE = 5,
 } gl_shader_stage;
 
 #define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 12045ebbf8e..ca10bbe81f6 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -945,6 +945,8 @@ _mesa_meta_end(struct gl_context *ctx)
    if (state & MESA_META_SHADER) {
       static const GLenum targets[] = {
          GL_VERTEX_SHADER,
+         GL_TESS_CONTROL_SHADER,
+         GL_TESS_EVALUATION_SHADER,
          GL_GEOMETRY_SHADER,
          GL_FRAGMENT_SHADER,
       };
diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c
index b68c2127f8d..a0777310e2a 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.c
+++ b/src/mesa/drivers/dri/i965/intel_debug.c
@@ -79,11 +79,13 @@ intel_debug_flag_for_shader_stage(gl_shader_stage stage)
 {
    uint64_t flags[] = {
       [MESA_SHADER_VERTEX] = DEBUG_VS,
+      [MESA_SHADER_TESS_CTRL] = 0,
+      [MESA_SHADER_TESS_EVAL] = 0,
       [MESA_SHADER_GEOMETRY] = DEBUG_GS,
       [MESA_SHADER_FRAGMENT] = DEBUG_WM,
       [MESA_SHADER_COMPUTE] = DEBUG_CS,
    };
-   STATIC_ASSERT(MESA_SHADER_STAGES == 4);
+   STATIC_ASSERT(MESA_SHADER_STAGES == 6);
    return flags[stage];
 }
 
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index f4dc4e3c4da..380c21fb2bf 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -478,6 +478,8 @@ init_program_limits(struct gl_constants *consts, gl_shader_stage stage,
       prog->MaxInputComponents = 16 * 4; /* old limit not to break tnl and swrast */
       prog->MaxOutputComponents = 0; /* value not used */
       break;
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_TESS_EVAL:
    case MESA_SHADER_GEOMETRY:
       prog->MaxParameters = MAX_VERTEX_PROGRAM_PARAMS;
       prog->MaxAttribs = MAX_VERTEX_GENERIC_ATTRIBS;
diff --git a/src/mesa/main/shaderobj.h b/src/mesa/main/shaderobj.h
index 3d696a1887e..75f019dbd3d 100644
--- a/src/mesa/main/shaderobj.h
+++ b/src/mesa/main/shaderobj.h
@@ -111,6 +111,10 @@ _mesa_shader_enum_to_shader_stage(GLenum v)
       return MESA_SHADER_FRAGMENT;
    case GL_GEOMETRY_SHADER:
       return MESA_SHADER_GEOMETRY;
+   case GL_TESS_CONTROL_SHADER:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_SHADER:
+      return MESA_SHADER_TESS_EVAL;
    case GL_COMPUTE_SHADER:
       return MESA_SHADER_COMPUTE;
    default:
diff --git a/src/mesa/program/prog_print.c b/src/mesa/program/prog_print.c
index e4faa63c06f..336a5ef4448 100644
--- a/src/mesa/program/prog_print.c
+++ b/src/mesa/program/prog_print.c
@@ -1015,6 +1015,12 @@ _mesa_write_shader_to_file(const struct gl_shader *shader)
    case MESA_SHADER_FRAGMENT:
       type = "frag";
       break;
+   case MESA_SHADER_TESS_CTRL:
+      type = "tesc";
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      type = "tese";
+      break;
    case MESA_SHADER_VERTEX:
       type = "vert";
       break;
diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h
index 2d92ab2f118..396a5c89db3 100644
--- a/src/mesa/program/program.h
+++ b/src/mesa/program/program.h
@@ -216,6 +216,10 @@ _mesa_program_enum_to_shader_stage(GLenum v)
       return MESA_SHADER_FRAGMENT;
    case GL_GEOMETRY_PROGRAM_NV:
       return MESA_SHADER_GEOMETRY;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      return MESA_SHADER_TESS_EVAL;
    case GL_COMPUTE_PROGRAM_NV:
       return MESA_SHADER_COMPUTE;
    default:
@@ -235,6 +239,10 @@ _mesa_shader_stage_to_program(unsigned stage)
       return GL_FRAGMENT_PROGRAM_ARB;
    case MESA_SHADER_GEOMETRY:
       return GL_GEOMETRY_PROGRAM_NV;
+   case MESA_SHADER_TESS_CTRL:
+      return GL_TESS_CONTROL_PROGRAM_NV;
+   case MESA_SHADER_TESS_EVAL:
+      return GL_TESS_EVALUATION_PROGRAM_NV;
    case MESA_SHADER_COMPUTE:
       return GL_COMPUTE_PROGRAM_NV;
    }

From bb97cc66c149d0782ec269aab29700252fda9db0 Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 21 Sep 2014 12:41:07 +1200
Subject: [PATCH 0488/1208] mesa: add tessellation shader state and limits

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/config.h    | 8 ++++++++
 src/mesa/main/context.c   | 8 ++++++++
 src/mesa/main/mtypes.h    | 6 ++++++
 src/mesa/main/shaderapi.c | 7 +++++++
 4 files changed, 29 insertions(+)

diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index 177f1766016..66eb6cb206d 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -296,6 +296,14 @@
 /** For GL_ARB_pipeline_statistics_query */
 #define MAX_PIPELINE_STATISTICS             11
 
+/** For GL_ARB_tessellation_shader */
+/*@{*/
+#define MAX_TESS_GEN_LEVEL 64
+#define MAX_PATCH_VERTICES 32
+#define MAX_TESS_PATCH_COMPONENTS 120
+#define MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS 4096
+/*@}*/
+
 /*
  * Color channel component order
  * 
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 380c21fb2bf..a08c0d84f6b 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -716,6 +716,14 @@ _mesa_init_constants(struct gl_constants *consts, gl_api api)
 
    /** GL_KHR_context_flush_control */
    consts->ContextReleaseBehavior = GL_CONTEXT_RELEASE_BEHAVIOR_FLUSH;
+
+   /** GL_ARB_tessellation_shader */
+   consts->MaxTessGenLevel = MAX_TESS_GEN_LEVEL;
+   consts->MaxPatchVertices = MAX_PATCH_VERTICES;
+   consts->Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits = MAX_TEXTURE_IMAGE_UNITS;
+   consts->Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits = MAX_TEXTURE_IMAGE_UNITS;
+   consts->MaxTessPatchComponents = MAX_TESS_PATCH_COMPONENTS;
+   consts->MaxTessControlTotalOutputComponents = MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS;
 }
 
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index fc001c2db15..e4342e0308c 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3745,6 +3745,12 @@ struct gl_constants
    GLenum ContextReleaseBehavior;
 
    struct gl_shader_compiler_options ShaderCompilerOptions[MESA_SHADER_STAGES];
+
+   /** GL_ARB_tessellation_shader */
+   GLuint MaxPatchVertices;
+   GLuint MaxTessGenLevel;
+   GLuint MaxTessPatchComponents;
+   GLuint MaxTessControlTotalOutputComponents;
 };
 
 
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index ccf008a914b..a666fdf3d7e 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -110,6 +110,7 @@ _mesa_init_shader_state(struct gl_context *ctx)
     */
    struct gl_shader_compiler_options options;
    gl_shader_stage sh;
+   int i;
 
    memset(&options, 0, sizeof(options));
    options.MaxUnrollIterations = 32;
@@ -126,6 +127,12 @@ _mesa_init_shader_state(struct gl_context *ctx)
    /* Extended for ARB_separate_shader_objects */
    ctx->Shader.RefCount = 1;
    mtx_init(&ctx->Shader.Mutex, mtx_plain);
+
+   ctx->TessCtrlProgram.patch_vertices = 3;
+   for (i = 0; i < 4; ++i)
+      ctx->TessCtrlProgram.patch_default_outer_level[i] = 1.0;
+   for (i = 0; i < 2; ++i)
+      ctx->TessCtrlProgram.patch_default_inner_level[i] = 1.0;
 }
 
 

From 78d3054980edd1a12e56ad0362e889915cff335b Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Fri, 7 Mar 2014 10:28:03 +0100
Subject: [PATCH 0489/1208] mesa: add tessellation shader init functions.

Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/program/program.c | 32 ++++++++++++++++++++++++++++++++
 src/mesa/program/program.h | 10 ++++++++++
 2 files changed, 42 insertions(+)

diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index c13e61b1630..ffad395f324 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -285,6 +285,38 @@ _mesa_init_compute_program(struct gl_context *ctx,
 }
 
 
+/**
+ * Initialize a new tessellation control program object.
+ */
+struct gl_program *
+_mesa_init_tess_ctrl_program(struct gl_context *ctx,
+                             struct gl_tess_ctrl_program *prog,
+                             GLenum target, GLuint id)
+{
+   if (prog) {
+      init_program_struct(&prog->Base, target, id);
+      return &prog->Base;
+   }
+   return NULL;
+}
+
+
+/**
+ * Initialize a new tessellation evaluation program object.
+ */
+struct gl_program *
+_mesa_init_tess_eval_program(struct gl_context *ctx,
+                             struct gl_tess_eval_program *prog,
+                             GLenum target, GLuint id)
+{
+   if (prog) {
+      init_program_struct(&prog->Base, target, id);
+      return &prog->Base;
+   }
+   return NULL;
+}
+
+
 /**
  * Initialize a new geometry program object.
  */
diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h
index 396a5c89db3..eafb969d2a5 100644
--- a/src/mesa/program/program.h
+++ b/src/mesa/program/program.h
@@ -78,6 +78,16 @@ _mesa_init_fragment_program(struct gl_context *ctx,
                             struct gl_fragment_program *prog,
                             GLenum target, GLuint id);
 
+extern struct gl_program *
+_mesa_init_tess_ctrl_program(struct gl_context *ctx,
+                            struct gl_tess_ctrl_program *prog,
+                            GLenum target, GLuint id);
+
+extern struct gl_program *
+_mesa_init_tess_eval_program(struct gl_context *ctx,
+                            struct gl_tess_eval_program *prog,
+                            GLenum target, GLuint id);
+
 extern struct gl_program *
 _mesa_init_geometry_program(struct gl_context *ctx,
                             struct gl_geometry_program *prog,

From a894ed82931840713aac25634ed469ac65889bfa Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Mon, 10 Mar 2014 11:58:37 +0100
Subject: [PATCH 0490/1208] mesa: add misc tessellation shader support

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/program/program.c | 27 +++++++++++++++++
 src/mesa/program/program.h | 62 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index ffad395f324..2d03bba3d12 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -365,6 +365,16 @@ _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id)
                                          CALLOC_STRUCT(gl_geometry_program),
                                          target, id);
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      prog = _mesa_init_tess_ctrl_program(ctx,
+                                          CALLOC_STRUCT(gl_tess_ctrl_program),
+                                          target, id);
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      prog = _mesa_init_tess_eval_program(ctx,
+                                         CALLOC_STRUCT(gl_tess_eval_program),
+                                         target, id);
+      break;
    case GL_COMPUTE_PROGRAM_NV:
       prog = _mesa_init_compute_program(ctx,
                                         CALLOC_STRUCT(gl_compute_program),
@@ -586,6 +596,23 @@ _mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog)
          gpc->UsesStreams = gp->UsesStreams;
       }
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      {
+         const struct gl_tess_ctrl_program *tcp = gl_tess_ctrl_program_const(prog);
+         struct gl_tess_ctrl_program *tcpc = gl_tess_ctrl_program(clone);
+         tcpc->VerticesOut = tcp->VerticesOut;
+      }
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      {
+         const struct gl_tess_eval_program *tep = gl_tess_eval_program_const(prog);
+         struct gl_tess_eval_program *tepc = gl_tess_eval_program(clone);
+         tepc->PrimitiveMode = tep->PrimitiveMode;
+         tepc->Spacing = tep->Spacing;
+         tepc->VertexOrder = tep->VertexOrder;
+         tepc->PointMode = tep->PointMode;
+      }
+      break;
    default:
       _mesa_problem(NULL, "Unexpected target in _mesa_clone_program");
    }
diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h
index eafb969d2a5..a894147cafd 100644
--- a/src/mesa/program/program.h
+++ b/src/mesa/program/program.h
@@ -157,6 +157,25 @@ _mesa_reference_compprog(struct gl_context *ctx,
                            (struct gl_program *) prog);
 }
 
+
+static inline void
+_mesa_reference_tesscprog(struct gl_context *ctx,
+                         struct gl_tess_ctrl_program **ptr,
+                         struct gl_tess_ctrl_program *prog)
+{
+   _mesa_reference_program(ctx, (struct gl_program **) ptr,
+                           (struct gl_program *) prog);
+}
+
+static inline void
+_mesa_reference_tesseprog(struct gl_context *ctx,
+                         struct gl_tess_eval_program **ptr,
+                         struct gl_tess_eval_program *prog)
+{
+   _mesa_reference_program(ctx, (struct gl_program **) ptr,
+                           (struct gl_program *) prog);
+}
+
 extern struct gl_program *
 _mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog);
 
@@ -167,6 +186,20 @@ _mesa_clone_vertex_program(struct gl_context *ctx,
    return (struct gl_vertex_program *) _mesa_clone_program(ctx, &prog->Base);
 }
 
+static inline struct gl_tess_ctrl_program *
+_mesa_clone_tess_ctrl_program(struct gl_context *ctx,
+                             const struct gl_tess_ctrl_program *prog)
+{
+   return (struct gl_tess_ctrl_program *) _mesa_clone_program(ctx, &prog->Base);
+}
+
+static inline struct gl_tess_eval_program *
+_mesa_clone_tess_eval_program(struct gl_context *ctx,
+                             const struct gl_tess_eval_program *prog)
+{
+   return (struct gl_tess_eval_program *) _mesa_clone_program(ctx, &prog->Base);
+}
+
 static inline struct gl_geometry_program *
 _mesa_clone_geometry_program(struct gl_context *ctx,
                              const struct gl_geometry_program *prog)
@@ -262,7 +295,9 @@ _mesa_shader_stage_to_program(unsigned stage)
 }
 
 
-/* Cast wrappers from gl_program to gl_vertex/geometry/fragment_program */
+/* Cast wrappers from gl_program to derived program types.
+ * (e.g. gl_vertex_program)
+ */
 
 static inline struct gl_fragment_program *
 gl_fragment_program(struct gl_program *prog)
@@ -315,6 +350,31 @@ gl_compute_program_const(const struct gl_program *prog)
    return (const struct gl_compute_program *) prog;
 }
 
+static inline struct gl_tess_ctrl_program *
+gl_tess_ctrl_program(struct gl_program *prog)
+{
+   return (struct gl_tess_ctrl_program *) prog;
+}
+
+static inline const struct gl_tess_ctrl_program *
+gl_tess_ctrl_program_const(const struct gl_program *prog)
+{
+   return (const struct gl_tess_ctrl_program *) prog;
+}
+
+
+static inline struct gl_tess_eval_program *
+gl_tess_eval_program(struct gl_program *prog)
+{
+   return (struct gl_tess_eval_program *) prog;
+}
+
+static inline const struct gl_tess_eval_program *
+gl_tess_eval_program_const(const struct gl_program *prog)
+{
+   return (const struct gl_tess_eval_program *) prog;
+}
+
 
 #ifdef __cplusplus
 } /* extern "C" */

From fa602c208815c3e4d757072cadc00e617e30b933 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 28 May 2015 19:11:07 +0200
Subject: [PATCH 0491/1208] mesa: add _mesa_has_tessellation

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/context.h  | 11 +++++++++++
 src/mesa/main/queryobj.c |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/context.h b/src/mesa/main/context.h
index 6f3c941016f..7d256b18d47 100644
--- a/src/mesa/main/context.h
+++ b/src/mesa/main/context.h
@@ -344,6 +344,17 @@ _mesa_has_compute_shaders(const struct gl_context *ctx)
 }
 
 
+/**
+ * Checks if the context supports tessellation.
+ */
+static inline GLboolean
+_mesa_has_tessellation(const struct gl_context *ctx)
+{
+   return ctx->API == API_OPENGL_CORE &&
+          ctx->Extensions.ARB_tessellation_shader;
+}
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index e5c6f9ae31e..98366857f62 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -217,7 +217,7 @@ get_query_binding_point(struct gl_context *ctx, GLenum target, GLuint index)
 
    case GL_TESS_CONTROL_SHADER_PATCHES_ARB:
    case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB:
-      if (ctx->Extensions.ARB_tessellation_shader)
+      if (_mesa_has_tessellation(ctx))
          return get_pipe_stats_binding_point(ctx, target);
       else
          return NULL;

From 8e758c3a74a35f8ee6c5969d5bb5f788b4ef4337 Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 21 Sep 2014 12:37:47 +1200
Subject: [PATCH 0492/1208] mesa: allow drawing of patch primitives

Cosmetic changes and fixes by Marek.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/api_validate.c | 45 ++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/api_validate.c
index 21ae07b9d50..13a7362ed17 100644
--- a/src/mesa/main/api_validate.c
+++ b/src/mesa/main/api_validate.c
@@ -127,6 +127,9 @@ _mesa_is_valid_prim_mode(struct gl_context *ctx, GLenum mode)
    if (mode <= GL_TRIANGLE_STRIP_ADJACENCY)
       return _mesa_has_geometry_shaders(ctx);
 
+   if (mode == GL_PATCHES)
+      return _mesa_has_tessellation(ctx);
+
    return false;
 }
 
@@ -136,6 +139,7 @@ _mesa_is_valid_prim_mode(struct gl_context *ctx, GLenum mode)
  * etc?  Also, do additional checking related to transformation feedback.
  * Note: this function cannot be called during glNewList(GL_COMPILE) because
  * this code depends on current transform feedback state.
+ * Also, do additional checking related to tessellation shaders.
  */
 GLboolean
 _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
@@ -215,6 +219,36 @@ _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
       }
    }
 
+   /* From the OpenGL 4.0 (Core Profile) spec (section 2.12):
+    *
+    *     "Tessellation operates only on patch primitives. If tessellation is
+    *      active, any command that transfers vertices to the GL will
+    *      generate an INVALID_OPERATION error if the primitive mode is not
+    *      PATCHES.
+    *      Patch primitives are not supported by pipeline stages below the
+    *      tessellation evaluation shader. If there is no active program
+    *      object or the active program object does not contain a tessellation
+    *      evaluation shader, the error INVALID_OPERATION is generated by any
+    *      command that transfers vertices to the GL if the primitive mode is
+    *      PATCHES."
+    *
+    */
+   if (ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL] ||
+       ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL]) {
+      if (mode != GL_PATCHES) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "only GL_PATCHES valid with tessellation");
+         return GL_FALSE;
+      }
+   }
+   else {
+      if (mode == GL_PATCHES) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "GL_PATCHES only valid with tessellation");
+         return GL_FALSE;
+      }
+   }
+
    /* From the GL_EXT_transform_feedback spec:
     *
     *     "The error INVALID_OPERATION is generated if Begin, or any command
@@ -247,6 +281,17 @@ _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
             pass = GL_FALSE;
          }
       }
+      else if (ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL]) {
+         struct gl_shader_program *tes =
+            ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
+
+         if (tes->TessEval.PointMode)
+            pass = ctx->TransformFeedback.Mode == GL_POINTS;
+         else if (tes->TessEval.PrimitiveMode == GL_ISOLINES)
+            pass = ctx->TransformFeedback.Mode == GL_LINES;
+         else
+            pass = ctx->TransformFeedback.Mode == GL_TRIANGLES;
+      }
       else {
          switch (mode) {
          case GL_POINTS:

From 5852b5d2fa02d7716c2fbf859d058a2881416e9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 13 Jun 2015 22:26:56 +0200
Subject: [PATCH 0493/1208] mesa: take tessellation into account when
 validating GS input primitive mode

I've reported the bug in the Khronos bugzilla.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/api_validate.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/api_validate.c
index 13a7362ed17..2e211c7f3a9 100644
--- a/src/mesa/main/api_validate.c
+++ b/src/mesa/main/api_validate.c
@@ -174,11 +174,29 @@ _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
     *   TRIANGLES_ADJACENCY_ARB and <mode> is not
     *   TRIANGLES_ADJACENCY_ARB or TRIANGLE_STRIP_ADJACENCY_ARB.
     *
+    * The GL spec doesn't mention any interaction with tessellation, which
+    * is clearly a spec bug. The same rule should apply, but instead of
+    * the draw primitive mode, the tessellation evaluation shader primitive
+    * mode should be used for the checking.
    */
    if (ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY]) {
       const GLenum geom_mode =
          ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY]->Geom.InputType;
-      switch (mode) {
+      struct gl_shader_program *tes =
+         ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
+      GLenum mode_before_gs = mode;
+
+      if (tes) {
+         if (tes->TessEval.PointMode)
+            mode_before_gs = GL_POINTS;
+         else if (tes->TessEval.PrimitiveMode == GL_ISOLINES)
+            mode_before_gs = GL_LINES;
+         else
+            /* the GL_QUADS mode generates triangles too */
+            mode_before_gs = GL_TRIANGLES;
+      }
+
+      switch (mode_before_gs) {
       case GL_POINTS:
          valid_enum = (geom_mode == GL_POINTS);
          break;
@@ -213,7 +231,7 @@ _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "%s(mode=%s vs geometry shader input %s)",
                      name,
-                     _mesa_lookup_prim_by_nr(mode),
+                     _mesa_lookup_prim_by_nr(mode_before_gs),
                      _mesa_lookup_prim_by_nr(geom_mode));
          return GL_FALSE;
       }

From 6435b2909e4f1b82268a1c5769c0c228cda768e0 Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Fri, 7 Mar 2014 10:33:54 +0100
Subject: [PATCH 0494/1208] mesa: support tess stages in glGetProgramPipelineiv

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/pipelineobj.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/mesa/main/pipelineobj.c b/src/mesa/main/pipelineobj.c
index bdf343a6763..8035c14b022 100644
--- a/src/mesa/main/pipelineobj.c
+++ b/src/mesa/main/pipelineobj.c
@@ -588,6 +588,7 @@ _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
    /* Are geometry shaders available in this context?
     */
    const bool has_gs = _mesa_has_geometry_shaders(ctx);
+   const bool has_tess = _mesa_has_tessellation(ctx);;
 
    if (!pipe) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -615,11 +616,17 @@ _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
          ? pipe->CurrentProgram[MESA_SHADER_VERTEX]->Name : 0;
       return;
    case GL_TESS_EVALUATION_SHADER:
-      /* NOT YET SUPPORTED */
-      break;
+      if (!has_tess)
+         break;
+      *params = pipe->CurrentProgram[MESA_SHADER_TESS_EVAL]
+         ? pipe->CurrentProgram[MESA_SHADER_TESS_EVAL]->Name : 0;
+      return;
    case GL_TESS_CONTROL_SHADER:
-      /* NOT YET SUPPORTED */
-      break;
+      if (!has_tess)
+         break;
+      *params = pipe->CurrentProgram[MESA_SHADER_TESS_CTRL]
+         ? pipe->CurrentProgram[MESA_SHADER_TESS_CTRL]->Name : 0;
+      return;
    case GL_GEOMETRY_SHADER:
       if (!has_gs)
          break;

From a30cc2882934ef25f41e1e41eb56d0b768f00b26 Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 21 Sep 2014 11:16:06 +1200
Subject: [PATCH 0495/1208] mesa: allow tess stages in glUseProgramStages

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/pipelineobj.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/mesa/main/pipelineobj.c b/src/mesa/main/pipelineobj.c
index 8035c14b022..3f7acfbc763 100644
--- a/src/mesa/main/pipelineobj.c
+++ b/src/mesa/main/pipelineobj.c
@@ -244,14 +244,13 @@ _mesa_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program)
     *
     *     "If stages is not the special value ALL_SHADER_BITS, and has a bit
     *     set that is not recognized, the error INVALID_VALUE is generated."
-    *
-    * NOT YET SUPPORTED:
-    * GL_TESS_CONTROL_SHADER_BIT
-    * GL_TESS_EVALUATION_SHADER_BIT
     */
    any_valid_stages = GL_VERTEX_SHADER_BIT | GL_FRAGMENT_SHADER_BIT;
    if (_mesa_has_geometry_shaders(ctx))
       any_valid_stages |= GL_GEOMETRY_SHADER_BIT;
+   if (_mesa_has_tessellation(ctx))
+      any_valid_stages |= GL_TESS_CONTROL_SHADER_BIT |
+                          GL_TESS_EVALUATION_SHADER_BIT;
 
    if (stages != GL_ALL_SHADER_BITS && (stages & ~any_valid_stages) != 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glUseProgramStages(Stages)");
@@ -327,6 +326,12 @@ _mesa_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program)
 
    if ((stages & GL_GEOMETRY_SHADER_BIT) != 0)
       _mesa_use_shader_program(ctx, GL_GEOMETRY_SHADER, shProg, pipe);
+
+   if ((stages & GL_TESS_CONTROL_SHADER_BIT) != 0)
+      _mesa_use_shader_program(ctx, GL_TESS_CONTROL_SHADER, shProg, pipe);
+
+   if ((stages & GL_TESS_EVALUATION_SHADER_BIT) != 0)
+      _mesa_use_shader_program(ctx, GL_TESS_EVALUATION_SHADER, shProg, pipe);
 }
 
 /**

From e32e546c17932161e5417a952db3fb7a19cdc93c Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 21 Sep 2014 12:08:22 +1200
Subject: [PATCH 0496/1208] mesa: require VS if TCS or TES is present in
 pipeline

Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/pipelineobj.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/pipelineobj.c b/src/mesa/main/pipelineobj.c
index 3f7acfbc763..07acbf10c1d 100644
--- a/src/mesa/main/pipelineobj.c
+++ b/src/mesa/main/pipelineobj.c
@@ -789,7 +789,9 @@ _mesa_validate_program_pipeline(struct gl_context* ctx,
     *           executable vertex shader."
     */
    if (!pipe->CurrentProgram[MESA_SHADER_VERTEX]
-       && pipe->CurrentProgram[MESA_SHADER_GEOMETRY]) {
+       && (pipe->CurrentProgram[MESA_SHADER_GEOMETRY] ||
+           pipe->CurrentProgram[MESA_SHADER_TESS_CTRL] ||
+           pipe->CurrentProgram[MESA_SHADER_TESS_EVAL])) {
       pipe->InfoLog = ralloc_strdup(pipe, "Program lacks a vertex shader");
       goto err;
    }

From cb0c12512cf83ac412ecc78d4d4c5318c46c9b22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 13 Jun 2015 23:06:06 +0200
Subject: [PATCH 0497/1208] mesa: allow setting of patch parameters.

Based on a patch from Fabian Bieler <fabianbieler@fastmail.fm>.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/mtypes.h    |  5 +++++
 src/mesa/main/shaderapi.c | 44 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index e4342e0308c..e30f923dd7b 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -4161,6 +4161,11 @@ struct gl_driver_flags
     * gl_context::ImageUnits
     */
    uint64_t NewImageUnits;
+
+   /**
+    * gl_context::TessCtrlProgram::patch_default_*
+    */
+   uint64_t NewDefaultTessLevels;
 };
 
 struct gl_uniform_buffer_binding
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index a666fdf3d7e..2a40eea1d41 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1999,13 +1999,53 @@ _mesa_CreateShaderProgramv(GLenum type, GLsizei count,
 extern void GLAPIENTRY
 _mesa_PatchParameteri(GLenum pname, GLint value)
 {
-   /* STUB */
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_tessellation(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "glPatchParameteri");
+      return;
+   }
+
+   if (pname != GL_PATCH_VERTICES) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glPatchParameteri");
+      return;
+   }
+
+   if (value <= 0 || value > ctx->Const.MaxPatchVertices) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glPatchParameteri");
+      return;
+   }
+
+   ctx->TessCtrlProgram.patch_vertices = value;
 }
 
 
 extern void GLAPIENTRY
 _mesa_PatchParameterfv(GLenum pname, const GLfloat *values)
 {
-   /* STUB */
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_tessellation(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "glPatchParameterfv");
+      return;
+   }
+
+   switch(pname) {
+   case GL_PATCH_DEFAULT_OUTER_LEVEL:
+      FLUSH_VERTICES(ctx, 0);
+      memcpy(ctx->TessCtrlProgram.patch_default_outer_level, values,
+             4 * sizeof(GLfloat));
+      ctx->NewDriverState |= ctx->DriverFlags.NewDefaultTessLevels;
+      return;
+   case GL_PATCH_DEFAULT_INNER_LEVEL:
+      FLUSH_VERTICES(ctx, 0);
+      memcpy(ctx->TessCtrlProgram.patch_default_inner_level, values,
+             2 * sizeof(GLfloat));
+      ctx->NewDriverState |= ctx->DriverFlags.NewDefaultTessLevels;
+      return;
+   default:
+      _mesa_error(ctx, GL_INVALID_ENUM, "glPatchParameterfv");
+      return;
+   }
 }
 

From 6823d713c68dfb5679a7c96d06f72c31f755d686 Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Fri, 7 Mar 2014 10:39:18 +0100
Subject: [PATCH 0498/1208] mesa: add tessellation shader getters (v3)

Tessellation dependencies added by Marek.

v2: require tessellation in addition to atomics/images for some glGet queries

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/get.c              | 23 +++++++++
 src/mesa/main/get_hash_params.py | 28 +++++++++++
 src/mesa/main/shaderapi.c        | 84 ++++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index ffafe51ba05..590fe284f4e 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -149,6 +149,8 @@ enum value_extra {
    EXTRA_EXT_UBO_GS4,
    EXTRA_EXT_ATOMICS_GS4,
    EXTRA_EXT_SHADER_IMAGE_GS4,
+   EXTRA_EXT_ATOMICS_TESS,
+   EXTRA_EXT_SHADER_IMAGE_TESS,
 };
 
 #define NO_EXTRA NULL
@@ -349,6 +351,16 @@ static const int extra_ARB_shader_image_load_store_and_geometry_shader[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_shader_atomic_counters_and_tessellation[] = {
+   EXTRA_EXT_ATOMICS_TESS,
+   EXTRA_END
+};
+
+static const int extra_ARB_shader_image_load_store_and_tessellation[] = {
+   EXTRA_EXT_SHADER_IMAGE_TESS,
+   EXTRA_END
+};
+
 static const int extra_ARB_draw_indirect_es31[] = {
    EXT(ARB_draw_indirect),
    EXTRA_API_ES31,
@@ -401,6 +413,7 @@ EXTRA_EXT(ARB_explicit_uniform_location);
 EXTRA_EXT(ARB_clip_control);
 EXTRA_EXT(EXT_polygon_offset_clamp);
 EXTRA_EXT(ARB_framebuffer_no_attachments);
+EXTRA_EXT(ARB_tessellation_shader);
 
 static const int
 extra_ARB_color_buffer_float_or_glcore[] = {
@@ -1149,6 +1162,16 @@ check_extra(struct gl_context *ctx, const char *func, const struct value_desc *d
          api_found = (ctx->Extensions.ARB_shader_image_load_store &&
                       _mesa_has_geometry_shaders(ctx));
          break;
+      case EXTRA_EXT_ATOMICS_TESS:
+         api_check = GL_TRUE;
+         api_found = ctx->Extensions.ARB_shader_atomic_counters &&
+                     _mesa_has_tessellation(ctx);
+         break;
+      case EXTRA_EXT_SHADER_IMAGE_TESS:
+         api_check = GL_TRUE;
+         api_found = ctx->Extensions.ARB_shader_image_load_store &&
+                     _mesa_has_tessellation(ctx);
+         break;
       case EXTRA_END:
 	 break;
       default: /* *e is a offset into the extension struct */
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index c25e1b6555f..2cf06d64c9f 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -824,6 +824,34 @@ descriptor=[
   [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
   [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
   [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5" ],
+
+# GL_ARB_tessellation_shader
+  [ "PATCH_VERTICES", "CONTEXT_INT(TessCtrlProgram.patch_vertices), extra_ARB_tessellation_shader" ],
+  [ "PATCH_DEFAULT_OUTER_LEVEL", "CONTEXT_FLOAT4(TessCtrlProgram.patch_default_outer_level), extra_ARB_tessellation_shader" ],
+  [ "PATCH_DEFAULT_INNER_LEVEL", "CONTEXT_FLOAT2(TessCtrlProgram.patch_default_inner_level), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_GEN_LEVEL", "CONTEXT_INT(Const.MaxTessGenLevel), extra_ARB_tessellation_shader" ],
+  [ "MAX_PATCH_VERTICES", "CONTEXT_INT(Const.MaxPatchVertices), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxOutputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_PATCH_COMPONENTS", "CONTEXT_INT(Const.MaxTessPatchComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.MaxTessControlTotalOutputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxOutputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_INPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxInputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_INPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxInputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_UNIFORM_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformBlocks), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_UNIFORM_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformBlocks), extra_ARB_tessellation_shader" ],
+  [ "MAX_COMBINED_TESS_CONTROL_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxCombinedUniformComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_COMBINED_TESS_EVALUATION_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxCombinedUniformComponents), extra_ARB_tessellation_shader" ],
+# Dependencies on GL_ARB_tessellation_shader
+  [ "MAX_TESS_CONTROL_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_and_tessellation" ],
+  [ "MAX_TESS_CONTROL_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxAtomicCounters), extra_ARB_shader_atomic_counters_and_tessellation" ],
+  [ "MAX_TESS_EVALUATION_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_and_tessellation" ],
+  [ "MAX_TESS_EVALUATION_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxAtomicCounters), extra_ARB_shader_atomic_counters_and_tessellation" ],
+  [ "MAX_TESS_CONTROL_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxImageUniforms), extra_ARB_shader_image_load_store_and_tessellation"],
+  [ "MAX_TESS_EVALUATION_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxImageUniforms), extra_ARB_shader_image_load_store_and_tessellation"],
 ]}
 
 ]
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 2a40eea1d41..a074a444d67 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -517,6 +517,57 @@ check_gs_query(struct gl_context *ctx, const struct gl_shader_program *shProg)
 }
 
 
+/**
+ * Check if a tessellation control shader query is valid at this time.
+ * If not, report an error and return false.
+ *
+ * From GL 4.0 section 6.1.12 (Shader and Program Queries):
+ *
+ *     "If TESS_CONTROL_OUTPUT_VERTICES is queried for a program which has
+ *     not been linked successfully, or which does not contain objects to
+ *     form a tessellation control shader, then an INVALID_OPERATION error is
+ *     generated."
+ */
+static bool
+check_tcs_query(struct gl_context *ctx, const struct gl_shader_program *shProg)
+{
+   if (shProg->LinkStatus &&
+       shProg->_LinkedShaders[MESA_SHADER_TESS_CTRL] != NULL) {
+      return true;
+   }
+
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "glGetProgramv(linked tessellation control shader required)");
+   return false;
+}
+
+
+/**
+ * Check if a tessellation evaluation shader query is valid at this time.
+ * If not, report an error and return false.
+ *
+ * From GL 4.0 section 6.1.12 (Shader and Program Queries):
+ *
+ *     "If any of the pname values in this paragraph are queried for a program
+ *     which has not been linked successfully, or which does not contain
+ *     objects to form a tessellation evaluation shader, then an
+ *     INVALID_OPERATION error is generated."
+ *
+ */
+static bool
+check_tes_query(struct gl_context *ctx, const struct gl_shader_program *shProg)
+{
+   if (shProg->LinkStatus &&
+       shProg->_LinkedShaders[MESA_SHADER_TESS_EVAL] != NULL) {
+      return true;
+   }
+
+   _mesa_error(ctx, GL_INVALID_OPERATION, "glGetProgramv(linked tessellation "
+               "evaluation shader required)");
+   return false;
+}
+
+
 /**
  * glGetProgramiv() - get shader program state.
  * Note that this is for GLSL shader programs, not ARB vertex/fragment
@@ -540,6 +591,7 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
     * and GL 3.2) are available in this context
     */
    const bool has_core_gs = _mesa_has_geometry_shaders(ctx);
+   const bool has_tess = _mesa_has_tessellation(ctx);
 
    /* Are uniform buffer objects available in this context?
     */
@@ -718,6 +770,38 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
    case GL_PROGRAM_SEPARABLE:
       *params = shProg->SeparateShader;
       return;
+
+   /* ARB_tessellation_shader */
+   case GL_TESS_CONTROL_OUTPUT_VERTICES:
+      if (!has_tess)
+         break;
+      if (check_tcs_query(ctx, shProg))
+         *params = shProg->TessCtrl.VerticesOut;
+      return;
+   case GL_TESS_GEN_MODE:
+      if (!has_tess)
+         break;
+      if (check_tes_query(ctx, shProg))
+         *params = shProg->TessEval.PrimitiveMode;
+      return;
+   case GL_TESS_GEN_SPACING:
+      if (!has_tess)
+         break;
+      if (check_tes_query(ctx, shProg))
+         *params = shProg->TessEval.Spacing;
+      return;
+   case GL_TESS_GEN_VERTEX_ORDER:
+      if (!has_tess)
+         break;
+      if (check_tes_query(ctx, shProg))
+         *params = shProg->TessEval.VertexOrder;
+      return;
+   case GL_TESS_GEN_POINT_MODE:
+      if (!has_tess)
+         break;
+      if (check_tes_query(ctx, shProg))
+         *params = shProg->TessEval.PointMode;
+      return;
    default:
       break;
    }

From 550a570c5325cc64a547fe4d6e1e75af2d0e9587 Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Fri, 7 Mar 2014 10:39:39 +0100
Subject: [PATCH 0499/1208] mesa: add misc tessellation shader stuff

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/context.c   |  2 ++
 src/mesa/main/mtypes.h    |  3 ++-
 src/mesa/main/shaderapi.c | 26 +++++++++++++++++++
 src/mesa/main/state.c     | 54 ++++++++++++++++++++++++++++++++++++---
 4 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index a08c0d84f6b..7451e5a2bdd 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -1331,6 +1331,8 @@ _mesa_free_context_data( struct gl_context *ctx )
    _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current, NULL);
    _mesa_reference_vertprog(ctx, &ctx->VertexProgram._TnlProgram, NULL);
 
+   _mesa_reference_tesscprog(ctx, &ctx->TessCtrlProgram._Current, NULL);
+   _mesa_reference_tesseprog(ctx, &ctx->TessEvalProgram._Current, NULL);
    _mesa_reference_geomprog(ctx, &ctx->GeometryProgram._Current, NULL);
 
    _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current, NULL);
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index e30f923dd7b..12390c430e6 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2410,7 +2410,8 @@ struct gl_ati_fragment_shader_state
  */
 struct gl_shader
 {
-   /** GL_FRAGMENT_SHADER || GL_VERTEX_SHADER || GL_GEOMETRY_SHADER_ARB.
+   /** GL_FRAGMENT_SHADER || GL_VERTEX_SHADER || GL_GEOMETRY_SHADER_ARB ||
+    *  GL_TESS_CONTROL_SHADER || GL_TESS_EVALUATION_SHADER.
     * Must be the first field.
     */
    GLenum Type;
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index a074a444d67..77b1430cee1 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -206,6 +206,9 @@ _mesa_validate_shader_target(const struct gl_context *ctx, GLenum type)
       return ctx == NULL || ctx->Extensions.ARB_vertex_shader;
    case GL_GEOMETRY_SHADER_ARB:
       return ctx == NULL || _mesa_has_geometry_shaders(ctx);
+   case GL_TESS_CONTROL_SHADER:
+   case GL_TESS_EVALUATION_SHADER:
+      return ctx == NULL || _mesa_has_tessellation(ctx);
    case GL_COMPUTE_SHADER:
       return ctx == NULL || ctx->Extensions.ARB_compute_shader;
    default:
@@ -422,6 +425,8 @@ detach_shader(struct gl_context *ctx, GLuint program, GLuint shader)
          /* sanity check - make sure the new list's entries are sensible */
          for (j = 0; j < shProg->NumShaders; j++) {
             assert(shProg->Shaders[j]->Type == GL_VERTEX_SHADER ||
+                   shProg->Shaders[j]->Type == GL_TESS_CONTROL_SHADER ||
+                   shProg->Shaders[j]->Type == GL_TESS_EVALUATION_SHADER ||
                    shProg->Shaders[j]->Type == GL_GEOMETRY_SHADER ||
                    shProg->Shaders[j]->Type == GL_FRAGMENT_SHADER);
             assert(shProg->Shaders[j]->RefCount > 0);
@@ -1083,6 +1088,12 @@ print_shader_info(const struct gl_shader_program *shProg)
    if (shProg->_LinkedShaders[MESA_SHADER_GEOMETRY])
       printf("  geom prog %u\n",
 	     shProg->_LinkedShaders[MESA_SHADER_GEOMETRY]->Program->Id);
+   if (shProg->_LinkedShaders[MESA_SHADER_TESS_CTRL])
+      printf("  tesc prog %u\n",
+	     shProg->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program->Id);
+   if (shProg->_LinkedShaders[MESA_SHADER_TESS_EVAL])
+      printf("  tese prog %u\n",
+	     shProg->_LinkedShaders[MESA_SHADER_TESS_EVAL]->Program->Id);
 }
 
 
@@ -2035,6 +2046,21 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
    case MESA_SHADER_VERTEX:
       dst->UsesClipDistanceOut = src->Vert.UsesClipDistance;
       break;
+   case MESA_SHADER_TESS_CTRL: {
+      struct gl_tess_ctrl_program *dst_tcp =
+         (struct gl_tess_ctrl_program *) dst;
+      dst_tcp->VerticesOut = src->TessCtrl.VerticesOut;
+      break;
+   }
+   case MESA_SHADER_TESS_EVAL: {
+      struct gl_tess_eval_program *dst_tep =
+         (struct gl_tess_eval_program *) dst;
+      dst_tep->PrimitiveMode = src->TessEval.PrimitiveMode;
+      dst_tep->Spacing = src->TessEval.Spacing;
+      dst_tep->VertexOrder = src->TessEval.VertexOrder;
+      dst_tep->PointMode = src->TessEval.PointMode;
+      break;
+   }
    case MESA_SHADER_GEOMETRY: {
       struct gl_geometry_program *dst_gp = (struct gl_geometry_program *) dst;
       dst_gp->VerticesIn = src->Geom.VerticesIn;
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index bede7fe1d0e..d3b1c72b08d 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -79,8 +79,8 @@ update_program_enables(struct gl_context *ctx)
 
 
 /**
- * Update the ctx->Vertex/Geometry/FragmentProgram._Current pointers to point
- * to the current/active programs.  Then call ctx->Driver.BindProgram() to
+ * Update the ctx->*Program._Current pointers to point to the
+ * current/active programs.  Then call ctx->Driver.BindProgram() to
  * tell the driver which programs to use.
  *
  * Programs may come from 3 sources: GLSL shaders, ARB/NV_vertex/fragment
@@ -97,6 +97,10 @@ update_program(struct gl_context *ctx)
 {
    const struct gl_shader_program *vsProg =
       ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
+   const struct gl_shader_program *tcsProg =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
+   const struct gl_shader_program *tesProg =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
    const struct gl_shader_program *gsProg =
       ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
    struct gl_shader_program *fsProg =
@@ -106,6 +110,8 @@ update_program(struct gl_context *ctx)
    const struct gl_vertex_program *prevVP = ctx->VertexProgram._Current;
    const struct gl_fragment_program *prevFP = ctx->FragmentProgram._Current;
    const struct gl_geometry_program *prevGP = ctx->GeometryProgram._Current;
+   const struct gl_tess_ctrl_program *prevTCP = ctx->TessCtrlProgram._Current;
+   const struct gl_tess_eval_program *prevTEP = ctx->TessEvalProgram._Current;
    const struct gl_compute_program *prevCP = ctx->ComputeProgram._Current;
    GLbitfield new_state = 0x0;
 
@@ -175,6 +181,30 @@ update_program(struct gl_context *ctx)
       _mesa_reference_geomprog(ctx, &ctx->GeometryProgram._Current, NULL);
    }
 
+   if (tesProg && tesProg->LinkStatus
+       && tesProg->_LinkedShaders[MESA_SHADER_TESS_EVAL]) {
+      /* Use GLSL tessellation evaluation shader */
+      _mesa_reference_tesseprog(ctx, &ctx->TessEvalProgram._Current,
+         gl_tess_eval_program(
+            tesProg->_LinkedShaders[MESA_SHADER_TESS_EVAL]->Program));
+   }
+   else {
+      /* No tessellation evaluation program */
+      _mesa_reference_tesseprog(ctx, &ctx->TessEvalProgram._Current, NULL);
+   }
+
+   if (tcsProg && tcsProg->LinkStatus
+       && tcsProg->_LinkedShaders[MESA_SHADER_TESS_CTRL]) {
+      /* Use GLSL tessellation control shader */
+      _mesa_reference_tesscprog(ctx, &ctx->TessCtrlProgram._Current,
+         gl_tess_ctrl_program(
+            tcsProg->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program));
+   }
+   else {
+      /* No tessellation control program */
+      _mesa_reference_tesscprog(ctx, &ctx->TessCtrlProgram._Current, NULL);
+   }
+
    /* Examine vertex program after fragment program as
     * _mesa_get_fixed_func_vertex_program() needs to know active
     * fragprog inputs.
@@ -230,6 +260,22 @@ update_program(struct gl_context *ctx)
       }
    }
 
+   if (ctx->TessEvalProgram._Current != prevTEP) {
+      new_state |= _NEW_PROGRAM;
+      if (ctx->Driver.BindProgram) {
+         ctx->Driver.BindProgram(ctx, GL_TESS_EVALUATION_PROGRAM_NV,
+                            (struct gl_program *) ctx->TessEvalProgram._Current);
+      }
+   }
+
+   if (ctx->TessCtrlProgram._Current != prevTCP) {
+      new_state |= _NEW_PROGRAM;
+      if (ctx->Driver.BindProgram) {
+         ctx->Driver.BindProgram(ctx, GL_TESS_CONTROL_PROGRAM_NV,
+                            (struct gl_program *) ctx->TessCtrlProgram._Current);
+      }
+   }
+
    if (ctx->VertexProgram._Current != prevVP) {
       new_state |= _NEW_PROGRAM;
       if (ctx->Driver.BindProgram) {
@@ -266,8 +312,8 @@ update_program_constants(struct gl_context *ctx)
       }
    }
 
-   /* Don't handle geometry shaders here. They don't use any state
-    * constants.
+   /* Don't handle tessellation and geometry shaders here. They don't use
+    * any state constants.
     */
 
    if (ctx->VertexProgram._Current) {

From 882413f1c5926503550a42554a83f57f85fec82d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 14 Jun 2015 01:21:02 +0200
Subject: [PATCH 0500/1208] mesa: add program interface queries for
 tessellation shaders

Based on a patch by Chris Forbes <chrisf@ijw.co.nz>.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/shader_query.cpp |  8 ++++++--
 src/mesa/main/uniforms.c       | 21 +++++++++++++++------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index c8d3687faa6..c58d3474df7 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -827,6 +827,10 @@ stage_from_enum(GLenum ref)
    switch (ref) {
    case GL_REFERENCED_BY_VERTEX_SHADER:
       return MESA_SHADER_VERTEX;
+   case GL_REFERENCED_BY_TESS_CONTROL_SHADER:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_REFERENCED_BY_TESS_EVALUATION_SHADER:
+      return MESA_SHADER_TESS_EVAL;
    case GL_REFERENCED_BY_GEOMETRY_SHADER:
       return MESA_SHADER_GEOMETRY;
    case GL_REFERENCED_BY_FRAGMENT_SHADER:
@@ -1012,6 +1016,8 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
          goto invalid_enum;
       /* fallthrough */
    case GL_REFERENCED_BY_VERTEX_SHADER:
+   case GL_REFERENCED_BY_TESS_CONTROL_SHADER:
+   case GL_REFERENCED_BY_TESS_EVALUATION_SHADER:
    case GL_REFERENCED_BY_GEOMETRY_SHADER:
    case GL_REFERENCED_BY_FRAGMENT_SHADER:
       switch (res->Type) {
@@ -1045,8 +1051,6 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
 
    /* GL_ARB_tessellation_shader */
    case GL_IS_PER_PATCH:
-   case GL_REFERENCED_BY_TESS_CONTROL_SHADER:
-   case GL_REFERENCED_BY_TESS_EVALUATION_SHADER:
    default:
       goto invalid_enum;
    }
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index a577a2569f2..6ba746e2f58 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -1092,6 +1092,21 @@ mesa_bufferiv(struct gl_shader_program *shProg, GLenum type,
                                   GL_REFERENCED_BY_VERTEX_SHADER, params,
                                   caller);
       return;
+
+   case GL_UNIFORM_BLOCK_REFERENCED_BY_TESS_CONTROL_SHADER:
+   case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_TESS_CONTROL_SHADER:
+      _mesa_program_resource_prop(shProg, res, index,
+                                  GL_REFERENCED_BY_TESS_CONTROL_SHADER, params,
+                                  caller);
+      return;
+
+   case GL_UNIFORM_BLOCK_REFERENCED_BY_TESS_EVALUATION_SHADER:
+   case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_TESS_EVALUATION_SHADER:
+      _mesa_program_resource_prop(shProg, res, index,
+                                  GL_REFERENCED_BY_TESS_EVALUATION_SHADER, params,
+                                  caller);
+      return;
+
    case GL_UNIFORM_BLOCK_REFERENCED_BY_GEOMETRY_SHADER:
    case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_GEOMETRY_SHADER:
       _mesa_program_resource_prop(shProg, res, index,
@@ -1104,12 +1119,6 @@ mesa_bufferiv(struct gl_shader_program *shProg, GLenum type,
                                   GL_REFERENCED_BY_FRAGMENT_SHADER, params,
                                   caller);
       return;
-   case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_TESS_CONTROL_SHADER:
-      params[0] = GL_FALSE;
-      return;
-   case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_TESS_EVALUATION_SHADER:
-      params[0] = GL_FALSE;
-      return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(pname 0x%x (%s))", caller, pname,

From 3d528e7c476f25f24bca35d09d1f4c2b00123234 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 22 Apr 2015 23:05:34 +0200
Subject: [PATCH 0501/1208] mesa: handle tessellation shaders in
 use_shader_program

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/shaderapi.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 77b1430cee1..40b4d417c22 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1139,11 +1139,9 @@ use_shader_program(struct gl_context *ctx, gl_shader_stage stage,
        */
       switch (stage) {
       case MESA_SHADER_VERTEX:
-	 /* Empty for now. */
-	 break;
+      case MESA_SHADER_TESS_CTRL:
+      case MESA_SHADER_TESS_EVAL:
       case MESA_SHADER_GEOMETRY:
-	 /* Empty for now. */
-	 break;
       case MESA_SHADER_COMPUTE:
          /* Empty for now. */
          break;

From 206af9d049cab6e794db5abf63e3d11281343423 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 28 May 2015 22:08:55 +0200
Subject: [PATCH 0502/1208] mesa: don't allow drawing with tess ctrl shader and
 without tess eval shader

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/api_validate.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/api_validate.c
index 2e211c7f3a9..53c8fb893b5 100644
--- a/src/mesa/main/api_validate.c
+++ b/src/mesa/main/api_validate.c
@@ -69,6 +69,25 @@ check_valid_to_render(struct gl_context *ctx, const char *function)
          return false;
       }
 
+      /* The spec argues that this is allowed because a tess ctrl shader
+       * without a tess eval shader can be used with transform feedback.
+       * However, glBeginTransformFeedback doesn't allow GL_PATCHES and
+       * therefore doesn't allow tessellation.
+       *
+       * Further investigation showed that this is indeed a spec bug and
+       * a tess ctrl shader without a tess eval shader shouldn't have been
+       * allowed, because there is no API in GL 4.0 that can make use this
+       * to produce something useful.
+       *
+       * Also, all vendors except one don't support a tess ctrl shader without
+       * a tess eval shader anyway.
+       */
+      if (ctx->TessCtrlProgram._Current && !ctx->TessEvalProgram._Current) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(tess eval shader is missing)", function);
+         return false;
+      }
+
       /* Section 7.3 (Program Objects) of the OpenGL 4.5 Core Profile spec
        * says:
        *

From 497eb295838baccde1420adfcc4ef7e8fdddd774 Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Thu, 20 Mar 2014 22:44:43 +0100
Subject: [PATCH 0503/1208] glsl: add tessellation shader parsing support (v2)

v2: Fixed things that Ken suggested.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/ast.h                  |  54 ++++++++++-
 src/glsl/ast_to_hir.cpp         | 166 +++++++++++++++++++++++++-------
 src/glsl/ast_type.cpp           | 112 ++++++++++++++++++++-
 src/glsl/glsl_parser.yy         | 122 +++++++++++++++++++++--
 src/glsl/glsl_parser_extras.cpp |  39 +++++++-
 src/glsl/glsl_parser_extras.h   |  31 +++++-
 6 files changed, 476 insertions(+), 48 deletions(-)

diff --git a/src/glsl/ast.h b/src/glsl/ast.h
index 49212298e16..e185ed11e75 100644
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -515,6 +515,17 @@ struct ast_type_qualifier {
          unsigned stream:1; /**< Has stream value assigned  */
          unsigned explicit_stream:1; /**< stream value assigned explicitly by shader code */
          /** \} */
+
+	 /** \name Layout qualifiers for GL_ARB_tessellation_shader */
+	 /** \{ */
+	 /* tess eval input layout */
+	 /* gs prim_type reused for primitive mode */
+	 unsigned vertex_spacing:1;
+	 unsigned ordering:1;
+	 unsigned point_mode:1;
+	 /* tess control output layout */
+	 unsigned vertices:1;
+	 /** \} */
       }
       /** \brief Set of flags, accessed by name. */
       q;
@@ -550,7 +561,10 @@ struct ast_type_qualifier {
    /** Stream in GLSL 1.50 geometry shaders. */
    unsigned stream;
 
-   /** Input or output primitive type in GLSL 1.50 geometry shaders */
+   /**
+    * Input or output primitive type in GLSL 1.50 geometry shaders
+    * and tessellation shaders.
+    */
    GLenum prim_type;
 
    /**
@@ -577,6 +591,18 @@ struct ast_type_qualifier {
     */
    int local_size[3];
 
+   /** Tessellation evaluation shader: vertex spacing (equal, fractional even/odd) */
+   GLenum vertex_spacing;
+
+   /** Tessellation evaluation shader: vertex ordering (CW or CCW) */
+   GLenum ordering;
+
+   /** Tessellation evaluation shader: point mode */
+   bool point_mode;
+
+   /** Tessellation control shader: number of output vertices */
+   int vertices;
+
    /**
     * Image format specified with an ARB_shader_image_load_store
     * layout qualifier.
@@ -632,6 +658,11 @@ struct ast_type_qualifier {
 			_mesa_glsl_parse_state *state,
 			ast_type_qualifier q);
 
+   bool merge_out_qualifier(YYLTYPE *loc,
+                           _mesa_glsl_parse_state *state,
+                           ast_type_qualifier q,
+                           ast_node* &node);
+
    bool merge_in_qualifier(YYLTYPE *loc,
                            _mesa_glsl_parse_state *state,
                            ast_type_qualifier q,
@@ -1031,6 +1062,27 @@ public:
 };
 
 
+/**
+ * AST node representing a declaration of the output layout for tessellation
+ * control shaders.
+ */
+class ast_tcs_output_layout : public ast_node
+{
+public:
+   ast_tcs_output_layout(const struct YYLTYPE &locp, int vertices)
+      : vertices(vertices)
+   {
+      set_location(locp);
+   }
+
+   virtual ir_rvalue *hir(exec_list *instructions,
+                          struct _mesa_glsl_parse_state *state);
+
+private:
+   const int vertices;
+};
+
+
 /**
  * AST node representing a declaration of the input layout for geometry
  * shaders.
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index b5c4ed9c667..391bee0b53f 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -79,6 +79,7 @@ _mesa_ast_to_hir(exec_list *instructions, struct _mesa_glsl_parse_state *state)
    state->toplevel_ir = instructions;
 
    state->gs_input_prim_type_specified = false;
+   state->tcs_output_vertices_specified = false;
    state->cs_input_local_size_specified = false;
 
    /* Section 4.2 of the GLSL 1.20 specification states:
@@ -2231,6 +2232,8 @@ validate_explicit_location(const struct ast_type_qualifier *qual,
     *                     input            output
     *                     -----            ------
     * vertex              explicit_loc     sso
+    * tess control        sso              sso
+    * tess eval           sso              sso
     * geometry            sso              sso
     * fragment            sso              explicit_loc
     */
@@ -2253,6 +2256,8 @@ validate_explicit_location(const struct ast_type_qualifier *qual,
       fail = true;
       break;
 
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_TESS_EVAL:
    case MESA_SHADER_GEOMETRY:
       if (var->data.mode == ir_var_shader_in || var->data.mode == ir_var_shader_out) {
          if (!state->check_separate_shader_objects_allowed(loc, var))
@@ -2312,6 +2317,8 @@ validate_explicit_location(const struct ast_type_qualifier *qual,
                : (qual->location + VARYING_SLOT_VAR0);
             break;
 
+         case MESA_SHADER_TESS_CTRL:
+         case MESA_SHADER_TESS_EVAL:
          case MESA_SHADER_GEOMETRY:
             var->data.location = qual->location + VARYING_SLOT_VAR0;
             break;
@@ -2592,7 +2599,9 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       case MESA_SHADER_VERTEX:
          if (var->data.mode == ir_var_shader_out)
             var->data.invariant = true;
-	      break;
+         break;
+      case MESA_SHADER_TESS_CTRL:
+      case MESA_SHADER_TESS_EVAL:
       case MESA_SHADER_GEOMETRY:
          if ((var->data.mode == ir_var_shader_in)
              || (var->data.mode == ir_var_shader_out))
@@ -3132,30 +3141,13 @@ process_initializer(ir_variable *var, ast_declaration *decl,
    return result;
 }
 
-
-/**
- * Do additional processing necessary for geometry shader input declarations
- * (this covers both interface blocks arrays and bare input variables).
- */
 static void
-handle_geometry_shader_input_decl(struct _mesa_glsl_parse_state *state,
-                                  YYLTYPE loc, ir_variable *var)
+validate_layout_qualifier_vertex_count(struct _mesa_glsl_parse_state *state,
+                                       YYLTYPE loc, ir_variable *var,
+                                       unsigned num_vertices,
+                                       unsigned *size,
+                                       const char *var_category)
 {
-   unsigned num_vertices = 0;
-   if (state->gs_input_prim_type_specified) {
-      num_vertices = vertices_per_prim(state->in_qualifier->prim_type);
-   }
-
-   /* Geometry shader input variables must be arrays.  Caller should have
-    * reported an error for this.
-    */
-   if (!var->type->is_array()) {
-      assert(state->error);
-
-      /* To avoid cascading failures, short circuit the checks below. */
-      return;
-   }
-
    if (var->type->is_unsized_array()) {
       /* Section 4.3.8.1 (Input Layout Qualifiers) of the GLSL 1.50 spec says:
        *
@@ -3165,6 +3157,8 @@ handle_geometry_shader_input_decl(struct _mesa_glsl_parse_state *state,
        *
        * Followed by a table mapping each allowed input layout qualifier to
        * the corresponding input length.
+       *
+       * Similarly for tessellation control shader outputs.
        */
       if (num_vertices != 0)
          var->type = glsl_type::get_array_instance(var->type->fields.array,
@@ -3191,22 +3185,63 @@ handle_geometry_shader_input_decl(struct _mesa_glsl_parse_state *state,
        */
       if (num_vertices != 0 && var->type->length != num_vertices) {
          _mesa_glsl_error(&loc, state,
-                          "geometry shader input size contradicts previously"
-                          " declared layout (size is %u, but layout requires a"
-                          " size of %u)", var->type->length, num_vertices);
-      } else if (state->gs_input_size != 0 &&
-                 var->type->length != state->gs_input_size) {
+                          "%s size contradicts previously declared layout "
+                          "(size is %u, but layout requires a size of %u)",
+                          var_category, var->type->length, num_vertices);
+      } else if (*size != 0 && var->type->length != *size) {
          _mesa_glsl_error(&loc, state,
-                          "geometry shader input sizes are "
-                          "inconsistent (size is %u, but a previous "
-                          "declaration has size %u)",
-                          var->type->length, state->gs_input_size);
+                          "%s sizes are inconsistent (size is %u, but a "
+                          "previous declaration has size %u)",
+                          var_category, var->type->length, *size);
       } else {
-         state->gs_input_size = var->type->length;
+         *size = var->type->length;
       }
    }
 }
 
+static void
+handle_tess_ctrl_shader_output_decl(struct _mesa_glsl_parse_state *state,
+                                    YYLTYPE loc, ir_variable *var)
+{
+   unsigned num_vertices = 0;
+
+   if (state->tcs_output_vertices_specified) {
+      num_vertices = state->out_qualifier->vertices;
+   }
+
+   validate_layout_qualifier_vertex_count(state, loc, var, num_vertices,
+                                          &state->tcs_output_size,
+                                          "geometry shader input");
+}
+
+/**
+ * Do additional processing necessary for geometry shader input declarations
+ * (this covers both interface blocks arrays and bare input variables).
+ */
+static void
+handle_geometry_shader_input_decl(struct _mesa_glsl_parse_state *state,
+                                  YYLTYPE loc, ir_variable *var)
+{
+   unsigned num_vertices = 0;
+
+   if (state->gs_input_prim_type_specified) {
+      num_vertices = vertices_per_prim(state->in_qualifier->prim_type);
+   }
+
+   /* Geometry shader input variables must be arrays.  Caller should have
+    * reported an error for this.
+    */
+   if (!var->type->is_array()) {
+      assert(state->error);
+
+      /* To avoid cascading failures, short circuit the checks below. */
+      return;
+   }
+
+   validate_layout_qualifier_vertex_count(state, loc, var, num_vertices,
+                                          &state->gs_input_size,
+                                          "geometry shader input");
+}
 
 void
 validate_identifier(const char *identifier, YYLTYPE loc,
@@ -3796,6 +3831,10 @@ ast_declarator_list::hir(exec_list *instructions,
                }
             }
          }
+
+         if (state->stage == MESA_SHADER_TESS_CTRL) {
+            handle_tess_ctrl_shader_output_decl(state, loc, var);
+         }
       }
 
       /* Integer fragment inputs must be qualified with 'flat'.  In GLSL ES,
@@ -6058,6 +6097,67 @@ ast_interface_block::hir(exec_list *instructions,
 }
 
 
+ir_rvalue *
+ast_tcs_output_layout::hir(exec_list *instructions,
+			  struct _mesa_glsl_parse_state *state)
+{
+   YYLTYPE loc = this->get_location();
+
+   /* If any tessellation control output layout declaration preceded this
+    * one, make sure it was consistent with this one.
+    */
+   if (state->tcs_output_vertices_specified &&
+       state->out_qualifier->vertices != this->vertices) {
+      _mesa_glsl_error(&loc, state,
+		       "tessellation control shader output layout does not "
+		       "match previous declaration");
+      return NULL;
+   }
+
+   /* If any shader outputs occurred before this declaration and specified an
+    * array size, make sure the size they specified is consistent with the
+    * primitive type.
+    */
+   unsigned num_vertices = this->vertices;
+   if (state->tcs_output_size != 0 && state->tcs_output_size != num_vertices) {
+      _mesa_glsl_error(&loc, state,
+		       "this tessellation control shader output layout "
+		       "specifies %u vertices, but a previous output "
+		       "is declared with size %u",
+		       num_vertices, state->tcs_output_size);
+      return NULL;
+   }
+
+   state->tcs_output_vertices_specified = true;
+
+   /* If any shader outputs occurred before this declaration and did not
+    * specify an array size, their size is determined now.
+    */
+   foreach_in_list (ir_instruction, node, instructions) {
+      ir_variable *var = node->as_variable();
+      if (var == NULL || var->data.mode != ir_var_shader_out)
+	 continue;
+
+      /* Note: Not all tessellation control shader output are arrays. */
+      if (!var->type->is_unsized_array())
+	 continue;
+
+      if (var->data.max_array_access >= num_vertices) {
+	 _mesa_glsl_error(&loc, state,
+			  "this tessellation control shader output layout "
+			  "specifies %u vertices, but an access to element "
+			  "%u of output `%s' already exists", num_vertices,
+			  var->data.max_array_access, var->name);
+      } else {
+	 var->type = glsl_type::get_array_instance(var->type->fields.array,
+						   num_vertices);
+      }
+   }
+
+   return NULL;
+}
+
+
 ir_rvalue *
 ast_gs_input_layout::hir(exec_list *instructions,
                          struct _mesa_glsl_parse_state *state)
diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index fa4806af1c0..f5c08d81e5f 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -212,6 +212,44 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
       }
    }
 
+   if (q.flags.q.vertices) {
+      if (this->flags.q.vertices && this->vertices != q.vertices) {
+	 _mesa_glsl_error(loc, state,
+			  "tessellation control shader set conflicting "
+			  "vertices (%d and %d)",
+			  this->vertices, q.vertices);
+	 return false;
+      }
+      this->vertices = q.vertices;
+   }
+
+   if (q.flags.q.vertex_spacing) {
+      if (this->flags.q.vertex_spacing && this->vertex_spacing != q.vertex_spacing) {
+	 _mesa_glsl_error(loc, state,
+			  "conflicting vertex spacing used");
+	 return false;
+      }
+      this->vertex_spacing = q.vertex_spacing;
+   }
+
+   if (q.flags.q.ordering) {
+      if (this->flags.q.ordering && this->ordering != q.ordering) {
+	 _mesa_glsl_error(loc, state,
+			  "conflicting ordering used");
+	 return false;
+      }
+      this->ordering = q.ordering;
+   }
+
+   if (q.flags.q.point_mode) {
+      if (this->flags.q.point_mode && this->point_mode != q.point_mode) {
+	 _mesa_glsl_error(loc, state,
+			  "conflicting point mode used");
+	 return false;
+      }
+      this->point_mode = q.point_mode;
+   }
+
    if ((q.flags.i & ubo_mat_mask.flags.i) != 0)
       this->flags.i &= ~ubo_mat_mask.flags.i;
    if ((q.flags.i & ubo_layout_mask.flags.i) != 0)
@@ -256,6 +294,22 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
    return true;
 }
 
+bool
+ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
+                                        _mesa_glsl_parse_state *state,
+                                        ast_type_qualifier q,
+                                        ast_node* &node)
+{
+   void *mem_ctx = state;
+   const bool r = this->merge_qualifier(loc, state, q);
+
+   if (state->stage == MESA_SHADER_TESS_CTRL) {
+      node = new(mem_ctx) ast_tcs_output_layout(*loc, q.vertices);
+   }
+
+   return r;
+}
+
 bool
 ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
                                        _mesa_glsl_parse_state *state,
@@ -269,6 +323,27 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
    valid_in_mask.flags.i = 0;
 
    switch (state->stage) {
+   case MESA_SHADER_TESS_EVAL:
+      if (q.flags.q.prim_type) {
+         /* Make sure this is a valid input primitive type. */
+         switch (q.prim_type) {
+         case GL_TRIANGLES:
+         case GL_QUADS:
+         case GL_ISOLINES:
+            break;
+         default:
+            _mesa_glsl_error(loc, state,
+                             "invalid tessellation evaluation "
+                             "shader input primitive type");
+            break;
+         }
+      }
+
+      valid_in_mask.flags.q.prim_type = 1;
+      valid_in_mask.flags.q.vertex_spacing = 1;
+      valid_in_mask.flags.q.ordering = 1;
+      valid_in_mask.flags.q.point_mode = 1;
+      break;
    case MESA_SHADER_GEOMETRY:
       if (q.flags.q.prim_type) {
          /* Make sure this is a valid input primitive type. */
@@ -324,7 +399,9 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
       if (q.flags.q.prim_type &&
           this->prim_type != q.prim_type) {
          _mesa_glsl_error(loc, state,
-                          "conflicting input primitive types specified");
+                          "conflicting input primitive %s specified",
+                          state->stage == MESA_SHADER_GEOMETRY ?
+                          "type" : "mode");
       }
    } else if (q.flags.q.prim_type) {
       state->in_qualifier->flags.q.prim_type = 1;
@@ -346,6 +423,39 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
       state->fs_early_fragment_tests = true;
    }
 
+   if (this->flags.q.vertex_spacing) {
+      if (q.flags.q.vertex_spacing &&
+          this->vertex_spacing != q.vertex_spacing) {
+         _mesa_glsl_error(loc, state,
+                          "conflicting vertex spacing specified");
+      }
+   } else if (q.flags.q.vertex_spacing) {
+      this->flags.q.vertex_spacing = 1;
+      this->vertex_spacing = q.vertex_spacing;
+   }
+
+   if (this->flags.q.ordering) {
+      if (q.flags.q.ordering &&
+          this->ordering != q.ordering) {
+         _mesa_glsl_error(loc, state,
+                          "conflicting ordering specified");
+      }
+   } else if (q.flags.q.ordering) {
+      this->flags.q.ordering = 1;
+      this->ordering = q.ordering;
+   }
+
+   if (this->flags.q.point_mode) {
+      if (q.flags.q.point_mode &&
+          this->point_mode != q.point_mode) {
+         _mesa_glsl_error(loc, state,
+                          "conflicting point mode specified");
+      }
+   } else if (q.flags.q.point_mode) {
+      this->flags.q.point_mode = 1;
+      this->point_mode = q.point_mode;
+   }
+
    if (create_gs_ast) {
       node = new(mem_ctx) ast_gs_input_layout(*loc, q.prim_type);
    } else if (create_cs_ast) {
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 37c44014ba7..494907703ba 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -1385,6 +1385,89 @@ layout_qualifier_id:
          }
       }
 
+      /* Layout qualifiers for tessellation evaluation shaders. */
+      if (!$$.flags.i) {
+         struct {
+            const char *s;
+            GLenum e;
+         } map[] = {
+                 /* triangles already parsed by gs-specific code */
+                 { "quads", GL_QUADS },
+                 { "isolines", GL_ISOLINES },
+         };
+         for (unsigned i = 0; i < ARRAY_SIZE(map); i++) {
+            if (match_layout_qualifier($1, map[i].s, state) == 0) {
+               $$.flags.q.prim_type = 1;
+               $$.prim_type = map[i].e;
+               break;
+            }
+         }
+
+         if ($$.flags.i &&
+             !state->ARB_tessellation_shader_enable &&
+             !state->is_version(400, 0)) {
+            _mesa_glsl_error(& @1, state,
+                             "primitive mode qualifier `%s' requires "
+                             "GLSL 4.00 or ARB_tessellation_shader", $1);
+         }
+      }
+      if (!$$.flags.i) {
+         struct {
+            const char *s;
+            GLenum e;
+         } map[] = {
+                 { "equal_spacing", GL_EQUAL },
+                 { "fractional_odd_spacing", GL_FRACTIONAL_ODD },
+                 { "fractional_even_spacing", GL_FRACTIONAL_EVEN },
+         };
+         for (unsigned i = 0; i < ARRAY_SIZE(map); i++) {
+            if (match_layout_qualifier($1, map[i].s, state) == 0) {
+               $$.flags.q.vertex_spacing = 1;
+               $$.vertex_spacing = map[i].e;
+               break;
+            }
+         }
+
+         if ($$.flags.i &&
+             !state->ARB_tessellation_shader_enable &&
+             !state->is_version(400, 0)) {
+            _mesa_glsl_error(& @1, state,
+                             "vertex spacing qualifier `%s' requires "
+                             "GLSL 4.00 or ARB_tessellation_shader", $1);
+         }
+      }
+      if (!$$.flags.i) {
+         if (match_layout_qualifier($1, "cw", state) == 0) {
+            $$.flags.q.ordering = 1;
+            $$.ordering = GL_CW;
+         } else if (match_layout_qualifier($1, "ccw", state) == 0) {
+            $$.flags.q.ordering = 1;
+            $$.ordering = GL_CCW;
+         }
+
+         if ($$.flags.i &&
+             !state->ARB_tessellation_shader_enable &&
+             !state->is_version(400, 0)) {
+            _mesa_glsl_error(& @1, state,
+                             "ordering qualifier `%s' requires "
+                             "GLSL 4.00 or ARB_tessellation_shader", $1);
+         }
+      }
+      if (!$$.flags.i) {
+         if (match_layout_qualifier($1, "point_mode", state) == 0) {
+            $$.flags.q.point_mode = 1;
+            $$.point_mode = true;
+         }
+
+         if ($$.flags.i &&
+             !state->ARB_tessellation_shader_enable &&
+             !state->is_version(400, 0)) {
+            _mesa_glsl_error(& @1, state,
+                             "qualifier `point_mode' requires "
+                             "GLSL 4.00 or ARB_tessellation_shader");
+         }
+      }
+
       if (!$$.flags.i) {
          _mesa_glsl_error(& @1, state, "unrecognized layout identifier "
                           "`%s'", $1);
@@ -1522,6 +1605,30 @@ layout_qualifier_id:
          }
       }
 
+      /* Layout qualifiers for tessellation control shaders. */
+      if (match_layout_qualifier("vertices", $1, state) == 0) {
+         $$.flags.q.vertices = 1;
+
+         if ($3 <= 0) {
+            _mesa_glsl_error(& @3, state,
+                             "invalid vertices (%d) specified", $3);
+            YYERROR;
+         } else if ($3 > (int)state->Const.MaxPatchVertices) {
+            _mesa_glsl_error(& @3, state,
+                             "vertices (%d) exceeds "
+                             "GL_MAX_PATCH_VERTICES", $3);
+            YYERROR;
+         } else {
+            $$.vertices = $3;
+            if (!state->ARB_tessellation_shader_enable &&
+                !state->is_version(400, 0)) {
+               _mesa_glsl_error(& @1, state,
+                                "vertices qualifier requires GLSL 4.00 or "
+                                "ARB_tessellation_shader");
+            }
+         }
+      }
+
       /* If the identifier didn't match any known layout identifiers,
        * emit an error.
        */
@@ -2743,11 +2850,8 @@ layout_defaults:
 
    | layout_qualifier OUT_TOK ';'
    {
-      if (state->stage != MESA_SHADER_GEOMETRY) {
-         _mesa_glsl_error(& @1, state,
-                          "out layout qualifiers only valid in "
-                          "geometry shaders");
-      } else {
+      $$ = NULL;
+      if (state->stage == MESA_SHADER_GEOMETRY) {
          if ($1.flags.q.prim_type) {
             /* Make sure this is a valid output primitive type. */
             switch ($1.prim_type) {
@@ -2766,6 +2870,12 @@ layout_defaults:
 
          /* Allow future assigments of global out's stream id value */
          state->out_qualifier->flags.q.explicit_stream = 0;
+      } else if (state->stage == MESA_SHADER_TESS_CTRL) {
+         if (!state->out_qualifier->merge_out_qualifier(& @1, state, $1, $$))
+            YYERROR;
+      } else {
+         _mesa_glsl_error(& @1, state,
+                          "out layout qualifiers only valid in "
+                          "tessellation control or geometry shaders");
       }
-      $$ = NULL;
    }
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 5412f0b7887..3bb5c4db2f7 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -145,6 +145,9 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    /* ARB_viewport_array */
    this->Const.MaxViewports = ctx->Const.MaxViewports;
 
+   /* tessellation shader constants */
+   this->Const.MaxPatchVertices = ctx->Const.MaxPatchVertices;
+
    this->current_function = NULL;
    this->toplevel_ir = NULL;
    this->found_return = false;
@@ -224,6 +227,7 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    this->fs_redeclares_gl_fragcoord_with_no_layout_qualifiers = false;
 
    this->gs_input_prim_type_specified = false;
+   this->tcs_output_vertices_specified = false;
    this->gs_input_size = 0;
    this->in_qualifier = new(this) ast_type_qualifier();
    this->out_qualifier = new(this) ast_type_qualifier();
@@ -389,6 +393,8 @@ _mesa_shader_stage_to_string(unsigned stage)
    case MESA_SHADER_FRAGMENT: return "fragment";
    case MESA_SHADER_GEOMETRY: return "geometry";
    case MESA_SHADER_COMPUTE:  return "compute";
+   case MESA_SHADER_TESS_CTRL: return "tess ctrl";
+   case MESA_SHADER_TESS_EVAL: return "tess eval";
    }
 
    unreachable("Unknown shader stage.");
@@ -406,6 +412,8 @@ _mesa_shader_stage_to_abbrev(unsigned stage)
    case MESA_SHADER_FRAGMENT: return "FS";
    case MESA_SHADER_GEOMETRY: return "GS";
    case MESA_SHADER_COMPUTE:  return "CS";
+   case MESA_SHADER_TESS_CTRL: return "TCS";
+   case MESA_SHADER_TESS_EVAL: return "TES";
    }
 
    unreachable("Unknown shader stage.");
@@ -574,6 +582,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(ARB_shader_texture_lod,           true,  false,     ARB_shader_texture_lod),
    EXT(ARB_shading_language_420pack,     true,  false,     ARB_shading_language_420pack),
    EXT(ARB_shading_language_packing,     true,  false,     ARB_shading_language_packing),
+   EXT(ARB_tessellation_shader,          true,  false,     ARB_tessellation_shader),
    EXT(ARB_texture_cube_map_array,       true,  false,     ARB_texture_cube_map_array),
    EXT(ARB_texture_gather,               true,  false,     ARB_texture_gather),
    EXT(ARB_texture_multisample,          true,  false,     ARB_texture_multisample),
@@ -1420,8 +1429,12 @@ static void
 set_shader_inout_layout(struct gl_shader *shader,
 		     struct _mesa_glsl_parse_state *state)
 {
-   if (shader->Stage != MESA_SHADER_GEOMETRY) {
-      /* Should have been prevented by the parser. */
+   /* Should have been prevented by the parser. */
+   if (shader->Stage == MESA_SHADER_TESS_CTRL) {
+      assert(!state->in_qualifier->flags.i);
+   } else if (shader->Stage == MESA_SHADER_TESS_EVAL) {
+      assert(!state->out_qualifier->flags.i);
+   } else if (shader->Stage != MESA_SHADER_GEOMETRY) {
       assert(!state->in_qualifier->flags.i);
       assert(!state->out_qualifier->flags.i);
    }
@@ -1441,6 +1454,28 @@ set_shader_inout_layout(struct gl_shader *shader,
    }
 
    switch (shader->Stage) {
+   case MESA_SHADER_TESS_CTRL:
+      shader->TessCtrl.VerticesOut = 0;
+      if (state->tcs_output_vertices_specified)
+         shader->TessCtrl.VerticesOut = state->out_qualifier->vertices;
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      shader->TessEval.PrimitiveMode = PRIM_UNKNOWN;
+      if (state->in_qualifier->flags.q.prim_type)
+         shader->TessEval.PrimitiveMode = state->in_qualifier->prim_type;
+
+      shader->TessEval.Spacing = 0;
+      if (state->in_qualifier->flags.q.vertex_spacing)
+         shader->TessEval.Spacing = state->in_qualifier->vertex_spacing;
+
+      shader->TessEval.VertexOrder = 0;
+      if (state->in_qualifier->flags.q.ordering)
+         shader->TessEval.VertexOrder = state->in_qualifier->ordering;
+
+      shader->TessEval.PointMode = -1;
+      if (state->in_qualifier->flags.q.point_mode)
+         shader->TessEval.PointMode = state->in_qualifier->point_mode;
+      break;
    case MESA_SHADER_GEOMETRY:
       shader->Geom.VerticesOut = 0;
       if (state->out_qualifier->flags.q.max_vertices)
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index 4996b845044..de8f8c417fc 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -277,15 +277,19 @@ struct _mesa_glsl_parse_state {
    bool fs_redeclares_gl_fragcoord_with_no_layout_qualifiers;
 
    /**
-    * True if a geometry shader input primitive type was specified using a
-    * layout directive.
+    * True if a geometry shader input primitive type or tessellation control
+    * output vertices were specified using a layout directive.
     *
-    * Note: this value is computed at ast_to_hir time rather than at parse
+    * Note: these values are computed at ast_to_hir time rather than at parse
     * time.
     */
    bool gs_input_prim_type_specified;
+   bool tcs_output_vertices_specified;
 
-   /** Input layout qualifiers from GLSL 1.50. (geometry shader controls)*/
+   /**
+    * Input layout qualifiers from GLSL 1.50 (geometry shader controls),
+    * and GLSL 4.00 (tessellation evaluation shader)
+    */
    struct ast_type_qualifier *in_qualifier;
 
    /**
@@ -303,7 +307,10 @@ struct _mesa_glsl_parse_state {
     */
    unsigned cs_input_local_size[3];
 
-   /** Output layout qualifiers from GLSL 1.50. (geometry shader controls)*/
+   /**
+    * Output layout qualifiers from GLSL 1.50 (geometry shader controls),
+    * and GLSL 4.00 (tessellation control shader).
+    */
    struct ast_type_qualifier *out_qualifier;
 
    /**
@@ -383,6 +390,9 @@ struct _mesa_glsl_parse_state {
 
       /* ARB_viewport_array */
       unsigned MaxViewports;
+
+      /* ARB_tessellation_shader */
+      unsigned MaxPatchVertices;
    } Const;
 
    /**
@@ -475,6 +485,8 @@ struct _mesa_glsl_parse_state {
    bool ARB_shading_language_420pack_warn;
    bool ARB_shading_language_packing_enable;
    bool ARB_shading_language_packing_warn;
+   bool ARB_tessellation_shader_enable;
+   bool ARB_tessellation_shader_warn;
    bool ARB_texture_cube_map_array_enable;
    bool ARB_texture_cube_map_array_warn;
    bool ARB_texture_gather_enable;
@@ -545,6 +557,15 @@ struct _mesa_glsl_parse_state {
 
    bool fs_early_fragment_tests;
 
+   /**
+    * For tessellation control shaders, size of the most recently seen output
+    * declaration that was a sized array, or 0 if no sized output array
+    * declarations have been seen.
+    *
+    * Unused for other shader types.
+    */
+   unsigned tcs_output_size;
+
    /** Atomic counter offsets by binding */
    unsigned atomic_counter_offsets[MAX_COMBINED_ATOMIC_BUFFERS];
 

From 1036b024d4c8ce2376ac41219dfda01d5a59b3ef Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Thu, 20 Mar 2014 22:41:40 +0100
Subject: [PATCH 0504/1208] glsl: add tessellation shader defines and built-in
 variables.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/builtin_variables.cpp | 72 +++++++++++++++++++++++++++++++++-
 src/glsl/glcpp/glcpp-parse.y   |  3 ++
 src/glsl/shader_enums.h        | 13 +++++-
 src/mesa/main/mtypes.h         |  6 +++
 src/mesa/program/prog_print.c  |  4 ++
 5 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index aba1750c09d..c094906ba69 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -343,6 +343,8 @@ public:
    void generate_constants();
    void generate_uniforms();
    void generate_vs_special_vars();
+   void generate_tcs_special_vars();
+   void generate_tes_special_vars();
    void generate_gs_special_vars();
    void generate_fs_special_vars();
    void generate_cs_special_vars();
@@ -871,6 +873,39 @@ builtin_variable_generator::generate_vs_special_vars()
 }
 
 
+/**
+ * Generate variables which only exist in tessellation control shaders.
+ */
+void
+builtin_variable_generator::generate_tcs_special_vars()
+{
+   add_system_value(SYSTEM_VALUE_PRIMITIVE_ID, int_t, "gl_PrimitiveID");
+   add_system_value(SYSTEM_VALUE_VERTICES_IN, int_t, "gl_PatchVerticesIn");
+   add_system_value(SYSTEM_VALUE_INVOCATION_ID, int_t, "gl_InvocationID");
+
+   add_output(VARYING_SLOT_TESS_LEVEL_OUTER,
+                    array(float_t, 4), "gl_TessLevelOuter");
+   add_output(VARYING_SLOT_TESS_LEVEL_INNER,
+                    array(float_t, 2), "gl_TessLevelInner");
+}
+
+
+/**
+ * Generate variables which only exist in tessellation evaluation shaders.
+ */
+void
+builtin_variable_generator::generate_tes_special_vars()
+{
+   add_system_value(SYSTEM_VALUE_PRIMITIVE_ID, int_t, "gl_PrimitiveID");
+   add_system_value(SYSTEM_VALUE_VERTICES_IN, int_t, "gl_PatchVerticesIn");
+   add_system_value(SYSTEM_VALUE_TESS_COORD, vec3_t, "gl_TessCoord");
+   add_system_value(SYSTEM_VALUE_TESS_LEVEL_OUTER, array(float_t, 4),
+                    "gl_TessLevelOuter");
+   add_system_value(SYSTEM_VALUE_TESS_LEVEL_INNER, array(float_t, 2),
+                    "gl_TessLevelInner");
+}
+
+
 /**
  * Generate variables which only exist in geometry shaders.
  */
@@ -994,6 +1029,8 @@ builtin_variable_generator::add_varying(int slot, const glsl_type *type,
                                         const char *name_as_gs_input)
 {
    switch (state->stage) {
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_TESS_EVAL:
    case MESA_SHADER_GEOMETRY:
       this->per_vertex_in.add_field(slot, type, name);
       /* FALLTHROUGH */
@@ -1046,13 +1083,40 @@ builtin_variable_generator::generate_varyings()
       }
    }
 
+   /* Section 7.1 (Built-In Language Variables) of the GLSL 4.00 spec
+    * says:
+    *
+    *    "In the tessellation control language, built-in variables are
+    *    intrinsically declared as:
+    *
+    *        in gl_PerVertex {
+    *            vec4 gl_Position;
+    *            float gl_PointSize;
+    *            float gl_ClipDistance[];
+    *        } gl_in[gl_MaxPatchVertices];"
+    */
+   if (state->stage == MESA_SHADER_TESS_CTRL ||
+       state->stage == MESA_SHADER_TESS_EVAL) {
+      const glsl_type *per_vertex_in_type =
+         this->per_vertex_in.construct_interface_instance();
+      add_variable("gl_in", array(per_vertex_in_type, state->Const.MaxPatchVertices),
+                   ir_var_shader_in, -1);
+   }
    if (state->stage == MESA_SHADER_GEOMETRY) {
       const glsl_type *per_vertex_in_type =
          this->per_vertex_in.construct_interface_instance();
       add_variable("gl_in", array(per_vertex_in_type, 0),
                    ir_var_shader_in, -1);
    }
-   if (state->stage == MESA_SHADER_VERTEX || state->stage == MESA_SHADER_GEOMETRY) {
+   if (state->stage == MESA_SHADER_TESS_CTRL) {
+      const glsl_type *per_vertex_out_type =
+         this->per_vertex_out.construct_interface_instance();
+      add_variable("gl_out", array(per_vertex_out_type, 0),
+                   ir_var_shader_out, -1);
+   }
+   if (state->stage == MESA_SHADER_VERTEX ||
+       state->stage == MESA_SHADER_TESS_EVAL ||
+       state->stage == MESA_SHADER_GEOMETRY) {
       const glsl_type *per_vertex_out_type =
          this->per_vertex_out.construct_interface_instance();
       const glsl_struct_field *fields = per_vertex_out_type->fields.structure;
@@ -1087,6 +1151,12 @@ _mesa_glsl_initialize_variables(exec_list *instructions,
    case MESA_SHADER_VERTEX:
       gen.generate_vs_special_vars();
       break;
+   case MESA_SHADER_TESS_CTRL:
+      gen.generate_tcs_special_vars();
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      gen.generate_tes_special_vars();
+      break;
    case MESA_SHADER_GEOMETRY:
       gen.generate_gs_special_vars();
       break;
diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index ed1bffbde3c..4236fb319d3 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -2486,6 +2486,9 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 
 	      if (extensions->ARB_shader_storage_buffer_object)
 	         add_builtin_define(parser, "GL_ARB_shader_storage_buffer_object", 1);
+
+	      if (extensions->ARB_tessellation_shader)
+	         add_builtin_define(parser, "GL_ARB_tessellation_shader", 1);
 	   }
 	}
 
diff --git a/src/glsl/shader_enums.h b/src/glsl/shader_enums.h
index 42a30ae8fb9..3c3941645f0 100644
--- a/src/glsl/shader_enums.h
+++ b/src/glsl/shader_enums.h
@@ -152,7 +152,7 @@ typedef enum
     * \name Geometry shader system values
     */
    /*@{*/
-   SYSTEM_VALUE_INVOCATION_ID,
+   SYSTEM_VALUE_INVOCATION_ID,  /**< (Also in Tessellation Control shader) */
    /*@}*/
 
    /**
@@ -165,6 +165,17 @@ typedef enum
    SYSTEM_VALUE_SAMPLE_MASK_IN,
    /*@}*/
 
+   /**
+    * \name Tessellation Evaluation shader system values
+    */
+   /*@{*/
+   SYSTEM_VALUE_TESS_COORD,
+   SYSTEM_VALUE_VERTICES_IN,    /**< Tessellation vertices in input patch */
+   SYSTEM_VALUE_PRIMITIVE_ID,   /**< (currently not used by GS) */
+   SYSTEM_VALUE_TESS_LEVEL_OUTER, /**< TES input */
+   SYSTEM_VALUE_TESS_LEVEL_INNER, /**< TES input */
+   /*@}*/
+
    SYSTEM_VALUE_MAX             /**< Number of values */
 } gl_system_value;
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 12390c430e6..3244a95c32b 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -240,6 +240,8 @@ typedef enum
    VARYING_SLOT_VIEWPORT, /* Appears as VS or GS output */
    VARYING_SLOT_FACE, /* FS only */
    VARYING_SLOT_PNTC, /* FS only */
+   VARYING_SLOT_TESS_LEVEL_OUTER, /* Only appears as TCS output. */
+   VARYING_SLOT_TESS_LEVEL_INNER, /* Only appears as TCS output. */
    VARYING_SLOT_VAR0, /* First generic varying slot */
    VARYING_SLOT_MAX = VARYING_SLOT_VAR0 + MAX_VARYING
 } gl_varying_slot;
@@ -276,6 +278,8 @@ typedef enum
 #define VARYING_BIT_VIEWPORT BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)
 #define VARYING_BIT_FACE BITFIELD64_BIT(VARYING_SLOT_FACE)
 #define VARYING_BIT_PNTC BITFIELD64_BIT(VARYING_SLOT_PNTC)
+#define VARYING_BIT_TESS_LEVEL_OUTER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_OUTER)
+#define VARYING_BIT_TESS_LEVEL_INNER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_INNER)
 #define VARYING_BIT_VAR(V) BITFIELD64_BIT(VARYING_SLOT_VAR0 + (V))
 /*@}*/
 
@@ -292,6 +296,8 @@ _mesa_varying_slot_in_fs(gl_varying_slot slot)
    case VARYING_SLOT_EDGE:
    case VARYING_SLOT_CLIP_VERTEX:
    case VARYING_SLOT_LAYER:
+   case VARYING_SLOT_TESS_LEVEL_OUTER:
+   case VARYING_SLOT_TESS_LEVEL_INNER:
       return GL_FALSE;
    default:
       return GL_TRUE;
diff --git a/src/mesa/program/prog_print.c b/src/mesa/program/prog_print.c
index 336a5ef4448..bb7c2c6e527 100644
--- a/src/mesa/program/prog_print.c
+++ b/src/mesa/program/prog_print.c
@@ -147,6 +147,8 @@ arb_input_attrib_string(GLuint index, GLenum progType)
       "fragment.(twenty-one)", /* VARYING_SLOT_VIEWPORT */
       "fragment.(twenty-two)", /* VARYING_SLOT_FACE */
       "fragment.(twenty-three)", /* VARYING_SLOT_PNTC */
+      "fragment.(twenty-four)", /* VARYING_SLOT_TESS_LEVEL_OUTER */
+      "fragment.(twenty-five)", /* VARYING_SLOT_TESS_LEVEL_INNER */
       "fragment.varying[0]",
       "fragment.varying[1]",
       "fragment.varying[2]",
@@ -272,6 +274,8 @@ arb_output_attrib_string(GLuint index, GLenum progType)
       "result.(twenty-one)", /* VARYING_SLOT_VIEWPORT */
       "result.(twenty-two)", /* VARYING_SLOT_FACE */
       "result.(twenty-three)", /* VARYING_SLOT_PNTC */
+      "result.(twenty-four)", /* VARYING_SLOT_TESS_LEVEL_OUTER */
+      "result.(twenty-five)", /* VARYING_SLOT_TESS_LEVEL_INNER */
       "result.varying[0]",
       "result.varying[1]",
       "result.varying[2]",

From 1009b3311febe3909e82d4b5be38ceecad6afcc1 Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Wed, 5 Mar 2014 13:43:17 +0100
Subject: [PATCH 0505/1208] glsl: add the patch in/out qualifier (v2)

v2: Dropped some unrelated reordering in glsl_parser.yy as Ken suggested.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/ast.h                            |  1 +
 src/glsl/ast_to_hir.cpp                   | 45 +++++++++++++++
 src/glsl/ast_type.cpp                     |  3 +-
 src/glsl/builtin_variables.cpp            |  8 +--
 src/glsl/glsl_lexer.ll                    |  2 +-
 src/glsl/glsl_parser.yy                   |  6 +-
 src/glsl/glsl_parser_extras.cpp           |  2 +
 src/glsl/glsl_types.cpp                   |  5 ++
 src/glsl/glsl_types.h                     |  6 ++
 src/glsl/ir.cpp                           |  2 +
 src/glsl/ir.h                             |  1 +
 src/glsl/ir_print_visitor.cpp             |  5 +-
 src/glsl/ir_reader.cpp                    |  2 +
 src/glsl/ir_set_program_inouts.cpp        | 69 ++++++++++++++++++++---
 src/glsl/link_varyings.cpp                | 15 ++++-
 src/glsl/lower_named_interface_blocks.cpp |  1 +
 src/glsl/lower_packed_varyings.cpp        |  1 +
 17 files changed, 156 insertions(+), 18 deletions(-)

diff --git a/src/glsl/ast.h b/src/glsl/ast.h
index e185ed11e75..3d0e173c737 100644
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -434,6 +434,7 @@ struct ast_type_qualifier {
 	 unsigned out:1;
 	 unsigned centroid:1;
          unsigned sample:1;
+	 unsigned patch:1;
 	 unsigned uniform:1;
 	 unsigned buffer:1;
 	 unsigned smooth:1;
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 391bee0b53f..840299c3c48 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -2487,6 +2487,9 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       var->data.stream = qual->stream;
    }
 
+   if (qual->flags.q.patch)
+      var->data.patch = 1;
+
    if (qual->flags.q.attribute && state->stage != MESA_SHADER_VERTEX) {
       var->type = glsl_type::error_type;
       _mesa_glsl_error(loc, state,
@@ -3209,6 +3212,17 @@ handle_tess_ctrl_shader_output_decl(struct _mesa_glsl_parse_state *state,
       num_vertices = state->out_qualifier->vertices;
    }
 
+   if (!var->type->is_array() && !var->data.patch) {
+      _mesa_glsl_error(&loc, state,
+                       "tessellation control shader outputs must be arrays");
+
+      /* To avoid cascading failures, short circuit the checks below. */
+      return;
+   }
+
+   if (var->data.patch)
+      return;
+
    validate_layout_qualifier_vertex_count(state, loc, var, num_vertices,
                                           &state->tcs_output_size,
                                           "geometry shader input");
@@ -3958,6 +3972,33 @@ ast_declarator_list::hir(exec_list *instructions,
       }
 
 
+      /* From section 4.3.4 of the GLSL 4.00 spec:
+       *    "Input variables may not be declared using the patch in qualifier
+       *    in tessellation control or geometry shaders."
+       *
+       * From section 4.3.6 of the GLSL 4.00 spec:
+       *    "It is an error to use patch out in a vertex, tessellation
+       *    evaluation, or geometry shader."
+       *
+       * This doesn't explicitly forbid using them in a fragment shader, but
+       * that's probably just an oversight.
+       */
+      if (state->stage != MESA_SHADER_TESS_EVAL
+          && this->type->qualifier.flags.q.patch
+          && this->type->qualifier.flags.q.in) {
+
+         _mesa_glsl_error(&loc, state, "'patch in' can only be used in a "
+                          "tessellation evaluation shader");
+      }
+
+      if (state->stage != MESA_SHADER_TESS_CTRL
+          && this->type->qualifier.flags.q.patch
+          && this->type->qualifier.flags.q.out) {
+
+         _mesa_glsl_error(&loc, state, "'patch out' can only be used in a "
+                          "tessellation control shader");
+      }
+
       /* Precision qualifiers exists only in GLSL versions 1.00 and >= 1.30.
        */
       if (this->type->qualifier.precision != ast_precision_none) {
@@ -5481,6 +5522,7 @@ ast_process_structure_or_interface_block(exec_list *instructions,
             interpret_interpolation_qualifier(qual, var_mode, state, &loc);
          fields[i].centroid = qual->flags.q.centroid ? 1 : 0;
          fields[i].sample = qual->flags.q.sample ? 1 : 0;
+         fields[i].patch = qual->flags.q.patch ? 1 : 0;
 
          /* Only save explicitly defined streams in block's field */
          fields[i].stream = qual->flags.q.explicit_stream ? qual->stream : -1;
@@ -5815,6 +5857,8 @@ ast_interface_block::hir(exec_list *instructions,
                earlier_per_vertex->fields.structure[j].centroid;
             fields[i].sample =
                earlier_per_vertex->fields.structure[j].sample;
+            fields[i].patch =
+               earlier_per_vertex->fields.structure[j].patch;
          }
       }
 
@@ -5994,6 +6038,7 @@ ast_interface_block::hir(exec_list *instructions,
          var->data.interpolation = fields[i].interpolation;
          var->data.centroid = fields[i].centroid;
          var->data.sample = fields[i].sample;
+         var->data.patch = fields[i].patch;
          var->init_interface_type(block_type);
 
          if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)
diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index f5c08d81e5f..d96e6a0e84d 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -86,7 +86,8 @@ bool
 ast_type_qualifier::has_auxiliary_storage() const
 {
    return this->flags.q.centroid
-          || this->flags.q.sample;
+          || this->flags.q.sample
+          || this->flags.q.patch;
 }
 
 const char*
diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index c094906ba69..c52c8e04720 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -883,10 +883,10 @@ builtin_variable_generator::generate_tcs_special_vars()
    add_system_value(SYSTEM_VALUE_VERTICES_IN, int_t, "gl_PatchVerticesIn");
    add_system_value(SYSTEM_VALUE_INVOCATION_ID, int_t, "gl_InvocationID");
 
-   add_output(VARYING_SLOT_TESS_LEVEL_OUTER,
-                    array(float_t, 4), "gl_TessLevelOuter");
-   add_output(VARYING_SLOT_TESS_LEVEL_INNER,
-                    array(float_t, 2), "gl_TessLevelInner");
+   add_output(VARYING_SLOT_TESS_LEVEL_OUTER, array(float_t, 4),
+              "gl_TessLevelOuter")->data.patch = 1;
+   add_output(VARYING_SLOT_TESS_LEVEL_INNER, array(float_t, 2),
+              "gl_TessLevelInner")->data.patch = 1;
 }
 
 
diff --git a/src/glsl/glsl_lexer.ll b/src/glsl/glsl_lexer.ll
index 845deebcd6f..ac4e2b70cc7 100644
--- a/src/glsl/glsl_lexer.ll
+++ b/src/glsl/glsl_lexer.ll
@@ -315,6 +315,7 @@ invariant	KEYWORD(120, 100, 120, 100, INVARIANT);
 flat		KEYWORD(130, 100, 130, 300, FLAT);
 smooth		KEYWORD(130, 300, 130, 300, SMOOTH);
 noperspective	KEYWORD(130, 300, 130, 0, NOPERSPECTIVE);
+patch		KEYWORD_WITH_ALT(0, 300, 400, 0, yyextra->ARB_tessellation_shader_enable, PATCH);
 
 sampler1D	DEPRECATED_ES_KEYWORD(SAMPLER1D);
 sampler2D	return SAMPLER2D;
@@ -576,7 +577,6 @@ usamplerBuffer	KEYWORD(140, 300, 140, 0, USAMPLERBUFFER);
 
     /* Additional reserved words in GLSL ES 3.00 */
 resource	KEYWORD(0, 300, 0, 0, RESOURCE);
-patch		KEYWORD(0, 300, 0, 0, PATCH);
 sample		KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_gpu_shader5_enable, SAMPLE);
 subroutine	KEYWORD(0, 300, 0, 0, SUBROUTINE);
 
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 494907703ba..9d5212c200f 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -1866,7 +1866,11 @@ auxiliary_storage_qualifier:
       memset(& $$, 0, sizeof($$));
       $$.flags.q.sample = 1;
    }
-   /* TODO: "patch" also goes here someday. */
+   | PATCH
+   {
+      memset(& $$, 0, sizeof($$));
+      $$.flags.q.patch = 1;
+   }
 
 storage_qualifier:
    CONST_TOK
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 3bb5c4db2f7..8979d3b49b6 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -861,6 +861,8 @@ _mesa_ast_type_qualifier_print(const struct ast_type_qualifier *q)
       printf("centroid ");
    if (q->flags.q.sample)
       printf("sample ");
+   if (q->flags.q.patch)
+      printf("patch ");
    if (q->flags.q.uniform)
       printf("uniform ");
    if (q->flags.q.buffer)
diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
index aaf7c7c48d6..cd6df5adad9 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -122,6 +122,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].centroid = fields[i].centroid;
       this->fields.structure[i].sample = fields[i].sample;
       this->fields.structure[i].matrix_layout = fields[i].matrix_layout;
+      this->fields.structure[i].patch = fields[i].patch;
    }
 
    mtx_unlock(&glsl_type::mutex);
@@ -154,6 +155,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].centroid = fields[i].centroid;
       this->fields.structure[i].sample = fields[i].sample;
       this->fields.structure[i].matrix_layout = fields[i].matrix_layout;
+      this->fields.structure[i].patch = fields[i].patch;
    }
 
    mtx_unlock(&glsl_type::mutex);
@@ -722,6 +724,9 @@ glsl_type::record_compare(const glsl_type *b) const
       if (this->fields.structure[i].sample
           != b->fields.structure[i].sample)
          return false;
+      if (this->fields.structure[i].patch
+          != b->fields.structure[i].patch)
+         return false;
    }
 
    return true;
diff --git a/src/glsl/glsl_types.h b/src/glsl/glsl_types.h
index 93a1d257ad8..dff6a9a9d05 100644
--- a/src/glsl/glsl_types.h
+++ b/src/glsl/glsl_types.h
@@ -751,6 +751,12 @@ struct glsl_struct_field {
     */
    unsigned matrix_layout:2;
 
+   /**
+    * For interface blocks, 1 if this variable is a per-patch input or output
+    * (as in ir_variable::patch). 0 otherwise.
+    */
+   unsigned patch:1;
+
    /**
     * For interface blocks, it has a value if this variable uses multiple vertex
     * streams (as in ir_variable::stream). -1 otherwise.
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index 9a25bf413f4..8fb7ca40cf1 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -1643,6 +1643,7 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name,
    this->data.read_only = false;
    this->data.centroid = false;
    this->data.sample = false;
+   this->data.patch = false;
    this->data.invariant = false;
    this->data.how_declared = ir_var_declared_normally;
    this->data.mode = mode;
@@ -1785,6 +1786,7 @@ ir_function_signature::qualifiers_match(exec_list *params)
 	  a->data.interpolation != b->data.interpolation ||
 	  a->data.centroid != b->data.centroid ||
           a->data.sample != b->data.sample ||
+          a->data.patch != b->data.patch ||
           a->data.image_read_only != b->data.image_read_only ||
           a->data.image_write_only != b->data.image_write_only ||
           a->data.image_coherent != b->data.image_coherent ||
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 1c7829b0326..98951221b02 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -622,6 +622,7 @@ public:
       unsigned read_only:1;
       unsigned centroid:1;
       unsigned sample:1;
+      unsigned patch:1;
       unsigned invariant:1;
       unsigned precise:1;
 
diff --git a/src/glsl/ir_print_visitor.cpp b/src/glsl/ir_print_visitor.cpp
index 922f98b6b8c..428fc310373 100644
--- a/src/glsl/ir_print_visitor.cpp
+++ b/src/glsl/ir_print_visitor.cpp
@@ -167,6 +167,7 @@ void ir_print_visitor::visit(ir_variable *ir)
 
    const char *const cent = (ir->data.centroid) ? "centroid " : "";
    const char *const samp = (ir->data.sample) ? "sample " : "";
+   const char *const patc = (ir->data.patch) ? "patch " : "";
    const char *const inv = (ir->data.invariant) ? "invariant " : "";
    const char *const mode[] = { "", "uniform ", "shader_storage",
                                 "shader_in ", "shader_out ",
@@ -177,8 +178,8 @@ void ir_print_visitor::visit(ir_variable *ir)
    const char *const interp[] = { "", "smooth", "flat", "noperspective" };
    STATIC_ASSERT(ARRAY_SIZE(interp) == INTERP_QUALIFIER_COUNT);
 
-   fprintf(f, "(%s%s%s%s%s%s%s) ",
-           loc, cent, samp, inv, mode[ir->data.mode],
+   fprintf(f, "(%s%s%s%s%s%s%s%s) ",
+           loc, cent, samp, patc, inv, mode[ir->data.mode],
            stream[ir->data.stream],
            interp[ir->data.interpolation]);
 
diff --git a/src/glsl/ir_reader.cpp b/src/glsl/ir_reader.cpp
index d3ece19b22c..469837f5e4c 100644
--- a/src/glsl/ir_reader.cpp
+++ b/src/glsl/ir_reader.cpp
@@ -417,6 +417,8 @@ ir_reader::read_declaration(s_expression *expr)
 	 var->data.centroid = 1;
       } else if (strcmp(qualifier->value(), "sample") == 0) {
          var->data.sample = 1;
+      } else if (strcmp(qualifier->value(), "patch") == 0) {
+         var->data.patch = 1;
       } else if (strcmp(qualifier->value(), "invariant") == 0) {
 	 var->data.invariant = 1;
       } else if (strcmp(qualifier->value(), "uniform") == 0) {
diff --git a/src/glsl/ir_set_program_inouts.cpp b/src/glsl/ir_set_program_inouts.cpp
index b968a1efd3e..fe4d6da45db 100644
--- a/src/glsl/ir_set_program_inouts.cpp
+++ b/src/glsl/ir_set_program_inouts.cpp
@@ -140,6 +140,24 @@ ir_set_program_inouts_visitor::mark_whole_variable(ir_variable *var)
       type = type->fields.array;
    }
 
+   if (this->shader_stage == MESA_SHADER_TESS_CTRL &&
+       var->data.mode == ir_var_shader_in) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
+   if (this->shader_stage == MESA_SHADER_TESS_CTRL &&
+       var->data.mode == ir_var_shader_out && !var->data.patch) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
+   if (this->shader_stage == MESA_SHADER_TESS_EVAL &&
+       var->data.mode == ir_var_shader_in && !var->data.patch) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
    mark(this->prog, var, 0, type->count_attribute_slots(),
         this->shader_stage == MESA_SHADER_FRAGMENT);
 }
@@ -165,6 +183,9 @@ ir_set_program_inouts_visitor::visit(ir_dereference_variable *ir)
  *
  * *Except gl_PrimitiveIDIn, as noted below.
  *
+ * For tessellation control shaders all inputs and non-patch outputs are
+ * arrays. For tessellation evaluation shaders non-patch inputs are arrays.
+ *
  * If the index can't be interpreted as a constant, or some other problem
  * occurs, then nothing will be marked and false will be returned.
  */
@@ -184,6 +205,24 @@ ir_set_program_inouts_visitor::try_mark_partial_variable(ir_variable *var,
       type = type->fields.array;
    }
 
+   if (this->shader_stage == MESA_SHADER_TESS_CTRL &&
+       var->data.mode == ir_var_shader_in) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
+   if (this->shader_stage == MESA_SHADER_TESS_CTRL &&
+       var->data.mode == ir_var_shader_out && !var->data.patch) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
+   if (this->shader_stage == MESA_SHADER_TESS_EVAL &&
+       var->data.mode == ir_var_shader_in && !var->data.patch) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
    /* The code below only handles:
     *
     * - Indexing into matrices
@@ -242,6 +281,22 @@ ir_set_program_inouts_visitor::try_mark_partial_variable(ir_variable *var,
    return true;
 }
 
+static bool
+is_multiple_vertices(gl_shader_stage stage, ir_variable *var)
+{
+   if (var->data.patch)
+      return false;
+
+   if (var->data.mode == ir_var_shader_in)
+      return stage == MESA_SHADER_GEOMETRY ||
+             stage == MESA_SHADER_TESS_CTRL ||
+             stage == MESA_SHADER_TESS_EVAL;
+   if (var->data.mode == ir_var_shader_out)
+      return stage == MESA_SHADER_TESS_CTRL;
+
+   return false;
+}
+
 ir_visitor_status
 ir_set_program_inouts_visitor::visit_enter(ir_dereference_array *ir)
 {
@@ -256,10 +311,9 @@ ir_set_program_inouts_visitor::visit_enter(ir_dereference_array *ir)
        */
       if (ir_dereference_variable * const deref_var =
           inner_array->array->as_dereference_variable()) {
-         if (this->shader_stage == MESA_SHADER_GEOMETRY &&
-             deref_var->var->data.mode == ir_var_shader_in) {
-            /* foo is a geometry shader input, so i is the vertex, and j the
-             * part of the input we're accessing.
+         if (is_multiple_vertices(this->shader_stage, deref_var->var)) {
+            /* foo is a geometry or tessellation shader input, so i is
+             * the vertex, and j the part of the input we're accessing.
              */
             if (try_mark_partial_variable(deref_var->var, ir->array_index))
             {
@@ -275,10 +329,9 @@ ir_set_program_inouts_visitor::visit_enter(ir_dereference_array *ir)
    } else if (ir_dereference_variable * const deref_var =
               ir->array->as_dereference_variable()) {
       /* ir => foo[i], where foo is a variable. */
-      if (this->shader_stage == MESA_SHADER_GEOMETRY &&
-          deref_var->var->data.mode == ir_var_shader_in) {
-         /* foo is a geometry shader input, so i is the vertex, and we're
-          * accessing the entire input.
+      if (is_multiple_vertices(this->shader_stage, deref_var->var)) {
+         /* foo is a geometry or tessellation shader input, so i is
+          * the vertex, and we're accessing the entire input.
           */
          mark_whole_variable(deref_var->var);
          /* We've now taken care of foo, but i might contain a subexpression
diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 020842a54a3..c1a8f1bb3f7 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -116,6 +116,18 @@ cross_validate_types_and_qualifiers(struct gl_shader_program *prog,
       return;
    }
 
+   if (input->data.patch != output->data.patch) {
+      linker_error(prog,
+                   "%s shader output `%s' %s patch qualifier, "
+                   "but %s shader input %s patch qualifier\n",
+                   _mesa_shader_stage_to_string(producer_stage),
+                   output->name,
+                   (output->data.patch) ? "has" : "lacks",
+                   _mesa_shader_stage_to_string(consumer_stage),
+                   (input->data.patch) ? "has" : "lacks");
+      return;
+   }
+
    if (!prog->IsES && input->data.invariant != output->data.invariant) {
       linker_error(prog,
                    "%s shader output `%s' %s invariant qualifier, "
@@ -989,7 +1001,8 @@ varying_matches::compute_packing_class(const ir_variable *var)
     *
     * Therefore, the packing class depends only on the interpolation type.
     */
-   unsigned packing_class = var->data.centroid | (var->data.sample << 1);
+   unsigned packing_class = var->data.centroid | (var->data.sample << 1) |
+                            (var->data.patch << 2);
    packing_class *= 4;
    packing_class += var->data.interpolation;
    return packing_class;
diff --git a/src/glsl/lower_named_interface_blocks.cpp b/src/glsl/lower_named_interface_blocks.cpp
index 28d7987d3c3..22c852dcd2a 100644
--- a/src/glsl/lower_named_interface_blocks.cpp
+++ b/src/glsl/lower_named_interface_blocks.cpp
@@ -159,6 +159,7 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
                iface_t->fields.structure[i].interpolation;
             new_var->data.centroid = iface_t->fields.structure[i].centroid;
             new_var->data.sample = iface_t->fields.structure[i].sample;
+            new_var->data.patch = iface_t->fields.structure[i].patch;
 
             new_var->init_interface_type(iface_t);
             hash_table_insert(interface_namespace, new_var,
diff --git a/src/glsl/lower_packed_varyings.cpp b/src/glsl/lower_packed_varyings.cpp
index d8bebb52235..cfe414ae088 100644
--- a/src/glsl/lower_packed_varyings.cpp
+++ b/src/glsl/lower_packed_varyings.cpp
@@ -610,6 +610,7 @@ lower_packed_varyings_visitor::get_packed_varying_deref(
       }
       packed_var->data.centroid = unpacked_var->data.centroid;
       packed_var->data.sample = unpacked_var->data.sample;
+      packed_var->data.patch = unpacked_var->data.patch;
       packed_var->data.interpolation = unpacked_var->data.interpolation;
       packed_var->data.location = location;
       unpacked_var->insert_before(packed_var);

From 7c758c5a216b0a72a089c4fe9b4facde0e7b2726 Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 21 Sep 2014 13:33:14 +1200
Subject: [PATCH 0506/1208] glsl: allow linking of tessellation shaders.

Marek: require a tess eval shader if a tess control shader is present

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/link_interface_blocks.cpp |  11 +-
 src/glsl/link_varyings.cpp         |  10 +-
 src/glsl/linker.cpp                | 280 ++++++++++++++++++++++++++++-
 3 files changed, 294 insertions(+), 7 deletions(-)

diff --git a/src/glsl/link_interface_blocks.cpp b/src/glsl/link_interface_blocks.cpp
index f9ddb13cd56..936e2e0ba21 100644
--- a/src/glsl/link_interface_blocks.cpp
+++ b/src/glsl/link_interface_blocks.cpp
@@ -134,9 +134,9 @@ intrastage_match(interface_block_definition *a,
  * Check if two interfaces match, according to interstage (in/out) interface
  * matching rules.
  *
- * If \c extra_array_level is true, then vertex-to-geometry shader matching
- * rules are enforced (i.e. a successful match requires the consumer interface
- * to be an array and the producer interface to be a non-array).
+ * If \c extra_array_level is true, the consumer interface is required to be
+ * an array and the producer interface is required to be a non-array.
+ * This is used for tessellation control and geometry shader consumers.
  */
 bool
 interstage_match(const interface_block_definition *producer,
@@ -318,7 +318,10 @@ validate_interstage_inout_blocks(struct gl_shader_program *prog,
                                  const gl_shader *consumer)
 {
    interface_block_definitions definitions;
-   const bool extra_array_level = consumer->Stage == MESA_SHADER_GEOMETRY;
+   /* VS -> GS, VS -> TCS, VS -> TES, TES -> GS */
+   const bool extra_array_level = (producer->Stage == MESA_SHADER_VERTEX &&
+                                   consumer->Stage != MESA_SHADER_FRAGMENT) ||
+                                  consumer->Stage == MESA_SHADER_GEOMETRY;
 
    /* Add input interfaces from the consumer to the symbol table. */
    foreach_in_list(ir_instruction, node, consumer->ir) {
diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index c1a8f1bb3f7..9174e9c2f2d 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -54,10 +54,16 @@ cross_validate_types_and_qualifiers(struct gl_shader_program *prog,
    /* Check that the types match between stages.
     */
    const glsl_type *type_to_match = input->type;
-   if (consumer_stage == MESA_SHADER_GEOMETRY) {
-      assert(type_to_match->is_array()); /* Enforced by ast_to_hir */
+
+   /* VS -> GS, VS -> TCS, VS -> TES, TES -> GS */
+   const bool extra_array_level = (producer_stage == MESA_SHADER_VERTEX &&
+                                   consumer_stage != MESA_SHADER_FRAGMENT) ||
+                                  consumer_stage == MESA_SHADER_GEOMETRY;
+   if (extra_array_level) {
+      assert(type_to_match->is_array());
       type_to_match = type_to_match->fields.array;
    }
+
    if (type_to_match != output->type) {
       /* There is a bit of a special case for gl_TexCoord.  This
        * built-in is unsized by default.  Applications that variable
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 58dd9fbfdae..498e8d5bf7c 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -250,6 +250,53 @@ public:
    }
 };
 
+class tess_eval_array_resize_visitor : public ir_hierarchical_visitor {
+public:
+   unsigned num_vertices;
+   gl_shader_program *prog;
+
+   tess_eval_array_resize_visitor(unsigned num_vertices, gl_shader_program *prog)
+   {
+      this->num_vertices = num_vertices;
+      this->prog = prog;
+   }
+
+   virtual ~tess_eval_array_resize_visitor()
+   {
+      /* empty */
+   }
+
+   virtual ir_visitor_status visit(ir_variable *var)
+   {
+      if (!var->type->is_array() || var->data.mode != ir_var_shader_in || var->data.patch)
+         return visit_continue;
+
+      var->type = glsl_type::get_array_instance(var->type->fields.array,
+                                                this->num_vertices);
+      var->data.max_array_access = this->num_vertices - 1;
+
+      return visit_continue;
+   }
+
+   /* Dereferences of input variables need to be updated so that their type
+    * matches the newly assigned type of the variable they are accessing. */
+   virtual ir_visitor_status visit(ir_dereference_variable *ir)
+   {
+      ir->type = ir->var->type;
+      return visit_continue;
+   }
+
+   /* Dereferences of 2D input arrays need to be updated so that their type
+    * matches the newly assigned type of the array they are accessing. */
+   virtual ir_visitor_status visit_leave(ir_dereference_array *ir)
+   {
+      const glsl_type *const vt = ir->array->type;
+      if (vt->is_array())
+         ir->type = vt->fields.array;
+      return visit_continue;
+   }
+};
+
 /**
  * Visitor that determines the highest stream id to which a (geometry) shader
  * emits vertices. It also checks whether End{Stream}Primitive is ever called.
@@ -1401,6 +1448,167 @@ private:
    hash_table *unnamed_interfaces;
 };
 
+
+/**
+ * Performs the cross-validation of tessellation control shader vertices and
+ * layout qualifiers for the attached tessellation control shaders,
+ * and propagates them to the linked TCS and linked shader program.
+ */
+static void
+link_tcs_out_layout_qualifiers(struct gl_shader_program *prog,
+			      struct gl_shader *linked_shader,
+			      struct gl_shader **shader_list,
+			      unsigned num_shaders)
+{
+   linked_shader->TessCtrl.VerticesOut = 0;
+
+   if (linked_shader->Stage != MESA_SHADER_TESS_CTRL)
+      return;
+
+   /* From the GLSL 4.0 spec (chapter 4.3.8.2):
+    *
+    *     "All tessellation control shader layout declarations in a program
+    *      must specify the same output patch vertex count.  There must be at
+    *      least one layout qualifier specifying an output patch vertex count
+    *      in any program containing tessellation control shaders; however,
+    *      such a declaration is not required in all tessellation control
+    *      shaders."
+    */
+
+   for (unsigned i = 0; i < num_shaders; i++) {
+      struct gl_shader *shader = shader_list[i];
+
+      if (shader->TessCtrl.VerticesOut != 0) {
+	 if (linked_shader->TessCtrl.VerticesOut != 0 &&
+	     linked_shader->TessCtrl.VerticesOut != shader->TessCtrl.VerticesOut) {
+	    linker_error(prog, "tessellation control shader defined with "
+			 "conflicting output vertex count (%d and %d)\n",
+			 linked_shader->TessCtrl.VerticesOut,
+			 shader->TessCtrl.VerticesOut);
+	    return;
+	 }
+	 linked_shader->TessCtrl.VerticesOut = shader->TessCtrl.VerticesOut;
+      }
+   }
+
+   /* Just do the intrastage -> interstage propagation right now,
+    * since we already know we're in the right type of shader program
+    * for doing it.
+    */
+   if (linked_shader->TessCtrl.VerticesOut == 0) {
+      linker_error(prog, "tessellation control shader didn't declare "
+		   "vertices out layout qualifier\n");
+      return;
+   }
+   prog->TessCtrl.VerticesOut = linked_shader->TessCtrl.VerticesOut;
+}
+
+
+/**
+ * Performs the cross-validation of tessellation evaluation shader
+ * primitive type, vertex spacing, ordering and point_mode layout qualifiers
+ * for the attached tessellation evaluation shaders, and propagates them
+ * to the linked TES and linked shader program.
+ */
+static void
+link_tes_in_layout_qualifiers(struct gl_shader_program *prog,
+				struct gl_shader *linked_shader,
+				struct gl_shader **shader_list,
+				unsigned num_shaders)
+{
+   linked_shader->TessEval.PrimitiveMode = PRIM_UNKNOWN;
+   linked_shader->TessEval.Spacing = 0;
+   linked_shader->TessEval.VertexOrder = 0;
+   linked_shader->TessEval.PointMode = -1;
+
+   if (linked_shader->Stage != MESA_SHADER_TESS_EVAL)
+      return;
+
+   /* From the GLSL 4.0 spec (chapter 4.3.8.1):
+    *
+    *     "At least one tessellation evaluation shader (compilation unit) in
+    *      a program must declare a primitive mode in its input layout.
+    *      Declaration vertex spacing, ordering, and point mode identifiers is
+    *      optional.  It is not required that all tessellation evaluation
+    *      shaders in a program declare a primitive mode.  If spacing or
+    *      vertex ordering declarations are omitted, the tessellation
+    *      primitive generator will use equal spacing or counter-clockwise
+    *      vertex ordering, respectively.  If a point mode declaration is
+    *      omitted, the tessellation primitive generator will produce lines or
+    *      triangles according to the primitive mode."
+    */
+
+   for (unsigned i = 0; i < num_shaders; i++) {
+      struct gl_shader *shader = shader_list[i];
+
+      if (shader->TessEval.PrimitiveMode != PRIM_UNKNOWN) {
+	 if (linked_shader->TessEval.PrimitiveMode != PRIM_UNKNOWN &&
+	     linked_shader->TessEval.PrimitiveMode != shader->TessEval.PrimitiveMode) {
+	    linker_error(prog, "tessellation evaluation shader defined with "
+			 "conflicting input primitive modes.\n");
+	    return;
+	 }
+	 linked_shader->TessEval.PrimitiveMode = shader->TessEval.PrimitiveMode;
+      }
+
+      if (shader->TessEval.Spacing != 0) {
+	 if (linked_shader->TessEval.Spacing != 0 &&
+	     linked_shader->TessEval.Spacing != shader->TessEval.Spacing) {
+	    linker_error(prog, "tessellation evaluation shader defined with "
+			 "conflicting vertex spacing.\n");
+	    return;
+	 }
+	 linked_shader->TessEval.Spacing = shader->TessEval.Spacing;
+      }
+
+      if (shader->TessEval.VertexOrder != 0) {
+	 if (linked_shader->TessEval.VertexOrder != 0 &&
+	     linked_shader->TessEval.VertexOrder != shader->TessEval.VertexOrder) {
+	    linker_error(prog, "tessellation evaluation shader defined with "
+			 "conflicting ordering.\n");
+	    return;
+	 }
+	 linked_shader->TessEval.VertexOrder = shader->TessEval.VertexOrder;
+      }
+
+      if (shader->TessEval.PointMode != -1) {
+	 if (linked_shader->TessEval.PointMode != -1 &&
+	     linked_shader->TessEval.PointMode != shader->TessEval.PointMode) {
+	    linker_error(prog, "tessellation evaluation shader defined with "
+			 "conflicting point modes.\n");
+	    return;
+	 }
+	 linked_shader->TessEval.PointMode = shader->TessEval.PointMode;
+      }
+
+   }
+
+   /* Just do the intrastage -> interstage propagation right now,
+    * since we already know we're in the right type of shader program
+    * for doing it.
+    */
+   if (linked_shader->TessEval.PrimitiveMode == PRIM_UNKNOWN) {
+      linker_error(prog,
+		   "tessellation evaluation shader didn't declare input "
+		   "primitive modes.\n");
+      return;
+   }
+   prog->TessEval.PrimitiveMode = linked_shader->TessEval.PrimitiveMode;
+
+   if (linked_shader->TessEval.Spacing == 0)
+      linked_shader->TessEval.Spacing = GL_EQUAL;
+   prog->TessEval.Spacing = linked_shader->TessEval.Spacing;
+
+   if (linked_shader->TessEval.VertexOrder == 0)
+      linked_shader->TessEval.VertexOrder = GL_CCW;
+   prog->TessEval.VertexOrder = linked_shader->TessEval.VertexOrder;
+
+   if (linked_shader->TessEval.PointMode == -1)
+      linked_shader->TessEval.PointMode = GL_FALSE;
+   prog->TessEval.PointMode = linked_shader->TessEval.PointMode;
+}
+
+
 /**
  * Performs the cross-validation of layout qualifiers specified in
  * redeclaration of gl_FragCoord for the attached fragment shaders,
@@ -1747,6 +1955,8 @@ link_intrastage_shaders(void *mem_ctx,
    ralloc_steal(linked, linked->UniformBlocks);
 
    link_fs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
+   link_tcs_out_layout_qualifiers(prog, linked, shader_list, num_shaders);
+   link_tes_in_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_gs_inout_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_cs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
 
@@ -1922,6 +2132,34 @@ update_array_sizes(struct gl_shader_program *prog)
    }
 }
 
+/**
+ * Resize tessellation evaluation per-vertex inputs to the size of
+ * tessellation control per-vertex outputs.
+ */
+static void
+resize_tes_inputs(struct gl_context *ctx,
+                  struct gl_shader_program *prog)
+{
+   if (prog->_LinkedShaders[MESA_SHADER_TESS_EVAL] == NULL)
+      return;
+
+   gl_shader *const tcs = prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
+   gl_shader *const tes = prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
+
+   /* If no control shader is present, then the TES inputs are statically
+    * sized to MaxPatchVertices; the actual size of the arrays won't be
+    * known until draw time.
+    */
+   const int num_vertices = tcs
+      ? tcs->TessCtrl.VerticesOut
+      : ctx->Const.MaxPatchVertices;
+
+   tess_eval_array_resize_visitor input_resize_visitor(num_vertices, prog);
+   foreach_in_list(ir_instruction, ir, tes->ir) {
+      ir->accept(&input_resize_visitor);
+   }
+}
+
 /**
  * Find a contiguous set of available bits in a bitmask.
  *
@@ -2897,7 +3135,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    prog->Version = max_version;
    prog->IsES = is_es_prog;
 
-   /* Geometry shaders have to be linked with vertex shaders.
+   /* Some shaders have to be linked with some other shaders present.
     */
    if (num_shaders[MESA_SHADER_GEOMETRY] > 0 &&
        num_shaders[MESA_SHADER_VERTEX] == 0 &&
@@ -2906,6 +3144,44 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
 		   "vertex shader\n");
       goto done;
    }
+   if (num_shaders[MESA_SHADER_TESS_EVAL] > 0 &&
+       num_shaders[MESA_SHADER_VERTEX] == 0 &&
+       !prog->SeparateShader) {
+      linker_error(prog, "Tessellation evaluation shader must be linked with "
+		   "vertex shader\n");
+      goto done;
+   }
+   if (num_shaders[MESA_SHADER_TESS_CTRL] > 0 &&
+       num_shaders[MESA_SHADER_VERTEX] == 0 &&
+       !prog->SeparateShader) {
+      linker_error(prog, "Tessellation control shader must be linked with "
+		   "vertex shader\n");
+      goto done;
+   }
+
+   /* The spec is self-contradictory here. It allows linking without a tess
+    * eval shader, but that can only be used with transform feedback and
+    * rasterization disabled. However, transform feedback isn't allowed
+    * with GL_PATCHES, so it can't be used.
+    *
+    * More investigation showed that the idea of transform feedback after
+    * a tess control shader was dropped, because some hw vendors couldn't
+    * support tessellation without a tess eval shader, but the linker section
+    * wasn't updated to reflect that.
+    *
+    * All specifications (ARB_tessellation_shader, GL 4.0-4.5) have this
+    * spec bug.
+    *
+    * Do what's reasonable and always require a tess eval shader if a tess
+    * control shader is present.
+    */
+   if (num_shaders[MESA_SHADER_TESS_CTRL] > 0 &&
+       num_shaders[MESA_SHADER_TESS_EVAL] == 0 &&
+       !prog->SeparateShader) {
+      linker_error(prog, "Tessellation control shader must be linked with "
+		   "tessellation evaluation shader\n");
+      goto done;
+   }
 
    /* Compute shaders have additional restrictions. */
    if (num_shaders[MESA_SHADER_COMPUTE] > 0 &&
@@ -2982,6 +3258,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    if (!prog->LinkStatus)
       goto done;
 
+   resize_tes_inputs(ctx, prog);
+
    /* Validate the inputs of each stage with the output of the preceding
     * stage.
     */

From 54f29502972cdd33302e69e029c8d07fb31b7bdf Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Thu, 20 Mar 2014 22:33:05 +0100
Subject: [PATCH 0507/1208] glsl: make lower_clip_distance work with
 tessellation shaders.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/lower_clip_distance.cpp | 187 ++++++++++++++++++-------------
 1 file changed, 106 insertions(+), 81 deletions(-)

diff --git a/src/glsl/lower_clip_distance.cpp b/src/glsl/lower_clip_distance.cpp
index 01f028b1f37..1ada215796c 100644
--- a/src/glsl/lower_clip_distance.cpp
+++ b/src/glsl/lower_clip_distance.cpp
@@ -55,9 +55,9 @@ namespace {
 class lower_clip_distance_visitor : public ir_rvalue_visitor {
 public:
    explicit lower_clip_distance_visitor(gl_shader_stage shader_stage)
-      : progress(false), old_clip_distance_1d_var(NULL),
-        old_clip_distance_2d_var(NULL), new_clip_distance_1d_var(NULL),
-        new_clip_distance_2d_var(NULL), shader_stage(shader_stage)
+      : progress(false), old_clip_distance_out_var(NULL),
+        old_clip_distance_in_var(NULL), new_clip_distance_out_var(NULL),
+        new_clip_distance_in_var(NULL), shader_stage(shader_stage)
    {
    }
 
@@ -80,20 +80,21 @@ public:
     *
     * Note:
     *
-    * - the 2d_var is for geometry shader input only.
+    * - the in_var is for geometry and both tessellation shader inputs only.
     *
-    * - since gl_ClipDistance is available in geometry shaders as both an
-    *   input and an output, it's possible for both old_clip_distance_1d_var
-    *   and old_clip_distance_2d_var to be non-null.
+    * - since gl_ClipDistance is available in tessellation control,
+    *   tessellation evaluation and geometry shaders as both an input
+    *   and an output, it's possible for both old_clip_distance_out_var
+    *   and old_clip_distance_in_var to be non-null.
     */
-   ir_variable *old_clip_distance_1d_var;
-   ir_variable *old_clip_distance_2d_var;
+   ir_variable *old_clip_distance_out_var;
+   ir_variable *old_clip_distance_in_var;
 
    /**
     * Pointer to the newly-created gl_ClipDistanceMESA variable.
     */
-   ir_variable *new_clip_distance_1d_var;
-   ir_variable *new_clip_distance_2d_var;
+   ir_variable *new_clip_distance_out_var;
+   ir_variable *new_clip_distance_in_var;
 
    /**
     * Type of shader we are compiling (e.g. MESA_SHADER_VERTEX)
@@ -110,62 +111,81 @@ public:
 ir_visitor_status
 lower_clip_distance_visitor::visit(ir_variable *ir)
 {
+   ir_variable **old_var;
+   ir_variable **new_var;
+
    if (!ir->name || strcmp(ir->name, "gl_ClipDistance") != 0)
       return visit_continue;
    assert (ir->type->is_array());
 
-   if (!ir->type->fields.array->is_array()) {
-      /* 1D gl_ClipDistance (used for vertex and geometry output, and fragment
-       * input).
-       */
-      if (this->old_clip_distance_1d_var)
+   if (ir->data.mode == ir_var_shader_out) {
+      if (this->old_clip_distance_out_var)
          return visit_continue;
+      old_var = &old_clip_distance_out_var;
+      new_var = &new_clip_distance_out_var;
+   } else if (ir->data.mode == ir_var_shader_in) {
+      if (this->old_clip_distance_in_var)
+         return visit_continue;
+      old_var = &old_clip_distance_in_var;
+      new_var = &new_clip_distance_in_var;
+   } else {
+      unreachable("not reached");
+   }
 
-      this->progress = true;
-      this->old_clip_distance_1d_var = ir;
+   this->progress = true;
+
+   if (!ir->type->fields.array->is_array()) {
+      /* gl_ClipDistance (used for vertex, tessellation evaluation and
+       * geometry output, and fragment input).
+       */
+      assert((ir->data.mode == ir_var_shader_in &&
+              this->shader_stage == MESA_SHADER_FRAGMENT) ||
+             (ir->data.mode == ir_var_shader_out &&
+              (this->shader_stage == MESA_SHADER_VERTEX ||
+               this->shader_stage == MESA_SHADER_TESS_EVAL ||
+               this->shader_stage == MESA_SHADER_GEOMETRY)));
+
+      *old_var = ir;
       assert (ir->type->fields.array == glsl_type::float_type);
       unsigned new_size = (ir->type->array_size() + 3) / 4;
 
       /* Clone the old var so that we inherit all of its properties */
-      this->new_clip_distance_1d_var = ir->clone(ralloc_parent(ir), NULL);
+      *new_var = ir->clone(ralloc_parent(ir), NULL);
 
       /* And change the properties that we need to change */
-      this->new_clip_distance_1d_var->name
-         = ralloc_strdup(this->new_clip_distance_1d_var,
-                         "gl_ClipDistanceMESA");
-      this->new_clip_distance_1d_var->type
-         = glsl_type::get_array_instance(glsl_type::vec4_type, new_size);
-      this->new_clip_distance_1d_var->data.max_array_access
-         = ir->data.max_array_access / 4;
+      (*new_var)->name = ralloc_strdup(*new_var, "gl_ClipDistanceMESA");
+      (*new_var)->type = glsl_type::get_array_instance(glsl_type::vec4_type,
+                                                       new_size);
+      (*new_var)->data.max_array_access = ir->data.max_array_access / 4;
 
-      ir->replace_with(this->new_clip_distance_1d_var);
+      ir->replace_with(*new_var);
    } else {
-      /* 2D gl_ClipDistance (used for geometry input). */
-      assert(ir->data.mode == ir_var_shader_in &&
-             this->shader_stage == MESA_SHADER_GEOMETRY);
-      if (this->old_clip_distance_2d_var)
-         return visit_continue;
+      /* 2D gl_ClipDistance (used for tessellation control, tessellation
+       * evaluation and geometry input, and tessellation control output).
+       */
+      assert((ir->data.mode == ir_var_shader_in &&
+              (this->shader_stage == MESA_SHADER_GEOMETRY ||
+               this->shader_stage == MESA_SHADER_TESS_EVAL)) ||
+             this->shader_stage == MESA_SHADER_TESS_CTRL);
 
-      this->progress = true;
-      this->old_clip_distance_2d_var = ir;
+      *old_var = ir;
       assert (ir->type->fields.array->fields.array == glsl_type::float_type);
       unsigned new_size = (ir->type->fields.array->array_size() + 3) / 4;
 
       /* Clone the old var so that we inherit all of its properties */
-      this->new_clip_distance_2d_var = ir->clone(ralloc_parent(ir), NULL);
+      *new_var = ir->clone(ralloc_parent(ir), NULL);
 
       /* And change the properties that we need to change */
-      this->new_clip_distance_2d_var->name
-         = ralloc_strdup(this->new_clip_distance_2d_var, "gl_ClipDistanceMESA");
-      this->new_clip_distance_2d_var->type = glsl_type::get_array_instance(
+      (*new_var)->name = ralloc_strdup(*new_var, "gl_ClipDistanceMESA");
+      (*new_var)->type = glsl_type::get_array_instance(
          glsl_type::get_array_instance(glsl_type::vec4_type,
             new_size),
          ir->type->array_size());
-      this->new_clip_distance_2d_var->data.max_array_access
-         = ir->data.max_array_access / 4;
+      (*new_var)->data.max_array_access = ir->data.max_array_access / 4;
 
-      ir->replace_with(this->new_clip_distance_2d_var);
+      ir->replace_with(*new_var);
    }
+
    return visit_continue;
 }
 
@@ -242,26 +262,27 @@ lower_clip_distance_visitor::is_clip_distance_vec8(ir_rvalue *ir)
 {
    /* Note that geometry shaders contain gl_ClipDistance both as an input
     * (which is a 2D array) and an output (which is a 1D array), so it's
-    * possible for both this->old_clip_distance_1d_var and
-    * this->old_clip_distance_2d_var to be non-NULL in the same shader.
+    * possible for both this->old_clip_distance_out_var and
+    * this->old_clip_distance_in_var to be non-NULL in the same shader.
     */
 
-   if (this->old_clip_distance_1d_var) {
-      ir_dereference_variable *var_ref = ir->as_dereference_variable();
-      if (var_ref && var_ref->var == this->old_clip_distance_1d_var)
+   if (!ir->type->is_array())
+      return false;
+   if (ir->type->fields.array != glsl_type::float_type)
+      return false;
+
+   if (this->old_clip_distance_out_var) {
+      if (ir->variable_referenced() == this->old_clip_distance_out_var)
          return true;
    }
-   if (this->old_clip_distance_2d_var) {
-      /* 2D clip distance is only possible as a geometry input */
-      assert(this->shader_stage == MESA_SHADER_GEOMETRY);
+   if (this->old_clip_distance_in_var) {
+      assert(this->shader_stage == MESA_SHADER_TESS_CTRL ||
+             this->shader_stage == MESA_SHADER_TESS_EVAL ||
+             this->shader_stage == MESA_SHADER_GEOMETRY ||
+             this->shader_stage == MESA_SHADER_FRAGMENT);
 
-      ir_dereference_array *array_ref = ir->as_dereference_array();
-      if (array_ref) {
-         ir_dereference_variable *var_ref =
-            array_ref->array->as_dereference_variable();
-         if (var_ref && var_ref->var == this->old_clip_distance_2d_var)
-            return true;
-      }
+      if (ir->variable_referenced() == this->old_clip_distance_in_var)
+         return true;
    }
    return false;
 }
@@ -279,29 +300,33 @@ lower_clip_distance_visitor::is_clip_distance_vec8(ir_rvalue *ir)
 ir_rvalue *
 lower_clip_distance_visitor::lower_clip_distance_vec8(ir_rvalue *ir)
 {
-   if (this->old_clip_distance_1d_var) {
-      ir_dereference_variable *var_ref = ir->as_dereference_variable();
-      if (var_ref && var_ref->var == this->old_clip_distance_1d_var) {
-         return new(ralloc_parent(ir))
-            ir_dereference_variable(this->new_clip_distance_1d_var);
-      }
-   }
-   if (this->old_clip_distance_2d_var) {
-      /* 2D clip distance is only possible as a geometry input */
-      assert(this->shader_stage == MESA_SHADER_GEOMETRY);
+   if (!ir->type->is_array())
+      return NULL;
+   if (ir->type->fields.array != glsl_type::float_type)
+      return NULL;
 
-      ir_dereference_array *array_ref = ir->as_dereference_array();
-      if (array_ref) {
-         ir_dereference_variable *var_ref =
-            array_ref->array->as_dereference_variable();
-         if (var_ref && var_ref->var == this->old_clip_distance_2d_var) {
-            return new(ralloc_parent(ir))
-               ir_dereference_array(this->new_clip_distance_2d_var,
-                                    array_ref->array_index);
-         }
-      }
+   ir_variable **new_var = NULL;
+   if (this->old_clip_distance_out_var) {
+      if (ir->variable_referenced() == this->old_clip_distance_out_var)
+         new_var = &this->new_clip_distance_out_var;
+   }
+   if (this->old_clip_distance_in_var) {
+      if (ir->variable_referenced() == this->old_clip_distance_in_var)
+         new_var = &this->new_clip_distance_in_var;
+   }
+   if (new_var == NULL)
+      return NULL;
+
+   if (ir->as_dereference_variable()) {
+      return new(ralloc_parent(ir)) ir_dereference_variable(*new_var);
+   } else {
+      ir_dereference_array *array_ref = ir->as_dereference_array();
+      assert(array_ref);
+      assert(array_ref->array->as_dereference_variable());
+
+      return new(ralloc_parent(ir))
+         ir_dereference_array(*new_var, array_ref->array_index);
    }
-   return NULL;
 }
 
 
@@ -540,10 +565,10 @@ lower_clip_distance(gl_shader *shader)
 
    visit_list_elements(&v, shader->ir);
 
-   if (v.new_clip_distance_1d_var)
-      shader->symbols->add_variable(v.new_clip_distance_1d_var);
-   if (v.new_clip_distance_2d_var)
-      shader->symbols->add_variable(v.new_clip_distance_2d_var);
+   if (v.new_clip_distance_out_var)
+      shader->symbols->add_variable(v.new_clip_distance_out_var);
+   if (v.new_clip_distance_in_var)
+      shader->symbols->add_variable(v.new_clip_distance_in_var);
 
    return v.progress;
 }

From 73a9a1539a85ae8fe22e11b4064105d588597736 Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Mon, 10 Mar 2014 17:55:36 +0100
Subject: [PATCH 0508/1208] glsl: lower gl_TessLevel* from float[n] to vecn.

Similar to gl_ClipDistance -> gl_ClipDistanceMESA

v2: - renamed is_mesa_var to lowered_builtin_array_variable
    - moved LowerTessLevel into gl_constants
    - cosmetic changes in lower_tess_level.cpp

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/Makefile.sources     |   1 +
 src/glsl/ir_optimization.h    |   1 +
 src/glsl/link_varyings.cpp    |  51 +++-
 src/glsl/link_varyings.h      |  13 +-
 src/glsl/linker.cpp           |   4 +
 src/glsl/lower_tess_level.cpp | 459 ++++++++++++++++++++++++++++++++++
 src/mesa/main/mtypes.h        |   1 +
 7 files changed, 517 insertions(+), 13 deletions(-)
 create mode 100644 src/glsl/lower_tess_level.cpp

diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index d784a810723..b3b84d69dcd 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -154,6 +154,7 @@ LIBGLSL_FILES = \
 	lower_packed_varyings.cpp \
 	lower_named_interface_blocks.cpp \
 	lower_packing_builtins.cpp \
+	lower_tess_level.cpp \
 	lower_texture_projection.cpp \
 	lower_variable_index_to_cond_assign.cpp \
 	lower_vec_index_to_cond_assign.cpp \
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index e6939f3fe1f..688a5e18ea3 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -132,6 +132,7 @@ bool optimize_split_arrays(exec_list *instructions, bool linked);
 bool lower_offset_arrays(exec_list *instructions);
 void optimize_dead_builtin_variables(exec_list *instructions,
                                      enum ir_variable_mode other);
+bool lower_tess_level(gl_shader *shader);
 
 bool lower_vertex_id(gl_shader *shader);
 
diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 9174e9c2f2d..f7c219ece05 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -328,7 +328,7 @@ tfeedback_decl::init(struct gl_context *ctx, const void *mem_ctx,
 
    this->location = -1;
    this->orig_name = input;
-   this->is_clip_distance_mesa = false;
+   this->lowered_builtin_array_variable = none;
    this->skip_components = 0;
    this->next_buffer_separator = false;
    this->matched_candidate = NULL;
@@ -377,8 +377,15 @@ tfeedback_decl::init(struct gl_context *ctx, const void *mem_ctx,
     */
    if (ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].LowerClipDistance &&
        strcmp(this->var_name, "gl_ClipDistance") == 0) {
-      this->is_clip_distance_mesa = true;
+      this->lowered_builtin_array_variable = clip_distance;
    }
+
+   if (ctx->Const.LowerTessLevel &&
+       (strcmp(this->var_name, "gl_TessLevelOuter") == 0))
+      this->lowered_builtin_array_variable = tess_level_outer;
+   if (ctx->Const.LowerTessLevel &&
+       (strcmp(this->var_name, "gl_TessLevelInner") == 0))
+      this->lowered_builtin_array_variable = tess_level_inner;
 }
 
 
@@ -425,9 +432,22 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
          this->matched_candidate->type->fields.array->matrix_columns;
       const unsigned vector_elements =
          this->matched_candidate->type->fields.array->vector_elements;
-      unsigned actual_array_size = this->is_clip_distance_mesa ?
-         prog->LastClipDistanceArraySize :
-         this->matched_candidate->type->array_size();
+      unsigned actual_array_size;
+      switch (this->lowered_builtin_array_variable) {
+      case clip_distance:
+         actual_array_size = prog->LastClipDistanceArraySize;
+         break;
+      case tess_level_outer:
+         actual_array_size = 4;
+         break;
+      case tess_level_inner:
+         actual_array_size = 2;
+         break;
+      case none:
+      default:
+         actual_array_size = this->matched_candidate->type->array_size();
+         break;
+      }
 
       if (this->is_subscripted) {
          /* Check array bounds. */
@@ -438,7 +458,7 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
                          actual_array_size);
             return false;
          }
-         unsigned array_elem_size = this->is_clip_distance_mesa ?
+         unsigned array_elem_size = this->lowered_builtin_array_variable ?
             1 : vector_elements * matrix_cols;
          fine_location += array_elem_size * this->array_subscript;
          this->size = 1;
@@ -447,7 +467,7 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
       }
       this->vector_elements = vector_elements;
       this->matrix_columns = matrix_cols;
-      if (this->is_clip_distance_mesa)
+      if (this->lowered_builtin_array_variable)
          this->type = GL_FLOAT;
       else
          this->type = this->matched_candidate->type->fields.array->gl_type;
@@ -570,8 +590,21 @@ const tfeedback_candidate *
 tfeedback_decl::find_candidate(gl_shader_program *prog,
                                hash_table *tfeedback_candidates)
 {
-   const char *name = this->is_clip_distance_mesa
-      ? "gl_ClipDistanceMESA" : this->var_name;
+   const char *name = this->var_name;
+   switch (this->lowered_builtin_array_variable) {
+   case none:
+      name = this->var_name;
+      break;
+   case clip_distance:
+      name = "gl_ClipDistanceMESA";
+      break;
+   case tess_level_outer:
+      name = "gl_TessLevelOuterMESA";
+      break;
+   case tess_level_inner:
+      name = "gl_TessLevelInnerMESA";
+      break;
+   }
    this->matched_candidate = (const tfeedback_candidate *)
       hash_table_find(tfeedback_candidates, name);
    if (!this->matched_candidate) {
diff --git a/src/glsl/link_varyings.h b/src/glsl/link_varyings.h
index afc16a8baa7..c0ad53bf1da 100644
--- a/src/glsl/link_varyings.h
+++ b/src/glsl/link_varyings.h
@@ -128,7 +128,7 @@ public:
     */
    unsigned num_components() const
    {
-      if (this->is_clip_distance_mesa)
+      if (this->lowered_builtin_array_variable)
          return this->size;
       else
          return this->vector_elements * this->matrix_columns * this->size;
@@ -161,10 +161,15 @@ private:
    unsigned array_subscript;
 
    /**
-    * True if the variable is gl_ClipDistance and the driver lowers
-    * gl_ClipDistance to gl_ClipDistanceMESA.
+    * Non-zero if the variable is gl_ClipDistance, glTessLevelOuter or
+    * gl_TessLevelInner and the driver lowers it to gl_*MESA.
     */
-   bool is_clip_distance_mesa;
+   enum {
+      none,
+      clip_distance,
+      tess_level_outer,
+      tess_level_inner,
+   } lowered_builtin_array_variable;
 
    /**
     * The vertex shader output location that the linker assigned for this
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 498e8d5bf7c..b9e443acb1d 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3324,6 +3324,10 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
          lower_clip_distance(prog->_LinkedShaders[i]);
       }
 
+      if (ctx->Const.LowerTessLevel) {
+         lower_tess_level(prog->_LinkedShaders[i]);
+      }
+
       while (do_common_optimization(prog->_LinkedShaders[i]->ir, true, false,
                                     &ctx->Const.ShaderCompilerOptions[i],
                                     ctx->Const.NativeIntegers))
diff --git a/src/glsl/lower_tess_level.cpp b/src/glsl/lower_tess_level.cpp
new file mode 100644
index 00000000000..bed2553222f
--- /dev/null
+++ b/src/glsl/lower_tess_level.cpp
@@ -0,0 +1,459 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file lower_tess_level.cpp
+ *
+ * This pass accounts for the difference between the way gl_TessLevelOuter
+ * and gl_TessLevelInner is declared in standard GLSL (as an array of
+ * floats), and the way it is frequently implemented in hardware (as a vec4
+ * and vec2).
+ *
+ * The declaration of gl_TessLevel* is replaced with a declaration
+ * of gl_TessLevel*MESA, and any references to gl_TessLevel* are
+ * translated to refer to gl_TessLevel*MESA with the appropriate
+ * swizzling of array indices.  For instance:
+ *
+ *   gl_TessLevelOuter[i]
+ *
+ * is translated into:
+ *
+ *   gl_TessLevelOuterMESA[i]
+ *
+ * Since some hardware may not internally represent gl_TessLevel* as a pair
+ * of vec4's, this lowering pass is optional.  To enable it, set the
+ * LowerTessLevel flag in gl_shader_compiler_options to true.
+ */
+
+#include "glsl_symbol_table.h"
+#include "ir_rvalue_visitor.h"
+#include "ir.h"
+#include "program/prog_instruction.h" /* For WRITEMASK_* */
+
+namespace {
+
+class lower_tess_level_visitor : public ir_rvalue_visitor {
+public:
+   explicit lower_tess_level_visitor(gl_shader_stage shader_stage)
+      : progress(false), old_tess_level_outer_var(NULL),
+        old_tess_level_inner_var(NULL), new_tess_level_outer_var(NULL),
+        new_tess_level_inner_var(NULL), shader_stage(shader_stage)
+   {
+   }
+
+   virtual ir_visitor_status visit(ir_variable *);
+   bool is_tess_level_array(ir_rvalue *ir);
+   ir_rvalue *lower_tess_level_array(ir_rvalue *ir);
+   virtual ir_visitor_status visit_leave(ir_assignment *);
+   void visit_new_assignment(ir_assignment *ir);
+   virtual ir_visitor_status visit_leave(ir_call *);
+
+   virtual void handle_rvalue(ir_rvalue **rvalue);
+
+   void fix_lhs(ir_assignment *);
+
+   bool progress;
+
+   /**
+    * Pointer to the declaration of gl_TessLevel*, if found.
+    */
+   ir_variable *old_tess_level_outer_var;
+   ir_variable *old_tess_level_inner_var;
+
+   /**
+    * Pointer to the newly-created gl_TessLevel*MESA variables.
+    */
+   ir_variable *new_tess_level_outer_var;
+   ir_variable *new_tess_level_inner_var;
+
+   /**
+    * Type of shader we are compiling (e.g. MESA_SHADER_TESS_CTRL)
+    */
+   const gl_shader_stage shader_stage;
+};
+
+} /* anonymous namespace */
+
+/**
+ * Replace any declaration of gl_TessLevel* as an array of floats with a
+ * declaration of gl_TessLevel*MESA as a vec4.
+ */
+ir_visitor_status
+lower_tess_level_visitor::visit(ir_variable *ir)
+{
+   if ((!ir->name) ||
+       ((strcmp(ir->name, "gl_TessLevelInner") != 0) &&
+        (strcmp(ir->name, "gl_TessLevelOuter") != 0)))
+      return visit_continue;
+
+   assert (ir->type->is_array());
+
+   if (strcmp(ir->name, "gl_TessLevelOuter") == 0) {
+      if (this->old_tess_level_outer_var)
+         return visit_continue;
+
+      old_tess_level_outer_var = ir;
+      assert(ir->type->fields.array == glsl_type::float_type);
+
+      /* Clone the old var so that we inherit all of its properties */
+      new_tess_level_outer_var = ir->clone(ralloc_parent(ir), NULL);
+
+      /* And change the properties that we need to change */
+      new_tess_level_outer_var->name = ralloc_strdup(new_tess_level_outer_var,
+                                                "gl_TessLevelOuterMESA");
+      new_tess_level_outer_var->type = glsl_type::vec4_type;
+      new_tess_level_outer_var->data.max_array_access = 0;
+
+      ir->replace_with(new_tess_level_outer_var);
+   } else if (strcmp(ir->name, "gl_TessLevelInner") == 0) {
+      if (this->old_tess_level_inner_var)
+         return visit_continue;
+
+      old_tess_level_inner_var = ir;
+      assert(ir->type->fields.array == glsl_type::float_type);
+
+      /* Clone the old var so that we inherit all of its properties */
+      new_tess_level_inner_var = ir->clone(ralloc_parent(ir), NULL);
+
+      /* And change the properties that we need to change */
+      new_tess_level_inner_var->name = ralloc_strdup(new_tess_level_inner_var,
+                                                "gl_TessLevelInnerMESA");
+      new_tess_level_inner_var->type = glsl_type::vec2_type;
+      new_tess_level_inner_var->data.max_array_access = 0;
+
+      ir->replace_with(new_tess_level_inner_var);
+   } else {
+      assert(0);
+   }
+
+   this->progress = true;
+
+   return visit_continue;
+}
+
+
+/**
+ * Determine whether the given rvalue describes an array of floats that
+ * needs to be lowered to a vec4; that is, determine whether it
+ * matches one of the following patterns:
+ *
+ * - gl_TessLevelOuter
+ * - gl_TessLevelInner
+ */
+bool
+lower_tess_level_visitor::is_tess_level_array(ir_rvalue *ir)
+{
+   if (!ir->type->is_array())
+      return false;
+   if (ir->type->fields.array != glsl_type::float_type)
+      return false;
+
+   if (this->old_tess_level_outer_var) {
+      if (ir->variable_referenced() == this->old_tess_level_outer_var)
+         return true;
+   }
+   if (this->old_tess_level_inner_var) {
+      if (ir->variable_referenced() == this->old_tess_level_inner_var)
+         return true;
+   }
+   return false;
+}
+
+
+/**
+ * If the given ir satisfies is_tess_level_array(), return new ir
+ * representing its lowered equivalent.  That is, map:
+ *
+ * - gl_TessLevelOuter => gl_TessLevelOuterMESA
+ * - gl_TessLevelInner => gl_TessLevelInnerMESA
+ *
+ * Otherwise return NULL.
+ */
+ir_rvalue *
+lower_tess_level_visitor::lower_tess_level_array(ir_rvalue *ir)
+{
+   if (!ir->type->is_array())
+      return NULL;
+   if (ir->type->fields.array != glsl_type::float_type)
+      return NULL;
+
+   ir_variable **new_var = NULL;
+
+   if (this->old_tess_level_outer_var) {
+      if (ir->variable_referenced() == this->old_tess_level_outer_var)
+         new_var = &this->new_tess_level_outer_var;
+   }
+   if (this->old_tess_level_inner_var) {
+      if (ir->variable_referenced() == this->old_tess_level_inner_var)
+         new_var = &this->new_tess_level_inner_var;
+   }
+
+   if (new_var == NULL)
+      return NULL;
+
+   assert(ir->as_dereference_variable());
+   return new(ralloc_parent(ir)) ir_dereference_variable(*new_var);
+}
+
+
+void
+lower_tess_level_visitor::handle_rvalue(ir_rvalue **rv)
+{
+   if (*rv == NULL)
+      return;
+
+   ir_dereference_array *const array_deref = (*rv)->as_dereference_array();
+   if (array_deref == NULL)
+      return;
+
+   /* Replace any expression that indexes one of the floats in gl_TessLevel*
+    * with an expression that indexes into one of the vec4's
+    * gl_TessLevel*MESA and accesses the appropriate component.
+    */
+   ir_rvalue *lowered_vec4 =
+      this->lower_tess_level_array(array_deref->array);
+   if (lowered_vec4 != NULL) {
+      this->progress = true;
+      void *mem_ctx = ralloc_parent(array_deref);
+
+      ir_expression *const expr =
+         new(mem_ctx) ir_expression(ir_binop_vector_extract,
+                                    lowered_vec4,
+                                    array_deref->array_index);
+
+      *rv = expr;
+   }
+}
+
+void
+lower_tess_level_visitor::fix_lhs(ir_assignment *ir)
+{
+   if (ir->lhs->ir_type != ir_type_expression)
+      return;
+   void *mem_ctx = ralloc_parent(ir);
+   ir_expression *const expr = (ir_expression *) ir->lhs;
+
+   /* The expression must be of the form:
+    *
+    *     (vector_extract gl_TessLevel*MESA, j).
+    */
+   assert(expr->operation == ir_binop_vector_extract);
+   assert(expr->operands[0]->ir_type == ir_type_dereference_variable);
+   assert((expr->operands[0]->type == glsl_type::vec4_type) ||
+          (expr->operands[0]->type == glsl_type::vec2_type));
+
+   ir_dereference *const new_lhs = (ir_dereference *) expr->operands[0];
+
+   ir_constant *old_index_constant = expr->operands[1]->constant_expression_value();
+   if (!old_index_constant) {
+      ir->rhs = new(mem_ctx) ir_expression(ir_triop_vector_insert,
+                                           expr->operands[0]->type,
+                                           new_lhs->clone(mem_ctx, NULL),
+                                           ir->rhs,
+                                           expr->operands[1]);
+   }
+   ir->set_lhs(new_lhs);
+
+   if (old_index_constant) {
+      /* gl_TessLevel* is being accessed via a constant index.  Don't bother
+       * creating a vector insert op. Just use a write mask.
+       */
+      ir->write_mask = 1 << old_index_constant->get_int_component(0);
+   } else {
+      ir->write_mask = (1 << expr->operands[0]->type->vector_elements) - 1;
+   }
+}
+
+/**
+ * Replace any assignment having a gl_TessLevel* (undereferenced) as
+ * its LHS or RHS with a sequence of assignments, one for each component of
+ * the array.  Each of these assignments is lowered to refer to
+ * gl_TessLevel*MESA as appropriate.
+ */
+ir_visitor_status
+lower_tess_level_visitor::visit_leave(ir_assignment *ir)
+{
+   /* First invoke the base class visitor.  This causes handle_rvalue() to be
+    * called on ir->rhs and ir->condition.
+    */
+   ir_rvalue_visitor::visit_leave(ir);
+
+   if (this->is_tess_level_array(ir->lhs) ||
+       this->is_tess_level_array(ir->rhs)) {
+      /* LHS or RHS of the assignment is the entire gl_TessLevel* array.
+       * Since we are
+       * reshaping gl_TessLevel* from an array of floats to a
+       * vec4, this isn't going to work as a bulk assignment anymore, so
+       * unroll it to element-by-element assignments and lower each of them.
+       *
+       * Note: to unroll into element-by-element assignments, we need to make
+       * clones of the LHS and RHS.  This is safe because expressions and
+       * l-values are side-effect free.
+       */
+      void *ctx = ralloc_parent(ir);
+      int array_size = ir->lhs->type->array_size();
+      for (int i = 0; i < array_size; ++i) {
+         ir_dereference_array *new_lhs = new(ctx) ir_dereference_array(
+            ir->lhs->clone(ctx, NULL), new(ctx) ir_constant(i));
+         ir_dereference_array *new_rhs = new(ctx) ir_dereference_array(
+            ir->rhs->clone(ctx, NULL), new(ctx) ir_constant(i));
+         this->handle_rvalue((ir_rvalue **) &new_rhs);
+
+         /* Handle the LHS after creating the new assignment.  This must
+          * happen in this order because handle_rvalue may replace the old LHS
+          * with an ir_expression of ir_binop_vector_extract.  Since this is
+          * not a valide l-value, this will cause an assertion in the
+          * ir_assignment constructor to fail.
+          *
+          * If this occurs, replace the mangled LHS with a dereference of the
+          * vector, and replace the RHS with an ir_triop_vector_insert.
+          */
+         ir_assignment *const assign = new(ctx) ir_assignment(new_lhs, new_rhs);
+         this->handle_rvalue((ir_rvalue **) &assign->lhs);
+         this->fix_lhs(assign);
+
+         this->base_ir->insert_before(assign);
+      }
+      ir->remove();
+
+      return visit_continue;
+   }
+
+   /* Handle the LHS as if it were an r-value.  Normally
+    * rvalue_visit(ir_assignment *) only visits the RHS, but we need to lower
+    * expressions in the LHS as well.
+    *
+    * This may cause the LHS to get replaced with an ir_expression of
+    * ir_binop_vector_extract.  If this occurs, replace it with a dereference
+    * of the vector, and replace the RHS with an ir_triop_vector_insert.
+    */
+   handle_rvalue((ir_rvalue **)&ir->lhs);
+   this->fix_lhs(ir);
+
+   return rvalue_visit(ir);
+}
+
+
+/**
+ * Set up base_ir properly and call visit_leave() on a newly created
+ * ir_assignment node.  This is used in cases where we have to insert an
+ * ir_assignment in a place where we know the hierarchical visitor won't see
+ * it.
+ */
+void
+lower_tess_level_visitor::visit_new_assignment(ir_assignment *ir)
+{
+   ir_instruction *old_base_ir = this->base_ir;
+   this->base_ir = ir;
+   ir->accept(this);
+   this->base_ir = old_base_ir;
+}
+
+
+/**
+ * If a gl_TessLevel* variable appears as an argument in an ir_call
+ * expression, replace it with a temporary variable, and make sure the ir_call
+ * is preceded and/or followed by assignments that copy the contents of the
+ * temporary variable to and/or from gl_TessLevel*.  Each of these
+ * assignments is then lowered to refer to gl_TessLevel*MESA.
+ */
+ir_visitor_status
+lower_tess_level_visitor::visit_leave(ir_call *ir)
+{
+   void *ctx = ralloc_parent(ir);
+
+   const exec_node *formal_param_node = ir->callee->parameters.head;
+   const exec_node *actual_param_node = ir->actual_parameters.head;
+   while (!actual_param_node->is_tail_sentinel()) {
+      ir_variable *formal_param = (ir_variable *) formal_param_node;
+      ir_rvalue *actual_param = (ir_rvalue *) actual_param_node;
+
+      /* Advance formal_param_node and actual_param_node now so that we can
+       * safely replace actual_param with another node, if necessary, below.
+       */
+      formal_param_node = formal_param_node->next;
+      actual_param_node = actual_param_node->next;
+
+      if (!this->is_tess_level_array(actual_param))
+         continue;
+
+      /* User is trying to pass a whole gl_TessLevel* array to a function
+       * call.  Since we are reshaping gl_TessLevel* from an array of floats
+       * to a vec4, this isn't going to work anymore, so use a temporary
+       * array instead.
+       */
+      ir_variable *temp = new(ctx) ir_variable(
+         actual_param->type, "temp_tess_level", ir_var_temporary);
+      this->base_ir->insert_before(temp);
+      actual_param->replace_with(
+         new(ctx) ir_dereference_variable(temp));
+      if (formal_param->data.mode == ir_var_function_in
+          || formal_param->data.mode == ir_var_function_inout) {
+         /* Copy from gl_TessLevel* to the temporary before the call.
+          * Since we are going to insert this copy before the current
+          * instruction, we need to visit it afterwards to make sure it
+          * gets lowered.
+          */
+         ir_assignment *new_assignment = new(ctx) ir_assignment(
+            new(ctx) ir_dereference_variable(temp),
+            actual_param->clone(ctx, NULL));
+         this->base_ir->insert_before(new_assignment);
+         this->visit_new_assignment(new_assignment);
+      }
+      if (formal_param->data.mode == ir_var_function_out
+          || formal_param->data.mode == ir_var_function_inout) {
+         /* Copy from the temporary to gl_TessLevel* after the call.
+          * Since visit_list_elements() has already decided which
+          * instruction it's going to visit next, we need to visit
+          * afterwards to make sure it gets lowered.
+          */
+         ir_assignment *new_assignment = new(ctx) ir_assignment(
+            actual_param->clone(ctx, NULL),
+            new(ctx) ir_dereference_variable(temp));
+         this->base_ir->insert_after(new_assignment);
+         this->visit_new_assignment(new_assignment);
+      }
+   }
+
+   return rvalue_visit(ir);
+}
+
+
+bool
+lower_tess_level(gl_shader *shader)
+{
+   if ((shader->Stage != MESA_SHADER_TESS_CTRL) &&
+       (shader->Stage != MESA_SHADER_TESS_EVAL))
+      return false;
+
+   lower_tess_level_visitor v(shader->Stage);
+
+   visit_list_elements(&v, shader->ir);
+
+   if (v.new_tess_level_outer_var)
+      shader->symbols->add_variable(v.new_tess_level_outer_var);
+   if (v.new_tess_level_inner_var)
+      shader->symbols->add_variable(v.new_tess_level_inner_var);
+
+   return v.progress;
+}
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 3244a95c32b..f97468d2db7 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3758,6 +3758,7 @@ struct gl_constants
    GLuint MaxTessGenLevel;
    GLuint MaxTessPatchComponents;
    GLuint MaxTessControlTotalOutputComponents;
+   bool LowerTessLevel; /**< Lower gl_TessLevel* from float[n] to vecn? */
 };
 
 

From c53aa26379ccee9d53fe1d1ea9bfa26d4d469618 Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Thu, 20 Mar 2014 22:37:37 +0100
Subject: [PATCH 0509/1208] glsl: add "in" or "out" prefix to name when
 flattening interface blocks

This is to prevent a name conflict in tessellation shaders built-in interface
blocks.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/lower_named_interface_blocks.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/glsl/lower_named_interface_blocks.cpp b/src/glsl/lower_named_interface_blocks.cpp
index 22c852dcd2a..01bbdd0587e 100644
--- a/src/glsl/lower_named_interface_blocks.cpp
+++ b/src/glsl/lower_named_interface_blocks.cpp
@@ -126,7 +126,8 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
       for (unsigned i = 0; i < iface_t->length; i++) {
          const char * field_name = iface_t->fields.structure[i].name;
          char *iface_field_name =
-            ralloc_asprintf(mem_ctx, "%s.%s.%s",
+            ralloc_asprintf(mem_ctx, "%s %s.%s.%s",
+                            var->data.mode == ir_var_shader_in ? "in" : "out",
                             iface_t->name, var->name, field_name);
 
          ir_variable *found_var =
@@ -219,7 +220,9 @@ flatten_named_interface_blocks_declarations::handle_rvalue(ir_rvalue **rvalue)
 
    if (var->get_interface_type() != NULL) {
       char *iface_field_name =
-         ralloc_asprintf(mem_ctx, "%s.%s.%s", var->get_interface_type()->name,
+         ralloc_asprintf(mem_ctx, "%s %s.%s.%s",
+                         var->data.mode == ir_var_shader_in ? "in" : "out",
+                         var->get_interface_type()->name,
                          var->name, ir->field);
       /* Find the variable in the set of flattened interface blocks */
       ir_variable *found_var =

From 0cfac917554aeb46bd78ba5b5f5ee1c8ed1d68bc Mon Sep 17 00:00:00 2001
From: Fabian Bieler <fabianbieler@fastmail.fm>
Date: Thu, 20 Mar 2014 22:34:42 +0100
Subject: [PATCH 0510/1208] glsl: make stand-alone compiler work with
 tessellation shaders.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/main.cpp                   | 8 +++++++-
 src/glsl/standalone_scaffolding.cpp | 1 +
 src/glsl/standalone_scaffolding.h   | 4 ++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/glsl/main.cpp b/src/glsl/main.cpp
index 23412980dce..df93a013ede 100644
--- a/src/glsl/main.cpp
+++ b/src/glsl/main.cpp
@@ -204,6 +204,8 @@ initialize_context(struct gl_context *ctx, gl_api api)
    }
 
    ctx->Const.GenerateTemporaryNames = true;
+   ctx->Const.MaxPatchVertices = 32;
+
    ctx->Driver.NewShader = _mesa_new_shader;
 }
 
@@ -273,7 +275,7 @@ usage_fail(const char *name)
 {
 
    const char *header =
-      "usage: %s [options] <file.vert | file.geom | file.frag>\n"
+      "usage: %s [options] <file.vert | file.tesc | file.tese | file.geom | file.frag | file.comp>\n"
       "\n"
       "Possible options are:\n";
    printf(header, name);
@@ -373,6 +375,10 @@ main(int argc, char **argv)
       const char *const ext = & argv[optind][len - 5];
       if (strncmp(".vert", ext, 5) == 0 || strncmp(".glsl", ext, 5) == 0)
 	 shader->Type = GL_VERTEX_SHADER;
+      else if (strncmp(".tesc", ext, 5) == 0)
+	 shader->Type = GL_TESS_CONTROL_SHADER;
+      else if (strncmp(".tese", ext, 5) == 0)
+	 shader->Type = GL_TESS_EVALUATION_SHADER;
       else if (strncmp(".geom", ext, 5) == 0)
 	 shader->Type = GL_GEOMETRY_SHADER;
       else if (strncmp(".frag", ext, 5) == 0)
diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp
index 172c6f41570..057ca6ee8a7 100644
--- a/src/glsl/standalone_scaffolding.cpp
+++ b/src/glsl/standalone_scaffolding.cpp
@@ -136,6 +136,7 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api)
    ctx->Extensions.ARB_shader_texture_lod = true;
    ctx->Extensions.ARB_shading_language_420pack = true;
    ctx->Extensions.ARB_shading_language_packing = true;
+   ctx->Extensions.ARB_tessellation_shader = true;
    ctx->Extensions.ARB_texture_cube_map_array = true;
    ctx->Extensions.ARB_texture_gather = true;
    ctx->Extensions.ARB_texture_multisample = true;
diff --git a/src/glsl/standalone_scaffolding.h b/src/glsl/standalone_scaffolding.h
index 895dd2782fb..dc6fb640f15 100644
--- a/src/glsl/standalone_scaffolding.h
+++ b/src/glsl/standalone_scaffolding.h
@@ -61,6 +61,10 @@ _mesa_shader_enum_to_shader_stage(GLenum v)
       return MESA_SHADER_FRAGMENT;
    case GL_GEOMETRY_SHADER:
       return MESA_SHADER_GEOMETRY;
+   case GL_TESS_CONTROL_SHADER:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_SHADER:
+      return MESA_SHADER_TESS_EVAL;
    case GL_COMPUTE_SHADER:
       return MESA_SHADER_COMPUTE;
    default:

From fb800b3dcd32ddb6f57143b46105d677eb01da80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 28 May 2015 23:24:08 +0200
Subject: [PATCH 0511/1208] glsl: don't lower variable indexing on non-patch
 tessellation inputs/outputs

There is no way to lower them, because the array sizes are unknown
at compile time.

Based on a patch from: Fabian Bieler <fabianbieler@fastmail.fm>

v2: add comments

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/ir_optimization.h                    |  5 +-
 .../lower_variable_index_to_cond_assign.cpp   | 58 ++++++++++++++-----
 src/glsl/test_optpass.cpp                     |  3 +-
 src/mesa/drivers/dri/i965/brw_shader.cpp      |  8 ++-
 src/mesa/program/ir_to_mesa.cpp               |  2 +-
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp    |  2 +-
 6 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index 688a5e18ea3..a174c9683a1 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -114,8 +114,9 @@ bool lower_discard(exec_list *instructions);
 void lower_discard_flow(exec_list *instructions);
 bool lower_instructions(exec_list *instructions, unsigned what_to_lower);
 bool lower_noise(exec_list *instructions);
-bool lower_variable_index_to_cond_assign(exec_list *instructions,
-    bool lower_input, bool lower_output, bool lower_temp, bool lower_uniform);
+bool lower_variable_index_to_cond_assign(gl_shader_stage stage,
+    exec_list *instructions, bool lower_input, bool lower_output,
+    bool lower_temp, bool lower_uniform);
 bool lower_quadop_vector(exec_list *instructions, bool dont_lower_swz);
 bool lower_const_arrays_to_uniforms(exec_list *instructions);
 bool lower_clip_distance(gl_shader *shader);
diff --git a/src/glsl/lower_variable_index_to_cond_assign.cpp b/src/glsl/lower_variable_index_to_cond_assign.cpp
index 4a6a76c4eba..1ab3afecc7e 100644
--- a/src/glsl/lower_variable_index_to_cond_assign.cpp
+++ b/src/glsl/lower_variable_index_to_cond_assign.cpp
@@ -335,12 +335,14 @@ struct switch_generator
 
 class variable_index_to_cond_assign_visitor : public ir_rvalue_visitor {
 public:
-   variable_index_to_cond_assign_visitor(bool lower_input,
-					 bool lower_output,
-					 bool lower_temp,
-					 bool lower_uniform)
+   variable_index_to_cond_assign_visitor(gl_shader_stage stage,
+                                         bool lower_input,
+                                         bool lower_output,
+                                         bool lower_temp,
+                                         bool lower_uniform)
    {
       this->progress = false;
+      this->stage = stage;
       this->lower_inputs = lower_input;
       this->lower_outputs = lower_output;
       this->lower_temps = lower_temp;
@@ -348,6 +350,8 @@ public:
    }
 
    bool progress;
+
+   gl_shader_stage stage;
    bool lower_inputs;
    bool lower_outputs;
    bool lower_temps;
@@ -369,18 +373,44 @@ public:
       case ir_var_auto:
       case ir_var_temporary:
 	 return this->lower_temps;
+
       case ir_var_uniform:
       case ir_var_shader_storage:
 	 return this->lower_uniforms;
+
       case ir_var_function_in:
       case ir_var_const_in:
          return this->lower_temps;
+
       case ir_var_shader_in:
+         /* The input array size is unknown at compiler time for non-patch
+          * inputs in TCS and TES. The arrays are sized to
+          * the implementation-dependent limit "gl_MaxPatchVertices", but
+          * the real size is stored in the "gl_PatchVerticesIn" built-in
+          * uniform.
+          *
+          * The TCS input array size is specified by
+          * glPatchParameteri(GL_PATCH_VERTICES).
+          *
+          * The TES input array size is specified by the "vertices" output
+          * layout qualifier in TCS.
+          */
+         if ((stage == MESA_SHADER_TESS_CTRL ||
+              stage == MESA_SHADER_TESS_EVAL) && !var->data.patch)
+            return false;
          return this->lower_inputs;
+
       case ir_var_function_out:
+         /* TCS non-patch outputs can only be indexed with "gl_InvocationID".
+          * Other expressions are not allowed.
+          */
+         if (stage == MESA_SHADER_TESS_CTRL && !var->data.patch)
+            return false;
          return this->lower_temps;
+
       case ir_var_shader_out:
          return this->lower_outputs;
+
       case ir_var_function_inout:
 	 return this->lower_temps;
       }
@@ -523,16 +553,18 @@ public:
 } /* anonymous namespace */
 
 bool
-lower_variable_index_to_cond_assign(exec_list *instructions,
-				    bool lower_input,
-				    bool lower_output,
-				    bool lower_temp,
-				    bool lower_uniform)
+lower_variable_index_to_cond_assign(gl_shader_stage stage,
+                                    exec_list *instructions,
+                                    bool lower_input,
+                                    bool lower_output,
+                                    bool lower_temp,
+                                    bool lower_uniform)
 {
-   variable_index_to_cond_assign_visitor v(lower_input,
-					   lower_output,
-					   lower_temp,
-					   lower_uniform);
+   variable_index_to_cond_assign_visitor v(stage,
+                                           lower_input,
+                                           lower_output,
+                                           lower_temp,
+                                           lower_uniform);
 
    /* Continue lowering until no progress is made.  If there are multiple
     * levels of indirection (e.g., non-constant indexing of array elements and
diff --git a/src/glsl/test_optpass.cpp b/src/glsl/test_optpass.cpp
index ac3e3f48c51..fed1fabf301 100644
--- a/src/glsl/test_optpass.cpp
+++ b/src/glsl/test_optpass.cpp
@@ -124,7 +124,8 @@ do_optimization(struct exec_list *ir, const char *optimization,
    } else if (sscanf(optimization, "lower_variable_index_to_cond_assign "
                      "( %d , %d , %d , %d ) ", &int_0, &int_1, &int_2,
                      &int_3) == 4) {
-      return lower_variable_index_to_cond_assign(ir, int_0 != 0, int_1 != 0,
+      return lower_variable_index_to_cond_assign(MESA_SHADER_VERTEX, ir,
+                                                 int_0 != 0, int_1 != 0,
                                                  int_2 != 0, int_3 != 0);
    } else if (sscanf(optimization, "lower_quadop_vector ( %d ) ",
                      &int_0) == 1) {
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index d66baf34b38..379ff4a3a1a 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -233,7 +233,8 @@ brw_lower_packing_builtins(struct brw_context *brw,
 }
 
 static void
-process_glsl_ir(struct brw_context *brw,
+process_glsl_ir(gl_shader_stage stage,
+                struct brw_context *brw,
                 struct gl_shader_program *shader_prog,
                 struct gl_shader *shader)
 {
@@ -281,7 +282,8 @@ process_glsl_ir(struct brw_context *brw,
    lower_quadop_vector(shader->ir, false);
 
    bool lowered_variable_indexing =
-      lower_variable_index_to_cond_assign(shader->ir,
+      lower_variable_index_to_cond_assign((gl_shader_stage)stage,
+                                          shader->ir,
                                           options->EmitNoIndirectInput,
                                           options->EmitNoIndirectOutput,
                                           options->EmitNoIndirectTemp,
@@ -358,7 +360,7 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
 
       _mesa_copy_linked_program_data((gl_shader_stage) stage, shProg, prog);
 
-      process_glsl_ir(brw, shProg, shader);
+      process_glsl_ir((gl_shader_stage) stage, brw, shProg, shader);
 
       /* Make a pass over the IR to add state references for any built-in
        * uniforms that are used.  This has to be done now (during linking).
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 2bd212efaa1..cadbf49ca65 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -2910,7 +2910,7 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 	 if (options->EmitNoIndirectInput || options->EmitNoIndirectOutput
 	     || options->EmitNoIndirectTemp || options->EmitNoIndirectUniform)
 	   progress =
-	     lower_variable_index_to_cond_assign(ir,
+	     lower_variable_index_to_cond_assign(prog->_LinkedShaders[i]->Stage, ir,
 						 options->EmitNoIndirectInput,
 						 options->EmitNoIndirectOutput,
 						 options->EmitNoIndirectTemp,
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index c9d40c578ef..b727c5e9c93 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -5833,7 +5833,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
        */
       if (options->EmitNoIndirectInput || options->EmitNoIndirectOutput ||
           options->EmitNoIndirectTemp || options->EmitNoIndirectUniform) {
-         lower_variable_index_to_cond_assign(ir,
+         lower_variable_index_to_cond_assign(prog->_LinkedShaders[i]->Stage, ir,
                                              options->EmitNoIndirectInput,
                                              options->EmitNoIndirectOutput,
                                              options->EmitNoIndirectTemp,

From 2abbe941e1bfaf494eb739b9fb81503736298f14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 16 Jun 2015 01:32:28 +0200
Subject: [PATCH 0512/1208] glsl: add the tessellation extension to the list
 for the "layout" qualifier

This is technically not needed, but it makes the compiler return a better
error message if tessellation is used with GLSL < 1.50.

Instead of:
    error: syntax error, unexpected NEW_IDENTIFIER, expecting $end
It returns:
    error: #version 150 layout qualifier `triangles' used

And the tessellation spec says:
    OpenGL 3.2 and GLSL 1.50 are required.
So it makes perfect sense.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/glsl_lexer.ll | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/glsl/glsl_lexer.ll b/src/glsl/glsl_lexer.ll
index ac4e2b70cc7..fe0083fe745 100644
--- a/src/glsl/glsl_lexer.ll
+++ b/src/glsl/glsl_lexer.ll
@@ -426,7 +426,8 @@ layout		{
 		      || yyextra->ARB_uniform_buffer_object_enable
 		      || yyextra->ARB_fragment_coord_conventions_enable
                       || yyextra->ARB_shading_language_420pack_enable
-                      || yyextra->ARB_compute_shader_enable) {
+                      || yyextra->ARB_compute_shader_enable
+                      || yyextra->ARB_tessellation_shader_enable) {
 		      return LAYOUT_TOK;
 		   } else {
 		      void *mem_ctx = yyextra;

From 64a0ae88b971e549852348b169de48d1d0b0869d Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Mon, 18 Aug 2014 21:51:46 +1200
Subject: [PATCH 0513/1208] glsl: relax unsized input/output block arrays for
 TCS/TES

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/ast_to_hir.cpp | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 840299c3c48..d6a8c77e6c2 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -5959,16 +5959,39 @@ ast_interface_block::hir(exec_list *instructions,
           *     geometry shader inputs. All other input and output block
           *     arrays must specify an array size.
           *
+          * The same applies to tessellation shaders.
+          *
           * The upshot of this is that the only circumstance where an
           * interface array size *doesn't* need to be specified is on a
-          * geometry shader input.
+          * geometry shader input, tessellation control shader input,
+          * tessellation control shader output, and tessellation evaluation
+          * shader input.
           */
-         if (this->array_specifier->is_unsized_array &&
-             (state->stage != MESA_SHADER_GEOMETRY || !this->layout.flags.q.in)) {
-            _mesa_glsl_error(&loc, state,
-                             "only geometry shader inputs may be unsized "
-                             "instance block arrays");
+         if (this->array_specifier->is_unsized_array) {
+            bool allow_inputs = state->stage == MESA_SHADER_GEOMETRY ||
+                                state->stage == MESA_SHADER_TESS_CTRL ||
+                                state->stage == MESA_SHADER_TESS_EVAL;
+            bool allow_outputs = state->stage == MESA_SHADER_TESS_CTRL;
 
+            if (this->layout.flags.q.in) {
+               if (!allow_inputs)
+                  _mesa_glsl_error(&loc, state,
+                                   "unsized input block arrays not allowed in "
+                                   "%s shader",
+                                   _mesa_shader_stage_to_string(state->stage));
+            } else if (this->layout.flags.q.out) {
+               if (!allow_outputs)
+                  _mesa_glsl_error(&loc, state,
+                                   "unsized output block arrays not allowed in "
+                                   "%s shader",
+                                   _mesa_shader_stage_to_string(state->stage));
+            } else {
+               /* by elimination, this is a uniform block array */
+               _mesa_glsl_error(&loc, state,
+                                "unsized uniform block arrays not allowed in "
+                                "%s shader",
+                                _mesa_shader_stage_to_string(state->stage));
+            }
          }
 
          const glsl_type *block_array_type =

From b7f98f9f094090c6e8a24407dab67e4873c68694 Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 24 Aug 2014 16:46:40 +1200
Subject: [PATCH 0514/1208] glsl: allow nonconst indexing of arrays where we
 can work out an implicit size

Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/ast_array_index.cpp | 37 +++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp
index 2c79002ebcb..50971bb22bb 100644
--- a/src/glsl/ast_array_index.cpp
+++ b/src/glsl/ast_array_index.cpp
@@ -107,6 +107,33 @@ update_max_array_access(ir_rvalue *ir, int idx, YYLTYPE *loc,
 }
 
 
+static int
+get_implicit_array_size(struct _mesa_glsl_parse_state *state,
+                        ir_rvalue *array)
+{
+   ir_variable *var = array->variable_referenced();
+
+   /* Inputs in control shader are implicitly sized
+    * to the maximum patch size.
+    */
+   if (state->stage == MESA_SHADER_TESS_CTRL &&
+       var->data.mode == ir_var_shader_in) {
+      return state->Const.MaxPatchVertices;
+   }
+
+   /* Non-patch inputs in evaluation shader are implicitly sized
+    * to the maximum patch size.
+    */
+   if (state->stage == MESA_SHADER_TESS_EVAL &&
+       var->data.mode == ir_var_shader_in &&
+       !var->data.patch) {
+      return state->Const.MaxPatchVertices;
+   }
+
+   return 0;
+}
+
+
 ir_rvalue *
 _mesa_ast_array_index_to_hir(void *mem_ctx,
 			     struct _mesa_glsl_parse_state *state,
@@ -183,7 +210,15 @@ _mesa_ast_array_index_to_hir(void *mem_ctx,
          update_max_array_access(array, idx, &loc, state);
    } else if (const_index == NULL && array->type->is_array()) {
       if (array->type->is_unsized_array()) {
-	 _mesa_glsl_error(&loc, state, "unsized array index must be constant");
+         int implicit_size = get_implicit_array_size(state, array);
+         if (implicit_size) {
+            ir_variable *v = array->whole_variable_referenced();
+            if (v != NULL)
+               v->data.max_array_access = implicit_size - 1;
+         }
+         else {
+            _mesa_glsl_error(&loc, state, "unsized array index must be constant");
+         }
       } else if (array->type->fields.array->is_interface()
                  && array->variable_referenced()->data.mode == ir_var_uniform
                  && !state->is_version(400, 0) && !state->ARB_gpu_shader5_enable) {

From da7adb99e85fc6efa7f0e570ab93bd7b625975ae Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 17 Aug 2014 22:37:16 +1200
Subject: [PATCH 0515/1208] glsl: add builtin constants for
 ARB_tessellation_shader

Limits from other extensions added by Marek.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/builtin_variables.cpp  | 40 +++++++++++++++++++++++++++++----
 src/glsl/glsl_parser_extras.cpp | 19 ++++++++++++++++
 src/glsl/glsl_parser_extras.h   | 17 ++++++++++++++
 3 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index c52c8e04720..0ff3a3f7062 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -672,8 +672,14 @@ builtin_variable_generator::generate_constants()
       if (!state->es_shader) {
          add_const("gl_MaxGeometryAtomicCounters",
                    state->Const.MaxGeometryAtomicCounters);
-         add_const("gl_MaxTessControlAtomicCounters", 0);
-         add_const("gl_MaxTessEvaluationAtomicCounters", 0);
+
+	 if (state->is_version(400, 0) ||
+             state->ARB_tessellation_shader_enable) {
+		 add_const("gl_MaxTessControlAtomicCounters",
+                           state->Const.MaxTessControlAtomicCounters);
+		 add_const("gl_MaxTessEvaluationAtomicCounters",
+                           state->Const.MaxTessEvaluationAtomicCounters);
+	 }
       }
    }
 
@@ -693,8 +699,10 @@ builtin_variable_generator::generate_constants()
       if (!state->es_shader) {
          add_const("gl_MaxGeometryAtomicCounterBuffers",
                    state->Const.MaxGeometryAtomicCounterBuffers);
-         add_const("gl_MaxTessControlAtomicCounterBuffers", 0);
-         add_const("gl_MaxTessEvaluationAtomicCounterBuffers", 0);
+         add_const("gl_MaxTessControlAtomicCounterBuffers",
+                   state->Const.MaxTessControlAtomicCounterBuffers);
+         add_const("gl_MaxTessEvaluationAtomicCounterBuffers",
+                   state->Const.MaxTessEvaluationAtomicCounterBuffers);
       }
    }
 
@@ -753,11 +761,35 @@ builtin_variable_generator::generate_constants()
                 state->Const.MaxFragmentImageUniforms);
       add_const("gl_MaxCombinedImageUniforms",
                 state->Const.MaxCombinedImageUniforms);
+
+      if (state->is_version(400, 0) ||
+          state->ARB_tessellation_shader_enable) {
+         add_const("gl_MaxTessControlImageUniforms",
+                   state->Const.MaxTessControlImageUniforms);
+         add_const("gl_MaxTessEvaluationImageUniforms",
+                   state->Const.MaxTessEvaluationImageUniforms);
+      }
    }
 
    if (state->is_version(410, 0) ||
        state->ARB_viewport_array_enable)
       add_const("gl_MaxViewports", state->Const.MaxViewports);
+
+   if (state->is_version(400, 0) ||
+       state->ARB_tessellation_shader_enable) {
+      add_const("gl_MaxPatchVertices", state->Const.MaxPatchVertices);
+      add_const("gl_MaxTessGenLevel", state->Const.MaxTessGenLevel);
+      add_const("gl_MaxTessControlInputComponents", state->Const.MaxTessControlInputComponents);
+      add_const("gl_MaxTessControlOutputComponents", state->Const.MaxTessControlOutputComponents);
+      add_const("gl_MaxTessControlTextureImageUnits", state->Const.MaxTessControlTextureImageUnits);
+      add_const("gl_MaxTessEvaluationInputComponents", state->Const.MaxTessEvaluationInputComponents);
+      add_const("gl_MaxTessEvaluationOutputComponents", state->Const.MaxTessEvaluationOutputComponents);
+      add_const("gl_MaxTessEvaluationTextureImageUnits", state->Const.MaxTessEvaluationTextureImageUnits);
+      add_const("gl_MaxTessPatchComponents", state->Const.MaxTessPatchComponents);
+      add_const("gl_MaxTessControlTotalOutputComponents", state->Const.MaxTessControlTotalOutputComponents);
+      add_const("gl_MaxTessControlUniformComponents", state->Const.MaxTessControlUniformComponents);
+      add_const("gl_MaxTessEvaluationUniformComponents", state->Const.MaxTessEvaluationUniformComponents);
+   }
 }
 
 
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 8979d3b49b6..43a3cd13f26 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -113,12 +113,18 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    this->Const.MaxGeometryUniformComponents = ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxUniformComponents;
 
    this->Const.MaxVertexAtomicCounters = ctx->Const.Program[MESA_SHADER_VERTEX].MaxAtomicCounters;
+   this->Const.MaxTessControlAtomicCounters = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxAtomicCounters;
+   this->Const.MaxTessEvaluationAtomicCounters = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxAtomicCounters;
    this->Const.MaxGeometryAtomicCounters = ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicCounters;
    this->Const.MaxFragmentAtomicCounters = ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicCounters;
    this->Const.MaxCombinedAtomicCounters = ctx->Const.MaxCombinedAtomicCounters;
    this->Const.MaxAtomicBufferBindings = ctx->Const.MaxAtomicBufferBindings;
    this->Const.MaxVertexAtomicCounterBuffers =
       ctx->Const.Program[MESA_SHADER_VERTEX].MaxAtomicBuffers;
+   this->Const.MaxTessControlAtomicCounterBuffers =
+      ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxAtomicBuffers;
+   this->Const.MaxTessEvaluationAtomicCounterBuffers =
+      ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxAtomicBuffers;
    this->Const.MaxGeometryAtomicCounterBuffers =
       ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicBuffers;
    this->Const.MaxFragmentAtomicCounterBuffers =
@@ -138,6 +144,8 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    this->Const.MaxCombinedImageUnitsAndFragmentOutputs = ctx->Const.MaxCombinedImageUnitsAndFragmentOutputs;
    this->Const.MaxImageSamples = ctx->Const.MaxImageSamples;
    this->Const.MaxVertexImageUniforms = ctx->Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms;
+   this->Const.MaxTessControlImageUniforms = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxImageUniforms;
+   this->Const.MaxTessEvaluationImageUniforms = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxImageUniforms;
    this->Const.MaxGeometryImageUniforms = ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxImageUniforms;
    this->Const.MaxFragmentImageUniforms = ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms;
    this->Const.MaxCombinedImageUniforms = ctx->Const.MaxCombinedImageUniforms;
@@ -147,6 +155,17 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
 
    /* tessellation shader constants */
    this->Const.MaxPatchVertices = ctx->Const.MaxPatchVertices;
+   this->Const.MaxTessGenLevel = ctx->Const.MaxTessGenLevel;
+   this->Const.MaxTessControlInputComponents = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxInputComponents;
+   this->Const.MaxTessControlOutputComponents = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxOutputComponents;
+   this->Const.MaxTessControlTextureImageUnits = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits;
+   this->Const.MaxTessEvaluationInputComponents = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxInputComponents;
+   this->Const.MaxTessEvaluationOutputComponents = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxOutputComponents;
+   this->Const.MaxTessEvaluationTextureImageUnits = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits;
+   this->Const.MaxTessPatchComponents = ctx->Const.MaxTessPatchComponents;
+   this->Const.MaxTessControlTotalOutputComponents = ctx->Const.MaxTessControlTotalOutputComponents;
+   this->Const.MaxTessControlUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformComponents;
+   this->Const.MaxTessEvaluationUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformComponents;
 
    this->current_function = NULL;
    this->toplevel_ir = NULL;
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index de8f8c417fc..f4b60afcb37 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -360,6 +360,8 @@ struct _mesa_glsl_parse_state {
 
       /* ARB_shader_atomic_counters */
       unsigned MaxVertexAtomicCounters;
+      unsigned MaxTessControlAtomicCounters;
+      unsigned MaxTessEvaluationAtomicCounters;
       unsigned MaxGeometryAtomicCounters;
       unsigned MaxFragmentAtomicCounters;
       unsigned MaxCombinedAtomicCounters;
@@ -370,6 +372,8 @@ struct _mesa_glsl_parse_state {
        * 3.10.
        */
       unsigned MaxVertexAtomicCounterBuffers;
+      unsigned MaxTessControlAtomicCounterBuffers;
+      unsigned MaxTessEvaluationAtomicCounterBuffers;
       unsigned MaxGeometryAtomicCounterBuffers;
       unsigned MaxFragmentAtomicCounterBuffers;
       unsigned MaxCombinedAtomicCounterBuffers;
@@ -384,6 +388,8 @@ struct _mesa_glsl_parse_state {
       unsigned MaxCombinedImageUnitsAndFragmentOutputs;
       unsigned MaxImageSamples;
       unsigned MaxVertexImageUniforms;
+      unsigned MaxTessControlImageUniforms;
+      unsigned MaxTessEvaluationImageUniforms;
       unsigned MaxGeometryImageUniforms;
       unsigned MaxFragmentImageUniforms;
       unsigned MaxCombinedImageUniforms;
@@ -393,6 +399,17 @@ struct _mesa_glsl_parse_state {
 
       /* ARB_tessellation_shader */
       unsigned MaxPatchVertices;
+      unsigned MaxTessGenLevel;
+      unsigned MaxTessControlInputComponents;
+      unsigned MaxTessControlOutputComponents;
+      unsigned MaxTessControlTextureImageUnits;
+      unsigned MaxTessEvaluationInputComponents;
+      unsigned MaxTessEvaluationOutputComponents;
+      unsigned MaxTessEvaluationTextureImageUnits;
+      unsigned MaxTessPatchComponents;
+      unsigned MaxTessControlTotalOutputComponents;
+      unsigned MaxTessControlUniformComponents;
+      unsigned MaxTessEvaluationUniformComponents;
    } Const;
 
    /**

From d563946a4064d50f6fa7ce5e9e8ccb1479d1205e Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 31 Aug 2014 19:35:46 +1200
Subject: [PATCH 0516/1208] glsl: restrict indexing for writes to TCS outputs
 to gl_InvocationID

Marek: handle ir_swizzle

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/ast_to_hir.cpp | 69 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 11 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index d6a8c77e6c2..8e503cfbfa3 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -639,6 +639,34 @@ shift_result_type(const struct glsl_type *type_a,
    return type_a;
 }
 
+/**
+ * Returns the innermost array index expression in an rvalue tree.
+ * This is the largest indexing level -- if an array of blocks, then
+ * it is the block index rather than an indexing expression for an
+ * array-typed member of an array of blocks.
+ */
+static ir_rvalue *
+find_innermost_array_index(ir_rvalue *rv)
+{
+   ir_dereference_array *last = NULL;
+   while (rv) {
+      if (rv->as_dereference_array()) {
+         last = rv->as_dereference_array();
+         rv = last->array;
+      } else if (rv->as_dereference_record())
+         rv = rv->as_dereference_record()->record;
+      else if (rv->as_swizzle())
+         rv = rv->as_swizzle()->val;
+      else
+         rv = NULL;
+   }
+
+   if (last)
+      return last->array_index;
+
+   return NULL;
+}
+
 /**
  * Validates that a value can be assigned to a location with a specified type
  *
@@ -655,9 +683,9 @@ shift_result_type(const struct glsl_type *type_a,
  * In addition to being used for assignments, this function is used to
  * type-check return values.
  */
-ir_rvalue *
+static ir_rvalue *
 validate_assignment(struct _mesa_glsl_parse_state *state,
-                    YYLTYPE loc, const glsl_type *lhs_type,
+                    YYLTYPE loc, ir_rvalue *lhs,
                     ir_rvalue *rhs, bool is_initializer)
 {
    /* If there is already some error in the RHS, just return it.  Anything
@@ -666,9 +694,28 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
    if (rhs->type->is_error())
       return rhs;
 
+   /* In the Tessellation Control Shader:
+    * If a per-vertex output variable is used as an l-value, it is an error
+    * if the expression indicating the vertex number is not the identifier
+    * `gl_InvocationID`.
+    */
+   if (state->stage == MESA_SHADER_TESS_CTRL) {
+      ir_variable *var = lhs->variable_referenced();
+      if (var->data.mode == ir_var_shader_out && !var->data.patch) {
+         ir_rvalue *index = find_innermost_array_index(lhs);
+         ir_variable *index_var = index ? index->variable_referenced() : NULL;
+         if (!index_var || strcmp(index_var->name, "gl_InvocationID") != 0) {
+            _mesa_glsl_error(&loc, state,
+                             "Tessellation control shader outputs can only "
+                             "be indexed by gl_InvocationID");
+            return NULL;
+         }
+      }
+   }
+
    /* If the types are identical, the assignment can trivially proceed.
     */
-   if (rhs->type == lhs_type)
+   if (rhs->type == lhs->type)
       return rhs;
 
    /* If the array element types are the same and the LHS is unsized,
@@ -678,8 +725,8 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
     * Note: Whole-array assignments are not permitted in GLSL 1.10, but this
     * is handled by ir_dereference::is_lvalue.
     */
-   if (lhs_type->is_unsized_array() && rhs->type->is_array()
-       && (lhs_type->fields.array == rhs->type->fields.array)) {
+   if (lhs->type->is_unsized_array() && rhs->type->is_array()
+       && (lhs->type->fields.array == rhs->type->fields.array)) {
       if (is_initializer) {
          return rhs;
       } else {
@@ -690,8 +737,8 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
    }
 
    /* Check for implicit conversion in GLSL 1.20 */
-   if (apply_implicit_conversion(lhs_type, rhs, state)) {
-      if (rhs->type == lhs_type)
+   if (apply_implicit_conversion(lhs->type, rhs, state)) {
+      if (rhs->type == lhs->type)
 	 return rhs;
    }
 
@@ -699,7 +746,7 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
                     "%s of type %s cannot be assigned to "
                     "variable of type %s",
                     is_initializer ? "initializer" : "value",
-                    rhs->type->name, lhs_type->name);
+                    rhs->type->name, lhs->type->name);
 
    return NULL;
 }
@@ -734,7 +781,7 @@ do_assignment(exec_list *instructions, struct _mesa_glsl_parse_state *state,
 
       if (unlikely(lhs_expr->operation == ir_binop_vector_extract)) {
          ir_rvalue *new_rhs =
-            validate_assignment(state, lhs_loc, lhs->type,
+            validate_assignment(state, lhs_loc, lhs,
                                 rhs, is_initializer);
 
          if (new_rhs == NULL) {
@@ -796,7 +843,7 @@ do_assignment(exec_list *instructions, struct _mesa_glsl_parse_state *state,
    }
 
    ir_rvalue *new_rhs =
-      validate_assignment(state, lhs_loc, lhs->type, rhs, is_initializer);
+      validate_assignment(state, lhs_loc, lhs, rhs, is_initializer);
    if (new_rhs != NULL) {
       rhs = new_rhs;
 
@@ -3058,7 +3105,7 @@ process_initializer(ir_variable *var, ast_declaration *decl,
    if (type->qualifier.flags.q.constant
        || type->qualifier.flags.q.uniform) {
       ir_rvalue *new_rhs = validate_assignment(state, initializer_loc,
-                                               var->type, rhs, true);
+                                               lhs, rhs, true);
       if (new_rhs != NULL) {
          rhs = new_rhs;
 

From 61846f222fffeba846f9f7277aba9cc7d48323ed Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Mon, 1 Sep 2014 20:48:09 +1200
Subject: [PATCH 0517/1208] glsl: properly size unsized arrays in tess stages

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/ast_to_hir.cpp | 49 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 8e503cfbfa3..dad8bceb41a 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -3275,6 +3275,33 @@ handle_tess_ctrl_shader_output_decl(struct _mesa_glsl_parse_state *state,
                                           "geometry shader input");
 }
 
+/**
+ * Do additional processing necessary for tessellation control/evaluation shader
+ * input declarations. This covers both interface block arrays and bare input
+ * variables.
+ */
+static void
+handle_tess_shader_input_decl(struct _mesa_glsl_parse_state *state,
+                              YYLTYPE loc, ir_variable *var)
+{
+   if (!var->type->is_array() && !var->data.patch) {
+      _mesa_glsl_error(&loc, state,
+                       "per-vertex tessellation shader inputs must be arrays");
+      /* Avoid cascading failures. */
+      return;
+   }
+
+   if (var->data.patch)
+      return;
+
+   /* Unsized arrays are implicitly sized to gl_MaxPatchVertices. */
+   if (var->type->is_unsized_array()) {
+      var->type = glsl_type::get_array_instance(var->type->fields.array,
+            state->Const.MaxPatchVertices);
+   }
+}
+
+
 /**
  * Do additional processing necessary for geometry shader input declarations
  * (this covers both interface blocks arrays and bare input variables).
@@ -3797,6 +3824,9 @@ ast_declarator_list::hir(exec_list *instructions,
                   }
                }
             }
+         } else if (state->stage == MESA_SHADER_TESS_CTRL ||
+                    state->stage == MESA_SHADER_TESS_EVAL) {
+            handle_tess_shader_input_decl(state, loc, var);
          }
       } else if (var->data.mode == ir_var_shader_out) {
          const glsl_type *check_type = var->type->without_array();
@@ -5959,8 +5989,18 @@ ast_interface_block::hir(exec_list *instructions,
    if (state->stage == MESA_SHADER_GEOMETRY && this->array_specifier == NULL &&
        var_mode == ir_var_shader_in) {
       _mesa_glsl_error(&loc, state, "geometry shader inputs must be arrays");
+   } else if ((state->stage == MESA_SHADER_TESS_CTRL ||
+               state->stage == MESA_SHADER_TESS_EVAL) &&
+              this->array_specifier == NULL &&
+              var_mode == ir_var_shader_in) {
+      _mesa_glsl_error(&loc, state, "per-vertex tessellation shader inputs must be arrays");
+   } else if (state->stage == MESA_SHADER_TESS_CTRL &&
+              this->array_specifier == NULL &&
+              var_mode == ir_var_shader_out) {
+      _mesa_glsl_error(&loc, state, "tessellation control shader outputs must be arrays");
    }
 
+
    /* Page 39 (page 45 of the PDF) of section 4.3.7 in the GLSL ES 3.00 spec
     * says:
     *
@@ -6072,6 +6112,11 @@ ast_interface_block::hir(exec_list *instructions,
 
       if (state->stage == MESA_SHADER_GEOMETRY && var_mode == ir_var_shader_in)
          handle_geometry_shader_input_decl(state, loc, var);
+      else if ((state->stage == MESA_SHADER_TESS_CTRL ||
+           state->stage == MESA_SHADER_TESS_EVAL) && var_mode == ir_var_shader_in)
+         handle_tess_shader_input_decl(state, loc, var);
+      else if (state->stage == MESA_SHADER_TESS_CTRL && var_mode == ir_var_shader_out)
+         handle_tess_ctrl_shader_output_decl(state, loc, var);
 
       if (ir_variable *earlier =
           state->symbols->get_variable(this->instance_name)) {
@@ -6254,8 +6299,8 @@ ast_tcs_output_layout::hir(exec_list *instructions,
 	 continue;
 
       /* Note: Not all tessellation control shader output are arrays. */
-      if (!var->type->is_unsized_array())
-	 continue;
+      if (!var->type->is_unsized_array() || var->data.patch)
+         continue;
 
       if (var->data.max_array_access >= num_vertices) {
 	 _mesa_glsl_error(&loc, state,

From 567f1b2ee89bf05f0600e9e79847140555f0a035 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 21 Jul 2014 21:59:37 -0400
Subject: [PATCH 0518/1208] glsl: pass shader stage to lower_output_reads and
 handle tess control

Tessellation control outputs can be read in directly without first
having been written. Accessing these will require some special logic
anyways, so just let them through.

V2: Never lower tess control output reads, whether patch or not -- both
can be read back by other threads.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/ir_optimization.h                 |  2 +-
 src/glsl/lower_output_reads.cpp            | 13 +++++++++----
 src/mesa/drivers/dri/i965/brw_shader.cpp   |  2 +-
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp |  2 +-
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index a174c9683a1..766b723df0b 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -120,7 +120,7 @@ bool lower_variable_index_to_cond_assign(gl_shader_stage stage,
 bool lower_quadop_vector(exec_list *instructions, bool dont_lower_swz);
 bool lower_const_arrays_to_uniforms(exec_list *instructions);
 bool lower_clip_distance(gl_shader *shader);
-void lower_output_reads(exec_list *instructions);
+void lower_output_reads(unsigned stage, exec_list *instructions);
 bool lower_packing_builtins(exec_list *instructions, int op_mask);
 void lower_ubo_reference(struct gl_shader *shader, exec_list *instructions);
 void lower_packed_varyings(void *mem_ctx,
diff --git a/src/glsl/lower_output_reads.cpp b/src/glsl/lower_output_reads.cpp
index 1ee815d5ece..79488df2932 100644
--- a/src/glsl/lower_output_reads.cpp
+++ b/src/glsl/lower_output_reads.cpp
@@ -48,8 +48,10 @@ protected:
    hash_table *replacements;
 
    void *mem_ctx;
+
+   unsigned stage;
 public:
-   output_read_remover();
+   output_read_remover(unsigned stage);
    ~output_read_remover();
    virtual ir_visitor_status visit(class ir_dereference_variable *);
    virtual ir_visitor_status visit_leave(class ir_emit_vertex *);
@@ -75,8 +77,9 @@ hash_table_var_hash(const void *key)
    return hash_table_string_hash(var->name);
 }
 
-output_read_remover::output_read_remover()
+output_read_remover::output_read_remover(unsigned stage)
 {
+   this->stage = stage;
    mem_ctx = ralloc_context(NULL);
    replacements =
       hash_table_ctor(0, hash_table_var_hash, hash_table_pointer_compare);
@@ -93,6 +96,8 @@ output_read_remover::visit(ir_dereference_variable *ir)
 {
    if (ir->var->data.mode != ir_var_shader_out)
       return visit_continue;
+   if (stage == MESA_SHADER_TESS_CTRL)
+      return visit_continue;
 
    ir_variable *temp = (ir_variable *) hash_table_find(replacements, ir->var);
 
@@ -166,8 +171,8 @@ output_read_remover::visit_leave(ir_function_signature *sig)
 }
 
 void
-lower_output_reads(exec_list *instructions)
+lower_output_reads(unsigned stage, exec_list *instructions)
 {
-   output_read_remover v;
+   output_read_remover v(stage);
    visit_list_elements(&v, instructions);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 379ff4a3a1a..9d60543c167 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -316,7 +316,7 @@ process_glsl_ir(gl_shader_stage stage,
    } while (progress);
 
    if (options->NirOptions != NULL)
-      lower_output_reads(shader->ir);
+      lower_output_reads(stage, shader->ir);
 
    validate_ir_tree(shader->ir);
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index b727c5e9c93..0ed16aeab87 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -5674,7 +5674,7 @@ get_mesa_program(struct gl_context *ctx,
                                                prog->Parameters);
 
    /* Remove reads from output registers. */
-   lower_output_reads(shader->ir);
+   lower_output_reads(shader->Stage, shader->ir);
 
    /* Emit intermediate IR for main(). */
    visit_exec_list(shader->ir, v);

From 0e94f350eeecd84cd5f15b10837b285bc9120684 Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 7 Sep 2014 18:19:15 +1200
Subject: [PATCH 0519/1208] glsl: push vertex count determination down one
 level

We have the prog here, so we don't need the caller to work this out for
us.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/link_varyings.cpp | 12 ++++++------
 src/glsl/link_varyings.h   |  3 +--
 src/glsl/linker.cpp        | 13 ++++---------
 3 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index f7c219ece05..532204e44e9 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -1356,9 +1356,6 @@ canonicalize_shader_io(exec_list *ir, enum ir_variable_mode io_mode)
  *        each of these objects that matches one of the outputs of the
  *        producer.
  *
- * \param gs_input_vertices: if \c consumer is a geometry shader, this is the
- *        number of input vertices it accepts.  Otherwise zero.
- *
  * When num_tfeedback_decls is nonzero, it is permissible for the consumer to
  * be NULL.  In this case, varying locations are assigned solely based on the
  * requirements of transform feedback.
@@ -1369,8 +1366,7 @@ assign_varying_locations(struct gl_context *ctx,
 			 struct gl_shader_program *prog,
 			 gl_shader *producer, gl_shader *consumer,
                          unsigned num_tfeedback_decls,
-                         tfeedback_decl *tfeedback_decls,
-                         unsigned gs_input_vertices)
+                         tfeedback_decl *tfeedback_decls)
 {
    varying_matches matches(ctx->Const.DisableVaryingPacking,
                            consumer && consumer->Stage == MESA_SHADER_FRAGMENT);
@@ -1384,6 +1380,10 @@ assign_varying_locations(struct gl_context *ctx,
       NULL,
    };
 
+   unsigned consumer_vertices = 0;
+   if (consumer && consumer->Stage == MESA_SHADER_GEOMETRY)
+      consumer_vertices = prog->Geom.VerticesIn;
+
    /* Operate in a total of four passes.
     *
     * 1. Sort inputs / outputs into a canonical order.  This is necessary so
@@ -1523,7 +1523,7 @@ assign_varying_locations(struct gl_context *ctx,
       }
       if (consumer) {
          lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
-                               gs_input_vertices, consumer);
+                               consumer_vertices, consumer);
       }
    }
 
diff --git a/src/glsl/link_varyings.h b/src/glsl/link_varyings.h
index c0ad53bf1da..2ce72d43d84 100644
--- a/src/glsl/link_varyings.h
+++ b/src/glsl/link_varyings.h
@@ -255,8 +255,7 @@ assign_varying_locations(struct gl_context *ctx,
 			 struct gl_shader_program *prog,
 			 gl_shader *producer, gl_shader *consumer,
                          unsigned num_tfeedback_decls,
-                         tfeedback_decl *tfeedback_decls,
-                         unsigned gs_input_vertices);
+                         tfeedback_decl *tfeedback_decls);
 
 bool
 check_against_output_limit(struct gl_context *ctx,
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index b9e443acb1d..c74ec56f3a2 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3420,8 +3420,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
           */
          if (!assign_varying_locations(ctx, mem_ctx, prog,
                                        NULL, prog->_LinkedShaders[first],
-                                       num_tfeedback_decls, tfeedback_decls,
-                                       prog->Geom.VerticesIn))
+                                       num_tfeedback_decls, tfeedback_decls))
             goto done;
       }
 
@@ -3432,8 +3431,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
           */
          if (!assign_varying_locations(ctx, mem_ctx, prog,
                                        sh, NULL,
-                                       num_tfeedback_decls, tfeedback_decls,
-                                       0))
+                                       num_tfeedback_decls, tfeedback_decls))
             goto done;
       }
 
@@ -3461,8 +3459,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                                        NULL /* producer */,
                                        sh /* consumer */,
                                        0 /* num_tfeedback_decls */,
-                                       NULL /* tfeedback_decls */,
-                                       0 /* gs_input_vertices */))
+                                       NULL /* tfeedback_decls */))
             goto done;
       } else
          demote_shader_inputs_and_outputs(sh, ir_var_shader_in);
@@ -3478,12 +3475,10 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
 
       gl_shader *const sh_i = prog->_LinkedShaders[i];
       gl_shader *const sh_next = prog->_LinkedShaders[next];
-      unsigned gs_input_vertices =
-         next == MESA_SHADER_GEOMETRY ? prog->Geom.VerticesIn : 0;
 
       if (!assign_varying_locations(ctx, mem_ctx, prog, sh_i, sh_next,
                 next == MESA_SHADER_FRAGMENT ? num_tfeedback_decls : 0,
-                tfeedback_decls, gs_input_vertices))
+                tfeedback_decls))
          goto done;
 
       do_dead_builtin_varyings(ctx, sh_i, sh_next,

From df16e0dd63dfeb7d5086339113ff7d7197010847 Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Tue, 9 Sep 2014 19:25:02 +1200
Subject: [PATCH 0520/1208] glsl: analyze TES usage of gl_ClipDistance

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/linker.cpp       | 19 +++++++++++++++++++
 src/mesa/main/mtypes.h    |  7 +++++++
 src/mesa/main/shaderapi.c |  1 +
 3 files changed, 27 insertions(+)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index c74ec56f3a2..eeab3a3ca91 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -666,6 +666,17 @@ validate_vertex_shader_executable(struct gl_shader_program *prog,
                       &prog->Vert.ClipDistanceArraySize);
 }
 
+void
+validate_tess_eval_shader_executable(struct gl_shader_program *prog,
+                                     struct gl_shader *shader)
+{
+   if (shader == NULL)
+      return;
+
+   analyze_clip_usage(prog, shader, &prog->TessEval.UsesClipDistance,
+                      &prog->TessEval.ClipDistanceArraySize);
+}
+
 
 /**
  * Verify that a fragment shader executable meets all semantic requirements
@@ -3215,6 +3226,12 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
          case MESA_SHADER_VERTEX:
             validate_vertex_shader_executable(prog, sh);
             break;
+         case MESA_SHADER_TESS_CTRL:
+            /* nothing to be done */
+            break;
+         case MESA_SHADER_TESS_EVAL:
+            validate_tess_eval_shader_executable(prog, sh);
+            break;
          case MESA_SHADER_GEOMETRY:
             validate_geometry_shader_executable(prog, sh);
             break;
@@ -3234,6 +3251,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
 
    if (num_shaders[MESA_SHADER_GEOMETRY] > 0)
       prog->LastClipDistanceArraySize = prog->Geom.ClipDistanceArraySize;
+   else if (num_shaders[MESA_SHADER_TESS_EVAL] > 0)
+      prog->LastClipDistanceArraySize = prog->TessEval.ClipDistanceArraySize;
    else if (num_shaders[MESA_SHADER_VERTEX] > 0)
       prog->LastClipDistanceArraySize = prog->Vert.ClipDistanceArraySize;
    else
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index f97468d2db7..85e04b5f18b 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2781,6 +2781,13 @@ struct gl_shader_program
       /** GL_CW or GL_CCW */
       GLenum VertexOrder;
       bool PointMode;
+      /**
+       * True if gl_ClipDistance is written to.  Copied into
+       * gl_tess_eval_program by _mesa_copy_linked_program_data().
+       */
+      GLboolean UsesClipDistance;
+      GLuint ClipDistanceArraySize; /**< Size of the gl_ClipDistance array, or
+                                         0 if not present. */
    } TessEval;
 
    /**
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 40b4d417c22..38365908a86 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -2057,6 +2057,7 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
       dst_tep->Spacing = src->TessEval.Spacing;
       dst_tep->VertexOrder = src->TessEval.VertexOrder;
       dst_tep->PointMode = src->TessEval.PointMode;
+      dst->UsesClipDistanceOut = src->TessEval.UsesClipDistance;
       break;
    }
    case MESA_SHADER_GEOMETRY: {

From 799afadf51ad1ff0775a1bf7b4f3954a8d368b09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 14 Jun 2015 20:14:22 +0200
Subject: [PATCH 0521/1208] glsl: allow barrier() in tessellation control
 shaders

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/builtin_functions.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index efab2991993..2175c66cbd7 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -413,8 +413,8 @@ fp64(const _mesa_glsl_parse_state *state)
 static bool
 barrier_supported(const _mesa_glsl_parse_state *state)
 {
-   return state->stage == MESA_SHADER_COMPUTE;
-   /* TODO: || stage->state == MESA_SHADER_TESS_CTRL; */
+   return state->stage == MESA_SHADER_COMPUTE ||
+          state->stage == MESA_SHADER_TESS_CTRL;
 }
 
 /** @} */

From 8cf72972ce2fc7df83d0572745968bbcb41a8c92 Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 7 Sep 2014 21:42:50 +1200
Subject: [PATCH 0522/1208] glsl: validate restrictions on use of barrier()

With the exception of always-taken switch cases (which are
indistinguishable from straight line code in our IR), this
disallows use of the builtin barrier() function in all the
places it may not appear.

Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/linker.cpp | 99 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index eeab3a3ca91..d0445f16032 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -297,6 +297,97 @@ public:
    }
 };
 
+class barrier_use_visitor : public ir_hierarchical_visitor {
+public:
+   barrier_use_visitor(gl_shader_program *prog)
+      : prog(prog), in_main(false), after_return(false), control_flow(0)
+   {
+   }
+
+   virtual ~barrier_use_visitor()
+   {
+      /* empty */
+   }
+
+   virtual ir_visitor_status visit_enter(ir_function *ir)
+   {
+      if (strcmp(ir->name, "main") == 0)
+         in_main = true;
+
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_leave(ir_function *ir)
+   {
+      in_main = false;
+      after_return = false;
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_leave(ir_return *ir)
+   {
+      after_return = true;
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_enter(ir_if *ir)
+   {
+      ++control_flow;
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_leave(ir_if *ir)
+   {
+      --control_flow;
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_enter(ir_loop *ir)
+   {
+      ++control_flow;
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_leave(ir_loop *ir)
+   {
+      --control_flow;
+      return visit_continue;
+   }
+
+   /* FINISHME: `switch` is not expressed at the IR level -- it's already
+    * been lowered to a mess of `if`s. We'll correctly disallow any use of
+    * barrier() in a conditional path within the switch, but not in a path
+    * which is always hit.
+    */
+
+   virtual ir_visitor_status visit_enter(ir_call *ir)
+   {
+      if (ir->use_builtin && strcmp(ir->callee_name(), "barrier") == 0) {
+         /* Use of barrier(); determine if it is legal: */
+         if (!in_main) {
+            linker_error(prog, "Builtin barrier() may only be used in main");
+            return visit_stop;
+         }
+
+         if (after_return) {
+            linker_error(prog, "Builtin barrier() may not be used after return");
+            return visit_stop;
+         }
+
+         if (control_flow != 0) {
+            linker_error(prog, "Builtin barrier() may not be used inside control flow");
+            return visit_stop;
+         }
+      }
+      return visit_continue;
+   }
+
+private:
+   gl_shader_program *prog;
+   bool in_main, after_return;
+   int control_flow;
+};
+
 /**
  * Visitor that determines the highest stream id to which a (geometry) shader
  * emits vertices. It also checks whether End{Stream}Primitive is ever called.
@@ -2050,6 +2141,14 @@ link_intrastage_shaders(void *mem_ctx,
    if (ctx->Const.VertexID_is_zero_based)
       lower_vertex_id(linked);
 
+   /* Validate correct usage of barrier() in the tess control shader */
+   if (linked->Stage == MESA_SHADER_TESS_CTRL) {
+      barrier_use_visitor visitor(prog);
+      foreach_in_list(ir_instruction, ir, linked->ir) {
+         ir->accept(&visitor);
+      }
+   }
+
    /* Make a pass over all variable declarations to ensure that arrays with
     * unspecified sizes have a size specified.  The size is inferred from the
     * max_array_access field.

From 19f46d0540d9557a4d458ceb72f27ece28fa935e Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Fri, 12 Sep 2014 21:27:26 +1200
Subject: [PATCH 0523/1208] glsl: allow redeclaration of TCS gl_out[]

Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/ast_to_hir.cpp | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index dad8bceb41a..1cc0731e932 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -5892,16 +5892,28 @@ ast_interface_block::hir(exec_list *instructions,
          if (ir_variable *earlier_gl_Position =
              state->symbols->get_variable("gl_Position")) {
             earlier_per_vertex = earlier_gl_Position->get_interface_type();
+         } else if (ir_variable *earlier_gl_out =
+               state->symbols->get_variable("gl_out")) {
+            earlier_per_vertex = earlier_gl_out->get_interface_type();
          } else {
             _mesa_glsl_error(&loc, state,
                              "redeclaration of gl_PerVertex output not "
                              "allowed in the %s shader",
                              _mesa_shader_stage_to_string(state->stage));
          }
-         if (this->instance_name != NULL) {
-            _mesa_glsl_error(&loc, state,
-                             "gl_PerVertex output may not be redeclared with "
-                             "an instance name");
+         if (state->stage == MESA_SHADER_TESS_CTRL) {
+            if (this->instance_name == NULL ||
+                strcmp(this->instance_name, "gl_out") != 0 || this->array_specifier == NULL) {
+               _mesa_glsl_error(&loc, state,
+                                "gl_PerVertex output must be redeclared as "
+                                "gl_out[]");
+            }
+         } else {
+            if (this->instance_name != NULL) {
+               _mesa_glsl_error(&loc, state,
+                                "gl_PerVertex output may not be redeclared with "
+                                "an instance name");
+            }
          }
          break;
       default:

From d5787e7eef7c42e4a90cbd89dee81efbf1491487 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 16 Jun 2015 02:43:55 +0200
Subject: [PATCH 0524/1208] glsl: allow indexing of gl_out with a non-const if
 length isn't known

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/ast_array_index.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp
index 50971bb22bb..27e84d101ec 100644
--- a/src/glsl/ast_array_index.cpp
+++ b/src/glsl/ast_array_index.cpp
@@ -216,6 +216,16 @@ _mesa_ast_array_index_to_hir(void *mem_ctx,
             if (v != NULL)
                v->data.max_array_access = implicit_size - 1;
          }
+         else if (state->stage == MESA_SHADER_TESS_CTRL &&
+                  array->variable_referenced()->data.mode == ir_var_shader_out &&
+                  !array->variable_referenced()->data.patch) {
+            /* Tessellation control shader output non-patch arrays are
+             * initially unsized. Despite that, they are allowed to be
+             * indexed with a non-constant expression (typically
+             * "gl_InvocationID"). The array size will be determined
+             * by the linker.
+             */
+         }
          else {
             _mesa_glsl_error(&loc, state, "unsized array index must be constant");
          }

From 3a4b87f26d6c8c12eb119d72bf46461a7a384ab9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 4 Oct 2014 00:15:33 +0200
Subject: [PATCH 0525/1208] glsl: disable varying packing between tessellation
 shaders

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/link_varyings.cpp | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 532204e44e9..30fce746e16 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -1368,7 +1368,26 @@ assign_varying_locations(struct gl_context *ctx,
                          unsigned num_tfeedback_decls,
                          tfeedback_decl *tfeedback_decls)
 {
-   varying_matches matches(ctx->Const.DisableVaryingPacking,
+   if (ctx->Const.DisableVaryingPacking) {
+      /* Transform feedback code assumes varyings are packed, so if the driver
+       * has disabled varying packing, make sure it does not support transform
+       * feedback.
+       */
+      assert(!ctx->Extensions.EXT_transform_feedback);
+   }
+
+   /* Tessellation shaders treat inputs and outputs as shared memory and can
+    * access inputs and outputs of other invocations.
+    * Therefore, they can't be lowered to temps easily (and definitely not
+    * efficiently).
+    */
+   bool disable_varying_packing =
+      ctx->Const.DisableVaryingPacking ||
+      (consumer && consumer->Stage == MESA_SHADER_TESS_EVAL) ||
+      (consumer && consumer->Stage == MESA_SHADER_TESS_CTRL) ||
+      (producer && producer->Stage == MESA_SHADER_TESS_CTRL);
+
+   varying_matches matches(disable_varying_packing,
                            consumer && consumer->Stage == MESA_SHADER_FRAGMENT);
    hash_table *tfeedback_candidates
       = hash_table_ctor(0, hash_table_string_hash, hash_table_string_compare);
@@ -1510,13 +1529,7 @@ assign_varying_locations(struct gl_context *ctx,
    hash_table_dtor(consumer_inputs);
    hash_table_dtor(consumer_interface_inputs);
 
-   if (ctx->Const.DisableVaryingPacking) {
-      /* Transform feedback code assumes varyings are packed, so if the driver
-       * has disabled varying packing, make sure it does not support transform
-       * feedback.
-       */
-      assert(!ctx->Extensions.EXT_transform_feedback);
-   } else {
+   if (!disable_varying_packing) {
       if (producer) {
          lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
                                0, producer);

From 41acdae2e9eedb697a0f91815e201daf92d74ab4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 4 Oct 2014 00:13:42 +0200
Subject: [PATCH 0526/1208] glsl: don't demote tess control shader outputs

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/link_varyings.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 30fce746e16..43245b84a86 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -1461,8 +1461,12 @@ assign_varying_locations(struct gl_context *ctx,
          /* If a matching input variable was found, add this ouptut (and the
           * input) to the set.  If this is a separable program and there is no
           * consumer stage, add the output.
+          *
+          * Always add TCS outputs. They are shared by all invocations
+          * within a patch and can be used as shared memory.
           */
-         if (input_var || (prog->SeparateShader && consumer == NULL)) {
+         if (input_var || (prog->SeparateShader && consumer == NULL) ||
+             producer->Type == GL_TESS_CONTROL_SHADER) {
             matches.record(output_var, input_var);
          }
 

From d07023894434325de850faabf005224f7b8ef4b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 16 May 2015 21:12:54 +0200
Subject: [PATCH 0527/1208] glsl: fix locations of 2-dimensional varyings
 without varying packing (v2)

v2: renamed producer/consumer_type -> producer/consumer_stage

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/link_varyings.cpp | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 43245b84a86..1104872ca3c 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -760,7 +760,9 @@ namespace {
 class varying_matches
 {
 public:
-   varying_matches(bool disable_varying_packing, bool consumer_is_fs);
+   varying_matches(bool disable_varying_packing,
+                   gl_shader_stage producer_stage,
+                   gl_shader_stage consumer_stage);
    ~varying_matches();
    void record(ir_variable *producer_var, ir_variable *consumer_var);
    unsigned assign_locations();
@@ -841,15 +843,18 @@ private:
     */
    unsigned matches_capacity;
 
-   const bool consumer_is_fs;
+   gl_shader_stage producer_stage;
+   gl_shader_stage consumer_stage;
 };
 
 } /* anonymous namespace */
 
 varying_matches::varying_matches(bool disable_varying_packing,
-                                 bool consumer_is_fs)
+                                 gl_shader_stage producer_stage,
+                                 gl_shader_stage consumer_stage)
    : disable_varying_packing(disable_varying_packing),
-     consumer_is_fs(consumer_is_fs)
+     producer_stage(producer_stage),
+     consumer_stage(consumer_stage)
 {
    /* Note: this initial capacity is rather arbitrarily chosen to be large
     * enough for many cases without wasting an unreasonable amount of space.
@@ -900,7 +905,7 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
    }
 
    if ((consumer_var == NULL && producer_var->type->contains_integer()) ||
-       !consumer_is_fs) {
+       consumer_stage != MESA_SHADER_FRAGMENT) {
       /* Since this varying is not being consumed by the fragment shader, its
        * interpolation type varying cannot possibly affect rendering.  Also,
        * this variable is non-flat and is (or contains) an integer.
@@ -937,9 +942,22 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
    this->matches[this->num_matches].packing_order
       = this->compute_packing_order(var);
    if (this->disable_varying_packing) {
-      unsigned slots = var->type->is_array()
-         ? (var->type->length * var->type->fields.array->matrix_columns)
-         : var->type->matrix_columns;
+      const struct glsl_type *type = var->type;
+      unsigned slots;
+
+      /* Some shader stages have 2-dimensional varyings. Use the inner type. */
+      if (!var->data.patch &&
+          ((var == producer_var && producer_stage == MESA_SHADER_TESS_CTRL) ||
+           (var == consumer_var && (consumer_stage == MESA_SHADER_TESS_CTRL ||
+                                    consumer_stage == MESA_SHADER_TESS_EVAL ||
+                                    consumer_stage == MESA_SHADER_GEOMETRY)))) {
+         assert(type->is_array());
+         type = type->fields.array;
+      }
+
+      slots = (type->is_array()
+            ? (type->length * type->fields.array->matrix_columns)
+            : type->matrix_columns);
       this->matches[this->num_matches].num_components = 4 * slots;
    } else {
       this->matches[this->num_matches].num_components
@@ -1388,7 +1406,8 @@ assign_varying_locations(struct gl_context *ctx,
       (producer && producer->Stage == MESA_SHADER_TESS_CTRL);
 
    varying_matches matches(disable_varying_packing,
-                           consumer && consumer->Stage == MESA_SHADER_FRAGMENT);
+                           producer ? producer->Stage : (gl_shader_stage)-1,
+                           consumer ? consumer->Stage : (gl_shader_stage)-1);
    hash_table *tfeedback_candidates
       = hash_table_ctor(0, hash_table_string_hash, hash_table_string_compare);
    hash_table *consumer_inputs

From 0af240e9401c12f4237f4a36a2474fe2cc590404 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 13 Jun 2015 13:50:12 +0200
Subject: [PATCH 0528/1208] glsl: use separate varying slots for patch varyings

The idea is to allow 32 normal varyings and 32 patch varyings,
a total of 64. Previously, only a total of 32 was allowed.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/ast_to_hir.cpp            |  5 ++++-
 src/glsl/ir_set_program_inouts.cpp | 27 ++++++++++++++++++++++++---
 src/glsl/link_varyings.cpp         | 23 ++++++++++++++++-------
 src/mesa/main/mtypes.h             |  6 +++++-
 4 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 1cc0731e932..a45b6e8f026 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -2367,7 +2367,10 @@ validate_explicit_location(const struct ast_type_qualifier *qual,
          case MESA_SHADER_TESS_CTRL:
          case MESA_SHADER_TESS_EVAL:
          case MESA_SHADER_GEOMETRY:
-            var->data.location = qual->location + VARYING_SLOT_VAR0;
+            if (var->data.patch)
+               var->data.location = qual->location + VARYING_SLOT_PATCH0;
+            else
+               var->data.location = qual->location + VARYING_SLOT_VAR0;
             break;
 
          case MESA_SHADER_FRAGMENT:
diff --git a/src/glsl/ir_set_program_inouts.cpp b/src/glsl/ir_set_program_inouts.cpp
index fe4d6da45db..b7a0f6e95ba 100644
--- a/src/glsl/ir_set_program_inouts.cpp
+++ b/src/glsl/ir_set_program_inouts.cpp
@@ -103,10 +103,26 @@ mark(struct gl_program *prog, ir_variable *var, int offset, int len,
    for (int i = 0; i < len; i++) {
       bool dual_slot = is_dual_slot(var);
       int idx = var->data.location + var->data.index + offset + i;
-      GLbitfield64 bitfield = BITFIELD64_BIT(idx);
+      bool is_patch_generic = var->data.patch &&
+                              idx != VARYING_SLOT_TESS_LEVEL_INNER &&
+                              idx != VARYING_SLOT_TESS_LEVEL_OUTER;
+      GLbitfield64 bitfield;
+
+      if (is_patch_generic) {
+         assert(idx >= VARYING_SLOT_PATCH0 && idx < VARYING_SLOT_TESS_MAX);
+         bitfield = BITFIELD64_BIT(idx - VARYING_SLOT_PATCH0);
+      }
+      else {
+         assert(idx < VARYING_SLOT_MAX);
+         bitfield = BITFIELD64_BIT(idx);
+      }
 
       if (var->data.mode == ir_var_shader_in) {
-         prog->InputsRead |= bitfield;
+         if (is_patch_generic)
+            prog->PatchInputsRead |= bitfield;
+         else
+            prog->InputsRead |= bitfield;
+
          if (dual_slot)
             prog->DoubleInputsRead |= bitfield;
          if (is_fragment_shader) {
@@ -122,7 +138,10 @@ mark(struct gl_program *prog, ir_variable *var, int offset, int len,
          prog->SystemValuesRead |= bitfield;
       } else {
          assert(var->data.mode == ir_var_shader_out);
-	 prog->OutputsWritten |= bitfield;
+         if (is_patch_generic)
+            prog->PatchOutputsWritten |= bitfield;
+         else
+            prog->OutputsWritten |= bitfield;
       }
    }
 }
@@ -406,6 +425,8 @@ do_set_program_inouts(exec_list *instructions, struct gl_program *prog,
 
    prog->InputsRead = 0;
    prog->OutputsWritten = 0;
+   prog->PatchInputsRead = 0;
+   prog->PatchOutputsWritten = 0;
    prog->SystemValuesRead = 0;
    if (shader_stage == MESA_SHADER_FRAGMENT) {
       gl_fragment_program *fprog = (gl_fragment_program *) prog;
diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 1104872ca3c..1c52ff35f44 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -985,8 +985,17 @@ varying_matches::assign_locations()
          &varying_matches::match_comparator);
 
    unsigned generic_location = 0;
+   unsigned generic_patch_location = MAX_VARYING*4;
 
    for (unsigned i = 0; i < this->num_matches; i++) {
+      unsigned *location = &generic_location;
+
+      if ((this->matches[i].consumer_var &&
+           this->matches[i].consumer_var->data.patch) ||
+          (this->matches[i].producer_var &&
+           this->matches[i].producer_var->data.patch))
+         location = &generic_patch_location;
+
       /* Advance to the next slot if this varying has a different packing
        * class than the previous one, and we're not already on a slot
        * boundary.
@@ -994,12 +1003,12 @@ varying_matches::assign_locations()
       if (i > 0 &&
           this->matches[i - 1].packing_class
           != this->matches[i].packing_class) {
-         generic_location = ALIGN(generic_location, 4);
+         *location = ALIGN(*location, 4);
       }
 
-      this->matches[i].generic_location = generic_location;
+      this->matches[i].generic_location = *location;
 
-      generic_location += this->matches[i].num_components;
+      *location += this->matches[i].num_components;
    }
 
    return (generic_location + 3) / 4;
@@ -1213,11 +1222,11 @@ bool
 populate_consumer_input_sets(void *mem_ctx, exec_list *ir,
                              hash_table *consumer_inputs,
                              hash_table *consumer_interface_inputs,
-                             ir_variable *consumer_inputs_with_locations[VARYING_SLOT_MAX])
+                             ir_variable *consumer_inputs_with_locations[VARYING_SLOT_TESS_MAX])
 {
    memset(consumer_inputs_with_locations,
           0,
-          sizeof(consumer_inputs_with_locations[0]) * VARYING_SLOT_MAX);
+          sizeof(consumer_inputs_with_locations[0]) * VARYING_SLOT_TESS_MAX);
 
    foreach_in_list(ir_instruction, node, ir) {
       ir_variable *const input_var = node->as_variable();
@@ -1273,7 +1282,7 @@ get_matching_input(void *mem_ctx,
                    const ir_variable *output_var,
                    hash_table *consumer_inputs,
                    hash_table *consumer_interface_inputs,
-                   ir_variable *consumer_inputs_with_locations[VARYING_SLOT_MAX])
+                   ir_variable *consumer_inputs_with_locations[VARYING_SLOT_TESS_MAX])
 {
    ir_variable *input_var;
 
@@ -1414,7 +1423,7 @@ assign_varying_locations(struct gl_context *ctx,
       = hash_table_ctor(0, hash_table_string_hash, hash_table_string_compare);
    hash_table *consumer_interface_inputs
       = hash_table_ctor(0, hash_table_string_hash, hash_table_string_compare);
-   ir_variable *consumer_inputs_with_locations[VARYING_SLOT_MAX] = {
+   ir_variable *consumer_inputs_with_locations[VARYING_SLOT_TESS_MAX] = {
       NULL,
    };
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 85e04b5f18b..b13271bbd06 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -243,7 +243,9 @@ typedef enum
    VARYING_SLOT_TESS_LEVEL_OUTER, /* Only appears as TCS output. */
    VARYING_SLOT_TESS_LEVEL_INNER, /* Only appears as TCS output. */
    VARYING_SLOT_VAR0, /* First generic varying slot */
-   VARYING_SLOT_MAX = VARYING_SLOT_VAR0 + MAX_VARYING
+   VARYING_SLOT_MAX = VARYING_SLOT_VAR0 + MAX_VARYING,
+   VARYING_SLOT_PATCH0 = VARYING_SLOT_MAX,
+   VARYING_SLOT_TESS_MAX = VARYING_SLOT_PATCH0 + MAX_VARYING
 } gl_varying_slot;
 
 
@@ -2103,6 +2105,8 @@ struct gl_program
    GLbitfield64 InputsRead;     /**< Bitmask of which input regs are read */
    GLbitfield64 DoubleInputsRead;     /**< Bitmask of which input regs are read  and are doubles */
    GLbitfield64 OutputsWritten; /**< Bitmask of which output regs are written */
+   GLbitfield PatchInputsRead;  /**< VAR[0..31] usage for patch inputs (user-defined only) */
+   GLbitfield PatchOutputsWritten; /**< VAR[0..31] usage for patch outputs (user-defined only) */
    GLbitfield SystemValuesRead;   /**< Bitmask of SYSTEM_VALUE_x inputs used */
    GLbitfield TexturesUsed[MAX_COMBINED_TEXTURE_IMAGE_UNITS];  /**< TEXTURE_x_BIT bitmask */
    GLbitfield SamplersUsed;   /**< Bitfield of which samplers are used */

From df4ee8ef366c60ad41502d4e45e0347c1ef1e348 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 16 Jun 2015 13:07:04 +0200
Subject: [PATCH 0529/1208] mesa: implement GL_IS_PER_PATCH

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/shader_query.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index c58d3474df7..59b9396f19b 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -1048,9 +1048,15 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
          goto invalid_operation;
       *val = RESOURCE_VAR(res)->data.index;
       return 1;
-
-   /* GL_ARB_tessellation_shader */
    case GL_IS_PER_PATCH:
+      switch (res->Type) {
+      case GL_PROGRAM_INPUT:
+      case GL_PROGRAM_OUTPUT:
+         *val = RESOURCE_VAR(res)->data.patch;
+         return 1;
+      default:
+         goto invalid_operation;
+      }
    default:
       goto invalid_enum;
    }

From ba9fb96f86344f1631b82114bb0ce6f926d3853a Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 11 Jul 2014 22:11:21 -0400
Subject: [PATCH 0530/1208] st/mesa: add tessellation shader states

additional fixes by Marek
---
 src/mesa/state_tracker/st_atom.c           |   2 +
 src/mesa/state_tracker/st_atom.h           |   2 +
 src/mesa/state_tracker/st_atom_shader.c    |  72 ++++++++
 src/mesa/state_tracker/st_cb_program.c     |  58 +++++++
 src/mesa/state_tracker/st_context.c        |   2 +
 src/mesa/state_tracker/st_context.h        |   6 +
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp |   4 +
 src/mesa/state_tracker/st_program.c        | 192 +++++++++++++++++++++
 src/mesa/state_tracker/st_program.h        | 122 ++++++++++++-
 9 files changed, 459 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index 428f2d9d7d7..1c3eb7ec45a 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -49,6 +49,8 @@ static const struct st_tracked_state *atoms[] =
    &st_finalize_textures,
    &st_update_fp,
    &st_update_gp,
+   &st_update_tep,
+   &st_update_tcp,
    &st_update_vp,
 
    &st_update_rasterizer,
diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h
index c50111d501f..bec4a810406 100644
--- a/src/mesa/state_tracker/st_atom.h
+++ b/src/mesa/state_tracker/st_atom.h
@@ -52,6 +52,8 @@ extern const struct st_tracked_state st_update_clip;
 extern const struct st_tracked_state st_update_depth_stencil_alpha;
 extern const struct st_tracked_state st_update_fp;
 extern const struct st_tracked_state st_update_gp;
+extern const struct st_tracked_state st_update_tep;
+extern const struct st_tracked_state st_update_tcp;
 extern const struct st_tracked_state st_update_vp;
 extern const struct st_tracked_state st_update_rasterizer;
 extern const struct st_tracked_state st_update_polygon_stipple;
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index ad8d2624fc9..d27882def7b 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -210,3 +210,75 @@ const struct st_tracked_state st_update_gp = {
    },
    update_gp  				/* update */
 };
+
+
+
+static void
+update_tcp( struct st_context *st )
+{
+   struct st_tessctrl_program *sttcp;
+   struct st_tcp_variant_key key;
+
+   if (!st->ctx->TessCtrlProgram._Current) {
+      cso_set_tessctrl_shader_handle(st->cso_context, NULL);
+      return;
+   }
+
+   sttcp = st_tessctrl_program(st->ctx->TessCtrlProgram._Current);
+   assert(sttcp->Base.Base.Target == GL_TESS_CONTROL_PROGRAM_NV);
+
+   memset(&key, 0, sizeof(key));
+   key.st = st;
+
+   st->tcp_variant = st_get_tcp_variant(st, sttcp, &key);
+
+   st_reference_tesscprog(st, &st->tcp, sttcp);
+
+   cso_set_tessctrl_shader_handle(st->cso_context,
+                                  st->tcp_variant->driver_shader);
+}
+
+const struct st_tracked_state st_update_tcp = {
+   "st_update_tcp",			/* name */
+   {					/* dirty */
+      0,				/* mesa */
+      ST_NEW_TESSCTRL_PROGRAM           /* st */
+   },
+   update_tcp  				/* update */
+};
+
+
+
+static void
+update_tep( struct st_context *st )
+{
+   struct st_tesseval_program *sttep;
+   struct st_tep_variant_key key;
+
+   if (!st->ctx->TessEvalProgram._Current) {
+      cso_set_tesseval_shader_handle(st->cso_context, NULL);
+      return;
+   }
+
+   sttep = st_tesseval_program(st->ctx->TessEvalProgram._Current);
+   assert(sttep->Base.Base.Target == GL_TESS_EVALUATION_PROGRAM_NV);
+
+   memset(&key, 0, sizeof(key));
+   key.st = st;
+
+   st->tep_variant = st_get_tep_variant(st, sttep, &key);
+
+   st_reference_tesseprog(st, &st->tep, sttep);
+
+   cso_set_tesseval_shader_handle(st->cso_context,
+                                  st->tep_variant->driver_shader);
+}
+
+const struct st_tracked_state st_update_tep = {
+   "st_update_tep",			/* name */
+   {					/* dirty */
+      0,				/* mesa */
+      ST_NEW_TESSEVAL_PROGRAM           /* st */
+   },
+   update_tep  				/* update */
+};
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index 6aa7d5796d9..3029909d12d 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -68,6 +68,12 @@ st_bind_program(struct gl_context *ctx, GLenum target, struct gl_program *prog)
    case GL_GEOMETRY_PROGRAM_NV:
       st->dirty.st |= ST_NEW_GEOMETRY_PROGRAM;
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      st->dirty.st |= ST_NEW_TESSCTRL_PROGRAM;
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM;
+      break;
    }
 }
 
@@ -84,6 +90,8 @@ st_use_program(struct gl_context *ctx, struct gl_shader_program *shProg)
    st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
    st->dirty.st |= ST_NEW_VERTEX_PROGRAM;
    st->dirty.st |= ST_NEW_GEOMETRY_PROGRAM;
+   st->dirty.st |= ST_NEW_TESSCTRL_PROGRAM;
+   st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM;
 }
 
 
@@ -110,6 +118,16 @@ st_new_program(struct gl_context *ctx, GLenum target, GLuint id)
       return _mesa_init_geometry_program(ctx, &prog->Base, target, id);
    }
 
+   case GL_TESS_CONTROL_PROGRAM_NV: {
+      struct st_tessctrl_program *prog = ST_CALLOC_STRUCT(st_tessctrl_program);
+      return _mesa_init_tess_ctrl_program(ctx, &prog->Base, target, id);
+   }
+
+   case GL_TESS_EVALUATION_PROGRAM_NV: {
+      struct st_tesseval_program *prog = ST_CALLOC_STRUCT(st_tesseval_program);
+      return _mesa_init_tess_eval_program(ctx, &prog->Base, target, id);
+   }
+
    default:
       assert(0);
       return NULL;
@@ -157,6 +175,28 @@ st_delete_program(struct gl_context *ctx, struct gl_program *prog)
             free_glsl_to_tgsi_visitor(stfp->glsl_to_tgsi);
       }
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      {
+         struct st_tessctrl_program *sttcp =
+            (struct st_tessctrl_program *) prog;
+
+         st_release_tcp_variants(st, sttcp);
+
+         if (sttcp->glsl_to_tgsi)
+            free_glsl_to_tgsi_visitor(sttcp->glsl_to_tgsi);
+      }
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      {
+         struct st_tesseval_program *sttep =
+            (struct st_tesseval_program *) prog;
+
+         st_release_tep_variants(st, sttep);
+
+         if (sttep->glsl_to_tgsi)
+            free_glsl_to_tgsi_visitor(sttep->glsl_to_tgsi);
+      }
+      break;
    default:
       assert(0); /* problem */
    }
@@ -214,6 +254,24 @@ st_program_string_notify( struct gl_context *ctx,
       if (st->vp == stvp)
 	 st->dirty.st |= ST_NEW_VERTEX_PROGRAM;
    }
+   else if (target == GL_TESS_CONTROL_PROGRAM_NV) {
+      struct st_tessctrl_program *sttcp =
+         (struct st_tessctrl_program *) prog;
+
+      st_release_tcp_variants(st, sttcp);
+
+      if (st->tcp == sttcp)
+         st->dirty.st |= ST_NEW_TESSCTRL_PROGRAM;
+   }
+   else if (target == GL_TESS_EVALUATION_PROGRAM_NV) {
+      struct st_tesseval_program *sttep =
+         (struct st_tesseval_program *) prog;
+
+      st_release_tep_variants(st, sttep);
+
+      if (st->tep == sttep)
+         st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM;
+   }
 
    if (ST_DEBUG & DEBUG_PRECOMPILE)
       st_precompile_shader_variant(st, prog);
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 62a0fbee3bb..0c80fa1179d 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -374,6 +374,8 @@ void st_destroy_context( struct st_context *st )
    st_reference_fragprog(st, &st->fp, NULL);
    st_reference_geomprog(st, &st->gp, NULL);
    st_reference_vertprog(st, &st->vp, NULL);
+   st_reference_tesscprog(st, &st->tcp, NULL);
+   st_reference_tesseprog(st, &st->tep, NULL);
 
    /* release framebuffer surfaces */
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index dac5a4b9006..301072742cf 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -58,6 +58,8 @@ struct u_upload_mgr;
 #define ST_NEW_VERTEX_ARRAYS           (1 << 6)
 #define ST_NEW_RASTERIZER              (1 << 7)
 #define ST_NEW_UNIFORM_BUFFER          (1 << 8)
+#define ST_NEW_TESSCTRL_PROGRAM        (1 << 9)
+#define ST_NEW_TESSEVAL_PROGRAM        (1 << 10)
 
 
 struct st_state_flags {
@@ -147,10 +149,14 @@ struct st_context
    struct st_vertex_program *vp;    /**< Currently bound vertex program */
    struct st_fragment_program *fp;  /**< Currently bound fragment program */
    struct st_geometry_program *gp;  /**< Currently bound geometry program */
+   struct st_tessctrl_program *tcp; /**< Currently bound tess control program */
+   struct st_tesseval_program *tep; /**< Currently bound tess eval program */
 
    struct st_vp_variant *vp_variant;
    struct st_fp_variant *fp_variant;
    struct st_gp_variant *gp_variant;
+   struct st_tcp_variant *tcp_variant;
+   struct st_tep_variant *tep_variant;
 
    struct gl_texture_object *default_texture;
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 0ed16aeab87..be3ae6fc47d 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -5622,6 +5622,10 @@ shader_stage_to_ptarget(gl_shader_stage stage)
       return PIPE_SHADER_FRAGMENT;
    case MESA_SHADER_GEOMETRY:
       return PIPE_SHADER_GEOMETRY;
+   case MESA_SHADER_TESS_CTRL:
+      return PIPE_SHADER_TESS_CTRL;
+   case MESA_SHADER_TESS_EVAL:
+      return PIPE_SHADER_TESS_EVAL;
    case MESA_SHADER_COMPUTE:
       return PIPE_SHADER_COMPUTE;
    }
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index fa792bc349b..99257949ba3 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -163,6 +163,68 @@ st_release_gp_variants(struct st_context *st, struct st_geometry_program *stgp)
 }
 
 
+/**
+ * Delete a tessellation control program variant.  Note the caller must unlink
+ * the variant from the linked list.
+ */
+static void
+delete_tcp_variant(struct st_context *st, struct st_tcp_variant *tcpv)
+{
+   if (tcpv->driver_shader)
+      cso_delete_tessctrl_shader(st->cso_context, tcpv->driver_shader);
+
+   free(tcpv);
+}
+
+
+/**
+ * Free all variants of a tessellation control program.
+ */
+void
+st_release_tcp_variants(struct st_context *st, struct st_tessctrl_program *sttcp)
+{
+   struct st_tcp_variant *tcpv;
+
+   for (tcpv = sttcp->variants; tcpv; ) {
+      struct st_tcp_variant *next = tcpv->next;
+      delete_tcp_variant(st, tcpv);
+      tcpv = next;
+   }
+
+   sttcp->variants = NULL;
+}
+
+
+/**
+ * Delete a tessellation evaluation program variant.  Note the caller must
+ * unlink the variant from the linked list.
+ */
+static void
+delete_tep_variant(struct st_context *st, struct st_tep_variant *tepv)
+{
+   if (tepv->driver_shader)
+      cso_delete_tesseval_shader(st->cso_context, tepv->driver_shader);
+
+   free(tepv);
+}
+
+
+/**
+ * Free all variants of a tessellation evaluation program.
+ */
+void
+st_release_tep_variants(struct st_context *st, struct st_tesseval_program *sttep)
+{
+   struct st_tep_variant *tepv;
+
+   for (tepv = sttep->variants; tepv; ) {
+      struct st_tep_variant *next = tepv->next;
+      delete_tep_variant(st, tepv);
+      tepv = next;
+   }
+
+   sttep->variants = NULL;
+}
 
 
 /**
@@ -1167,6 +1229,92 @@ st_get_gp_variant(struct st_context *st,
 }
 
 
+/**
+ * Translate a tessellation control program to create a new variant.
+ */
+static struct st_tcp_variant *
+st_translate_tessctrl_program(struct st_context *st,
+                              struct st_tessctrl_program *sttcp,
+                              const struct st_tcp_variant_key *key)
+{
+   return NULL; /* will be updated in the next commit */
+}
+
+
+/**
+ * Get/create tessellation control program variant.
+ */
+struct st_tcp_variant *
+st_get_tcp_variant(struct st_context *st,
+                  struct st_tessctrl_program *sttcp,
+                  const struct st_tcp_variant_key *key)
+{
+   struct st_tcp_variant *tcpv;
+
+   /* Search for existing variant */
+   for (tcpv = sttcp->variants; tcpv; tcpv = tcpv->next) {
+      if (memcmp(&tcpv->key, key, sizeof(*key)) == 0) {
+         break;
+      }
+   }
+
+   if (!tcpv) {
+      /* create new */
+      tcpv = st_translate_tessctrl_program(st, sttcp, key);
+      if (tcpv) {
+         /* insert into list */
+         tcpv->next = sttcp->variants;
+         sttcp->variants = tcpv;
+      }
+   }
+
+   return tcpv;
+}
+
+
+/**
+ * Translate a tessellation evaluation program to create a new variant.
+ */
+static struct st_tep_variant *
+st_translate_tesseval_program(struct st_context *st,
+                              struct st_tesseval_program *sttep,
+                              const struct st_tep_variant_key *key)
+{
+   return NULL; /* will be updated in the next commit */
+}
+
+
+/**
+ * Get/create tessellation evaluation program variant.
+ */
+struct st_tep_variant *
+st_get_tep_variant(struct st_context *st,
+                  struct st_tesseval_program *sttep,
+                  const struct st_tep_variant_key *key)
+{
+   struct st_tep_variant *tepv;
+
+   /* Search for existing variant */
+   for (tepv = sttep->variants; tepv; tepv = tepv->next) {
+      if (memcmp(&tepv->key, key, sizeof(*key)) == 0) {
+         break;
+      }
+   }
+
+   if (!tepv) {
+      /* create new */
+      tepv = st_translate_tesseval_program(st, sttep, key);
+      if (tepv) {
+         /* insert into list */
+         tepv->next = sttep->variants;
+         sttep->variants = tepv;
+      }
+   }
+
+   return tepv;
+}
+
+
 /**
  * Vert/Geom/Frag programs have per-context variants.  Free all the
  * variants attached to the given program which match the given context.
@@ -1240,6 +1388,48 @@ destroy_program_variants(struct st_context *st, struct gl_program *program)
          }
       }
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      {
+         struct st_tessctrl_program *sttcp =
+            (struct st_tessctrl_program *) program;
+         struct st_tcp_variant *tcpv, **prevPtr = &sttcp->variants;
+
+         for (tcpv = sttcp->variants; tcpv; ) {
+            struct st_tcp_variant *next = tcpv->next;
+            if (tcpv->key.st == st) {
+               /* unlink from list */
+               *prevPtr = next;
+               /* destroy this variant */
+               delete_tcp_variant(st, tcpv);
+            }
+            else {
+               prevPtr = &tcpv->next;
+            }
+            tcpv = next;
+         }
+      }
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      {
+         struct st_tesseval_program *sttep =
+            (struct st_tesseval_program *) program;
+         struct st_tep_variant *tepv, **prevPtr = &sttep->variants;
+
+         for (tepv = sttep->variants; tepv; ) {
+            struct st_tep_variant *next = tepv->next;
+            if (tepv->key.st == st) {
+               /* unlink from list */
+               *prevPtr = next;
+               /* destroy this variant */
+               delete_tep_variant(st, tepv);
+            }
+            else {
+               prevPtr = &tepv->next;
+            }
+            tepv = next;
+         }
+      }
+      break;
    default:
       _mesa_problem(NULL, "Unexpected program target 0x%x in "
                     "destroy_program_variants_cb()", program->Target);
@@ -1276,6 +1466,8 @@ destroy_shader_program_variants_cb(GLuint key, void *data, void *userData)
    case GL_VERTEX_SHADER:
    case GL_FRAGMENT_SHADER:
    case GL_GEOMETRY_SHADER:
+   case GL_TESS_CONTROL_SHADER:
+   case GL_TESS_EVALUATION_SHADER:
       {
          destroy_program_variants(st, shader->Program);
       }
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index bb77eb6ed65..7013993fe38 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -188,7 +188,7 @@ struct st_gp_variant_key
  */
 struct st_gp_variant
 {
-   /* Parameters which generated this translated version of a vertex */
+   /* Parameters which generated this variant. */
    struct st_gp_variant_key key;
 
    void *driver_shader;
@@ -210,6 +210,76 @@ struct st_geometry_program
 
 
 
+/** Tessellation control program variant key */
+struct st_tcp_variant_key
+{
+   struct st_context *st;          /**< variants are per-context */
+   /* no other fields yet */
+};
+
+
+/**
+ * Tessellation control program variant.
+ */
+struct st_tcp_variant
+{
+   /* Parameters which generated this variant. */
+   struct st_tcp_variant_key key;
+
+   void *driver_shader;
+
+   struct st_tcp_variant *next;
+};
+
+
+/**
+ * Derived from Mesa gl_tess_ctrl_program:
+ */
+struct st_tessctrl_program
+{
+   struct gl_tess_ctrl_program Base;  /**< The Mesa tess ctrl program */
+   struct glsl_to_tgsi_visitor* glsl_to_tgsi;
+
+   struct st_tcp_variant *variants;
+};
+
+
+
+/** Tessellation evaluation program variant key */
+struct st_tep_variant_key
+{
+   struct st_context *st;          /**< variants are per-context */
+   /* no other fields yet */
+};
+
+
+/**
+ * Tessellation evaluation program variant.
+ */
+struct st_tep_variant
+{
+   /* Parameters which generated this variant. */
+   struct st_tep_variant_key key;
+
+   void *driver_shader;
+
+   struct st_tep_variant *next;
+};
+
+
+/**
+ * Derived from Mesa gl_tess_eval_program:
+ */
+struct st_tesseval_program
+{
+   struct gl_tess_eval_program Base;  /**< The Mesa tess eval program */
+   struct glsl_to_tgsi_visitor* glsl_to_tgsi;
+
+   struct st_tep_variant *variants;
+};
+
+
+
 static inline struct st_fragment_program *
 st_fragment_program( struct gl_fragment_program *fp )
 {
@@ -229,6 +299,18 @@ st_geometry_program( struct gl_geometry_program *gp )
    return (struct st_geometry_program *)gp;
 }
 
+static inline struct st_tessctrl_program *
+st_tessctrl_program( struct gl_tess_ctrl_program *tcp )
+{
+   return (struct st_tessctrl_program *)tcp;
+}
+
+static inline struct st_tesseval_program *
+st_tesseval_program( struct gl_tess_eval_program *tep )
+{
+   return (struct st_tesseval_program *)tep;
+}
+
 static inline void
 st_reference_vertprog(struct st_context *st,
                       struct st_vertex_program **ptr,
@@ -259,6 +341,26 @@ st_reference_fragprog(struct st_context *st,
                            (struct gl_program *) prog);
 }
 
+static inline void
+st_reference_tesscprog(struct st_context *st,
+                       struct st_tessctrl_program **ptr,
+                       struct st_tessctrl_program *prog)
+{
+   _mesa_reference_program(st->ctx,
+                           (struct gl_program **) ptr,
+                           (struct gl_program *) prog);
+}
+
+static inline void
+st_reference_tesseprog(struct st_context *st,
+                       struct st_tesseval_program **ptr,
+                       struct st_tesseval_program *prog)
+{
+   _mesa_reference_program(st->ctx,
+                           (struct gl_program **) ptr,
+                           (struct gl_program *) prog);
+}
+
 /**
  * This defines mapping from Mesa VARYING_SLOTs to TGSI GENERIC slots.
  */
@@ -302,6 +404,16 @@ st_get_gp_variant(struct st_context *st,
                   struct st_geometry_program *stgp,
                   const struct st_gp_variant_key *key);
 
+extern struct st_tcp_variant *
+st_get_tcp_variant(struct st_context *st,
+                   struct st_tessctrl_program *stgp,
+                   const struct st_tcp_variant_key *key);
+
+extern struct st_tep_variant *
+st_get_tep_variant(struct st_context *st,
+                   struct st_tesseval_program *stgp,
+                   const struct st_tep_variant_key *key);
+
 
 extern void
 st_prepare_vertex_program(struct gl_context *ctx,
@@ -324,6 +436,14 @@ extern void
 st_release_gp_variants(struct st_context *st,
                        struct st_geometry_program *stgp);
 
+extern void
+st_release_tcp_variants(struct st_context *st,
+                        struct st_tessctrl_program *stgp);
+
+extern void
+st_release_tep_variants(struct st_context *st,
+                        struct st_tesseval_program *stgp);
+
 extern void
 st_destroy_program_variants(struct st_context *st);
 

From a58a66fe8577940cf07530b6235a386950ae04f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 20:44:39 +0100
Subject: [PATCH 0531/1208] st/mesa: add conversion for tessellation shaders

Based on code from Ilia Mirkin <imirkin@alum.mit.edu>.
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp |  65 ++++-
 src/mesa/state_tracker/st_program.c        | 307 +++++++++++++++------
 2 files changed, 274 insertions(+), 98 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index be3ae6fc47d..e59cadcb8ec 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -2243,7 +2243,10 @@ is_inout_array(unsigned stage, ir_variable *var, bool *is_2d)
 
    *is_2d = false;
 
-   if (stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) {
+   if (((stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) ||
+        (stage == MESA_SHADER_TESS_EVAL && var->data.mode == ir_var_shader_in) ||
+        stage == MESA_SHADER_TESS_CTRL) &&
+       !var->data.patch) {
       if (!var->type->is_array())
          return false; /* a system value probably */
 
@@ -2355,7 +2358,8 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 
 static void
 shrink_array_declarations(struct array_decl *arrays, unsigned count,
-                          GLbitfield64 usage_mask)
+                          GLbitfield64 usage_mask,
+                          GLbitfield patch_usage_mask)
 {
    unsigned i, j;
 
@@ -2367,8 +2371,15 @@ shrink_array_declarations(struct array_decl *arrays, unsigned count,
 
       /* Shrink the beginning. */
       for (j = 0; j < decl->array_size; j++) {
-         if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
-            break;
+         if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
+            if (patch_usage_mask &
+                BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
+               break;
+         }
+         else {
+            if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
+               break;
+         }
 
          decl->mesa_index++;
          decl->array_size--;
@@ -2377,8 +2388,15 @@ shrink_array_declarations(struct array_decl *arrays, unsigned count,
 
       /* Shrink the end. */
       for (j = decl->array_size-1; j >= 0; j--) {
-         if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
-            break;
+         if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
+            if (patch_usage_mask &
+                BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
+               break;
+         }
+         else {
+            if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
+               break;
+         }
 
          decl->array_size--;
       }
@@ -3553,7 +3571,7 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
 {
    int tempWritesSize = 0;
    unsigned *tempWrites = NULL;
-   unsigned outputWrites[MAX_PROGRAM_OUTPUTS];
+   unsigned outputWrites[VARYING_SLOT_TESS_MAX];
 
    memset(outputWrites, 0, sizeof(outputWrites));
 
@@ -3573,7 +3591,7 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
       }
 
       if (inst->dst[0].file == PROGRAM_OUTPUT) {
-         assert(inst->dst[0].index < MAX_PROGRAM_OUTPUTS);
+         assert(inst->dst[0].index < (signed)ARRAY_SIZE(outputWrites));
          prevWriteMask = outputWrites[inst->dst[0].index];
          outputWrites[inst->dst[0].index] |= inst->dst[0].writemask;
       } else if (inst->dst[0].file == PROGRAM_TEMPORARY) {
@@ -4527,6 +4545,14 @@ const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
    TGSI_SEMANTIC_SAMPLEID,
    TGSI_SEMANTIC_SAMPLEPOS,
    TGSI_SEMANTIC_SAMPLEMASK,
+
+   /* Tessellation shaders
+    */
+   TGSI_SEMANTIC_TESSCOORD,
+   TGSI_SEMANTIC_VERTICESIN,
+   TGSI_SEMANTIC_PRIMID,
+   TGSI_SEMANTIC_TESSOUTER,
+   TGSI_SEMANTIC_TESSINNER,
 };
 
 /**
@@ -4651,6 +4677,9 @@ dst_register(struct st_translate *t, gl_register_file file, unsigned index,
       if (!array_id) {
          if (t->procType == TGSI_PROCESSOR_FRAGMENT)
             assert(index < FRAG_RESULT_MAX);
+         else if (t->procType == TGSI_PROCESSOR_TESS_CTRL ||
+                  t->procType == TGSI_PROCESSOR_TESS_EVAL)
+            assert(index < VARYING_SLOT_TESS_MAX);
          else
             assert(index < VARYING_SLOT_MAX);
 
@@ -5271,6 +5300,8 @@ st_translate_program(
           TGSI_SEMANTIC_VERTEXID_NOBASE);
    assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_BASE_VERTEX] ==
           TGSI_SEMANTIC_BASEVERTEX);
+   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_TESS_COORD] ==
+          TGSI_SEMANTIC_TESSCOORD);
 
    t = CALLOC_STRUCT(st_translate);
    if (!t) {
@@ -5313,6 +5344,8 @@ st_translate_program(
       }
       break;
    case TGSI_PROCESSOR_GEOMETRY:
+   case TGSI_PROCESSOR_TESS_EVAL:
+   case TGSI_PROCESSOR_TESS_CTRL:
       for (i = 0; i < numInputs; i++) {
          unsigned array_id = 0;
          unsigned array_size;
@@ -5347,6 +5380,8 @@ st_translate_program(
    case TGSI_PROCESSOR_FRAGMENT:
       break;
    case TGSI_PROCESSOR_GEOMETRY:
+   case TGSI_PROCESSOR_TESS_EVAL:
+   case TGSI_PROCESSOR_TESS_CTRL:
    case TGSI_PROCESSOR_VERTEX:
       for (i = 0; i < numOutputs; i++) {
          unsigned array_id = 0;
@@ -5750,9 +5785,9 @@ get_mesa_program(struct gl_context *ctx,
 
    do_set_program_inouts(shader->ir, prog, shader->Stage);
    shrink_array_declarations(v->input_arrays, v->num_input_arrays,
-                             prog->InputsRead);
+                             prog->InputsRead, prog->PatchInputsRead);
    shrink_array_declarations(v->output_arrays, v->num_output_arrays,
-                             prog->OutputsWritten);
+                             prog->OutputsWritten, prog->PatchOutputsWritten);
    count_resources(v, prog);
 
    /* This must be done before the uniform storage is associated. */
@@ -5781,6 +5816,8 @@ get_mesa_program(struct gl_context *ctx,
    struct st_vertex_program *stvp;
    struct st_fragment_program *stfp;
    struct st_geometry_program *stgp;
+   struct st_tessctrl_program *sttcp;
+   struct st_tesseval_program *sttep;
 
    switch (shader->Type) {
    case GL_VERTEX_SHADER:
@@ -5795,6 +5832,14 @@ get_mesa_program(struct gl_context *ctx,
       stgp = (struct st_geometry_program *)prog;
       stgp->glsl_to_tgsi = v;
       break;
+   case GL_TESS_CONTROL_SHADER:
+      sttcp = (struct st_tessctrl_program *)prog;
+      sttcp->glsl_to_tgsi = v;
+      break;
+   case GL_TESS_EVALUATION_SHADER:
+      sttep = (struct st_tesseval_program *)prog;
+      sttep->glsl_to_tgsi = v;
+      break;
    default:
       assert(!"should not be reached");
       return NULL;
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 99257949ba3..e62dd7aab80 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -932,61 +932,52 @@ st_get_fp_variant(struct st_context *st,
 
 
 /**
- * Translate a geometry program to create a new variant.
+ * Translate a program. This is common code for geometry and tessellation
+ * shaders.
  */
-static struct st_gp_variant *
-st_translate_geometry_program(struct st_context *st,
-                              struct st_geometry_program *stgp,
-                              const struct st_gp_variant_key *key)
+static void
+st_translate_program_common(struct st_context *st,
+                            struct gl_program *prog,
+                            struct glsl_to_tgsi_visitor *glsl_to_tgsi,
+                            struct ureg_program *ureg,
+                            unsigned tgsi_processor,
+                            struct pipe_shader_state *out_state)
 {
-   GLuint inputSlotToAttr[VARYING_SLOT_MAX];
-   GLuint inputMapping[VARYING_SLOT_MAX];
-   GLuint outputSlotToAttr[VARYING_SLOT_MAX];
-   GLuint outputMapping[VARYING_SLOT_MAX];
-   struct pipe_context *pipe = st->pipe;
+   GLuint inputSlotToAttr[VARYING_SLOT_TESS_MAX];
+   GLuint inputMapping[VARYING_SLOT_TESS_MAX];
+   GLuint outputSlotToAttr[VARYING_SLOT_TESS_MAX];
+   GLuint outputMapping[VARYING_SLOT_TESS_MAX];
    GLuint attr;
 
-   uint gs_num_inputs = 0;
-
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS];
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
+   uint num_inputs = 0;
 
-   ubyte gs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
-   ubyte gs_output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
-   uint gs_num_outputs = 0;
+   ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
+   ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
+   uint num_outputs = 0;
 
    GLint i;
-   struct ureg_program *ureg;
-   struct pipe_shader_state state = {0};
-   struct st_gp_variant *gpv;
-
-   gpv = CALLOC_STRUCT(st_gp_variant);
-   if (!gpv)
-      return NULL;
-
-   ureg = ureg_create_with_screen(TGSI_PROCESSOR_GEOMETRY, st->pipe->screen);
-   if (ureg == NULL) {
-      free(gpv);
-      return NULL;
-   }
 
    memset(inputSlotToAttr, 0, sizeof(inputSlotToAttr));
    memset(inputMapping, 0, sizeof(inputMapping));
    memset(outputSlotToAttr, 0, sizeof(outputSlotToAttr));
    memset(outputMapping, 0, sizeof(outputMapping));
+   memset(out_state, 0, sizeof(*out_state));
 
    /*
     * Convert Mesa program inputs to TGSI input register semantics.
     */
    for (attr = 0; attr < VARYING_SLOT_MAX; attr++) {
-      if ((stgp->Base.Base.InputsRead & BITFIELD64_BIT(attr)) != 0) {
-         const GLuint slot = gs_num_inputs++;
+      if ((prog->InputsRead & BITFIELD64_BIT(attr)) != 0) {
+         const GLuint slot = num_inputs++;
 
          inputMapping[attr] = slot;
          inputSlotToAttr[slot] = attr;
 
          switch (attr) {
          case VARYING_SLOT_PRIMITIVE_ID:
+            assert(tgsi_processor == TGSI_PROCESSOR_GEOMETRY);
             input_semantic_name[slot] = TGSI_SEMANTIC_PRIMID;
             input_semantic_index[slot] = 0;
             break;
@@ -1038,19 +1029,33 @@ st_translate_geometry_program(struct st_context *st,
             /* fall through */
          case VARYING_SLOT_VAR0:
          default:
-            assert(attr >= VARYING_SLOT_VAR0 && attr < VARYING_SLOT_MAX);
+            assert(attr >= VARYING_SLOT_VAR0 ||
+                   (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
             input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
             input_semantic_index[slot] =
                st_get_generic_varying_index(st, attr);
-         break;
+            break;
          }
       }
    }
 
+   /* Also add patch inputs. */
+   for (attr = 0; attr < 32; attr++) {
+      if (prog->PatchInputsRead & (1 << attr)) {
+         GLuint slot = num_inputs++;
+         GLuint patch_attr = VARYING_SLOT_PATCH0 + attr;
+
+         inputMapping[patch_attr] = slot;
+         inputSlotToAttr[slot] = patch_attr;
+         input_semantic_name[slot] = TGSI_SEMANTIC_PATCH;
+         input_semantic_index[slot] = attr;
+      }
+   }
+
    /* initialize output semantics to defaults */
    for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
-      gs_output_semantic_name[i] = TGSI_SEMANTIC_GENERIC;
-      gs_output_semantic_index[i] = 0;
+      output_semantic_name[i] = TGSI_SEMANTIC_GENERIC;
+      output_semantic_index[i] = 0;
    }
 
    /*
@@ -1058,8 +1063,8 @@ st_translate_geometry_program(struct st_context *st,
     * mapping and the semantic information for each output.
     */
    for (attr = 0; attr < VARYING_SLOT_MAX; attr++) {
-      if (stgp->Base.Base.OutputsWritten & BITFIELD64_BIT(attr)) {
-         GLuint slot = gs_num_outputs++;
+      if (prog->OutputsWritten & BITFIELD64_BIT(attr)) {
+         GLuint slot = num_outputs++;
 
          outputMapping[attr] = slot;
          outputSlotToAttr[slot] = attr;
@@ -1067,56 +1072,64 @@ st_translate_geometry_program(struct st_context *st,
          switch (attr) {
          case VARYING_SLOT_POS:
             assert(slot == 0);
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_POSITION;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_POSITION;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_COL0:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_COL1:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
-            gs_output_semantic_index[slot] = 1;
+            output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
+            output_semantic_index[slot] = 1;
             break;
          case VARYING_SLOT_BFC0:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_BFC1:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
-            gs_output_semantic_index[slot] = 1;
+            output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
+            output_semantic_index[slot] = 1;
             break;
          case VARYING_SLOT_FOGC:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_FOG;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_FOG;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_PSIZ:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_CLIP_VERTEX:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_CLIP_DIST0:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_CLIP_DIST1:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
-            gs_output_semantic_index[slot] = 1;
+            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
+            output_semantic_index[slot] = 1;
             break;
          case VARYING_SLOT_LAYER:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_LAYER;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_LAYER;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_PRIMITIVE_ID:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_PRIMID;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_PRIMID;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_VIEWPORT:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX;
+            output_semantic_index[slot] = 0;
+            break;
+         case VARYING_SLOT_TESS_LEVEL_OUTER:
+            output_semantic_name[slot] = TGSI_SEMANTIC_TESSOUTER;
+            output_semantic_index[slot] = 0;
+            break;
+         case VARYING_SLOT_TESS_LEVEL_INNER:
+            output_semantic_name[slot] = TGSI_SEMANTIC_TESSINNER;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_TEX0:
          case VARYING_SLOT_TEX1:
@@ -1127,36 +1140,44 @@ st_translate_geometry_program(struct st_context *st,
          case VARYING_SLOT_TEX6:
          case VARYING_SLOT_TEX7:
             if (st->needs_texcoord_semantic) {
-               gs_output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD;
-               gs_output_semantic_index[slot] = attr - VARYING_SLOT_TEX0;
+               output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD;
+               output_semantic_index[slot] = attr - VARYING_SLOT_TEX0;
                break;
             }
             /* fall through */
          case VARYING_SLOT_VAR0:
          default:
-            assert(slot < ARRAY_SIZE(gs_output_semantic_name));
-            assert(attr >= VARYING_SLOT_VAR0);
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            gs_output_semantic_index[slot] =
+            assert(slot < ARRAY_SIZE(output_semantic_name));
+            assert(attr >= VARYING_SLOT_VAR0 ||
+                   (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
+            output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
+            output_semantic_index[slot] =
                st_get_generic_varying_index(st, attr);
-         break;
+            break;
          }
       }
    }
 
-   ureg_property(ureg, TGSI_PROPERTY_GS_INPUT_PRIM, stgp->Base.InputType);
-   ureg_property(ureg, TGSI_PROPERTY_GS_OUTPUT_PRIM, stgp->Base.OutputType);
-   ureg_property(ureg, TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES,
-                 stgp->Base.VerticesOut);
-   ureg_property(ureg, TGSI_PROPERTY_GS_INVOCATIONS, stgp->Base.Invocations);
+   /* Also add patch outputs. */
+   for (attr = 0; attr < 32; attr++) {
+      if (prog->PatchOutputsWritten & (1 << attr)) {
+         GLuint slot = num_outputs++;
+         GLuint patch_attr = VARYING_SLOT_PATCH0 + attr;
+
+         outputMapping[patch_attr] = slot;
+         outputSlotToAttr[slot] = patch_attr;
+         output_semantic_name[slot] = TGSI_SEMANTIC_PATCH;
+         output_semantic_index[slot] = attr;
+      }
+   }
 
    st_translate_program(st->ctx,
-                        TGSI_PROCESSOR_GEOMETRY,
+                        tgsi_processor,
                         ureg,
-                        stgp->glsl_to_tgsi,
-                        &stgp->Base.Base,
+                        glsl_to_tgsi,
+                        prog,
                         /* inputs */
-                        gs_num_inputs,
+                        num_inputs,
                         inputMapping,
                         inputSlotToAttr,
                         input_semantic_name,
@@ -1164,30 +1185,64 @@ st_translate_geometry_program(struct st_context *st,
                         NULL,
                         NULL,
                         /* outputs */
-                        gs_num_outputs,
+                        num_outputs,
                         outputMapping,
                         outputSlotToAttr,
-                        gs_output_semantic_name,
-                        gs_output_semantic_index,
+                        output_semantic_name,
+                        output_semantic_index,
                         FALSE,
                         FALSE);
 
-   state.tokens = ureg_get_tokens(ureg, NULL);
+   out_state->tokens = ureg_get_tokens(ureg, NULL);
    ureg_destroy(ureg);
 
-   st_translate_stream_output_info(stgp->glsl_to_tgsi,
+   st_translate_stream_output_info(glsl_to_tgsi,
                                    outputMapping,
-                                   &state.stream_output);
+                                   &out_state->stream_output);
 
    if ((ST_DEBUG & DEBUG_TGSI) && (ST_DEBUG & DEBUG_MESA)) {
-      _mesa_print_program(&stgp->Base.Base);
+      _mesa_print_program(prog);
       debug_printf("\n");
    }
 
    if (ST_DEBUG & DEBUG_TGSI) {
-      tgsi_dump(state.tokens, 0);
+      tgsi_dump(out_state->tokens, 0);
       debug_printf("\n");
    }
+}
+
+
+/**
+ * Translate a geometry program to create a new variant.
+ */
+static struct st_gp_variant *
+st_translate_geometry_program(struct st_context *st,
+                              struct st_geometry_program *stgp,
+                              const struct st_gp_variant_key *key)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct ureg_program *ureg;
+   struct st_gp_variant *gpv;
+   struct pipe_shader_state state;
+
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_GEOMETRY, st->pipe->screen);
+   if (ureg == NULL)
+      return NULL;
+
+   ureg_property(ureg, TGSI_PROPERTY_GS_INPUT_PRIM, stgp->Base.InputType);
+   ureg_property(ureg, TGSI_PROPERTY_GS_OUTPUT_PRIM, stgp->Base.OutputType);
+   ureg_property(ureg, TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES,
+                 stgp->Base.VerticesOut);
+   ureg_property(ureg, TGSI_PROPERTY_GS_INVOCATIONS, stgp->Base.Invocations);
+
+   st_translate_program_common(st, &stgp->Base.Base, stgp->glsl_to_tgsi, ureg,
+                               TGSI_PROCESSOR_GEOMETRY, &state);
+
+   gpv = CALLOC_STRUCT(st_gp_variant);
+   if (!gpv) {
+      ureg_free_tokens(state.tokens);
+      return NULL;
+   }
 
    /* fill in new variant */
    gpv->driver_shader = pipe->create_gs_state(pipe, &state);
@@ -1237,7 +1292,34 @@ st_translate_tessctrl_program(struct st_context *st,
                               struct st_tessctrl_program *sttcp,
                               const struct st_tcp_variant_key *key)
 {
-   return NULL; /* will be updated in the next commit */
+   struct pipe_context *pipe = st->pipe;
+   struct ureg_program *ureg;
+   struct st_tcp_variant *tcpv;
+   struct pipe_shader_state state;
+
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_CTRL, pipe->screen);
+   if (ureg == NULL) {
+      return NULL;
+   }
+
+   ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT,
+                 sttcp->Base.VerticesOut);
+
+   st_translate_program_common(st, &sttcp->Base.Base, sttcp->glsl_to_tgsi,
+                               ureg, TGSI_PROCESSOR_TESS_CTRL, &state);
+
+   tcpv = CALLOC_STRUCT(st_tcp_variant);
+   if (!tcpv) {
+      ureg_free_tokens(state.tokens);
+      return NULL;
+   }
+
+   /* fill in new variant */
+   tcpv->driver_shader = pipe->create_tcs_state(pipe, &state);
+   tcpv->key = *key;
+
+   ureg_free_tokens(state.tokens);
+   return tcpv;
 }
 
 
@@ -1280,7 +1362,56 @@ st_translate_tesseval_program(struct st_context *st,
                               struct st_tesseval_program *sttep,
                               const struct st_tep_variant_key *key)
 {
-   return NULL; /* will be updated in the next commit */
+   struct pipe_context *pipe = st->pipe;
+   struct ureg_program *ureg;
+   struct st_tep_variant *tepv;
+   struct pipe_shader_state state;
+
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_EVAL, pipe->screen);
+   if (ureg == NULL) {
+      return NULL;
+   }
+
+   if (sttep->Base.PrimitiveMode == GL_ISOLINES)
+      ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, GL_LINES);
+   else
+      ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, sttep->Base.PrimitiveMode);
+
+   switch (sttep->Base.Spacing) {
+   case GL_EQUAL:
+      ureg_property(ureg, TGSI_PROPERTY_TES_SPACING, PIPE_TESS_SPACING_EQUAL);
+      break;
+   case GL_FRACTIONAL_EVEN:
+      ureg_property(ureg, TGSI_PROPERTY_TES_SPACING,
+                    PIPE_TESS_SPACING_FRACTIONAL_EVEN);
+      break;
+   case GL_FRACTIONAL_ODD:
+      ureg_property(ureg, TGSI_PROPERTY_TES_SPACING,
+                    PIPE_TESS_SPACING_FRACTIONAL_ODD);
+      break;
+   default:
+      assert(0);
+   }
+
+   ureg_property(ureg, TGSI_PROPERTY_TES_VERTEX_ORDER_CW,
+                 sttep->Base.VertexOrder == GL_CW);
+   ureg_property(ureg, TGSI_PROPERTY_TES_POINT_MODE, sttep->Base.PointMode);
+
+   st_translate_program_common(st, &sttep->Base.Base, sttep->glsl_to_tgsi,
+                               ureg, TGSI_PROCESSOR_TESS_EVAL, &state);
+
+   tepv = CALLOC_STRUCT(st_tep_variant);
+   if (!tepv) {
+      ureg_free_tokens(state.tokens);
+      return NULL;
+   }
+
+   /* fill in new variant */
+   tepv->driver_shader = pipe->create_tes_state(pipe, &state);
+   tepv->key = *key;
+
+   ureg_free_tokens(state.tokens);
+   return tepv;
 }
 
 

From bda79139d4579b5105c45561401960a82bab2f7e Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 12 Jul 2014 17:10:59 -0400
Subject: [PATCH 0532/1208] st/mesa: handle constbufs/ubos for tessellation
 shaders

---
 src/mesa/state_tracker/st_atom.c          |  4 ++
 src/mesa/state_tracker/st_atom.h          |  4 ++
 src/mesa/state_tracker/st_atom_constbuf.c | 88 ++++++++++++++++++++++-
 3 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index 1c3eb7ec45a..676b14c648d 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -66,9 +66,13 @@ static const struct st_tracked_state *atoms[] =
    &st_update_msaa,
    &st_update_sample_shading,
    &st_update_vs_constants,
+   &st_update_tcs_constants,
+   &st_update_tes_constants,
    &st_update_gs_constants,
    &st_update_fs_constants,
    &st_bind_vs_ubos,
+   &st_bind_tcs_ubos,
+   &st_bind_tes_ubos,
    &st_bind_fs_ubos,
    &st_bind_gs_ubos,
    &st_update_pixel_transfer,
diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h
index bec4a810406..655b4e76388 100644
--- a/src/mesa/state_tracker/st_atom.h
+++ b/src/mesa/state_tracker/st_atom.h
@@ -69,10 +69,14 @@ extern const struct st_tracked_state st_update_geometry_texture;
 extern const struct st_tracked_state st_finalize_textures;
 extern const struct st_tracked_state st_update_fs_constants;
 extern const struct st_tracked_state st_update_gs_constants;
+extern const struct st_tracked_state st_update_tes_constants;
+extern const struct st_tracked_state st_update_tcs_constants;
 extern const struct st_tracked_state st_update_vs_constants;
 extern const struct st_tracked_state st_bind_fs_ubos;
 extern const struct st_tracked_state st_bind_vs_ubos;
 extern const struct st_tracked_state st_bind_gs_ubos;
+extern const struct st_tracked_state st_bind_tcs_ubos;
+extern const struct st_tracked_state st_bind_tes_ubos;
 extern const struct st_tracked_state st_update_pixel_transfer;
 
 
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index a54e0d9dbf5..6affb4d84d5 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -59,7 +59,9 @@ void st_upload_constants( struct st_context *st,
 {
    assert(shader_type == PIPE_SHADER_VERTEX ||
           shader_type == PIPE_SHADER_FRAGMENT ||
-          shader_type == PIPE_SHADER_GEOMETRY);
+          shader_type == PIPE_SHADER_GEOMETRY ||
+          shader_type == PIPE_SHADER_TESS_CTRL ||
+          shader_type == PIPE_SHADER_TESS_EVAL);
 
    /* update constants */
    if (params && params->NumParameters) {
@@ -178,6 +180,50 @@ const struct st_tracked_state st_update_gs_constants = {
    update_gs_constants					/* update */
 };
 
+/* Tessellation control shader:
+ */
+static void update_tcs_constants(struct st_context *st )
+{
+   struct st_tessctrl_program *tcp = st->tcp;
+   struct gl_program_parameter_list *params;
+
+   if (tcp) {
+      params = tcp->Base.Base.Parameters;
+      st_upload_constants( st, params, PIPE_SHADER_TESS_CTRL );
+   }
+}
+
+const struct st_tracked_state st_update_tcs_constants = {
+   "st_update_tcs_constants",				/* name */
+   {							/* dirty */
+      _NEW_PROGRAM_CONSTANTS,                           /* mesa */
+      ST_NEW_TESSCTRL_PROGRAM,				/* st */
+   },
+   update_tcs_constants					/* update */
+};
+
+/* Tessellation evaluation shader:
+ */
+static void update_tes_constants(struct st_context *st )
+{
+   struct st_tesseval_program *tep = st->tep;
+   struct gl_program_parameter_list *params;
+
+   if (tep) {
+      params = tep->Base.Base.Parameters;
+      st_upload_constants( st, params, PIPE_SHADER_TESS_EVAL );
+   }
+}
+
+const struct st_tracked_state st_update_tes_constants = {
+   "st_update_tes_constants",				/* name */
+   {							/* dirty */
+      _NEW_PROGRAM_CONSTANTS,                           /* mesa */
+      ST_NEW_TESSEVAL_PROGRAM,				/* st */
+   },
+   update_tes_constants					/* update */
+};
+
 static void st_bind_ubos(struct st_context *st,
                            struct gl_shader *shader,
                            unsigned shader_type)
@@ -275,3 +321,43 @@ const struct st_tracked_state st_bind_gs_ubos = {
    },
    bind_gs_ubos
 };
+
+static void bind_tcs_ubos(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
+
+   if (!prog)
+      return;
+
+   st_bind_ubos(st, prog->_LinkedShaders[MESA_SHADER_TESS_CTRL], PIPE_SHADER_TESS_CTRL);
+}
+
+const struct st_tracked_state st_bind_tcs_ubos = {
+   "st_bind_tcs_ubos",
+   {
+      0,
+      ST_NEW_TESSCTRL_PROGRAM | ST_NEW_UNIFORM_BUFFER,
+   },
+   bind_tcs_ubos
+};
+
+static void bind_tes_ubos(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
+
+   if (!prog)
+      return;
+
+   st_bind_ubos(st, prog->_LinkedShaders[MESA_SHADER_TESS_EVAL], PIPE_SHADER_TESS_EVAL);
+}
+
+const struct st_tracked_state st_bind_tes_ubos = {
+   "st_bind_tes_ubos",
+   {
+      0,
+      ST_NEW_TESSEVAL_PROGRAM | ST_NEW_UNIFORM_BUFFER,
+   },
+   bind_tes_ubos
+};

From 40bc1c32d2fb42207ea860053045fa49e45d80b9 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 12 Jul 2014 17:15:51 -0400
Subject: [PATCH 0533/1208] st/mesa: add texture updates for tessellation
 programs

---
 src/mesa/state_tracker/st_atom.c         |  2 +
 src/mesa/state_tracker/st_atom.h         |  2 +
 src/mesa/state_tracker/st_atom_sampler.c | 16 ++++++++
 src/mesa/state_tracker/st_atom_texture.c | 52 ++++++++++++++++++++++++
 4 files changed, 72 insertions(+)

diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index 676b14c648d..c97cd84da60 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -61,6 +61,8 @@ static const struct st_tracked_state *atoms[] =
    &st_update_vertex_texture,
    &st_update_fragment_texture,
    &st_update_geometry_texture,
+   &st_update_tessctrl_texture,
+   &st_update_tesseval_texture,
    &st_update_sampler, /* depends on update_*_texture for swizzle */
    &st_update_framebuffer,
    &st_update_msaa,
diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h
index 655b4e76388..bbfbd2d3f9f 100644
--- a/src/mesa/state_tracker/st_atom.h
+++ b/src/mesa/state_tracker/st_atom.h
@@ -66,6 +66,8 @@ extern const struct st_tracked_state st_update_sampler;
 extern const struct st_tracked_state st_update_fragment_texture;
 extern const struct st_tracked_state st_update_vertex_texture;
 extern const struct st_tracked_state st_update_geometry_texture;
+extern const struct st_tracked_state st_update_tessctrl_texture;
+extern const struct st_tracked_state st_update_tesseval_texture;
 extern const struct st_tracked_state st_finalize_textures;
 extern const struct st_tracked_state st_update_fs_constants;
 extern const struct st_tracked_state st_update_gs_constants;
diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c
index 4ab82bc9acb..4252c27962e 100644
--- a/src/mesa/state_tracker/st_atom_sampler.c
+++ b/src/mesa/state_tracker/st_atom_sampler.c
@@ -305,6 +305,22 @@ update_samplers(struct st_context *st)
                              st->state.samplers[PIPE_SHADER_GEOMETRY],
                              &st->state.num_samplers[PIPE_SHADER_GEOMETRY]);
    }
+   if (ctx->TessCtrlProgram._Current) {
+      update_shader_samplers(st,
+                             PIPE_SHADER_TESS_CTRL,
+                             &ctx->TessCtrlProgram._Current->Base,
+                             ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits,
+                             st->state.samplers[PIPE_SHADER_TESS_CTRL],
+                             &st->state.num_samplers[PIPE_SHADER_TESS_CTRL]);
+   }
+   if (ctx->TessEvalProgram._Current) {
+      update_shader_samplers(st,
+                             PIPE_SHADER_TESS_EVAL,
+                             &ctx->TessEvalProgram._Current->Base,
+                             ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits,
+                             st->state.samplers[PIPE_SHADER_TESS_EVAL],
+                             &st->state.num_samplers[PIPE_SHADER_TESS_EVAL]);
+   }
 }
 
 
diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index 04ba86448fc..ba3cf9beeb4 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -474,6 +474,38 @@ update_geometry_textures(struct st_context *st)
 }
 
 
+static void
+update_tessctrl_textures(struct st_context *st)
+{
+   const struct gl_context *ctx = st->ctx;
+
+   if (ctx->TessCtrlProgram._Current) {
+      update_textures(st,
+                      PIPE_SHADER_TESS_CTRL,
+                      &ctx->TessCtrlProgram._Current->Base,
+                      ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits,
+                      st->state.sampler_views[PIPE_SHADER_TESS_CTRL],
+                      &st->state.num_sampler_views[PIPE_SHADER_TESS_CTRL]);
+   }
+}
+
+
+static void
+update_tesseval_textures(struct st_context *st)
+{
+   const struct gl_context *ctx = st->ctx;
+
+   if (ctx->TessEvalProgram._Current) {
+      update_textures(st,
+                      PIPE_SHADER_TESS_EVAL,
+                      &ctx->TessEvalProgram._Current->Base,
+                      ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits,
+                      st->state.sampler_views[PIPE_SHADER_TESS_EVAL],
+                      &st->state.num_sampler_views[PIPE_SHADER_TESS_EVAL]);
+   }
+}
+
+
 const struct st_tracked_state st_update_fragment_texture = {
    "st_update_texture",					/* name */
    {							/* dirty */
@@ -504,6 +536,26 @@ const struct st_tracked_state st_update_geometry_texture = {
 };
 
 
+const struct st_tracked_state st_update_tessctrl_texture = {
+   "st_update_tessctrl_texture",			/* name */
+   {							/* dirty */
+      _NEW_TEXTURE,					/* mesa */
+      ST_NEW_TESSCTRL_PROGRAM,				/* st */
+   },
+   update_tessctrl_textures				/* update */
+};
+
+
+const struct st_tracked_state st_update_tesseval_texture = {
+   "st_update_tesseval_texture",			/* name */
+   {							/* dirty */
+      _NEW_TEXTURE,					/* mesa */
+      ST_NEW_TESSEVAL_PROGRAM,				/* st */
+   },
+   update_tesseval_textures				/* update */
+};
+
+
 
 static void
 finalize_textures(struct st_context *st)

From d00e2763b153c212e8f01af610ae305606044bcc Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 12 Jul 2014 17:28:24 -0400
Subject: [PATCH 0534/1208] st/mesa: query shader CAPs for tessellation

The MaxTessPatchComponents query added by Marek.
---
 src/mesa/state_tracker/st_extensions.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index b1057f3eadd..6c70df63037 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -165,6 +165,14 @@ void st_init_limits(struct pipe_screen *screen,
          pc = &c->Program[MESA_SHADER_GEOMETRY];
          options = &c->ShaderCompilerOptions[MESA_SHADER_GEOMETRY];
          break;
+      case PIPE_SHADER_TESS_CTRL:
+         pc = &c->Program[MESA_SHADER_TESS_CTRL];
+         options = &c->ShaderCompilerOptions[MESA_SHADER_TESS_CTRL];
+         break;
+      case PIPE_SHADER_TESS_EVAL:
+         pc = &c->Program[MESA_SHADER_TESS_EVAL];
+         options = &c->ShaderCompilerOptions[MESA_SHADER_TESS_EVAL];
+         break;
       default:
          /* compute shader, etc. */
          continue;
@@ -247,6 +255,8 @@ void st_init_limits(struct pipe_screen *screen,
 
    c->MaxCombinedTextureImageUnits =
          _min(c->Program[MESA_SHADER_VERTEX].MaxTextureImageUnits +
+              c->Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits +
+              c->Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits +
               c->Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits +
               c->Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits,
               MAX_COMBINED_TEXTURE_IMAGE_UNITS);
@@ -266,6 +276,9 @@ void st_init_limits(struct pipe_screen *screen,
    c->MaxVarying = MIN2(c->MaxVarying, MAX_VARYING);
    c->MaxGeometryOutputVertices = screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES);
    c->MaxGeometryTotalOutputComponents = screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS);
+   c->MaxTessPatchComponents =
+      MAX2(screen->get_param(screen, PIPE_CAP_MAX_SHADER_PATCH_VARYINGS),
+           MAX_VARYING) * 4;
 
    c->MinProgramTexelOffset = screen->get_param(screen, PIPE_CAP_MIN_TEXEL_OFFSET);
    c->MaxProgramTexelOffset = screen->get_param(screen, PIPE_CAP_MAX_TEXEL_OFFSET);
@@ -301,6 +314,8 @@ void st_init_limits(struct pipe_screen *screen,
          screen->get_param(screen, PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT);
       c->MaxCombinedUniformBlocks = c->MaxUniformBufferBindings =
          c->Program[MESA_SHADER_VERTEX].MaxUniformBlocks +
+         c->Program[MESA_SHADER_TESS_CTRL].MaxUniformBlocks +
+         c->Program[MESA_SHADER_TESS_EVAL].MaxUniformBlocks +
          c->Program[MESA_SHADER_GEOMETRY].MaxUniformBlocks +
          c->Program[MESA_SHADER_FRAGMENT].MaxUniformBlocks;
       assert(c->MaxCombinedUniformBlocks <= MAX_COMBINED_UNIFORM_BUFFERS);

From 05c847433f5a3f3b2032bef32284ad7d6a2db850 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 12 Jul 2014 20:09:06 -0400
Subject: [PATCH 0535/1208] st/mesa: lower gl_TessLevel from float[] to vecn

---
 src/mesa/state_tracker/st_extensions.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 6c70df63037..899442fe43e 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -253,6 +253,8 @@ void st_init_limits(struct pipe_screen *screen,
       options->LowerClipDistance = true;
    }
 
+   c->LowerTessLevel = true;
+
    c->MaxCombinedTextureImageUnits =
          _min(c->Program[MESA_SHADER_VERTEX].MaxTextureImageUnits +
               c->Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits +

From c9998617a8f40ad7e65aca9c581f5bcc7f1d0f4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 20:47:31 +0100
Subject: [PATCH 0536/1208] st/mesa: handle tessellation 2D varyings correctly

---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 28 +++++++++++++++-------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index e59cadcb8ec..ac78c18ff70 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -2409,22 +2409,34 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
    ir_constant *index;
    st_src_reg src;
    int element_size = type_size(ir->type);
-   bool is_2D_input;
+   bool is_2D = false;
 
    index = ir->array_index->constant_expression_value();
 
    ir->array->accept(this);
    src = this->result;
 
-   is_2D_input = this->prog->Target == GL_GEOMETRY_PROGRAM_NV &&
-                 src.file == PROGRAM_INPUT &&
-                 ir->array->ir_type != ir_type_dereference_array;
+   if (ir->array->ir_type != ir_type_dereference_array) {
+      switch (this->prog->Target) {
+      case GL_TESS_CONTROL_PROGRAM_NV:
+         is_2D = (src.file == PROGRAM_INPUT || src.file == PROGRAM_OUTPUT) &&
+                 !ir->variable_referenced()->data.patch;
+         break;
+      case GL_TESS_EVALUATION_PROGRAM_NV:
+         is_2D = src.file == PROGRAM_INPUT &&
+                 !ir->variable_referenced()->data.patch;
+         break;
+      case GL_GEOMETRY_PROGRAM_NV:
+         is_2D = src.file == PROGRAM_INPUT;
+         break;
+      }
+   }
 
-   if (is_2D_input)
+   if (is_2D)
       element_size = 1;
 
    if (index) {
-      if (is_2D_input) {
+      if (is_2D) {
          src.index2D = index->value.i[0];
          src.has_index2 = true;
       } else
@@ -2451,7 +2463,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
       /* If there was already a relative address register involved, add the
        * new and the old together to get the new offset.
        */
-      if (!is_2D_input && src.reladdr != NULL) {
+      if (!is_2D && src.reladdr != NULL) {
          st_src_reg accum_reg = get_temp(native_integers ?
                                 glsl_type::int_type : glsl_type::float_type);
 
@@ -2461,7 +2473,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
          index_reg = accum_reg;
       }
 
-      if (is_2D_input) {
+      if (is_2D) {
          src.reladdr2 = ralloc(mem_ctx, st_src_reg);
          memcpy(src.reladdr2, &index_reg, sizeof(index_reg));
          src.index2D = 0;

From 37d1809dd7cdfedbee4fcfef148fcdb1c7b43068 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 13 Jul 2014 15:02:53 -0400
Subject: [PATCH 0537/1208] st/mesa: add 2d indexing support to outputs

---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 47 +++++++++++++++++-----
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index ac78c18ff70..221b6751b26 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -158,9 +158,12 @@ public:
    {
       this->file = file;
       this->index = index;
+      this->index2D = 0;
       this->writemask = writemask;
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
+      this->reladdr2 = NULL;
+      this->has_index2 = false;
       this->type = type;
       this->array_id = 0;
    }
@@ -169,9 +172,12 @@ public:
    {
       this->file = file;
       this->index = 0;
+      this->index2D = 0;
       this->writemask = writemask;
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
+      this->reladdr2 = NULL;
+      this->has_index2 = false;
       this->type = type;
       this->array_id = 0;
    }
@@ -181,9 +187,12 @@ public:
       this->type = GLSL_TYPE_ERROR;
       this->file = PROGRAM_UNDEFINED;
       this->index = 0;
+      this->index2D = 0;
       this->writemask = 0;
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
+      this->reladdr2 = NULL;
+      this->has_index2 = false;
       this->array_id = 0;
    }
 
@@ -191,11 +200,14 @@ public:
 
    gl_register_file file; /**< PROGRAM_* from Mesa */
    int index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
+   int index2D;
    int writemask; /**< Bitfield of WRITEMASK_[XYZW] */
    GLuint cond_mask:4;
    int type; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
    /** Register index should be offset by the integer in this reg. */
    st_src_reg *reladdr;
+   st_src_reg *reladdr2;
+   bool has_index2;
    unsigned array_id;
 };
 
@@ -207,9 +219,9 @@ st_src_reg::st_src_reg(st_dst_reg reg)
    this->swizzle = SWIZZLE_XYZW;
    this->negate = 0;
    this->reladdr = reg.reladdr;
-   this->index2D = 0;
-   this->reladdr2 = NULL;
-   this->has_index2 = false;
+   this->index2D = reg.index2D;
+   this->reladdr2 = reg.reladdr2;
+   this->has_index2 = reg.has_index2;
    this->double_reg2 = false;
    this->array_id = reg.array_id;
 }
@@ -222,6 +234,9 @@ st_dst_reg::st_dst_reg(st_src_reg reg)
    this->writemask = WRITEMASK_XYZW;
    this->cond_mask = COND_TR;
    this->reladdr = reg.reladdr;
+   this->index2D = reg.index2D;
+   this->reladdr2 = reg.reladdr2;
+   this->has_index2 = reg.has_index2;
    this->array_id = reg.array_id;
 }
 
@@ -551,8 +566,8 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
     * reg directly for one of the regs, and preload the other reladdr
     * sources into temps.
     */
-   num_reladdr += dst.reladdr != NULL;
-   num_reladdr += dst1.reladdr != NULL;
+   num_reladdr += dst.reladdr != NULL || dst.reladdr2;
+   num_reladdr += dst1.reladdr != NULL || dst1.reladdr2;
    num_reladdr += src0.reladdr != NULL || src0.reladdr2 != NULL;
    num_reladdr += src1.reladdr != NULL || src1.reladdr2 != NULL;
    num_reladdr += src2.reladdr != NULL || src2.reladdr2 != NULL;
@@ -563,8 +578,11 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
    reladdr_to_temp(ir, &src1, &num_reladdr);
    reladdr_to_temp(ir, &src0, &num_reladdr);
 
-   if (dst.reladdr) {
-      emit_arl(ir, address_reg, *dst.reladdr);
+   if (dst.reladdr || dst.reladdr2) {
+      if (dst.reladdr)
+         emit_arl(ir, address_reg, *dst.reladdr);
+      if (dst.reladdr2)
+         emit_arl(ir, address_reg2, *dst.reladdr2);
       num_reladdr--;
    }
    if (dst1.reladdr) {
@@ -590,7 +608,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
    inst->function = NULL;
 
    /* Update indirect addressing status used by TGSI */
-   if (dst.reladdr) {
+   if (dst.reladdr || dst.reladdr2) {
       switch(dst.file) {
       case PROGRAM_STATE_VAR:
       case PROGRAM_CONSTANT:
@@ -3591,8 +3609,8 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
       unsigned prevWriteMask = 0;
 
       /* Give up if we encounter relative addressing or flow control. */
-      if (inst->dst[0].reladdr ||
-          inst->dst[1].reladdr ||
+      if (inst->dst[0].reladdr || inst->dst[0].reladdr2 ||
+          inst->dst[1].reladdr || inst->dst[1].reladdr2 ||
           tgsi_get_opcode_info(inst->op)->is_branch ||
           inst->op == TGSI_OPCODE_BGNSUB ||
           inst->op == TGSI_OPCODE_CONT ||
@@ -3970,6 +3988,7 @@ glsl_to_tgsi_visitor::copy_propagate(void)
           !(inst->dst[0].file == inst->src[0].file &&
              inst->dst[0].index == inst->src[0].index) &&
           !inst->dst[0].reladdr &&
+          !inst->dst[0].reladdr2 &&
           !inst->saturate &&
           inst->src[0].file != PROGRAM_ARRAY &&
           !inst->src[0].reladdr &&
@@ -4831,6 +4850,14 @@ translate_dst(struct st_translate *t,
       dst = ureg_dst_indirect(dst, ureg_src(t->address[0]));
    }
 
+   if (dst_reg->has_index2) {
+      if (dst_reg->reladdr2)
+         dst = ureg_dst_dimension_indirect(dst, ureg_src(t->address[1]),
+                                           dst_reg->index2D);
+      else
+         dst = ureg_dst_dimension(dst, dst_reg->index2D);
+   }
+
    return dst;
 }
 

From f4c13fad6550f42524786c70b6f13fc510abaf0b Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 21 Jul 2014 18:49:40 -0400
Subject: [PATCH 0538/1208] st/mesa: set vertices_per_patch when drawing

---
 src/mesa/state_tracker/st_draw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 8b43582c14b..66b2f832933 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -164,6 +164,7 @@ translate_prim(const struct gl_context *ctx, unsigned prim)
    STATIC_ASSERT(GL_POINTS == PIPE_PRIM_POINTS);
    STATIC_ASSERT(GL_QUADS == PIPE_PRIM_QUADS);
    STATIC_ASSERT(GL_TRIANGLE_STRIP_ADJACENCY == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY);
+   STATIC_ASSERT(GL_PATCHES == PIPE_PRIM_PATCHES);
 
    return prim;
 }
@@ -260,6 +261,7 @@ st_draw_vbo(struct gl_context *ctx,
       info.count = prims[i].count;
       info.start_instance = prims[i].base_instance;
       info.instance_count = prims[i].num_instances;
+      info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
       info.index_bias = prims[i].basevertex;
       if (!ib) {
          info.min_index = info.start;

From 8f40428afbbfa9080964df3cd4f38f24122c4c5e Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 21 Jul 2014 20:45:29 -0400
Subject: [PATCH 0539/1208] st/mesa: disable copy propagation for tessellation
 shaders

This can't work due to shared inputs and outputs and barriers.
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 221b6751b26..ad319f06cad 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -5800,7 +5800,11 @@ get_mesa_program(struct gl_context *ctx,
 
    /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. */
    v->simplify_cmp();
-   v->copy_propagate();
+
+   if (shader->Type != GL_TESS_CONTROL_SHADER &&
+       shader->Type != GL_TESS_EVALUATION_SHADER)
+      v->copy_propagate();
+
    while (v->eliminate_dead_code());
 
    v->merge_two_dsts();

From 82f7fad96691480b9ffdeb3e8e1b3345ede713ef Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 7 Sep 2014 18:36:06 -0400
Subject: [PATCH 0540/1208] st/mesa: add barrier support

---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index ad319f06cad..b0fc6254edb 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -3478,7 +3478,10 @@ glsl_to_tgsi_visitor::visit(ir_end_primitive *ir)
 void
 glsl_to_tgsi_visitor::visit(ir_barrier *ir)
 {
-   unreachable("Not implemented!");
+   assert(this->prog->Target == GL_TESS_CONTROL_PROGRAM_NV ||
+          this->prog->Target == GL_COMPUTE_PROGRAM_NV);
+
+   emit_asm(ir, TGSI_OPCODE_BARRIER);
 }
 
 glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()

From bda9094f1d69817ed1a51677d38e157ec3b37826 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 18 May 2015 12:49:10 +0200
Subject: [PATCH 0541/1208] st/mesa: set default tessellation levels

---
 src/mesa/Makefile.sources             |  1 +
 src/mesa/state_tracker/st_atom.c      |  1 +
 src/mesa/state_tracker/st_atom.h      |  1 +
 src/mesa/state_tracker/st_atom_tess.c | 62 +++++++++++++++++++++++++++
 src/mesa/state_tracker/st_context.c   |  1 +
 src/mesa/state_tracker/st_context.h   |  2 +-
 6 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 src/mesa/state_tracker/st_atom_tess.c

diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index 83f500fbf20..ed9848c5454 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -407,6 +407,7 @@ STATETRACKER_FILES = \
 	state_tracker/st_atom_shader.c \
 	state_tracker/st_atom_shader.h \
 	state_tracker/st_atom_stipple.c \
+	state_tracker/st_atom_tess.c \
 	state_tracker/st_atom_texture.c \
 	state_tracker/st_atom_viewport.c \
 	state_tracker/st_cache.h \
diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index c97cd84da60..5fc1a77ce60 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -78,6 +78,7 @@ static const struct st_tracked_state *atoms[] =
    &st_bind_fs_ubos,
    &st_bind_gs_ubos,
    &st_update_pixel_transfer,
+   &st_update_tess,
 
    /* this must be done after the vertex program update */
    &st_update_array
diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h
index bbfbd2d3f9f..5735ca6930d 100644
--- a/src/mesa/state_tracker/st_atom.h
+++ b/src/mesa/state_tracker/st_atom.h
@@ -80,6 +80,7 @@ extern const struct st_tracked_state st_bind_gs_ubos;
 extern const struct st_tracked_state st_bind_tcs_ubos;
 extern const struct st_tracked_state st_bind_tes_ubos;
 extern const struct st_tracked_state st_update_pixel_transfer;
+extern const struct st_tracked_state st_update_tess;
 
 
 GLuint st_compare_func_to_pipe(GLenum func);
diff --git a/src/mesa/state_tracker/st_atom_tess.c b/src/mesa/state_tracker/st_atom_tess.c
new file mode 100644
index 00000000000..8e6287a900c
--- /dev/null
+++ b/src/mesa/state_tracker/st_atom_tess.c
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/*
+ * Authors:
+ *   Marek Olšák <maraeo@gmail.com>
+ */
+
+
+#include "main/macros.h"
+#include "st_context.h"
+#include "pipe/p_context.h"
+#include "st_atom.h"
+
+
+static void
+update_tess(struct st_context *st)
+{
+   const struct gl_context *ctx = st->ctx;
+   struct pipe_context *pipe = st->pipe;
+
+   if (!pipe->set_tess_state)
+      return;
+
+   pipe->set_tess_state(pipe,
+                        ctx->TessCtrlProgram.patch_default_outer_level,
+                        ctx->TessCtrlProgram.patch_default_inner_level);
+}
+
+
+const struct st_tracked_state st_update_tess = {
+   "update_tess",		/* name */
+   {				/* dirty */
+      0,			/* mesa */
+      ST_NEW_TESS_STATE,	/* st */
+   },
+   update_tess                  /* update */
+};
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 0c80fa1179d..44244a1e720 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -313,6 +313,7 @@ static void st_init_driver_flags(struct gl_driver_flags *f)
    f->NewArray = ST_NEW_VERTEX_ARRAYS;
    f->NewRasterizerDiscard = ST_NEW_RASTERIZER;
    f->NewUniformBuffer = ST_NEW_UNIFORM_BUFFER;
+   f->NewDefaultTessLevels = ST_NEW_TESS_STATE;
 }
 
 struct st_context *st_create_context(gl_api api, struct pipe_context *pipe,
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 301072742cf..8183412ceab 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -53,7 +53,7 @@ struct u_upload_mgr;
 #define ST_NEW_FRAGMENT_PROGRAM        (1 << 1)
 #define ST_NEW_VERTEX_PROGRAM          (1 << 2)
 #define ST_NEW_FRAMEBUFFER             (1 << 3)
-/* gap, re-use it */
+#define ST_NEW_TESS_STATE              (1 << 4)
 #define ST_NEW_GEOMETRY_PROGRAM        (1 << 5)
 #define ST_NEW_VERTEX_ARRAYS           (1 << 6)
 #define ST_NEW_RASTERIZER              (1 << 7)

From 7626ad8d6daad147bf9a1a82fa4c3ac9e2d3347c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 19:50:42 +0100
Subject: [PATCH 0542/1208] st/mesa: enable tessellation if the driver supports
 it

---
 src/mesa/state_tracker/st_extensions.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 899442fe43e..dc5cdfd1fe7 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -747,6 +747,11 @@ void st_init_extensions(struct pipe_screen *screen,
 #endif
    }
 
+   if (screen->get_shader_param(screen, PIPE_SHADER_TESS_CTRL,
+                                PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0) {
+      extensions->ARB_tessellation_shader = GL_TRUE;
+   }
+
    if (screen->get_param(screen, PIPE_CAP_PRIMITIVE_RESTART)) {
       extensions->NV_primitive_restart = GL_TRUE;
    }

From a3be59b4a91e25d47535f192194ff669cfe2ef6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 5 Oct 2014 20:20:18 +0200
Subject: [PATCH 0543/1208] gallium/radeon: expose LLVM functions implementing
 emit_store

emit_store will be reimplemented for tessellation control shader outputs
where only radeon_llvm_saturate will be used, but radeonsi will want to
fall back to radeon_llvm_emit_store for other register types.

This exposes both functions.
---
 src/gallium/drivers/radeon/radeon_llvm.h            | 9 +++++++++
 src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c | 9 ++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 591e698d482..c50eb6914be 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -212,4 +212,13 @@ radeon_llvm_emit_fetch_double(struct lp_build_tgsi_context *bld_base,
 			      LLVMValueRef ptr,
 			      LLVMValueRef ptr2);
 
+LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
+                                  LLVMValueRef value);
+
+void radeon_llvm_emit_store(
+	struct lp_build_tgsi_context * bld_base,
+	const struct tgsi_full_instruction * inst,
+	const struct tgsi_opcode_info * info,
+	LLVMValueRef dst[4]);
+
 #endif /* RADEON_LLVM_H */
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 7c0318152d1..5bc93d7aa0e 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -371,8 +371,8 @@ static void emit_declaration(
 	}
 }
 
-static LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
-                                         LLVMValueRef value)
+LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
+                                  LLVMValueRef value)
 {
 	struct lp_build_emit_data clamp_emit_data;
 
@@ -386,8 +386,7 @@ static LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
 				  &clamp_emit_data);
 }
 
-static void
-emit_store(
+void radeon_llvm_emit_store(
 	struct lp_build_tgsi_context * bld_base,
 	const struct tgsi_full_instruction * inst,
 	const struct tgsi_opcode_info * info,
@@ -1542,7 +1541,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	}
 
 	bld_base->soa = 1;
-	bld_base->emit_store = emit_store;
+	bld_base->emit_store = radeon_llvm_emit_store;
 	bld_base->emit_swizzle = emit_swizzle;
 	bld_base->emit_declaration = emit_declaration;
 	bld_base->emit_immediate = emit_immediate;

From 1bc0fba572363f5460be7343cff8b8b7a315d755 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 3 Mar 2015 15:11:27 +0100
Subject: [PATCH 0544/1208] gallium/radeon: expose emit_fetch

Radeonsi will use this.
---
 src/gallium/drivers/radeon/radeon_llvm.h      |  5 ++++
 .../drivers/radeon/radeon_setup_tgsi_llvm.c   | 29 +++++++------------
 2 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index c50eb6914be..ef09ead9774 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -215,6 +215,11 @@ radeon_llvm_emit_fetch_double(struct lp_build_tgsi_context *bld_base,
 LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
                                   LLVMValueRef value);
 
+LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
+				    const struct tgsi_full_src_register *reg,
+				    enum tgsi_opcode_type type,
+				    unsigned swizzle);
+
 void radeon_llvm_emit_store(
 	struct lp_build_tgsi_context * bld_base,
 	const struct tgsi_full_instruction * inst,
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 5bc93d7aa0e..39bbdb67778 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -109,13 +109,6 @@ emit_array_index(
 	return LLVMBuildAdd(gallivm->builder, addr, lp_build_const_int32(gallivm, offset), "");
 }
 
-static LLVMValueRef
-emit_fetch(
-	struct lp_build_tgsi_context *bld_base,
-	const struct tgsi_full_src_register *reg,
-	enum tgsi_opcode_type type,
-	unsigned swizzle);
-
 LLVMValueRef
 radeon_llvm_emit_fetch_double(
 	struct lp_build_tgsi_context *bld_base,
@@ -158,7 +151,7 @@ emit_array_fetch(
 
 	for (i = 0; i < size; ++i) {
 		tmp_reg.Register.Index = i + range.First;
-		LLVMValueRef temp = emit_fetch(bld_base, &tmp_reg, type, swizzle);
+		LLVMValueRef temp = radeon_llvm_emit_fetch(bld_base, &tmp_reg, type, swizzle);
 		result = LLVMBuildInsertElement(builder, result, temp,
 			lp_build_const_int32(gallivm, i), "");
 	}
@@ -172,12 +165,10 @@ static bool uses_temp_indirect_addressing(
 	return (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY));
 }
 
-static LLVMValueRef
-emit_fetch(
-	struct lp_build_tgsi_context *bld_base,
-	const struct tgsi_full_src_register *reg,
-	enum tgsi_opcode_type type,
-	unsigned swizzle)
+LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
+				    const struct tgsi_full_src_register *reg,
+				    enum tgsi_opcode_type type,
+				    unsigned swizzle)
 {
 	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
 	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
@@ -188,7 +179,7 @@ emit_fetch(
 		LLVMValueRef values[TGSI_NUM_CHANNELS];
 		unsigned chan;
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-			values[chan] = emit_fetch(bld_base, reg, type, chan);
+			values[chan] = radeon_llvm_emit_fetch(bld_base, reg, type, chan);
 		}
 		return lp_build_gather_values(bld_base->base.gallivm, values,
 					      TGSI_NUM_CHANNELS);
@@ -1546,10 +1537,10 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->emit_declaration = emit_declaration;
 	bld_base->emit_immediate = emit_immediate;
 
-	bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_TEMPORARY] = emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = emit_fetch;
+	bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = radeon_llvm_emit_fetch;
+	bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = radeon_llvm_emit_fetch;
+	bld_base->emit_fetch_funcs[TGSI_FILE_TEMPORARY] = radeon_llvm_emit_fetch;
+	bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = radeon_llvm_emit_fetch;
 	bld_base->emit_fetch_funcs[TGSI_FILE_SYSTEM_VALUE] = fetch_system_value;
 
 	/* Allocate outputs */

From 2ecb06b946ff8bf4a96de79ab81926fa1bf5a93f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 6 Oct 2014 00:17:42 +0200
Subject: [PATCH 0545/1208] radeonsi: make ES2GS offset sgpr location dynamic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It will have a different location in the tessellation evaluation shader.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 6 +++---
 src/gallium/drivers/radeonsi/si_shader.h | 3 ---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 955e780faf2..d0e7fa95cf7 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -72,6 +72,7 @@ struct si_shader_context
 	int param_streamout_offset[4];
 	int param_vertex_id;
 	int param_instance_id;
+	int param_es2gs_offset;
 	LLVMTargetMachineRef tm;
 	LLVMValueRef const_md;
 	LLVMValueRef const_resource[SI_NUM_CONST_BUFFERS];
@@ -1332,7 +1333,7 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 	struct tgsi_shader_info *info = &es->selector->info;
 	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 	LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-					    SI_PARAM_ES2GS_OFFSET);
+					    si_shader_ctx->param_es2gs_offset);
 	unsigned chan;
 	int i;
 
@@ -2385,8 +2386,7 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 		num_params = SI_PARAM_START_INSTANCE+1;
 
 		if (shader->key.vs.as_es) {
-			params[SI_PARAM_ES2GS_OFFSET] = i32;
-			num_params++;
+			params[si_shader_ctx->param_es2gs_offset = num_params++] = i32;
 		} else {
 			if (shader->is_gs_copy_shader) {
 				last_array_pointer = SI_PARAM_CONST;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index c12782f288c..be3e98a361a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -62,9 +62,6 @@ struct radeon_shader_reloc;
 #define SI_PARAM_START_INSTANCE	6
 /* the other VS parameters are assigned dynamically */
 
-/* ES only parameters */
-#define SI_PARAM_ES2GS_OFFSET	7
-
 /* GS only parameters */
 #define SI_PARAM_GS2VS_OFFSET	4
 #define SI_PARAM_GS_WAVE_ID	5

From aa1f2af572a0285e9f5779e17b2d753119e0ec85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 14:33:49 +0100
Subject: [PATCH 0546/1208] radeonsi: move declaring streamout parameters to
 its own function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It will be reused later.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 36 +++++++++++++++---------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index d0e7fa95cf7..288f377fd93 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2354,6 +2354,27 @@ static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
 			       CONST_ADDR_SPACE);
 }
 
+static void declare_streamout_params(struct si_shader_context *si_shader_ctx,
+				     struct pipe_stream_output_info *so,
+				     LLVMTypeRef *params, LLVMTypeRef i32,
+				     unsigned *num_params)
+{
+	int i;
+
+	/* Streamout SGPRs. */
+	if (so->num_outputs) {
+		params[si_shader_ctx->param_streamout_config = (*num_params)++] = i32;
+		params[si_shader_ctx->param_streamout_write_index = (*num_params)++] = i32;
+	}
+	/* A streamout buffer offset is loaded if the stride is non-zero. */
+	for (i = 0; i < 4; i++) {
+		if (!so->stride[i])
+			continue;
+
+		params[si_shader_ctx->param_streamout_offset[i] = (*num_params)++] = i32;
+	}
+}
+
 static void create_function(struct si_shader_context *si_shader_ctx)
 {
 	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
@@ -2394,19 +2415,8 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 			}
 
 			/* The locations of the other parameters are assigned dynamically. */
-
-			/* Streamout SGPRs. */
-			if (shader->selector->so.num_outputs) {
-				params[si_shader_ctx->param_streamout_config = num_params++] = i32;
-				params[si_shader_ctx->param_streamout_write_index = num_params++] = i32;
-			}
-			/* A streamout buffer offset is loaded if the stride is non-zero. */
-			for (i = 0; i < 4; i++) {
-				if (!shader->selector->so.stride[i])
-					continue;
-
-				params[si_shader_ctx->param_streamout_offset[i] = num_params++] = i32;
-			}
+			declare_streamout_params(si_shader_ctx, &shader->selector->so,
+						 params, i32, &num_params);
 		}
 
 		last_sgpr = num_params-1;

From f66844820e3ae2403d66d3275b1bf3e77087189c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 14:46:20 +0100
Subject: [PATCH 0547/1208] radeonsi: separate primitive ID computation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support for new shader stages will be added here.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 27 ++++++++++++++++++------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 288f377fd93..5626c9e2fbb 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -328,6 +328,24 @@ static void declare_input_vs(
 	}
 }
 
+static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
+				     unsigned swizzle)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+
+	if (swizzle > 0)
+		return bld_base->uint_bld.zero;
+
+	switch (si_shader_ctx->type) {
+	case TGSI_PROCESSOR_GEOMETRY:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    SI_PARAM_PRIMITIVE_ID);
+	default:
+		assert(0);
+		return bld_base->uint_bld.zero;
+	}
+}
+
 static LLVMValueRef fetch_input_gs(
 	struct lp_build_tgsi_context *bld_base,
 	const struct tgsi_full_src_register *reg,
@@ -347,13 +365,8 @@ static LLVMValueRef fetch_input_gs(
 	unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
 	unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
 
-	if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) {
-		if (swizzle == 0)
-			return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-					    SI_PARAM_PRIMITIVE_ID);
-		else
-			return uint->zero;
-	}
+	if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
+		return get_primitive_id(bld_base, swizzle);
 
 	if (!reg->Register.Dimension)
 		return NULL;

From 57b6f8d9f9bfafd931974eae6942663e2ba6db02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 14:54:54 +0100
Subject: [PATCH 0548/1208] radeonsi: rename build_streamout_store ->
 build_tbuffer_store_dwords
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It will be reused later.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 5626c9e2fbb..e2d08287e77 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1032,13 +1032,13 @@ static void build_tbuffer_store(struct si_shader_context *shader,
 			   args, Elements(args));
 }
 
-static void build_streamout_store(struct si_shader_context *shader,
-				  LLVMValueRef rsrc,
-				  LLVMValueRef vdata,
-				  unsigned num_channels,
-				  LLVMValueRef vaddr,
-				  LLVMValueRef soffset,
-				  unsigned inst_offset)
+static void build_tbuffer_store_dwords(struct si_shader_context *shader,
+				     LLVMValueRef rsrc,
+				     LLVMValueRef vdata,
+				     unsigned num_channels,
+				     LLVMValueRef vaddr,
+				     LLVMValueRef soffset,
+				     unsigned inst_offset)
 {
 	static unsigned dfmt[] = {
 		V_008F0C_BUF_DATA_FORMAT_32,
@@ -1151,11 +1151,11 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 				break;
 			}
 
-			build_streamout_store(shader, shader->so_buffers[buf_idx],
-					      vdata, num_comps,
-					      so_write_offset[buf_idx],
-					      LLVMConstInt(i32, 0, 0),
-					      so->output[i].dst_offset*4);
+			build_tbuffer_store_dwords(shader, shader->so_buffers[buf_idx],
+						   vdata, num_comps,
+						   so_write_offset[buf_idx],
+						   LLVMConstInt(i32, 0, 0),
+						   so->output[i].dst_offset*4);
 		}
 	}
 	lp_build_endif(&if_ctx);

From 3ce91c727f2a00a05f414351266b0b45d677611e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 15 Sep 2014 23:34:28 +0200
Subject: [PATCH 0549/1208] radeonsi: rework how shader pointers to descriptors
 are set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is mainly needed for tessellation where a VS can be bound as VS, ES,
or LS, and TES (tess. evaluationshader) can be bound as VS or ES or neither.
Therefore we need the ability to move pointers to descriptors between
shaders arbitrarily.

The idea is that the context has a mapping from PIPE_SHADER_x to
SPI_SHADER_USER_DATA_x. After a shader is enabled or disabled,
si_shader_change_notify should be called to update this mapping accordingly.

There is a dirty flag for each shader pointer, but only one emit function
for all pointers in the whole context, whose code and logic is separated
from descriptors.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 222 +++++++++++-------
 src/gallium/drivers/radeonsi/si_pipe.h        |   5 +-
 src/gallium/drivers/radeonsi/si_state.h       |  13 +-
 src/gallium/drivers/radeonsi/si_state_draw.c  |   3 +-
 .../drivers/radeonsi/si_state_shaders.c       |   4 +
 5 files changed, 156 insertions(+), 91 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index bbfd36dcbeb..f31cccbba17 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -141,7 +141,7 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
 
 static void si_init_descriptors(struct si_context *sctx,
 				struct si_descriptors *desc,
-				unsigned shader_userdata_reg,
+				unsigned shader_userdata_index,
 				unsigned element_dw_size,
 				unsigned num_elements,
 				void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
@@ -150,7 +150,7 @@ static void si_init_descriptors(struct si_context *sctx,
 	assert(num_elements <= sizeof(desc->dirty_mask)*8);
 
 	desc->atom.emit = (void*)emit_func;
-	desc->shader_userdata_reg = shader_userdata_reg;
+	desc->shader_userdata_offset = shader_userdata_index * 4;
 	desc->element_dw_size = element_dw_size;
 	desc->num_elements = num_elements;
 	desc->context_size = num_elements * element_dw_size * 4;
@@ -181,14 +181,11 @@ static void si_update_descriptors(struct si_context *sctx,
 	if (desc->dirty_mask) {
 		desc->atom.num_dw =
 			7 + /* copy */
-			(4 + desc->element_dw_size) * util_bitcount64(desc->dirty_mask) + /* update */
-			4; /* pointer update */
-
-		if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
-		    desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0)
-			desc->atom.num_dw += 4; /* second pointer update */
+			(4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask); /* update */
 
 		desc->atom.dirty = true;
+		desc->pointer_dirty = true;
+		sctx->shader_userdata.atom.dirty = true;
 
 		/* TODO: Investigate if these flushes can be removed after
 		 * adding CE support. */
@@ -206,32 +203,6 @@ static void si_update_descriptors(struct si_context *sctx,
 	}
 }
 
-static void si_emit_shader_pointer(struct si_context *sctx,
-				   struct r600_atom *atom)
-{
-	struct si_descriptors *desc = (struct si_descriptors*)atom;
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint64_t va = desc->buffer->gpu_address +
-		      desc->current_context_id * desc->context_size +
-		      desc->buffer_offset;
-
-	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
-	radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
-	radeon_emit(cs, va);
-	radeon_emit(cs, va >> 32);
-
-	if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
-	    desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0) {
-		radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
-		radeon_emit(cs, (desc->shader_userdata_reg +
-				 (R_00B330_SPI_SHADER_USER_DATA_ES_0 -
-				  R_00B130_SPI_SHADER_USER_DATA_VS_0) -
-				 SI_SH_REG_OFFSET) >> 2);
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-	}
-}
-
 static void si_emit_descriptors(struct si_context *sctx,
 				struct si_descriptors *desc,
 				uint32_t **descriptors)
@@ -295,24 +266,6 @@ static void si_emit_descriptors(struct si_context *sctx,
 
 	desc->dirty_mask = 0;
 	desc->current_context_id = new_context_id;
-
-	/* Now update the shader userdata pointer. */
-	si_emit_shader_pointer(sctx, &desc->atom);
-}
-
-static unsigned si_get_shader_user_data_base(unsigned shader)
-{
-	switch (shader) {
-	case PIPE_SHADER_VERTEX:
-		return R_00B130_SPI_SHADER_USER_DATA_VS_0;
-	case PIPE_SHADER_GEOMETRY:
-		return R_00B230_SPI_SHADER_USER_DATA_GS_0;
-	case PIPE_SHADER_FRAGMENT:
-		return R_00B030_SPI_SHADER_USER_DATA_PS_0;
-	default:
-		assert(0);
-		return 0;
-	}
 }
 
 /* SAMPLER VIEWS */
@@ -325,14 +278,11 @@ static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *ato
 }
 
 static void si_init_sampler_views(struct si_context *sctx,
-				  struct si_sampler_views *views,
-				  unsigned shader)
+				  struct si_sampler_views *views)
 {
 	int i;
 
-	si_init_descriptors(sctx, &views->desc,
-			    si_get_shader_user_data_base(shader) +
-			    SI_SGPR_RESOURCE * 4,
+	si_init_descriptors(sctx, &views->desc, SI_SGPR_RESOURCE,
 			    8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views);
 
 	for (i = 0; i < views->desc.num_elements; i++) {
@@ -384,8 +334,6 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-
-	si_emit_shader_pointer(sctx, &views->desc.atom);
 }
 
 static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
@@ -493,7 +441,6 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 {
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-	si_emit_shader_pointer(sctx, &states->desc.atom);
 }
 
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
@@ -536,7 +483,7 @@ static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *
 
 static void si_init_buffer_resources(struct si_context *sctx,
 				     struct si_buffer_resources *buffers,
-				     unsigned num_buffers, unsigned shader,
+				     unsigned num_buffers,
 				     unsigned shader_userdata_index,
 				     enum radeon_bo_usage shader_usage,
 				     enum radeon_bo_priority priority)
@@ -556,10 +503,8 @@ static void si_init_buffer_resources(struct si_context *sctx,
 		buffers->desc_data[i] = &buffers->desc_storage[i*4];
 	}
 
-	si_init_descriptors(sctx, &buffers->desc,
-			    si_get_shader_user_data_base(shader) +
-			    shader_userdata_index*4, 4, num_buffers,
-			    si_emit_buffer_resources);
+	si_init_descriptors(sctx, &buffers->desc, shader_userdata_index, 4,
+			    num_buffers, si_emit_buffer_resources);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
@@ -593,8 +538,6 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
 			      RADEON_PRIO_SHADER_DATA);
-
-	si_emit_shader_pointer(sctx, &buffers->desc.atom);
 }
 
 /* VERTEX BUFFERS */
@@ -620,8 +563,6 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_SHADER_DATA);
-
-	si_emit_shader_pointer(sctx, &desc->atom);
 }
 
 void si_update_vertex_buffers(struct si_context *sctx)
@@ -693,13 +634,12 @@ void si_update_vertex_buffers(struct si_context *sctx)
 		}
 	}
 
-	desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */
-	desc->atom.dirty = true;
-
 	/* Don't flush the const cache. It would have a very negative effect
 	 * on performance (confirmed by testing). New descriptors are always
 	 * uploaded to a fresh new buffer, so I don't think flushing the const
 	 * cache is needed. */
+	desc->pointer_dirty = true;
+	sctx->shader_userdata.atom.dirty = true;
 }
 
 
@@ -1277,6 +1217,112 @@ void si_copy_buffer(struct si_context *sctx,
 		r600_resource(dst)->TC_L2_dirty = true;
 }
 
+/* SHADER USER DATA */
+
+static void si_mark_shader_pointers_dirty(struct si_context *sctx,
+					  unsigned shader)
+{
+	sctx->const_buffers[shader].desc.pointer_dirty = true;
+	sctx->rw_buffers[shader].desc.pointer_dirty = true;
+	sctx->samplers[shader].views.desc.pointer_dirty = true;
+	sctx->samplers[shader].states.desc.pointer_dirty = true;
+
+	if (shader == PIPE_SHADER_VERTEX)
+		sctx->vertex_buffers.pointer_dirty = true;
+
+	sctx->shader_userdata.atom.dirty = true;
+}
+
+static void si_shader_userdata_begin_new_cs(struct si_context *sctx)
+{
+	int i;
+
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		si_mark_shader_pointers_dirty(sctx, i);
+	}
+}
+
+/* Set a base register address for user data constants in the given shader.
+ * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
+ */
+static void si_set_user_data_base(struct si_context *sctx,
+				  unsigned shader, uint32_t new_base)
+{
+	uint32_t *base = &sctx->shader_userdata.sh_base[shader];
+
+	if (*base != new_base) {
+		*base = new_base;
+
+		if (new_base)
+			si_mark_shader_pointers_dirty(sctx, shader);
+	}
+}
+
+/* This must be called when these shaders are changed from non-NULL to NULL
+ * and vice versa:
+ * - geometry shader
+ * - tessellation control shader
+ * - tessellation evaluation shader
+ */
+void si_shader_change_notify(struct si_context *sctx)
+{
+	/* VS can be bound as VS or ES. */
+	if (sctx->gs_shader)
+		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
+				      R_00B330_SPI_SHADER_USER_DATA_ES_0);
+	else
+		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
+				      R_00B130_SPI_SHADER_USER_DATA_VS_0);
+}
+
+static void si_emit_shader_pointer(struct si_context *sctx,
+				   struct si_descriptors *desc,
+				   unsigned sh_base, bool keep_dirty)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	uint64_t va;
+
+	if (!desc->pointer_dirty)
+		return;
+
+	va = desc->buffer->gpu_address +
+	     desc->current_context_id * desc->context_size +
+	     desc->buffer_offset;
+
+	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
+	radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+
+	desc->pointer_dirty = keep_dirty;
+}
+
+static void si_emit_shader_userdata(struct si_context *sctx,
+				    struct r600_atom *atom)
+{
+	unsigned i;
+	uint32_t *sh_base = sctx->shader_userdata.sh_base;
+
+	/* The VS copy shader needs these for clipping, streamout, and rings. */
+	if (sctx->gs_shader) {
+		unsigned base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
+		unsigned i = PIPE_SHADER_VERTEX;
+
+		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, true);
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, true);
+	}
+
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		unsigned base = sh_base[i];
+
+		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->samplers[i].states.desc, base, false);
+	}
+	si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
+}
+
 /* INIT/DEINIT */
 
 void si_init_all_descriptors(struct si_context *sctx)
@@ -1285,19 +1331,17 @@ void si_init_all_descriptors(struct si_context *sctx)
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		si_init_buffer_resources(sctx, &sctx->const_buffers[i],
-					 SI_NUM_CONST_BUFFERS, i, SI_SGPR_CONST,
+					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
 					 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
 		si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
-					 i == PIPE_SHADER_VERTEX ?
-					 SI_NUM_RW_BUFFERS : SI_NUM_RING_BUFFERS,
-					 i, SI_SGPR_RW_BUFFERS,
+					 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
 
-		si_init_sampler_views(sctx, &sctx->samplers[i].views, i);
+		si_init_sampler_views(sctx, &sctx->samplers[i].views);
 
 		si_init_descriptors(sctx, &sctx->samplers[i].states.desc,
-				    si_get_shader_user_data_base(i) + SI_SGPR_SAMPLER * 4,
-				    4, SI_NUM_SAMPLER_STATES, si_emit_sampler_states);
+				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES,
+				    si_emit_sampler_states);
 
 		sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
 		sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
@@ -1305,11 +1349,8 @@ void si_init_all_descriptors(struct si_context *sctx)
 		sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
 	}
 
-	si_init_descriptors(sctx, &sctx->vertex_buffers,
-			    si_get_shader_user_data_base(PIPE_SHADER_VERTEX) +
-			    SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS,
-			    si_emit_shader_pointer);
-	sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom;
+	si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
+			    4, SI_NUM_VERTEX_BUFFERS, NULL);
 
 	/* Set pipe_context functions. */
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
@@ -1317,6 +1358,18 @@ void si_init_all_descriptors(struct si_context *sctx)
 	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
 	sctx->b.clear_buffer = si_clear_buffer;
 	sctx->b.invalidate_buffer = si_invalidate_buffer;
+
+	/* Shader user data. */
+	sctx->atoms.s.shader_userdata = &sctx->shader_userdata.atom;
+	sctx->shader_userdata.atom.emit = (void*)si_emit_shader_userdata;
+
+	/* Upper bound, 4 pointers per shader, +1 for vertex buffers, +2 for the VS copy shader. */
+	sctx->shader_userdata.atom.num_dw = (SI_NUM_SHADERS * 4 + 1 + 2) * 4;
+
+	/* Set default and immutable mappings. */
+	si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+	si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+	si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
 }
 
 void si_release_all_descriptors(struct si_context *sctx)
@@ -1343,4 +1396,5 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 		si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
 	}
 	si_vertex_buffers_begin_new_cs(sctx);
+	si_shader_userdata_begin_new_cs(sctx);
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 266eeef61e9..11900105d82 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -125,8 +125,6 @@ struct si_framebuffer {
 
 #define SI_NUM_ATOMS(sctx) (sizeof((sctx)->atoms)/sizeof((sctx)->atoms.array[0]))
 
-#define SI_NUM_SHADERS (PIPE_SHADER_GEOMETRY+1)
-
 struct si_context {
 	struct r600_common_context	b;
 	struct blitter_context		*blitter;
@@ -142,7 +140,6 @@ struct si_context {
 	union {
 		struct {
 			/* The order matters. */
-			struct r600_atom *vertex_buffers;
 			struct r600_atom *const_buffers[SI_NUM_SHADERS];
 			struct r600_atom *rw_buffers[SI_NUM_SHADERS];
 			struct r600_atom *sampler_views[SI_NUM_SHADERS];
@@ -157,6 +154,7 @@ struct si_context {
 			struct r600_atom *db_render_state;
 			struct r600_atom *msaa_config;
 			struct r600_atom *clip_regs;
+			struct r600_atom *shader_userdata;
 		} s;
 		struct r600_atom *array[0];
 	} atoms;
@@ -170,6 +168,7 @@ struct si_context {
 	struct si_shader_selector	*gs_shader;
 	struct si_shader_selector	*vs_shader;
 	struct si_cs_shader_state	cs_shader_state;
+	struct si_shader_data		shader_userdata;
 	/* shader information */
 	unsigned			sprite_coord_enable;
 	bool				flatshade;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 0c1fdb41408..1510c2284d6 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -30,6 +30,8 @@
 #include "si_pm4.h"
 #include "radeon/r600_pipe_common.h"
 
+#define SI_NUM_SHADERS (PIPE_SHADER_GEOMETRY+1)
+
 struct si_screen;
 struct si_shader;
 
@@ -111,6 +113,11 @@ union si_state {
 	struct si_pm4_state	*array[0];
 };
 
+struct si_shader_data {
+	struct r600_atom	atom;
+	uint32_t		sh_base[SI_NUM_SHADERS];
+};
+
 #define SI_NUM_USER_SAMPLERS            16 /* AKA OpenGL textures units per shader */
 #define SI_POLY_STIPPLE_SAMPLER         SI_NUM_USER_SAMPLERS
 #define SI_NUM_SAMPLERS                 (SI_POLY_STIPPLE_SAMPLER + 1)
@@ -172,9 +179,10 @@ struct si_descriptors {
 	/* The size of a context, should be equal to 4*element_dw_size*num_elements. */
 	unsigned context_size;
 
-	/* The shader userdata register where the 64-bit pointer to the descriptor
+	/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
 	 * array will be stored. */
-	unsigned shader_userdata_reg;
+	unsigned shader_userdata_offset;
+	bool pointer_dirty;
 };
 
 struct si_sampler_views {
@@ -246,6 +254,7 @@ void si_copy_buffer(struct si_context *sctx,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size, bool is_framebuffer);
 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
 			    const uint8_t *ptr, unsigned size, uint32_t *const_offset);
+void si_shader_change_notify(struct si_context *sctx);
 
 /* si_state.c */
 struct si_shader_selector;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index e85ed15deb7..abfae1c8ba3 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -249,8 +249,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 				 const struct pipe_index_buffer *ib)
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	unsigned sh_base_reg = (sctx->gs_shader ? R_00B330_SPI_SHADER_USER_DATA_ES_0 :
-						  R_00B130_SPI_SHADER_USER_DATA_VS_0);
+	unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
 
 	if (info->count_from_stream_output) {
 		struct r600_so_target *t =
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index b153228fb2c..23aee3864f8 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -533,6 +533,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
+	bool enable_changed = !!sctx->gs_shader != !!sel;
 
 	if (sctx->gs_shader == sel)
 		return;
@@ -540,6 +541,9 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 	sctx->gs_shader = sel;
 	sctx->clip_regs.dirty = true;
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+
+	if (enable_changed)
+		si_shader_change_notify(sctx);
 }
 
 static void si_make_dummy_ps(struct si_context *sctx)

From c2670463fd50f5b74066f0e0ab8f9a31dcb37429 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 18 Sep 2014 22:26:02 +0200
Subject: [PATCH 0550/1208] radeonsi: add debug flags for dumping tessellation
 shaders
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.c |  6 ++++++
 src/gallium/drivers/radeon/r600_pipe_common.h | 20 ++++++++++---------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 94a7535f531..fb834b01dcd 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -333,6 +333,8 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "gs", DBG_GS, "Print geometry shaders" },
 	{ "ps", DBG_PS, "Print pixel shaders" },
 	{ "cs", DBG_CS, "Print compute shaders" },
+	{ "tcs", DBG_TCS, "Print tessellation control shaders" },
+	{ "tes", DBG_TES, "Print tessellation evaluation shaders" },
 
 	/* features */
 	{ "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" },
@@ -956,6 +958,10 @@ bool r600_can_dump_shader(struct r600_common_screen *rscreen,
 	switch (tgsi_get_processor_type(tokens)) {
 	case TGSI_PROCESSOR_VERTEX:
 		return (rscreen->debug_flags & DBG_VS) != 0;
+	case TGSI_PROCESSOR_TESS_CTRL:
+		return (rscreen->debug_flags & DBG_TCS) != 0;
+	case TGSI_PROCESSOR_TESS_EVAL:
+		return (rscreen->debug_flags & DBG_TES) != 0;
 	case TGSI_PROCESSOR_GEOMETRY:
 		return (rscreen->debug_flags & DBG_GS) != 0;
 	case TGSI_PROCESSOR_FRAGMENT:
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 6f736eb17a5..d225f2528ab 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -79,16 +79,18 @@
 #define DBG_GS			(1 << 7)
 #define DBG_PS			(1 << 8)
 #define DBG_CS			(1 << 9)
+#define DBG_TCS			(1 << 10)
+#define DBG_TES			(1 << 11)
 /* features */
-#define DBG_NO_ASYNC_DMA	(1 << 10)
-#define DBG_NO_HYPERZ		(1 << 11)
-#define DBG_NO_DISCARD_RANGE	(1 << 12)
-#define DBG_NO_2D_TILING	(1 << 13)
-#define DBG_NO_TILING		(1 << 14)
-#define DBG_SWITCH_ON_EOP	(1 << 15)
-#define DBG_FORCE_DMA		(1 << 16)
-#define DBG_PRECOMPILE		(1 << 17)
-#define DBG_INFO		(1 << 18)
+#define DBG_NO_ASYNC_DMA	(1 << 12)
+#define DBG_NO_HYPERZ		(1 << 13)
+#define DBG_NO_DISCARD_RANGE	(1 << 14)
+#define DBG_NO_2D_TILING	(1 << 15)
+#define DBG_NO_TILING		(1 << 16)
+#define DBG_SWITCH_ON_EOP	(1 << 17)
+#define DBG_FORCE_DMA		(1 << 18)
+#define DBG_PRECOMPILE		(1 << 19)
+#define DBG_INFO		(1 << 20)
 /* The maximum allowed bit is 20. */
 
 #define R600_MAP_BUFFER_ALIGNMENT 64

From d1f43a7e5b889b30106c4db55ec1caac1ed6ca4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 18 Sep 2014 22:50:52 +0200
Subject: [PATCH 0551/1208] radeonsi: add code for creating, binding and
 destroying tessellation shaders
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This doesn't do anything yet.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_blit.c        |  2 +
 src/gallium/drivers/radeonsi/si_pipe.h        |  2 +
 .../drivers/radeonsi/si_state_shaders.c       | 70 +++++++++++++++++++
 3 files changed, 74 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index f6db3f54192..c3591a7e85a 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -57,6 +57,8 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 	util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
 	util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader);
 	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader);
+	util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader);
+	util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader);
 	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader);
 	util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
 	if (sctx->queued.named.sample_mask) {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 11900105d82..2eaff1cd24d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -167,6 +167,8 @@ struct si_context {
 	struct si_shader_selector	*ps_shader;
 	struct si_shader_selector	*gs_shader;
 	struct si_shader_selector	*vs_shader;
+	struct si_shader_selector	*tcs_shader;
+	struct si_shader_selector	*tes_shader;
 	struct si_cs_shader_state	cs_shader_state;
 	struct si_shader_data		shader_userdata;
 	/* shader information */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 23aee3864f8..88edc908f72 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -517,6 +517,18 @@ static void *si_create_vs_state(struct pipe_context *ctx,
 	return si_create_shader_state(ctx, state, PIPE_SHADER_VERTEX);
 }
 
+static void *si_create_tcs_state(struct pipe_context *ctx,
+				 const struct pipe_shader_state *state)
+{
+	return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_CTRL);
+}
+
+static void *si_create_tes_state(struct pipe_context *ctx,
+				 const struct pipe_shader_state *state)
+{
+	return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_EVAL);
+}
+
 static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
@@ -546,6 +558,34 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 		si_shader_change_notify(sctx);
 }
 
+static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = state;
+
+	if (sctx->tcs_shader == sel)
+		return;
+
+	sctx->tcs_shader = sel;
+}
+
+static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = state;
+	bool enable_changed = !!sctx->tes_shader != !!sel;
+
+	if (sctx->tes_shader == sel)
+		return;
+
+	sctx->tes_shader = sel;
+	sctx->clip_regs.dirty = true;
+	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+
+	if (enable_changed)
+		si_shader_change_notify(sctx);
+}
+
 static void si_make_dummy_ps(struct si_context *sctx)
 {
 	if (!sctx->dummy_pixel_shader) {
@@ -643,6 +683,30 @@ static void si_delete_ps_shader(struct pipe_context *ctx, void *state)
 	si_delete_shader_selector(ctx, sel);
 }
 
+static void si_delete_tcs_shader(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+	if (sctx->tcs_shader == sel) {
+		sctx->tcs_shader = NULL;
+	}
+
+	si_delete_shader_selector(ctx, sel);
+}
+
+static void si_delete_tes_shader(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+	if (sctx->tes_shader == sel) {
+		sctx->tes_shader = NULL;
+	}
+
+	si_delete_shader_selector(ctx, sel);
+}
+
 static void si_update_spi_map(struct si_context *sctx)
 {
 	struct si_shader *ps = sctx->ps_shader->current;
@@ -956,14 +1020,20 @@ void si_update_shaders(struct si_context *sctx)
 void si_init_shader_functions(struct si_context *sctx)
 {
 	sctx->b.b.create_vs_state = si_create_vs_state;
+	sctx->b.b.create_tcs_state = si_create_tcs_state;
+	sctx->b.b.create_tes_state = si_create_tes_state;
 	sctx->b.b.create_gs_state = si_create_gs_state;
 	sctx->b.b.create_fs_state = si_create_fs_state;
 
 	sctx->b.b.bind_vs_state = si_bind_vs_shader;
+	sctx->b.b.bind_tcs_state = si_bind_tcs_shader;
+	sctx->b.b.bind_tes_state = si_bind_tes_shader;
 	sctx->b.b.bind_gs_state = si_bind_gs_shader;
 	sctx->b.b.bind_fs_state = si_bind_ps_shader;
 
 	sctx->b.b.delete_vs_state = si_delete_vs_shader;
+	sctx->b.b.delete_tcs_state = si_delete_tcs_shader;
+	sctx->b.b.delete_tes_state = si_delete_tes_shader;
 	sctx->b.b.delete_gs_state = si_delete_gs_shader;
 	sctx->b.b.delete_fs_state = si_delete_ps_shader;
 }

From 55b6f1caaeef4fa81fcc34d552aee4f0448417bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 18 Sep 2014 22:54:40 +0200
Subject: [PATCH 0552/1208] radeonsi: add support for tessellation shader
 resources and samplers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 43 ++++++++++++++++---
 src/gallium/drivers/radeonsi/si_state.h       |  2 +-
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index f31cccbba17..2e2a35b2280 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1266,13 +1266,28 @@ static void si_set_user_data_base(struct si_context *sctx,
  */
 void si_shader_change_notify(struct si_context *sctx)
 {
-	/* VS can be bound as VS or ES. */
-	if (sctx->gs_shader)
+	/* VS can be bound as VS, ES, or LS. */
+	if (sctx->tes_shader)
+		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
+				      R_00B530_SPI_SHADER_USER_DATA_LS_0);
+	else if (sctx->gs_shader)
 		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
 				      R_00B330_SPI_SHADER_USER_DATA_ES_0);
 	else
 		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
 				      R_00B130_SPI_SHADER_USER_DATA_VS_0);
+
+	/* TES can be bound as ES, VS, or not bound. */
+	if (sctx->tes_shader) {
+		if (sctx->gs_shader)
+			si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
+					      R_00B330_SPI_SHADER_USER_DATA_ES_0);
+		else
+			si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
+					      R_00B130_SPI_SHADER_USER_DATA_VS_0);
+	} else {
+		si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
+	}
 }
 
 static void si_emit_shader_pointer(struct si_context *sctx,
@@ -1303,20 +1318,33 @@ static void si_emit_shader_userdata(struct si_context *sctx,
 	unsigned i;
 	uint32_t *sh_base = sctx->shader_userdata.sh_base;
 
-	/* The VS copy shader needs these for clipping, streamout, and rings. */
 	if (sctx->gs_shader) {
-		unsigned base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
+		/* The VS copy shader needs these for clipping, streamout, and rings. */
+		unsigned vs_base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
 		unsigned i = PIPE_SHADER_VERTEX;
 
-		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, true);
-		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, true);
+		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, vs_base, true);
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, vs_base, true);
+
+		/* The TESSEVAL shader needs this for the ESGS ring buffer. */
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc,
+				       R_00B330_SPI_SHADER_USER_DATA_ES_0, true);
+	} else if (sctx->tes_shader) {
+		/* The TESSEVAL shader needs this for streamout. */
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[PIPE_SHADER_VERTEX].desc,
+				       R_00B130_SPI_SHADER_USER_DATA_VS_0, true);
 	}
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		unsigned base = sh_base[i];
 
+		if (!base)
+			continue;
+
+		if (i != PIPE_SHADER_TESS_EVAL)
+			si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, false);
+
 		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false);
-		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, false);
 		si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false);
 		si_emit_shader_pointer(sctx, &sctx->samplers[i].states.desc, base, false);
 	}
@@ -1368,6 +1396,7 @@ void si_init_all_descriptors(struct si_context *sctx)
 
 	/* Set default and immutable mappings. */
 	si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+	si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0);
 	si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
 	si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
 }
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 1510c2284d6..edee6d41328 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -30,7 +30,7 @@
 #include "si_pm4.h"
 #include "radeon/r600_pipe_common.h"
 
-#define SI_NUM_SHADERS (PIPE_SHADER_GEOMETRY+1)
+#define SI_NUM_SHADERS (PIPE_SHADER_TESS_EVAL+1)
 
 struct si_screen;
 struct si_shader;

From d9d0de4d289fa0b18bf23c85586e0111d64bf3b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 18 Sep 2014 23:39:44 +0200
Subject: [PATCH 0553/1208] radeonsi: add translation of PATCH primitives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_state_draw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index abfae1c8ba3..b877a7e0e95 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -64,6 +64,7 @@ static unsigned si_conv_pipe_prim(unsigned mode)
 		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_008958_DI_PT_LINESTRIP_ADJ,
 		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_008958_DI_PT_TRILIST_ADJ,
 		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_008958_DI_PT_TRISTRIP_ADJ,
+		[PIPE_PRIM_PATCHES]			= V_008958_DI_PT_PATCH,
 		[R600_PRIM_RECTANGLE_LIST]		= V_008958_DI_PT_RECTLIST
         };
 	assert(mode < Elements(prim_conv));
@@ -87,6 +88,7 @@ static unsigned si_conv_prim_to_gs_out(unsigned mode)
 		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
 		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
 		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_PATCHES]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
 		[R600_PRIM_RECTANGLE_LIST]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP
 	};
 	assert(mode < Elements(prim_conv));

From 59b3556f4c69f0e6e5430ca6ab384d2ac9372bfc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 19 Sep 2014 00:16:12 +0200
Subject: [PATCH 0554/1208] radeonsi: program VGT_SHADER_STAGES_EN for
 tessellation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.c        |  5 +-
 src/gallium/drivers/radeonsi/si_pipe.h        |  7 +--
 src/gallium/drivers/radeonsi/si_state.h       |  2 +-
 .../drivers/radeonsi/si_state_shaders.c       | 54 +++++++++++++------
 4 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 0878b8887c9..25dc4e7a96d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -36,6 +36,7 @@
 static void si_destroy_context(struct pipe_context *context)
 {
 	struct si_context *sctx = (struct si_context *)context;
+	int i;
 
 	si_release_all_descriptors(sctx);
 
@@ -48,8 +49,8 @@ static void si_destroy_context(struct pipe_context *context)
 
 	si_pm4_free_state(sctx, sctx->init_config, ~0);
 	si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings);
-	si_pm4_delete_state(sctx, gs_onoff, sctx->gs_on);
-	si_pm4_delete_state(sctx, gs_onoff, sctx->gs_off);
+	for (i = 0; i < Elements(sctx->vgt_shader_config); i++)
+		si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
 
 	if (sctx->pstipple_sampler_state)
 		sctx->b.b.delete_sampler_state(&sctx->b.b, sctx->pstipple_sampler_state);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 2eaff1cd24d..880f7beaa19 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -196,11 +196,12 @@ struct si_context {
 	/* With rasterizer discard, there doesn't have to be a pixel shader.
 	 * In that case, we bind this one: */
 	void			*dummy_pixel_shader;
-	struct si_pm4_state	*gs_on;
-	struct si_pm4_state	*gs_off;
-	struct si_pm4_state	*gs_rings;
 	struct r600_atom	cache_flush;
 	struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on CIK */
+
+	/* VGT states. */
+	struct si_pm4_state	*vgt_shader_config[4];
+	struct si_pm4_state	*gs_rings;
 	struct pipe_resource	*esgs_ring;
 	struct pipe_resource	*gsvs_ring;
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index edee6d41328..6174dad7190 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -105,7 +105,7 @@ union si_state {
 		struct si_pm4_state		*es;
 		struct si_pm4_state		*gs;
 		struct si_pm4_state		*gs_rings;
-		struct si_pm4_state		*gs_onoff;
+		struct si_pm4_state		*vgt_shader_config;
 		struct si_pm4_state		*vs;
 		struct si_pm4_state		*ps;
 		struct si_pm4_state		*spi;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 88edc908f72..3eec217fbd2 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -922,6 +922,41 @@ static void si_update_spi_tmpring_size(struct si_context *sctx)
 				S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
 }
 
+static void si_update_vgt_shader_config(struct si_context *sctx)
+{
+	/* Calculate the index of the config.
+	 * 0 = VS, 1 = VS+GS, 2 = VS+Tess, 3 = VS+Tess+GS */
+	unsigned index = 2*!!sctx->tes_shader + !!sctx->gs_shader;
+	struct si_pm4_state **pm4 = &sctx->vgt_shader_config[index];
+
+	if (!*pm4) {
+		uint32_t stages = 0;
+
+		*pm4 = CALLOC_STRUCT(si_pm4_state);
+
+		if (sctx->tes_shader) {
+			stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
+				  S_028B54_HS_EN(1);
+
+			if (sctx->gs_shader)
+				stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
+					  S_028B54_GS_EN(1) |
+				          S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+			else
+				stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
+		} else if (sctx->gs_shader) {
+			stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
+				  S_028B54_GS_EN(1) |
+			          S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+		}
+
+		si_pm4_set_reg(*pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
+		if (!sctx->gs_shader)
+			si_pm4_set_reg(*pm4, R_028A40_VGT_GS_MODE, 0);
+	}
+	si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
+}
+
 void si_update_shaders(struct si_context *sctx)
 {
 	struct pipe_context *ctx = (struct pipe_context*)sctx;
@@ -948,34 +983,19 @@ void si_update_shaders(struct si_context *sctx)
 				   sctx->gs_shader->gs_max_out_vertices *
 				   sctx->gs_shader->info.num_outputs * 16,
 				   64, true, true, 4, 16);
-
-		if (!sctx->gs_on) {
-			sctx->gs_on = CALLOC_STRUCT(si_pm4_state);
-
-			si_pm4_set_reg(sctx->gs_on, R_028B54_VGT_SHADER_STAGES_EN,
-				       S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
-				       S_028B54_GS_EN(1) |
-				       S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER));
-		}
-		si_pm4_bind_state(sctx, gs_onoff, sctx->gs_on);
 	} else {
 		si_shader_select(ctx, sctx->vs_shader);
 		si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
 
 		sctx->b.streamout.stride_in_dw = sctx->vs_shader->so.stride;
 
-		if (!sctx->gs_off) {
-			sctx->gs_off = CALLOC_STRUCT(si_pm4_state);
-
-			si_pm4_set_reg(sctx->gs_off, R_028A40_VGT_GS_MODE, 0);
-			si_pm4_set_reg(sctx->gs_off, R_028B54_VGT_SHADER_STAGES_EN, 0);
-		}
-		si_pm4_bind_state(sctx, gs_onoff, sctx->gs_off);
 		si_pm4_bind_state(sctx, gs_rings, NULL);
 		si_pm4_bind_state(sctx, gs, NULL);
 		si_pm4_bind_state(sctx, es, NULL);
 	}
 
+	si_update_vgt_shader_config(sctx);
+
 	si_shader_select(ctx, sctx->ps_shader);
 
 	if (!sctx->ps_shader->current) {

From 4805685b6fe6efb7891dbc6dbab6ae4edce7e19e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 6 Oct 2014 13:19:53 +0200
Subject: [PATCH 0555/1208] radeonsi: implement TGSI_OPCODE_BARRIER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index e2d08287e77..9b03a53035d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2338,6 +2338,17 @@ static void si_llvm_emit_primitive(
 			LLVMNoUnwindAttribute);
 }
 
+static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
+				 struct lp_build_tgsi_context *bld_base,
+				 struct lp_build_emit_data *emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+
+	build_intrinsic(gallivm->builder, "llvm.AMDGPU.barrier.local",
+			LLVMVoidTypeInContext(gallivm->context), NULL, 0,
+			LLVMNoUnwindAttribute);
+}
+
 static const struct lp_build_tgsi_action tex_action = {
 	.fetch_args = tex_fetch_args,
 	.emit = build_tex_intrinsic,
@@ -2966,6 +2977,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 
 	bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
 	bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
+	bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
 
 	if (HAVE_LLVM >= 0x0306) {
 		bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;

From fff16e4ad2cf51749e01e04805908effe49217d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 15:09:35 +0100
Subject: [PATCH 0556/1208] radeonsi: add shader code generation for
 tessellation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/radeon_llvm_emit.c |   2 +
 src/gallium/drivers/radeonsi/si_shader.c      | 723 +++++++++++++++++-
 src/gallium/drivers/radeonsi/si_shader.h      | 109 ++-
 src/gallium/drivers/radeonsi/si_state.h       |   5 +-
 .../drivers/radeonsi/si_state_shaders.c       |  41 +-
 5 files changed, 851 insertions(+), 29 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 25580b6bd4c..973d6edff8a 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -62,6 +62,8 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type)
 
 	switch (type) {
 	case TGSI_PROCESSOR_VERTEX:
+	case TGSI_PROCESSOR_TESS_CTRL:
+	case TGSI_PROCESSOR_TESS_EVAL:
 		llvm_type = RADEON_LLVM_SHADER_VS;
 		break;
 	case TGSI_PROCESSOR_GEOMETRY:
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 9b03a53035d..cddd9a0e120 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -71,12 +71,17 @@ struct si_shader_context
 	int param_streamout_write_index;
 	int param_streamout_offset[4];
 	int param_vertex_id;
+	int param_rel_auto_id;
 	int param_instance_id;
+	int param_tes_u;
+	int param_tes_v;
+	int param_tes_rel_patch_id;
+	int param_tes_patch_id;
 	int param_es2gs_offset;
 	LLVMTargetMachineRef tm;
 	LLVMValueRef const_md;
 	LLVMValueRef const_resource[SI_NUM_CONST_BUFFERS];
-	LLVMValueRef ddxy_lds;
+	LLVMValueRef lds;
 	LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
 	LLVMValueRef resources[SI_NUM_SAMPLER_VIEWS];
 	LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
@@ -133,6 +138,14 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 		assert(index <= 63-4);
 		return 4 + index;
 
+	/* patch indices are completely separate and thus start from 0 */
+	case TGSI_SEMANTIC_TESSOUTER:
+		return 0;
+	case TGSI_SEMANTIC_TESSINNER:
+		return 1;
+	case TGSI_SEMANTIC_PATCH:
+		return 2 + index;
+
 	default:
 		/* Don't fail here. The result of this function is only used
 		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
@@ -210,6 +223,136 @@ static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx,
 	return value;
 }
 
+static LLVMValueRef get_rel_patch_id(struct si_shader_context *si_shader_ctx)
+{
+	switch (si_shader_ctx->type) {
+	case TGSI_PROCESSOR_TESS_CTRL:
+		return unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 0, 8);
+
+	case TGSI_PROCESSOR_TESS_EVAL:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    si_shader_ctx->param_tes_rel_patch_id);
+
+	default:
+		assert(0);
+		return NULL;
+	}
+}
+
+/* Tessellation shaders pass outputs to the next shader using LDS.
+ *
+ * LS outputs = TCS inputs
+ * TCS outputs = TES inputs
+ *
+ * The LDS layout is:
+ * - TCS inputs for patch 0
+ * - TCS inputs for patch 1
+ * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
+ * - ...
+ * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
+ * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
+ * - TCS outputs for patch 1
+ * - Per-patch TCS outputs for patch 1
+ * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
+ * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
+ * - ...
+ *
+ * All three shaders VS(LS), TCS, TES share the same LDS space.
+ */
+
+static LLVMValueRef
+get_tcs_in_patch_stride(struct si_shader_context *si_shader_ctx)
+{
+	if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX)
+		return unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
+	else if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL)
+		return unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
+	else {
+		assert(0);
+		return NULL;
+	}
+}
+
+static LLVMValueRef
+get_tcs_out_patch_stride(struct si_shader_context *si_shader_ctx)
+{
+	return unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
+}
+
+static LLVMValueRef
+get_tcs_out_patch0_offset(struct si_shader_context *si_shader_ctx)
+{
+	return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld,
+				unpack_param(si_shader_ctx,
+					     SI_PARAM_TCS_OUT_OFFSETS,
+					     0, 16),
+				4);
+}
+
+static LLVMValueRef
+get_tcs_out_patch0_patch_data_offset(struct si_shader_context *si_shader_ctx)
+{
+	return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld,
+				unpack_param(si_shader_ctx,
+					     SI_PARAM_TCS_OUT_OFFSETS,
+					     16, 16),
+				4);
+}
+
+static LLVMValueRef
+get_tcs_in_current_patch_offset(struct si_shader_context *si_shader_ctx)
+{
+	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
+	LLVMValueRef patch_stride = get_tcs_in_patch_stride(si_shader_ctx);
+	LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
+
+	return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
+}
+
+static LLVMValueRef
+get_tcs_out_current_patch_offset(struct si_shader_context *si_shader_ctx)
+{
+	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
+	LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(si_shader_ctx);
+	LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx);
+	LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
+
+	return LLVMBuildAdd(gallivm->builder, patch0_offset,
+			    LLVMBuildMul(gallivm->builder, patch_stride,
+					 rel_patch_id, ""),
+			    "");
+}
+
+static LLVMValueRef
+get_tcs_out_current_patch_data_offset(struct si_shader_context *si_shader_ctx)
+{
+	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
+	LLVMValueRef patch0_patch_data_offset =
+		get_tcs_out_patch0_patch_data_offset(si_shader_ctx);
+	LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx);
+	LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
+
+	return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
+			    LLVMBuildMul(gallivm->builder, patch_stride,
+					 rel_patch_id, ""),
+			    "");
+}
+
+static void build_indexed_store(struct si_shader_context *si_shader_ctx,
+				LLVMValueRef base_ptr, LLVMValueRef index,
+				LLVMValueRef value)
+{
+	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMValueRef indices[2], pointer;
+
+	indices[0] = bld_base->uint_bld.zero;
+	indices[1] = index;
+
+	pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
+	LLVMBuildStore(gallivm->builder, value, pointer);
+}
+
 /**
  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
  * It's equivalent to doing a load from &base_ptr[index].
@@ -337,6 +480,12 @@ static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 		return bld_base->uint_bld.zero;
 
 	switch (si_shader_ctx->type) {
+	case TGSI_PROCESSOR_TESS_CTRL:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    SI_PARAM_PATCH_ID);
+	case TGSI_PROCESSOR_TESS_EVAL:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    si_shader_ctx->param_tes_patch_id);
 	case TGSI_PROCESSOR_GEOMETRY:
 		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 				    SI_PARAM_PRIMITIVE_ID);
@@ -346,6 +495,278 @@ static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 	}
 }
 
+/**
+ * Return the value of tgsi_ind_register for indexing.
+ * This is the indirect index with the constant offset added to it.
+ */
+static LLVMValueRef get_indirect_index(struct si_shader_context *si_shader_ctx,
+				       const struct tgsi_ind_register *ind,
+				       int rel_index)
+{
+	struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
+	LLVMValueRef result;
+
+	result = si_shader_ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
+	result = LLVMBuildLoad(gallivm->builder, result, "");
+	result = LLVMBuildAdd(gallivm->builder, result,
+			      lp_build_const_int32(gallivm, rel_index), "");
+	return result;
+}
+
+/**
+ * Calculate a dword address given an input or output register and a stride.
+ */
+static LLVMValueRef get_dw_address(struct si_shader_context *si_shader_ctx,
+				   const struct tgsi_full_dst_register *dst,
+				   const struct tgsi_full_src_register *src,
+				   LLVMValueRef vertex_dw_stride,
+				   LLVMValueRef base_addr)
+{
+	struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
+	struct tgsi_shader_info *info = &si_shader_ctx->shader->selector->info;
+	ubyte *name, *index, *array_first;
+	int first, param;
+	struct tgsi_full_dst_register reg;
+
+	/* Set the register description. The address computation is the same
+	 * for sources and destinations. */
+	if (src) {
+		reg.Register.File = src->Register.File;
+		reg.Register.Index = src->Register.Index;
+		reg.Register.Indirect = src->Register.Indirect;
+		reg.Register.Dimension = src->Register.Dimension;
+		reg.Indirect = src->Indirect;
+		reg.Dimension = src->Dimension;
+		reg.DimIndirect = src->DimIndirect;
+	} else
+		reg = *dst;
+
+	/* If the register is 2-dimensional (e.g. an array of vertices
+	 * in a primitive), calculate the base address of the vertex. */
+	if (reg.Register.Dimension) {
+		LLVMValueRef index;
+
+		if (reg.Dimension.Indirect)
+			index = get_indirect_index(si_shader_ctx, &reg.DimIndirect,
+						   reg.Dimension.Index);
+		else
+			index = lp_build_const_int32(gallivm, reg.Dimension.Index);
+
+		base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
+					 LLVMBuildMul(gallivm->builder, index,
+						      vertex_dw_stride, ""), "");
+	}
+
+	/* Get information about the register. */
+	if (reg.Register.File == TGSI_FILE_INPUT) {
+		name = info->input_semantic_name;
+		index = info->input_semantic_index;
+		array_first = info->input_array_first;
+	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
+		name = info->output_semantic_name;
+		index = info->output_semantic_index;
+		array_first = info->output_array_first;
+	} else {
+		assert(0);
+		return NULL;
+	}
+
+	if (reg.Register.Indirect) {
+		/* Add the relative address of the element. */
+		LLVMValueRef ind_index;
+
+		if (reg.Indirect.ArrayID)
+			first = array_first[reg.Indirect.ArrayID];
+		else
+			first = reg.Register.Index;
+
+		ind_index = get_indirect_index(si_shader_ctx, &reg.Indirect,
+					   reg.Register.Index - first);
+
+		base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
+				    LLVMBuildMul(gallivm->builder, ind_index,
+						 lp_build_const_int32(gallivm, 4), ""), "");
+
+		param = si_shader_io_get_unique_index(name[first], index[first]);
+	} else {
+		param = si_shader_io_get_unique_index(name[reg.Register.Index],
+						      index[reg.Register.Index]);
+	}
+
+	/* Add the base address of the element. */
+	return LLVMBuildAdd(gallivm->builder, base_addr,
+			    lp_build_const_int32(gallivm, param * 4), "");
+}
+
+/**
+ * Load from LDS.
+ *
+ * \param type		output value type
+ * \param swizzle	offset (typically 0..3); it can be ~0, which loads a vec4
+ * \param dw_addr	address in dwords
+ */
+static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
+			     enum tgsi_opcode_type type, unsigned swizzle,
+			     LLVMValueRef dw_addr)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMValueRef value;
+
+	if (swizzle == ~0) {
+		LLVMValueRef values[TGSI_NUM_CHANNELS];
+
+		for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
+			values[chan] = lds_load(bld_base, type, chan, dw_addr);
+
+		return lp_build_gather_values(bld_base->base.gallivm, values,
+					      TGSI_NUM_CHANNELS);
+	}
+
+	dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
+			    lp_build_const_int32(gallivm, swizzle));
+
+	value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+	return LLVMBuildBitCast(gallivm->builder, value,
+				tgsi2llvmtype(bld_base, type), "");
+}
+
+/**
+ * Store to LDS.
+ *
+ * \param swizzle	offset (typically 0..3)
+ * \param dw_addr	address in dwords
+ * \param value		value to store
+ */
+static void lds_store(struct lp_build_tgsi_context * bld_base,
+		      unsigned swizzle, LLVMValueRef dw_addr,
+		      LLVMValueRef value)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+
+	dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
+			    lp_build_const_int32(gallivm, swizzle));
+
+	value = LLVMBuildBitCast(gallivm->builder, value,
+				 LLVMInt32TypeInContext(gallivm->context), "");
+	build_indexed_store(si_shader_ctx, si_shader_ctx->lds,
+			    dw_addr, value);
+}
+
+static LLVMValueRef fetch_input_tcs(
+	struct lp_build_tgsi_context *bld_base,
+	const struct tgsi_full_src_register *reg,
+	enum tgsi_opcode_type type, unsigned swizzle)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	LLVMValueRef dw_addr, stride;
+
+	stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
+	dw_addr = get_tcs_in_current_patch_offset(si_shader_ctx);
+	dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
+
+	return lds_load(bld_base, type, swizzle, dw_addr);
+}
+
+static LLVMValueRef fetch_output_tcs(
+		struct lp_build_tgsi_context *bld_base,
+		const struct tgsi_full_src_register *reg,
+		enum tgsi_opcode_type type, unsigned swizzle)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader *shader = si_shader_ctx->shader;
+	struct tgsi_shader_info *info = &shader->selector->info;
+	unsigned name = info->output_semantic_name[reg->Register.Index];
+	LLVMValueRef dw_addr, stride;
+
+	/* Just read the local temp "output" register to get TESSOUTER/INNER. */
+	if (!reg->Register.Indirect &&
+	    (name == TGSI_SEMANTIC_TESSOUTER ||
+	     name == TGSI_SEMANTIC_TESSINNER)) {
+		return radeon_llvm_emit_fetch(bld_base, reg, type, swizzle);
+	}
+
+	if (reg->Register.Dimension) {
+		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
+	} else {
+		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr);
+	}
+
+	return lds_load(bld_base, type, swizzle, dw_addr);
+}
+
+static LLVMValueRef fetch_input_tes(
+	struct lp_build_tgsi_context *bld_base,
+	const struct tgsi_full_src_register *reg,
+	enum tgsi_opcode_type type, unsigned swizzle)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	LLVMValueRef dw_addr, stride;
+
+	if (reg->Register.Dimension) {
+		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
+	} else {
+		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr);
+	}
+
+	return lds_load(bld_base, type, swizzle, dw_addr);
+}
+
+static void store_output_tcs(struct lp_build_tgsi_context * bld_base,
+			     const struct tgsi_full_instruction * inst,
+			     const struct tgsi_opcode_info * info,
+			     LLVMValueRef dst[4])
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader *shader = si_shader_ctx->shader;
+	struct tgsi_shader_info *sinfo = &shader->selector->info;
+	const struct tgsi_full_dst_register *reg = &inst->Dst[0];
+	unsigned chan_index;
+	LLVMValueRef dw_addr, stride;
+
+	/* Only handle per-patch and per-vertex outputs here.
+	 * Vectors will be lowered to scalars and this function will be called again.
+	 */
+	if (reg->Register.File != TGSI_FILE_OUTPUT ||
+	    (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
+		radeon_llvm_emit_store(bld_base, inst, info, dst);
+		return;
+	}
+
+	/* Write tessellation levels to "output" temp registers.
+	 * Also write them to LDS as per-patch outputs (below).
+	 */
+	if (!reg->Register.Indirect &&
+	    (sinfo->output_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_TESSINNER ||
+             sinfo->output_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_TESSOUTER))
+		radeon_llvm_emit_store(bld_base, inst, info, dst);
+
+	if (reg->Register.Dimension) {
+		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, reg, NULL, stride, dw_addr);
+	} else {
+		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, reg, NULL, NULL, dw_addr);
+	}
+
+	TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
+		LLVMValueRef value = dst[chan_index];
+
+		if (inst->Instruction.Saturate)
+			value = radeon_llvm_saturate(bld_base, value);
+
+		lds_store(bld_base, chan_index, dw_addr, value);
+	}
+}
+
 static LLVMValueRef fetch_input_gs(
 	struct lp_build_tgsi_context *bld_base,
 	const struct tgsi_full_src_register *reg,
@@ -398,7 +819,7 @@ static LLVMValueRef fetch_input_gs(
 	args[1] = vtx_offset;
 	args[2] = lp_build_const_int32(gallivm,
 				       (get_param_index(semantic_name, semantic_index,
-							shader->selector->gs_used_inputs) * 4 +
+							shader->selector->inputs_read) * 4 +
 					swizzle) * 256);
 	args[3] = uint->zero;
 	args[4] = uint->one;  /* OFFEN */
@@ -616,6 +1037,7 @@ static void declare_system_value(
 {
 	struct si_shader_context *si_shader_ctx =
 		si_shader_context(&radeon_bld->soa.bld_base);
+	struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
 	struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
 	struct gallivm_state *gallivm = &radeon_bld->gallivm;
 	LLVMValueRef value = 0;
@@ -645,8 +1067,13 @@ static void declare_system_value(
 		break;
 
 	case TGSI_SEMANTIC_INVOCATIONID:
-		value = LLVMGetParam(radeon_bld->main_fn,
-				     SI_PARAM_GS_INSTANCE_ID);
+		if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL)
+			value = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5);
+		else if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY)
+			value = LLVMGetParam(radeon_bld->main_fn,
+					     SI_PARAM_GS_INSTANCE_ID);
+		else
+			assert(!"INVOCATIONID not implemented");
 		break;
 
 	case TGSI_SEMANTIC_SAMPLEID:
@@ -683,6 +1110,48 @@ static void declare_system_value(
 			value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
 		break;
 
+	case TGSI_SEMANTIC_TESSCOORD:
+	{
+		LLVMValueRef coord[4] = {
+			LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_u),
+			LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_v),
+			bld->zero,
+			bld->zero
+		};
+
+		/* For triangles, the vector should be (u, v, 1-u-v). */
+		if (si_shader_ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
+		    PIPE_PRIM_TRIANGLES)
+			coord[2] = lp_build_sub(bld, bld->one,
+						lp_build_add(bld, coord[0], coord[1]));
+
+		value = lp_build_gather_values(gallivm, coord, 4);
+		break;
+	}
+
+	case TGSI_SEMANTIC_VERTICESIN:
+		value = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
+		break;
+
+	case TGSI_SEMANTIC_TESSINNER:
+	case TGSI_SEMANTIC_TESSOUTER:
+	{
+		LLVMValueRef dw_addr;
+		int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
+
+		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = LLVMBuildAdd(gallivm->builder, dw_addr,
+				       lp_build_const_int32(gallivm, param * 4), "");
+
+		value = lds_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
+				 ~0, dw_addr);
+		break;
+	}
+
+	case TGSI_SEMANTIC_PRIMID:
+		value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
+		break;
+
 	default:
 		assert(!"unknown system value");
 		return;
@@ -1338,6 +1807,134 @@ handle_semantic:
 	}
 }
 
+static void si_write_tess_factors(struct si_shader_context *si_shader_ctx,
+				  unsigned name, LLVMValueRef *out_ptr)
+{
+	struct si_shader *shader = si_shader_ctx->shader;
+	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMValueRef tf_base, rel_patch_id, byteoffset, buffer, rw_buffers;
+	LLVMValueRef output, out[4];
+	unsigned stride, outer_comps, inner_comps, i;
+
+	if (name != TGSI_SEMANTIC_TESSOUTER &&
+	    name != TGSI_SEMANTIC_TESSINNER) {
+		assert(0);
+		return;
+	}
+
+	switch (shader->key.tcs.prim_mode) {
+	case PIPE_PRIM_LINES:
+		stride = 2;
+		outer_comps = 2;
+		inner_comps = 0;
+		break;
+	case PIPE_PRIM_TRIANGLES:
+		stride = 4;
+		outer_comps = 3;
+		inner_comps = 1;
+		break;
+	case PIPE_PRIM_QUADS:
+		stride = 6;
+		outer_comps = 4;
+		inner_comps = 2;
+		break;
+	default:
+		assert(0);
+	}
+
+	/* Load the outputs as i32. */
+	for (i = 0; i < 4; i++)
+		out[i] = LLVMBuildBitCast(gallivm->builder,
+				LLVMBuildLoad(gallivm->builder, out_ptr[i], ""),
+				bld_base->uint_bld.elem_type, "");
+
+	/* Convert the outputs to vectors. */
+	if (name == TGSI_SEMANTIC_TESSOUTER)
+		output = lp_build_gather_values(gallivm, out,
+						util_next_power_of_two(outer_comps));
+	else if (inner_comps > 1)
+		output = lp_build_gather_values(gallivm, out, inner_comps);
+	else if (inner_comps == 1)
+		output = out[0];
+	else
+		return;
+
+	/* Get the buffer. */
+	rw_buffers = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				  SI_PARAM_RW_BUFFERS);
+	buffer = build_indexed_load_const(si_shader_ctx, rw_buffers,
+			lp_build_const_int32(gallivm, SI_RING_TESS_FACTOR));
+
+	/* Get offsets. */
+	tf_base = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+			       SI_PARAM_TESS_FACTOR_OFFSET);
+	rel_patch_id = get_rel_patch_id(si_shader_ctx);
+	byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
+				  lp_build_const_int32(gallivm, 4 * stride), "");
+
+	/* Store the output. */
+	if (name == TGSI_SEMANTIC_TESSOUTER) {
+		build_tbuffer_store_dwords(si_shader_ctx, buffer, output,
+					   outer_comps, byteoffset, tf_base, 0);
+	} else if (inner_comps) {
+		build_tbuffer_store_dwords(si_shader_ctx, buffer, output,
+					   inner_comps, byteoffset, tf_base,
+					   outer_comps * 4);
+	}
+}
+
+static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader *shader = si_shader_ctx->shader;
+	struct tgsi_shader_info *info = &shader->selector->info;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	unsigned i, chan;
+	LLVMValueRef vertex_id = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+					      si_shader_ctx->param_rel_auto_id);
+	LLVMValueRef vertex_dw_stride =
+		unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
+	LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
+						 vertex_dw_stride, "");
+
+	/* Write outputs to LDS. The next shader (TCS aka HS) will read
+	 * its inputs from it. */
+	for (i = 0; i < info->num_outputs; i++) {
+		LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i];
+		unsigned name = info->output_semantic_name[i];
+		unsigned index = info->output_semantic_index[i];
+		int param = si_shader_io_get_unique_index(name, index);
+		LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
+					lp_build_const_int32(gallivm, param * 4), "");
+
+		for (chan = 0; chan < 4; chan++) {
+			lds_store(bld_base, chan, dw_addr,
+				  LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
+		}
+	}
+}
+
+static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context * bld_base)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader *shader = si_shader_ctx->shader;
+	struct tgsi_shader_info *info = &shader->selector->info;
+	unsigned i;
+
+	/* Only write tessellation factors. Other outputs have already been
+	 * written to LDS by instructions. */
+	for (i = 0; i < info->num_outputs; i++) {
+		LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i];
+		unsigned name = info->output_semantic_name[i];
+
+		if (name == TGSI_SEMANTIC_TESSINNER ||
+		    name == TGSI_SEMANTIC_TESSOUTER) {
+			si_write_tess_factors(si_shader_ctx, name, out_ptr);
+		}
+	}
+}
+
 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 {
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
@@ -1347,6 +1944,9 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 	LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 					    si_shader_ctx->param_es2gs_offset);
+	uint64_t enabled_outputs = si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL ?
+					   es->key.tes.es_enabled_outputs :
+					   es->key.vs.es_enabled_outputs;
 	unsigned chan;
 	int i;
 
@@ -1361,7 +1961,7 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 
 		param_index = get_param_index(info->output_semantic_name[i],
 					      info->output_semantic_index[i],
-					      es->key.vs.gs_used_inputs);
+					      enabled_outputs);
 		if (param_index < 0)
 			continue;
 
@@ -2201,19 +2801,19 @@ static void si_llvm_emit_ddxy(
 	indices[0] = bld_base->uint_bld.zero;
 	indices[1] = build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
 				     NULL, 0, LLVMReadNoneAttribute);
-	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds,
+	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
 	indices[1] = LLVMBuildAnd(gallivm->builder, indices[1],
 				  lp_build_const_int32(gallivm, 0xfffffffc), "");
-	load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds,
+	load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
 	indices[1] = LLVMBuildAdd(gallivm->builder, indices[1],
 				  lp_build_const_int32(gallivm,
 						       opcode == TGSI_OPCODE_DDX ? 1 : 2),
 				  "");
-	load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds,
+	load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
 	for (c = 0; c < 4; ++c) {
@@ -2432,6 +3032,9 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 
 		if (shader->key.vs.as_es) {
 			params[si_shader_ctx->param_es2gs_offset = num_params++] = i32;
+		} else if (shader->key.vs.as_ls) {
+			params[SI_PARAM_LS_OUT_LAYOUT] = i32;
+			num_params = SI_PARAM_LS_OUT_LAYOUT+1;
 		} else {
 			if (shader->is_gs_copy_shader) {
 				last_array_pointer = SI_PARAM_CONST;
@@ -2447,11 +3050,44 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 
 		/* VGPRs */
 		params[si_shader_ctx->param_vertex_id = num_params++] = i32;
-		params[num_params++] = i32; /* unused*/
+		params[si_shader_ctx->param_rel_auto_id = num_params++] = i32;
 		params[num_params++] = i32; /* unused */
 		params[si_shader_ctx->param_instance_id = num_params++] = i32;
 		break;
 
+	case TGSI_PROCESSOR_TESS_CTRL:
+		params[SI_PARAM_TCS_OUT_OFFSETS] = i32;
+		params[SI_PARAM_TCS_OUT_LAYOUT] = i32;
+		params[SI_PARAM_TCS_IN_LAYOUT] = i32;
+		params[SI_PARAM_TESS_FACTOR_OFFSET] = i32;
+		last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
+
+		/* VGPRs */
+		params[SI_PARAM_PATCH_ID] = i32;
+		params[SI_PARAM_REL_IDS] = i32;
+		num_params = SI_PARAM_REL_IDS+1;
+		break;
+
+	case TGSI_PROCESSOR_TESS_EVAL:
+		params[SI_PARAM_TCS_OUT_OFFSETS] = i32;
+		params[SI_PARAM_TCS_OUT_LAYOUT] = i32;
+		num_params = SI_PARAM_TCS_OUT_LAYOUT+1;
+
+		if (shader->key.tes.as_es) {
+			params[si_shader_ctx->param_es2gs_offset = num_params++] = i32;
+		} else {
+			declare_streamout_params(si_shader_ctx, &shader->selector->so,
+						 params, i32, &num_params);
+		}
+		last_sgpr = num_params - 1;
+
+		/* VGPRs */
+		params[si_shader_ctx->param_tes_u = num_params++] = f32;
+		params[si_shader_ctx->param_tes_v = num_params++] = f32;
+		params[si_shader_ctx->param_tes_rel_patch_id = num_params++] = i32;
+		params[si_shader_ctx->param_tes_patch_id = num_params++] = i32;
+		break;
+
 	case TGSI_PROCESSOR_GEOMETRY:
 		params[SI_PARAM_GS2VS_OFFSET] = i32;
 		params[SI_PARAM_GS_WAVE_ID] = i32;
@@ -2519,11 +3155,30 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 	if (bld_base->info &&
 	    (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
 	     bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0))
-		si_shader_ctx->ddxy_lds =
+		si_shader_ctx->lds =
 			LLVMAddGlobalInAddressSpace(gallivm->module,
 						    LLVMArrayType(i32, 64),
 						    "ddxy_lds",
 						    LOCAL_ADDR_SPACE);
+
+	if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) ||
+	    si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
+	    si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL) {
+		/* This is the upper bound, maximum is 32 inputs times 32 vertices */
+		unsigned vertex_data_dw_size = 32*32*4;
+		unsigned patch_data_dw_size = 32*4;
+		/* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
+		unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
+		unsigned lds_dwords = patch_dw_size;
+
+		/* The actual size is computed outside of the shader to reduce
+		 * the number of shader variants. */
+		si_shader_ctx->lds =
+			LLVMAddGlobalInAddressSpace(gallivm->module,
+						    LLVMArrayType(i32, lds_dwords),
+						    "tess_lds",
+						    LOCAL_ADDR_SPACE);
+	}
 }
 
 static void preload_constants(struct si_shader_context *si_shader_ctx)
@@ -2600,9 +3255,13 @@ static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx)
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	unsigned i;
 
-	if (si_shader_ctx->type != TGSI_PROCESSOR_VERTEX ||
-	    si_shader_ctx->shader->key.vs.as_es ||
-	    !si_shader_ctx->shader->selector->so.num_outputs)
+	/* Streamout can only be used if the shader is compiled as VS. */
+	if (!si_shader_ctx->shader->selector->so.num_outputs ||
+	    (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
+	     (si_shader_ctx->shader->key.vs.as_es ||
+	      si_shader_ctx->shader->key.vs.as_ls)) ||
+	    (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
+	     si_shader_ctx->shader->key.tes.as_es))
 		return;
 
 	LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
@@ -2633,6 +3292,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 
 	if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
 	     si_shader_ctx->shader->key.vs.as_es) ||
+	    (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
+	     si_shader_ctx->shader->key.tes.as_es) ||
 	    si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) {
 		LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_ESGS);
 
@@ -2893,9 +3554,21 @@ static void si_dump_key(unsigned shader, union si_shader_key *key)
 		fprintf(stderr, "}\n");
 
 		if (key->vs.as_es)
-			fprintf(stderr, "  gs_used_inputs = 0x%"PRIx64"\n",
-				key->vs.gs_used_inputs);
+			fprintf(stderr, "  es_enabled_outputs = 0x%"PRIx64"\n",
+				key->vs.es_enabled_outputs);
 		fprintf(stderr, "  as_es = %u\n", key->vs.as_es);
+		fprintf(stderr, "  as_es = %u\n", key->vs.as_ls);
+		break;
+
+	case PIPE_SHADER_TESS_CTRL:
+		fprintf(stderr, "  prim_mode = %u\n", key->tcs.prim_mode);
+		break;
+
+	case PIPE_SHADER_TESS_EVAL:
+		if (key->tes.as_es)
+			fprintf(stderr, "  es_enabled_outputs = 0x%"PRIx64"\n",
+				key->tes.es_enabled_outputs);
+		fprintf(stderr, "  as_es = %u\n", key->tes.as_es);
 		break;
 
 	case PIPE_SHADER_GEOMETRY:
@@ -2995,11 +3668,25 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	switch (si_shader_ctx.type) {
 	case TGSI_PROCESSOR_VERTEX:
 		si_shader_ctx.radeon_bld.load_input = declare_input_vs;
-		if (shader->key.vs.as_es) {
+		if (shader->key.vs.as_ls)
+			bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
+		else if (shader->key.vs.as_es)
 			bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
-		} else {
+		else
+			bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
+		break;
+	case TGSI_PROCESSOR_TESS_CTRL:
+		bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
+		bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
+		bld_base->emit_store = store_output_tcs;
+		bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
+		break;
+	case TGSI_PROCESSOR_TESS_EVAL:
+		bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
+		if (shader->key.tes.as_es)
+			bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
+		else
 			bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
-		}
 		break;
 	case TGSI_PROCESSOR_GEOMETRY:
 		bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index be3e98a361a..2cba96f9587 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -26,6 +26,46 @@
  *      Christian König <christian.koenig@amd.com>
  */
 
+/* How linking tessellation shader inputs and outputs works.
+ *
+ * Inputs and outputs between shaders are stored in a buffer. This buffer
+ * lives in LDS (typical case for tessellation), but it can also live
+ * in memory. Each input or output has a fixed location within a vertex.
+ * The highest used input or output determines the stride between vertices.
+ *
+ * Since tessellation is only enabled in the OpenGL core profile,
+ * only these semantics are valid for per-vertex data:
+ *
+ *   Name             Location
+ *
+ *   POSITION         0
+ *   PSIZE            1
+ *   CLIPDIST0..1     2..3
+ *   CULLDIST0..1     (not implemented)
+ *   GENERIC0..31     4..35
+ *
+ * For example, a shader only writing GENERIC0 has the output stride of 5.
+ *
+ * Only these semantics are valid for per-patch data:
+ *
+ *   Name             Location
+ *
+ *   TESSOUTER        0
+ *   TESSINNER        1
+ *   PATCH0..29       2..31
+ *
+ * That's how independent shaders agree on input and output locations.
+ * The si_shader_io_get_unique_index function assigns the locations.
+ *
+ * Other required information for calculating the input and output addresses
+ * like the vertex stride, the patch stride, and the offsets where per-vertex
+ * and per-patch data start, is passed to the shader via user data SGPRs.
+ * The offsets and strides are calculated at draw time and aren't available
+ * at compile time.
+ *
+ * The same approach should be used for linking ES->GS in the future.
+ */
+
 #ifndef SI_SHADER_H
 #define SI_SHADER_H
 
@@ -43,9 +83,16 @@ struct radeon_shader_reloc;
 #define SI_SGPR_VERTEX_BUFFER	8  /* VS only */
 #define SI_SGPR_BASE_VERTEX	10 /* VS only */
 #define SI_SGPR_START_INSTANCE	11 /* VS only */
+#define SI_SGPR_LS_OUT_LAYOUT	12 /* VS(LS) only */
+#define SI_SGPR_TCS_OUT_OFFSETS	8  /* TCS & TES only */
+#define SI_SGPR_TCS_OUT_LAYOUT	9  /* TCS & TES only */
+#define SI_SGPR_TCS_IN_LAYOUT	10 /* TCS only */
 #define SI_SGPR_ALPHA_REF	8  /* PS only */
 
 #define SI_VS_NUM_USER_SGPR	12
+#define SI_LS_NUM_USER_SGPR	13
+#define SI_TCS_NUM_USER_SGPR	11
+#define SI_TES_NUM_USER_SGPR	10
 #define SI_GS_NUM_USER_SGPR	8
 #define SI_GSCOPY_NUM_USER_SGPR	4
 #define SI_PS_NUM_USER_SGPR	9
@@ -62,6 +109,31 @@ struct radeon_shader_reloc;
 #define SI_PARAM_START_INSTANCE	6
 /* the other VS parameters are assigned dynamically */
 
+/* Offsets where TCS outputs and TCS patch outputs live in LDS:
+ *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
+ *   [16:31] = TCS output patch0 offset for per-patch / 16, max = NUM_PATCHES*32*32* + 32*32
+ */
+#define SI_PARAM_TCS_OUT_OFFSETS 4 /* for TCS & TES */
+
+/* Layout of TCS outputs / TES inputs:
+ *   [0:12] = stride between output patches in dwords, num_outputs * num_vertices * 4, max = 32*32*4
+ *   [13:20] = stride between output vertices in dwords = num_inputs * 4, max = 32*4
+ *   [26:31] = gl_PatchVerticesIn, max = 32
+ */
+#define SI_PARAM_TCS_OUT_LAYOUT	5 /* for TCS & TES */
+
+/* Layout of LS outputs / TCS inputs
+ *   [0:12] = stride between patches in dwords = num_inputs * num_vertices * 4, max = 32*32*4
+ *   [13:20] = stride between vertices in dwords = num_inputs * 4, max = 32*4
+ */
+#define SI_PARAM_TCS_IN_LAYOUT	6 /* TCS only */
+#define SI_PARAM_LS_OUT_LAYOUT	7 /* same value as TCS_IN_LAYOUT, LS only */
+
+/* TCS only parameters. */
+#define SI_PARAM_TESS_FACTOR_OFFSET 7
+#define SI_PARAM_PATCH_ID	8
+#define SI_PARAM_REL_IDS	9
+
 /* GS only parameters */
 #define SI_PARAM_GS2VS_OFFSET	4
 #define SI_PARAM_GS_WAVE_ID	5
@@ -113,9 +185,24 @@ struct si_shader_selector {
 	unsigned	gs_output_prim;
 	unsigned	gs_max_out_vertices;
 	unsigned	gs_num_invocations;
-	uint64_t	gs_used_inputs; /* mask of "get_unique_index" bits */
+
+	/* masks of "get_unique_index" bits */
+	uint64_t	inputs_read;
+	uint64_t	outputs_written;
+	uint32_t	patch_outputs_written;
 };
 
+/* Valid shader configurations:
+ *
+ * API shaders       VS | TCS | TES | GS |pass| PS
+ * are compiled as:     |     |     |    |thru|
+ *                      |     |     |    |    |
+ * Only VS & PS:     VS | --  | --  | -- | -- | PS
+ * With GS:          ES | --  | --  | GS | VS | PS
+ * With Tessel.:     LS | HS  | VS  | -- | -- | PS
+ * With both:        LS | HS  | ES  | GS | VS | PS
+ */
+
 union si_shader_key {
 	struct {
 		unsigned	export_16bpc:8;
@@ -128,11 +215,23 @@ union si_shader_key {
 	} ps;
 	struct {
 		unsigned	instance_divisors[SI_NUM_VERTEX_BUFFERS];
-		/* The mask of "get_unique_index" bits, needed for ES,
-		 * it describes how the ES->GS ring buffer is laid out. */
-		uint64_t	gs_used_inputs;
-		unsigned	as_es:1;
+		/* Mask of "get_unique_index" bits - which outputs are read
+		 * by the next stage (needed by ES).
+		 * This describes how outputs are laid out in memory. */
+		uint64_t	es_enabled_outputs;
+		unsigned	as_es:1; /* export shader */
+		unsigned	as_ls:1; /* local shader */
 	} vs;
+	struct {
+		unsigned	prim_mode:3;
+	} tcs; /* tessellation control shader */
+	struct {
+		/* Mask of "get_unique_index" bits - which outputs are read
+		 * by the next stage (needed by ES).
+		 * This describes how outputs are laid out in memory. */
+		uint64_t	es_enabled_outputs;
+		unsigned	as_es:1; /* export shader */
+	} tes; /* tessellation evaluation shader */
 };
 
 struct si_shader {
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 6174dad7190..af001b39b3a 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -142,8 +142,9 @@ struct si_shader_data {
  * Ring buffers:        0..1
  * Streamout buffers:   2..5
  */
-#define SI_RING_ESGS		0
-#define SI_RING_GSVS		1
+#define SI_RING_TESS_FACTOR	0 /* for HS (TCS)  */
+#define SI_RING_ESGS		0 /* for ES, GS */
+#define SI_RING_GSVS		1 /* for GS, VS */
 #define SI_NUM_RING_BUFFERS	2
 #define SI_SO_BUF_OFFSET	SI_NUM_RING_BUFFERS
 #define SI_NUM_RW_BUFFERS	(SI_SO_BUF_OFFSET + 4)
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 3eec217fbd2..2a0ff10285f 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -112,7 +112,7 @@ static void si_shader_gs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize);
 
 	si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-		       util_bitcount64(shader->selector->gs_used_inputs) * (16 >> 2));
+		       util_bitcount64(shader->selector->inputs_read) * (16 >> 2));
 	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize);
 
 	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
@@ -351,9 +351,21 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 				key->vs.instance_divisors[i] =
 					sctx->vertex_elements->elements[i].instance_divisor;
 
-		if (sctx->gs_shader) {
+		if (sctx->tes_shader)
+			key->vs.as_ls = 1;
+		else if (sctx->gs_shader) {
 			key->vs.as_es = 1;
-			key->vs.gs_used_inputs = sctx->gs_shader->gs_used_inputs;
+			key->vs.es_enabled_outputs = sctx->gs_shader->inputs_read;
+		}
+		break;
+	case PIPE_SHADER_TESS_CTRL:
+		key->tcs.prim_mode =
+			sctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+		break;
+	case PIPE_SHADER_TESS_EVAL:
+		if (sctx->gs_shader) {
+			key->tes.as_es = 1;
+			key->tes.es_enabled_outputs = sctx->gs_shader->inputs_read;
 		}
 		break;
 	case PIPE_SHADER_GEOMETRY:
@@ -487,10 +499,31 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 			case TGSI_SEMANTIC_PRIMID:
 				break;
 			default:
-				sel->gs_used_inputs |=
+				sel->inputs_read |=
 					1llu << si_shader_io_get_unique_index(name, index);
 			}
 		}
+		break;
+
+	case PIPE_SHADER_VERTEX:
+	case PIPE_SHADER_TESS_CTRL:
+		for (i = 0; i < sel->info.num_outputs; i++) {
+			unsigned name = sel->info.output_semantic_name[i];
+			unsigned index = sel->info.output_semantic_index[i];
+
+			switch (name) {
+			case TGSI_SEMANTIC_TESSINNER:
+			case TGSI_SEMANTIC_TESSOUTER:
+			case TGSI_SEMANTIC_PATCH:
+				sel->patch_outputs_written |=
+					1llu << si_shader_io_get_unique_index(name, index);
+				break;
+			default:
+				sel->outputs_written |=
+					1llu << si_shader_io_get_unique_index(name, index);
+			}
+		}
+		break;
 	}
 
 	if (sscreen->b.debug_flags & DBG_PRECOMPILE)

From aa2fa6723a0f8ab86ce2e55b1ac093f2cffd87c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 16:07:04 +0100
Subject: [PATCH 0557/1208] radeonsi: update si_get_vs_info and si_get_vs_state
 for tessellation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 2cba96f9587..374021817cd 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -268,14 +268,20 @@ struct si_shader {
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
 {
-	return sctx->gs_shader ? &sctx->gs_shader->info
-                               : &sctx->vs_shader->info;
+	if (sctx->gs_shader)
+		return &sctx->gs_shader->info;
+	else if (sctx->tes_shader)
+		return &sctx->tes_shader->info;
+	else
+		return &sctx->vs_shader->info;
 }
 
 static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
 {
 	if (sctx->gs_shader)
 		return sctx->gs_shader->current->gs_copy_shader;
+	else if (sctx->tes_shader)
+		return sctx->tes_shader->current;
 	else
 		return sctx->vs_shader->current;
 }

From ebfd9e007191d582e22d252e9ff9b93fe4f8c593 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 17:07:34 +0100
Subject: [PATCH 0558/1208] radeonsi: add tessellation shader states
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ls_rsrc# will be emitted as part of the derived tessellation state

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.h      |   3 +
 src/gallium/drivers/radeonsi/si_state.h       |   2 +
 .../drivers/radeonsi/si_state_shaders.c       | 211 ++++++++++++++++--
 3 files changed, 203 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 374021817cd..b52714d5abe 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -264,6 +264,9 @@ struct si_shader {
 	unsigned		nr_param_exports;
 	bool			is_gs_copy_shader;
 	bool			dx10_clamp_mode; /* convert NaNs to 0 */
+
+	unsigned		ls_rsrc1;
+	unsigned		ls_rsrc2;
 };
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index af001b39b3a..b17f2f09012 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -102,6 +102,8 @@ union si_state {
 		struct si_pm4_state		*fb_blend;
 		struct si_pm4_state		*dsa_stencil_ref;
 		struct si_pm4_state		*ta_bordercolor_base;
+		struct si_pm4_state		*ls;
+		struct si_pm4_state		*hs;
 		struct si_pm4_state		*es;
 		struct si_pm4_state		*gs;
 		struct si_pm4_state		*gs_rings;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 2a0ff10285f..937e71de723 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -33,6 +33,129 @@
 #include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 
+static void si_set_tesseval_regs(struct si_shader *shader,
+				 struct si_pm4_state *pm4)
+{
+	struct tgsi_shader_info *info = &shader->selector->info;
+	unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
+	unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
+	bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
+	bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE];
+	unsigned type, partitioning, topology;
+
+	switch (tes_prim_mode) {
+	case PIPE_PRIM_LINES:
+		type = V_028B6C_TESS_ISOLINE;
+		break;
+	case PIPE_PRIM_TRIANGLES:
+		type = V_028B6C_TESS_TRIANGLE;
+		break;
+	case PIPE_PRIM_QUADS:
+		type = V_028B6C_TESS_QUAD;
+		break;
+	default:
+		assert(0);
+		return;
+	}
+
+	switch (tes_spacing) {
+	case PIPE_TESS_SPACING_FRACTIONAL_ODD:
+		partitioning = V_028B6C_PART_FRAC_ODD;
+		break;
+	case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
+		partitioning = V_028B6C_PART_FRAC_EVEN;
+		break;
+	case PIPE_TESS_SPACING_EQUAL:
+		partitioning = V_028B6C_PART_INTEGER;
+		break;
+	default:
+		assert(0);
+		return;
+	}
+
+	if (tes_point_mode)
+		topology = V_028B6C_OUTPUT_POINT;
+	else if (tes_prim_mode == PIPE_PRIM_LINES)
+		topology = V_028B6C_OUTPUT_LINE;
+	else if (tes_vertex_order_cw)
+		/* for some reason, this must be the other way around */
+		topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
+	else
+		topology = V_028B6C_OUTPUT_TRIANGLE_CW;
+
+	si_pm4_set_reg(pm4, R_028B6C_VGT_TF_PARAM,
+		       S_028B6C_TYPE(type) |
+		       S_028B6C_PARTITIONING(partitioning) |
+		       S_028B6C_TOPOLOGY(topology));
+}
+
+static void si_shader_ls(struct si_shader *shader)
+{
+	struct si_pm4_state *pm4;
+	unsigned num_sgprs, num_user_sgprs;
+	unsigned vgpr_comp_cnt;
+	uint64_t va;
+
+	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
+	if (pm4 == NULL)
+		return;
+
+	va = shader->bo->gpu_address;
+	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
+
+	/* We need at least 2 components for LS.
+	 * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
+	vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1;
+
+	num_user_sgprs = SI_LS_NUM_USER_SGPR;
+	num_sgprs = shader->num_sgprs;
+	if (num_user_sgprs > num_sgprs) {
+		/* Last 2 reserved SGPRs are used for VCC */
+		num_sgprs = num_user_sgprs + 2;
+	}
+	assert(num_sgprs <= 104);
+
+	si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+	si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
+
+	shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
+			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
+		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt);
+	shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs);
+}
+
+static void si_shader_hs(struct si_shader *shader)
+{
+	struct si_pm4_state *pm4;
+	unsigned num_sgprs, num_user_sgprs;
+	uint64_t va;
+
+	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
+	if (pm4 == NULL)
+		return;
+
+	va = shader->bo->gpu_address;
+	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
+
+	num_user_sgprs = SI_TCS_NUM_USER_SGPR;
+	num_sgprs = shader->num_sgprs;
+	/* One SGPR after user SGPRs is pre-loaded with tessellation factor
+	 * buffer offset. */
+	if ((num_user_sgprs + 1) > num_sgprs) {
+		/* Last 2 reserved SGPRs are used for VCC */
+		num_sgprs = num_user_sgprs + 1 + 2;
+	}
+	assert(num_sgprs <= 104);
+
+	si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
+	si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
+	si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
+		       S_00B428_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B428_SGPRS((num_sgprs - 1) / 8));
+	si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
+		       S_00B42C_USER_SGPR(num_user_sgprs));
+}
+
 static void si_shader_es(struct si_shader *shader)
 {
 	struct si_pm4_state *pm4;
@@ -48,9 +171,15 @@ static void si_shader_es(struct si_shader *shader)
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 
-	vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
+	if (shader->selector->type == PIPE_SHADER_VERTEX) {
+		vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
+		num_user_sgprs = SI_VS_NUM_USER_SGPR;
+	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+		vgpr_comp_cnt = 3; /* all components are needed for TES */
+		num_user_sgprs = SI_TES_NUM_USER_SGPR;
+	} else
+		assert(0);
 
-	num_user_sgprs = SI_VS_NUM_USER_SGPR;
 	num_sgprs = shader->num_sgprs;
 	/* One SGPR after user SGPRs is pre-loaded with es2gs_offset */
 	if ((num_user_sgprs + 1) > num_sgprs) {
@@ -69,6 +198,9 @@ static void si_shader_es(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
 		       S_00B32C_USER_SGPR(num_user_sgprs) |
 		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+
+	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+		si_set_tesseval_regs(shader, pm4);
 }
 
 static void si_shader_gs(struct si_shader *shader)
@@ -169,6 +301,9 @@ static void si_shader_vs(struct si_shader *shader)
 	} else if (shader->selector->type == PIPE_SHADER_VERTEX) {
 		vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
 		num_user_sgprs = SI_VS_NUM_USER_SGPR;
+	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+		vgpr_comp_cnt = 3; /* all components are needed for TES */
+		num_user_sgprs = SI_TES_NUM_USER_SGPR;
 	} else
 		assert(0);
 
@@ -220,6 +355,9 @@ static void si_shader_vs(struct si_shader *shader)
 			       S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
 			       S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
 			       S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
+
+	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+		si_set_tesseval_regs(shader, pm4);
 }
 
 static void si_shader_ps(struct si_shader *shader)
@@ -317,7 +455,18 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 
 	switch (shader->selector->type) {
 	case PIPE_SHADER_VERTEX:
-		if (shader->key.vs.as_es)
+		if (shader->key.vs.as_ls)
+			si_shader_ls(shader);
+		else if (shader->key.vs.as_es)
+			si_shader_es(shader);
+		else
+			si_shader_vs(shader);
+		break;
+	case PIPE_SHADER_TESS_CTRL:
+		si_shader_hs(shader);
+		break;
+	case PIPE_SHADER_TESS_EVAL:
+		if (shader->key.tes.as_es)
 			si_shader_es(shader);
 		else
 			si_shader_vs(shader);
@@ -657,7 +806,18 @@ static void si_delete_shader_selector(struct pipe_context *ctx,
 		c = p->next_variant;
 		switch (sel->type) {
 		case PIPE_SHADER_VERTEX:
-			if (p->key.vs.as_es)
+			if (p->key.vs.as_ls)
+				si_pm4_delete_state(sctx, ls, p->pm4);
+			else if (p->key.vs.as_es)
+				si_pm4_delete_state(sctx, es, p->pm4);
+			else
+				si_pm4_delete_state(sctx, vs, p->pm4);
+			break;
+		case PIPE_SHADER_TESS_CTRL:
+			si_pm4_delete_state(sctx, hs, p->pm4);
+			break;
+		case PIPE_SHADER_TESS_EVAL:
+			if (p->key.tes.as_es)
 				si_pm4_delete_state(sctx, es, p->pm4);
 			else
 				si_pm4_delete_state(sctx, vs, p->pm4);
@@ -995,16 +1155,46 @@ void si_update_shaders(struct si_context *sctx)
 	struct pipe_context *ctx = (struct pipe_context*)sctx;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
+	/* Update stages before GS. */
+	if (sctx->tes_shader) {
+		/* VS as LS */
+		si_shader_select(ctx, sctx->vs_shader);
+		si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4);
+
+		if (sctx->tcs_shader) {
+			si_shader_select(ctx, sctx->tcs_shader);
+			si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4);
+		} else {
+			assert(!"generate TCS shader");
+		}
+
+		si_shader_select(ctx, sctx->tes_shader);
+		if (sctx->gs_shader) {
+			/* TES as ES */
+			si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4);
+		} else {
+			/* TES as VS */
+			si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4);
+			sctx->b.streamout.stride_in_dw = sctx->tes_shader->so.stride;
+		}
+	} else if (sctx->gs_shader) {
+		/* VS as ES */
+		si_shader_select(ctx, sctx->vs_shader);
+		si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
+	} else {
+		/* VS as VS */
+		si_shader_select(ctx, sctx->vs_shader);
+		si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
+		sctx->b.streamout.stride_in_dw = sctx->vs_shader->so.stride;
+	}
+
+	/* Update GS. */
 	if (sctx->gs_shader) {
 		si_shader_select(ctx, sctx->gs_shader);
 		si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
 		si_pm4_bind_state(sctx, vs, sctx->gs_shader->current->gs_copy_shader->pm4);
-
 		sctx->b.streamout.stride_in_dw = sctx->gs_shader->so.stride;
 
-		si_shader_select(ctx, sctx->vs_shader);
-		si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
-
 		if (!sctx->gs_rings)
 			si_init_gs_rings(sctx);
 		if (sctx->emitted.named.gs_rings != sctx->gs_rings)
@@ -1017,11 +1207,6 @@ void si_update_shaders(struct si_context *sctx)
 				   sctx->gs_shader->info.num_outputs * 16,
 				   64, true, true, 4, 16);
 	} else {
-		si_shader_select(ctx, sctx->vs_shader);
-		si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
-
-		sctx->b.streamout.stride_in_dw = sctx->vs_shader->so.stride;
-
 		si_pm4_bind_state(sctx, gs_rings, NULL);
 		si_pm4_bind_state(sctx, gs, NULL);
 		si_pm4_bind_state(sctx, es, NULL);

From b6f4fdf6a9238bdb9e0589eafb22396da347b792 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 17:25:37 +0100
Subject: [PATCH 0559/1208] radeonsi: set up a ring buffer for tessellation
 factors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.c        |  2 +
 src/gallium/drivers/radeonsi/si_pipe.h        |  2 +
 src/gallium/drivers/radeonsi/si_state.h       |  1 +
 .../drivers/radeonsi/si_state_shaders.c       | 37 +++++++++++++++++++
 4 files changed, 42 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 25dc4e7a96d..8678fe6260b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -42,6 +42,7 @@ static void si_destroy_context(struct pipe_context *context)
 
 	pipe_resource_reference(&sctx->esgs_ring, NULL);
 	pipe_resource_reference(&sctx->gsvs_ring, NULL);
+	pipe_resource_reference(&sctx->tf_ring, NULL);
 	pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
 	r600_resource_reference(&sctx->border_color_table, NULL);
 	r600_resource_reference(&sctx->scratch_buffer, NULL);
@@ -49,6 +50,7 @@ static void si_destroy_context(struct pipe_context *context)
 
 	si_pm4_free_state(sctx, sctx->init_config, ~0);
 	si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings);
+	si_pm4_delete_state(sctx, tf_ring, sctx->tf_state);
 	for (i = 0; i < Elements(sctx->vgt_shader_config); i++)
 		si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 880f7beaa19..9e2ad188801 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -204,6 +204,8 @@ struct si_context {
 	struct si_pm4_state	*gs_rings;
 	struct pipe_resource	*esgs_ring;
 	struct pipe_resource	*gsvs_ring;
+	struct si_pm4_state	*tf_state;
+	struct pipe_resource	*tf_ring;
 
 	LLVMTargetMachineRef		tm;
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index b17f2f09012..25220534eb4 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -107,6 +107,7 @@ union si_state {
 		struct si_pm4_state		*es;
 		struct si_pm4_state		*gs;
 		struct si_pm4_state		*gs_rings;
+		struct si_pm4_state		*tf_ring;
 		struct si_pm4_state		*vgt_shader_config;
 		struct si_pm4_state		*vs;
 		struct si_pm4_state		*ps;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 937e71de723..686718e4221 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1115,6 +1115,40 @@ static void si_update_spi_tmpring_size(struct si_context *sctx)
 				S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
 }
 
+static void si_init_tess_factor_ring(struct si_context *sctx)
+{
+	assert(!sctx->tf_state);
+	sctx->tf_state = CALLOC_STRUCT(si_pm4_state);
+
+	sctx->tf_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
+					   PIPE_USAGE_DEFAULT,
+					   32768 * sctx->screen->b.info.max_se);
+	sctx->b.clear_buffer(&sctx->b.b, sctx->tf_ring, 0,
+			     sctx->tf_ring->width0, fui(0), false);
+	assert(((sctx->tf_ring->width0 / 4) & C_030938_SIZE) == 0);
+
+	if (sctx->b.chip_class >= CIK) {
+		si_pm4_set_reg(sctx->tf_state, R_030938_VGT_TF_RING_SIZE,
+			       S_030938_SIZE(sctx->tf_ring->width0 / 4));
+		si_pm4_set_reg(sctx->tf_state, R_030940_VGT_TF_MEMORY_BASE,
+			       r600_resource(sctx->tf_ring)->gpu_address >> 8);
+	} else {
+		si_pm4_set_reg(sctx->tf_state, R_008988_VGT_TF_RING_SIZE,
+			       S_008988_SIZE(sctx->tf_ring->width0 / 4));
+		si_pm4_set_reg(sctx->tf_state, R_0089B8_VGT_TF_MEMORY_BASE,
+			       r600_resource(sctx->tf_ring)->gpu_address >> 8);
+	}
+	si_pm4_add_bo(sctx->tf_state, r600_resource(sctx->tf_ring),
+		      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
+	si_pm4_bind_state(sctx, tf_ring, sctx->tf_state);
+
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_TESS_CTRL,
+			   SI_RING_TESS_FACTOR, sctx->tf_ring, 0,
+			   sctx->tf_ring->width0, false, false, 0, 0);
+
+	sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
+}
+
 static void si_update_vgt_shader_config(struct si_context *sctx)
 {
 	/* Calculate the index of the config.
@@ -1157,6 +1191,9 @@ void si_update_shaders(struct si_context *sctx)
 
 	/* Update stages before GS. */
 	if (sctx->tes_shader) {
+		if (!sctx->tf_state)
+			si_init_tess_factor_ring(sctx);
+
 		/* VS as LS */
 		si_shader_select(ctx, sctx->vs_shader);
 		si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4);

From db267a04ceee51ca1698c3a68127508fa1e31c86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 18 May 2015 01:59:37 +0200
Subject: [PATCH 0560/1208] radeonsi: implement a fixed-function tessellation
 control shader and its state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.c        |  2 +
 src/gallium/drivers/radeonsi/si_pipe.h        |  1 +
 src/gallium/drivers/radeonsi/si_state.c       | 25 +++++++++++
 .../drivers/radeonsi/si_state_shaders.c       | 41 ++++++++++++++++++-
 4 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 8678fe6260b..b545a4f4a5f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -59,6 +59,8 @@ static void si_destroy_context(struct pipe_context *context)
 	if (sctx->dummy_pixel_shader) {
 		sctx->b.b.delete_fs_state(&sctx->b.b, sctx->dummy_pixel_shader);
 	}
+	if (sctx->fixed_func_tcs_shader)
+		sctx->b.b.delete_tcs_state(&sctx->b.b, sctx->fixed_func_tcs_shader);
 	sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush);
 	sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve);
 	sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 9e2ad188801..08eae960c06 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -136,6 +136,7 @@ struct si_context {
 	struct si_screen		*screen;
 	struct si_pm4_state		*init_config;
 	struct pipe_fence_handle	*last_gfx_fence;
+	struct si_shader_selector	*fixed_func_tcs_shader;
 
 	union {
 		struct {
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 316c689357b..482e4fed5a0 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2895,6 +2895,30 @@ static void si_set_polygon_stipple(struct pipe_context *ctx,
 	}
 }
 
+static void si_set_tess_state(struct pipe_context *ctx,
+			      const float default_outer_level[4],
+			      const float default_inner_level[2])
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct pipe_constant_buffer cb;
+	float array[8];
+
+	memcpy(array, default_outer_level, sizeof(float) * 4);
+	memcpy(array+4, default_inner_level, sizeof(float) * 2);
+
+	cb.buffer = NULL;
+	cb.user_buffer = NULL;
+	cb.buffer_size = sizeof(array);
+
+	si_upload_const_buffer(sctx, (struct r600_resource**)&cb.buffer,
+			       (void*)array, sizeof(array),
+			       &cb.buffer_offset);
+
+	ctx->set_constant_buffer(ctx, PIPE_SHADER_TESS_CTRL,
+				 SI_DRIVER_STATE_CONST_BUF, &cb);
+	pipe_resource_reference(&cb.buffer, NULL);
+}
+
 static void si_texture_barrier(struct pipe_context *ctx)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
@@ -2972,6 +2996,7 @@ void si_init_state_functions(struct si_context *sctx)
 	sctx->b.b.texture_barrier = si_texture_barrier;
 	sctx->b.b.set_polygon_stipple = si_set_polygon_stipple;
 	sctx->b.b.set_min_samples = si_set_min_samples;
+	sctx->b.b.set_tess_state = si_set_tess_state;
 
 	sctx->b.set_occlusion_query_state = si_set_occlusion_query_state;
 	sctx->b.need_gfx_cs_space = si_need_gfx_cs_space;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 686718e4221..76d590b4e6c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -30,6 +30,7 @@
 #include "sid.h"
 
 #include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_ureg.h"
 #include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 
@@ -1149,6 +1150,40 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
 	sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
 }
 
+/**
+ * This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
+ * VS passes its outputs to TES directly, so the fixed-function shader only
+ * has to write TESSOUTER and TESSINNER.
+ */
+static void si_generate_fixed_func_tcs(struct si_context *sctx)
+{
+	struct ureg_src const0, const1;
+	struct ureg_dst tessouter, tessinner;
+	struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_TESS_CTRL);
+
+	if (!ureg)
+		return; /* if we get here, we're screwed */
+
+	assert(!sctx->fixed_func_tcs_shader);
+
+	ureg_DECL_constant2D(ureg, 0, 1, SI_DRIVER_STATE_CONST_BUF);
+	const0 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 0),
+				    SI_DRIVER_STATE_CONST_BUF);
+	const1 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 1),
+				    SI_DRIVER_STATE_CONST_BUF);
+
+	tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
+	tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
+
+	ureg_MOV(ureg, tessouter, const0);
+	ureg_MOV(ureg, tessinner, const1);
+	ureg_END(ureg);
+
+	sctx->fixed_func_tcs_shader =
+		ureg_create_shader_and_destroy(ureg, &sctx->b.b);
+	assert(sctx->fixed_func_tcs_shader);
+}
+
 static void si_update_vgt_shader_config(struct si_context *sctx)
 {
 	/* Calculate the index of the config.
@@ -1202,7 +1237,11 @@ void si_update_shaders(struct si_context *sctx)
 			si_shader_select(ctx, sctx->tcs_shader);
 			si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4);
 		} else {
-			assert(!"generate TCS shader");
+			if (!sctx->fixed_func_tcs_shader)
+				si_generate_fixed_func_tcs(sctx);
+			si_shader_select(ctx, sctx->fixed_func_tcs_shader);
+			si_pm4_bind_state(sctx, hs,
+					  sctx->fixed_func_tcs_shader->current->pm4);
 		}
 
 		si_shader_select(ctx, sctx->tes_shader);

From 74c1001d13f07538e349c157598f9de83f252c49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 18:01:18 +0100
Subject: [PATCH 0561/1208] radeonsi: add derived tessellation state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_hw_context.c  |   7 +
 src/gallium/drivers/radeonsi/si_pipe.h        |  11 +-
 src/gallium/drivers/radeonsi/si_state_draw.c  | 123 ++++++++++++++++++
 .../drivers/radeonsi/si_state_shaders.c       |   8 +-
 4 files changed, 146 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index dc8702e95a6..b4c5b0a6f48 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -165,6 +165,9 @@ void si_begin_new_cs(struct si_context *ctx)
 	r600_postflush_resume_features(&ctx->b);
 
 	ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw;
+
+	/* Invalidate various draw states so that they are emitted before
+	 * the first draw call. */
 	si_invalidate_draw_sh_constants(ctx);
 	ctx->last_primitive_restart_en = -1;
 	ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
@@ -174,4 +177,8 @@ void si_begin_new_cs(struct si_context *ctx)
 	ctx->last_rast_prim = -1;
 	ctx->last_sc_line_stipple = ~0;
 	ctx->emit_scratch_reloc = true;
+	ctx->last_ls = NULL;
+	ctx->last_tcs = NULL;
+	ctx->last_tes_sh_base = -1;
+	ctx->last_num_tcs_input_cp = -1;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 08eae960c06..ad3677435e4 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -48,7 +48,8 @@
 
 #define SI_MAX_DRAW_CS_DWORDS \
 	(/*scratch:*/ 3 + /*derived prim state:*/ 3 + \
-	 /*draw regs:*/ 16 + /*draw packets:*/ 31)
+	 /*draw regs:*/ 16 + /*draw packets:*/ 31 +\
+	 /*derived tess state:*/ 19)
 
 /* Instruction cache. */
 #define SI_CONTEXT_INV_ICACHE		(R600_CONTEXT_PRIVATE_FLAG << 0)
@@ -224,7 +225,7 @@ struct si_context {
 	bool			db_depth_disable_expclear;
 	unsigned		ps_db_shader_control;
 
-	/* Draw state. */
+	/* Emitted draw state. */
 	int			last_base_vertex;
 	int			last_start_instance;
 	int			last_sh_base_reg;
@@ -241,6 +242,12 @@ struct si_context {
 	boolean                 emit_scratch_reloc;
 	unsigned		scratch_waves;
 	unsigned		spi_tmpring_size;
+
+	/* Emitted derived tessellation state. */
+	struct si_shader	*last_ls; /* local shader (VS) */
+	struct si_shader_selector *last_tcs;
+	int			last_num_tcs_input_cp;
+	int			last_tes_sh_base;
 };
 
 /* cik_sdma.c */
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index b877a7e0e95..fd507b0fa1d 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -96,6 +96,125 @@ static unsigned si_conv_prim_to_gs_out(unsigned mode)
 	return prim_conv[mode];
 }
 
+/**
+ * This calculates the LDS size for tessellation shaders (VS, TCS, TES).
+ * LS.LDS_SIZE is shared by all 3 shader stages.
+ *
+ * The information about LDS and other non-compile-time parameters is then
+ * written to userdata SGPRs.
+ */
+static void si_emit_derived_tess_state(struct si_context *sctx,
+				       const struct pipe_draw_info *info,
+				       unsigned *num_patches)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct si_shader_selector *ls = sctx->vs_shader;
+	/* The TES pointer will only be used for sctx->last_tcs.
+	 * It would be wrong to think that TCS = TES. */
+	struct si_shader_selector *tcs =
+		sctx->tcs_shader ? sctx->tcs_shader : sctx->tes_shader;
+	unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
+	unsigned num_tcs_input_cp = info->vertices_per_patch;
+	unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
+	unsigned num_tcs_patch_outputs;
+	unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
+	unsigned input_patch_size, output_patch_size, output_patch0_offset;
+	unsigned perpatch_output_offset, lds_size, ls_rsrc2;
+	unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
+
+	*num_patches = 1; /* TODO: calculate this */
+
+	if (sctx->last_ls == ls->current &&
+	    sctx->last_tcs == tcs &&
+	    sctx->last_tes_sh_base == tes_sh_base &&
+	    sctx->last_num_tcs_input_cp == num_tcs_input_cp)
+		return;
+
+	sctx->last_ls = ls->current;
+	sctx->last_tcs = tcs;
+	sctx->last_tes_sh_base = tes_sh_base;
+	sctx->last_num_tcs_input_cp = num_tcs_input_cp;
+
+	/* This calculates how shader inputs and outputs among VS, TCS, and TES
+	 * are laid out in LDS. */
+	num_tcs_inputs = util_last_bit64(ls->outputs_written);
+
+	if (sctx->tcs_shader) {
+		num_tcs_outputs = util_last_bit64(tcs->outputs_written);
+		num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+		num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
+	} else {
+		/* No TCS. Route varyings from LS to TES. */
+		num_tcs_outputs = num_tcs_inputs;
+		num_tcs_output_cp = num_tcs_input_cp;
+		num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
+	}
+
+	input_vertex_size = num_tcs_inputs * 16;
+	output_vertex_size = num_tcs_outputs * 16;
+
+	input_patch_size = num_tcs_input_cp * input_vertex_size;
+
+	pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
+	output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+
+	output_patch0_offset = sctx->tcs_shader ? input_patch_size * *num_patches : 0;
+	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
+
+	lds_size = output_patch0_offset + output_patch_size * *num_patches;
+	ls_rsrc2 = ls->current->ls_rsrc2;
+
+	if (sctx->b.chip_class >= CIK) {
+		assert(lds_size <= 65536);
+		ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 512) / 512);
+	} else {
+		assert(lds_size <= 32768);
+		ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 256) / 256);
+	}
+
+	/* Due to a hw bug, RSRC2_LS must be written twice with another
+	 * LS register written in between. */
+	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
+		si_write_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
+	si_write_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
+	radeon_emit(cs, ls->current->ls_rsrc1);
+	radeon_emit(cs, ls_rsrc2);
+
+	/* Compute userdata SGPRs. */
+	assert(((input_vertex_size / 4) & ~0xff) == 0);
+	assert(((output_vertex_size / 4) & ~0xff) == 0);
+	assert(((input_patch_size / 4) & ~0x1fff) == 0);
+	assert(((output_patch_size / 4) & ~0x1fff) == 0);
+	assert(((output_patch0_offset / 16) & ~0xffff) == 0);
+	assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
+	assert(num_tcs_input_cp <= 32);
+	assert(num_tcs_output_cp <= 32);
+
+	tcs_in_layout = (input_patch_size / 4) |
+			((input_vertex_size / 4) << 13);
+	tcs_out_layout = (output_patch_size / 4) |
+			 ((output_vertex_size / 4) << 13);
+	tcs_out_offsets = (output_patch0_offset / 16) |
+			  ((perpatch_output_offset / 16) << 16);
+
+	/* Set them for LS. */
+	si_write_sh_reg(cs,
+		R_00B530_SPI_SHADER_USER_DATA_LS_0 + SI_SGPR_LS_OUT_LAYOUT * 4,
+		tcs_in_layout);
+
+	/* Set them for TCS. */
+	si_write_sh_reg_seq(cs,
+		R_00B430_SPI_SHADER_USER_DATA_HS_0 + SI_SGPR_TCS_OUT_OFFSETS * 4, 3);
+	radeon_emit(cs, tcs_out_offsets);
+	radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26));
+	radeon_emit(cs, tcs_in_layout);
+
+	/* Set them for TES. */
+	si_write_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TCS_OUT_OFFSETS * 4, 2);
+	radeon_emit(cs, tcs_out_offsets);
+	radeon_emit(cs, tcs_out_layout | (num_tcs_output_cp << 26));
+}
+
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 					  const struct pipe_draw_info *info)
 {
@@ -208,8 +327,12 @@ static void si_emit_draw_registers(struct si_context *sctx,
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 	unsigned prim = si_conv_pipe_prim(info->mode);
 	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
+	unsigned num_patches = 0;
 	unsigned ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info);
 
+	if (sctx->tes_shader)
+		si_emit_derived_tess_state(sctx, info, &num_patches);
+
 	/* Draw state. */
 	if (prim != sctx->last_prim ||
 	    ia_multi_vgt_param != sctx->last_multi_vgt_param) {
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 76d590b4e6c..074e44a60e0 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -745,11 +745,15 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
+	bool enable_changed = !!sctx->tcs_shader != !!sel;
 
 	if (sctx->tcs_shader == sel)
 		return;
 
 	sctx->tcs_shader = sel;
+
+	if (enable_changed)
+		sctx->last_tcs = NULL; /* invalidate derived tess state */
 }
 
 static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
@@ -765,8 +769,10 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 	sctx->clip_regs.dirty = true;
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 
-	if (enable_changed)
+	if (enable_changed) {
 		si_shader_change_notify(sctx);
+		sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
+	}
 }
 
 static void si_make_dummy_ps(struct si_context *sctx)

From 09d02fa463b7207464c99ca887e253476fde851e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 18:06:34 +0100
Subject: [PATCH 0562/1208] radeonsi: update IA_MULTI_VGT_PARAM for
 tessellation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_state_draw.c | 54 ++++++++++++++++++--
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index fd507b0fa1d..39e57e229af 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -31,6 +31,7 @@
 
 #include "util/u_index_modify.h"
 #include "util/u_upload_mgr.h"
+#include "util/u_prim.h"
 
 static void si_decompress_textures(struct si_context *sctx)
 {
@@ -216,7 +217,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 }
 
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
-					  const struct pipe_draw_info *info)
+					  const struct pipe_draw_info *info,
+					  unsigned num_patches)
 {
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	unsigned prim = info->mode;
@@ -225,11 +227,41 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 	/* SWITCH_ON_EOP(0) is always preferable. */
 	bool wd_switch_on_eop = false;
 	bool ia_switch_on_eop = false;
+	bool ia_switch_on_eoi = false;
 	bool partial_vs_wave = false;
+	bool partial_es_wave = false;
 
 	if (sctx->gs_shader)
 		primgroup_size = 64; /* recommended with a GS */
 
+	if (sctx->tes_shader) {
+		unsigned num_cp_out =
+			sctx->tcs_shader ?
+			sctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
+			info->vertices_per_patch;
+		unsigned max_size = 256 / MAX2(info->vertices_per_patch, num_cp_out);
+
+		primgroup_size = MIN2(primgroup_size, max_size);
+
+		/* primgroup_size must be set to a multiple of NUM_PATCHES */
+		primgroup_size = (primgroup_size / num_patches) * num_patches;
+
+		/* SWITCH_ON_EOI must be set if PrimID is used.
+		 * If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
+		if ((sctx->tcs_shader && sctx->tcs_shader->info.uses_primid) ||
+		    sctx->tes_shader->info.uses_primid) {
+			ia_switch_on_eoi = true;
+			partial_es_wave = true;
+		}
+
+		/* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
+		if ((sctx->b.family == CHIP_TAHITI ||
+		     sctx->b.family == CHIP_PITCAIRN ||
+		     sctx->b.family == CHIP_BONAIRE) &&
+		    sctx->gs_shader)
+			partial_vs_wave = true;
+	}
+
 	/* This is a hardware requirement. */
 	if ((rs && rs->line_stipple_enable) ||
 	    (sctx->b.screen->debug_flags & DBG_SWITCH_ON_EOP)) {
@@ -268,8 +300,23 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		assert(wd_switch_on_eop || !ia_switch_on_eop);
 	}
 
+	/* Hw bug with single-primitive instances and SWITCH_ON_EOI
+	 * on multi-SE chips. */
+	if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi &&
+	    (info->indirect ||
+	     (info->instance_count > 1 &&
+	      u_prims_for_vertices(info->mode, info->count) <= 1)))
+		sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
+
+	/* Instancing bug on 2 SE chips. */
+	if (sctx->b.screen->info.max_se == 2 && ia_switch_on_eoi &&
+	    (info->indirect || info->instance_count > 1))
+		partial_vs_wave = true;
+
 	return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
+		S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
 		S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
+		S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
 		S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1) |
 		S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0);
 }
@@ -327,12 +374,13 @@ static void si_emit_draw_registers(struct si_context *sctx,
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 	unsigned prim = si_conv_pipe_prim(info->mode);
 	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
-	unsigned num_patches = 0;
-	unsigned ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info);
+	unsigned ia_multi_vgt_param, num_patches = 0;
 
 	if (sctx->tes_shader)
 		si_emit_derived_tess_state(sctx, info, &num_patches);
 
+	ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches);
+
 	/* Draw state. */
 	if (prim != sctx->last_prim ||
 	    ia_multi_vgt_param != sctx->last_multi_vgt_param) {

From 3344699243b856c3bc7b8ea08a949d2e3274e871 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 18:07:51 +0100
Subject: [PATCH 0563/1208] radeonsi: set VGT_LS_HS_CONFIG for tessellation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_hw_context.c |  1 +
 src/gallium/drivers/radeonsi/si_pipe.h       |  3 ++-
 src/gallium/drivers/radeonsi/si_state_draw.c | 28 +++++++++++++++++---
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index b4c5b0a6f48..1aef37b18b5 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -174,6 +174,7 @@ void si_begin_new_cs(struct si_context *ctx)
 	ctx->last_gs_out_prim = -1;
 	ctx->last_prim = -1;
 	ctx->last_multi_vgt_param = -1;
+	ctx->last_ls_hs_config = -1;
 	ctx->last_rast_prim = -1;
 	ctx->last_sc_line_stipple = ~0;
 	ctx->emit_scratch_reloc = true;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index ad3677435e4..7b2263b1162 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -48,7 +48,7 @@
 
 #define SI_MAX_DRAW_CS_DWORDS \
 	(/*scratch:*/ 3 + /*derived prim state:*/ 3 + \
-	 /*draw regs:*/ 16 + /*draw packets:*/ 31 +\
+	 /*draw regs:*/ 18 + /*draw packets:*/ 31 +\
 	 /*derived tess state:*/ 19)
 
 /* Instruction cache. */
@@ -234,6 +234,7 @@ struct si_context {
 	int			last_gs_out_prim;
 	int			last_prim;
 	int			last_multi_vgt_param;
+	int			last_ls_hs_config;
 	int			last_rast_prim;
 	unsigned		last_sc_line_stipple;
 	int			current_rast_prim; /* primitive type after TES, GS */
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 39e57e229af..dc90d3f5fb5 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -321,6 +321,24 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0);
 }
 
+static unsigned si_get_ls_hs_config(struct si_context *sctx,
+				    const struct pipe_draw_info *info,
+				    unsigned num_patches)
+{
+	unsigned num_output_cp;
+
+	if (!sctx->tes_shader)
+		return 0;
+
+	num_output_cp = sctx->tcs_shader ?
+		sctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
+		info->vertices_per_patch;
+
+	return S_028B58_NUM_PATCHES(num_patches) |
+		S_028B58_HS_NUM_INPUT_CP(info->vertices_per_patch) |
+		S_028B58_HS_NUM_OUTPUT_CP(num_output_cp);
+}
+
 static void si_emit_scratch_reloc(struct si_context *sctx)
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
@@ -374,27 +392,31 @@ static void si_emit_draw_registers(struct si_context *sctx,
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 	unsigned prim = si_conv_pipe_prim(info->mode);
 	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
-	unsigned ia_multi_vgt_param, num_patches = 0;
+	unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0;
 
 	if (sctx->tes_shader)
 		si_emit_derived_tess_state(sctx, info, &num_patches);
 
 	ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches);
+	ls_hs_config = si_get_ls_hs_config(sctx, info, num_patches);
 
 	/* Draw state. */
 	if (prim != sctx->last_prim ||
-	    ia_multi_vgt_param != sctx->last_multi_vgt_param) {
+	    ia_multi_vgt_param != sctx->last_multi_vgt_param ||
+	    ls_hs_config != sctx->last_ls_hs_config) {
 		if (sctx->b.chip_class >= CIK) {
 			radeon_emit(cs, PKT3(PKT3_DRAW_PREAMBLE, 2, 0));
 			radeon_emit(cs, prim); /* VGT_PRIMITIVE_TYPE */
 			radeon_emit(cs, ia_multi_vgt_param); /* IA_MULTI_VGT_PARAM */
-			radeon_emit(cs, 0); /* VGT_LS_HS_CONFIG */
+			radeon_emit(cs, ls_hs_config); /* VGT_LS_HS_CONFIG */
 		} else {
 			r600_write_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, prim);
 			r600_write_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
+			r600_write_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
 		}
 		sctx->last_prim = prim;
 		sctx->last_multi_vgt_param = ia_multi_vgt_param;
+		sctx->last_ls_hs_config = ls_hs_config;
 	}
 
 	if (gs_out_prim != sctx->last_gs_out_prim) {

From 5aa5f9082347941fd8ac2fc3e94cd91aa1489982 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 18:09:18 +0100
Subject: [PATCH 0564/1208] radeonsi: set the rasterization primitive type for
 tessellation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_state_draw.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index dc90d3f5fb5..1793c2e6f91 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -730,6 +730,9 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	 * current_rast_prim for this draw_vbo call. */
 	if (sctx->gs_shader)
 		sctx->current_rast_prim = sctx->gs_shader->gs_output_prim;
+	else if (sctx->tes_shader)
+		sctx->current_rast_prim =
+			sctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 	else
 		sctx->current_rast_prim = info->mode;
 

From 99bf47f603502cd6f3a6040ba17c0881e3b0c15f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 18:10:38 +0100
Subject: [PATCH 0565/1208] radeonsi: add assertions into draw_vbo and check
 tessellation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_state_draw.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 1793c2e6f91..ec8dd84c9dd 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -718,8 +718,14 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	    (info->indexed || !info->count_from_stream_output))
 		return;
 
-	if (!sctx->ps_shader || !sctx->vs_shader)
+	if (!sctx->ps_shader || !sctx->vs_shader) {
+		assert(0);
 		return;
+	}
+	if (!!sctx->tes_shader != (info->mode == PIPE_PRIM_PATCHES)) {
+		assert(0);
+		return;
+	}
 
 	si_decompress_textures(sctx);
 

From 12df9a7876ed0e6cfffb7871dc37bf66c95edca3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 18:16:02 +0100
Subject: [PATCH 0566/1208] radeonsi: update invariant registers for
 tessellation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 482e4fed5a0..ab5c3ca2a03 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3125,8 +3125,8 @@ static void si_init_config(struct si_context *sctx)
 
 	si_cmd_context_control(pm4);
 
-	si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, 0x0);
-	si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, 0x0);
+	si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
+	si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
 
 	/* FIXME calculate these values somehow ??? */
 	si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, 0x80);
@@ -3251,6 +3251,10 @@ static void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
 
 	if (sctx->b.chip_class >= CIK) {
+		si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xfffc));
+		si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
+		si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xfffe));
+		si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff));
 		si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xffff));
 		si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(0));
 		si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff));

From a193c4978b0b536266afc7887457ab11473671d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 18 May 2015 14:41:35 +0200
Subject: [PATCH 0567/1208] radeonsi: add scratch buffer support for
 tessellation shaders
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 .../drivers/radeonsi/si_state_shaders.c       | 34 +++++++++++++++----
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 074e44a60e0..24afed06fce 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -122,7 +122,8 @@ static void si_shader_ls(struct si_shader *shader)
 	shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
 			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
 		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt);
-	shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs);
+	shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
+			   S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
 }
 
 static void si_shader_hs(struct si_shader *shader)
@@ -154,7 +155,8 @@ static void si_shader_hs(struct si_shader *shader)
 		       S_00B428_VGPRS((shader->num_vgprs - 1) / 4) |
 		       S_00B428_SGPRS((num_sgprs - 1) / 8));
 	si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
-		       S_00B42C_USER_SGPR(num_user_sgprs));
+		       S_00B42C_USER_SGPR(num_user_sgprs) |
+		       S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_es(struct si_shader *shader)
@@ -1066,10 +1068,14 @@ static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_context *sctx,
 
 static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
 {
+	unsigned bytes = 0;
 
-	return MAX3(si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader),
-			si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader),
-			si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tcs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tes_shader));
+	return bytes;
 }
 
 static void si_update_spi_tmpring_size(struct si_context *sctx)
@@ -1103,15 +1109,29 @@ static void si_update_spi_tmpring_size(struct si_context *sctx)
 			si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4);
 		if (si_update_scratch_buffer(sctx, sctx->gs_shader))
 			si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
+		if (si_update_scratch_buffer(sctx, sctx->tcs_shader))
+			si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4);
 
-		/* VS can be bound as ES or VS. */
-		if (sctx->gs_shader) {
+		/* VS can be bound as LS, ES, or VS. */
+		if (sctx->tes_shader) {
+			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
+				si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4);
+		} else if (sctx->gs_shader) {
 			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
 				si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
 		} else {
 			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
 				si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
 		}
+
+		/* TES can be bound as ES or VS. */
+		if (sctx->gs_shader) {
+			if (si_update_scratch_buffer(sctx, sctx->tes_shader))
+				si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4);
+		} else {
+			if (si_update_scratch_buffer(sctx, sctx->tes_shader))
+				si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4);
+		}
 	}
 
 	/* The LLVM shader backend should be reporting aligned scratch_sizes. */

From bac12c8948681a23fd1a8f8a6bbb5523ccfe0939 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 22 Feb 2015 18:46:53 +0100
Subject: [PATCH 0568/1208] radeonsi: enable tessellation, update GL3.txt &
 release notes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 docs/GL3.txt                           |  2 +-
 docs/relnotes/10.7.0.html              |  1 +
 src/gallium/drivers/radeonsi/si_pipe.c | 14 +++++++++++---
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 214425713b8..197b676f64b 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -112,7 +112,7 @@ GL 4.0, GLSL 4.00:
   GL_ARB_gpu_shader_fp64                               DONE (nvc0, radeonsi, llvmpipe, softpipe)
   GL_ARB_sample_shading                                DONE (i965, nv50, nvc0, r600, radeonsi)
   GL_ARB_shader_subroutine                             started (Dave)
-  GL_ARB_tessellation_shader                           started (Chris, Ilia)
+  GL_ARB_tessellation_shader                           DONE (radeonsi)
   GL_ARB_texture_buffer_object_rgb32                   DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_cube_map_array                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_gather                                DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index 0d993313f34..c7d0d2480ff 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -50,6 +50,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_get_texture_sub_image for all drivers</li>
 <li>GL_ARB_gpu_shader_fp64 on llvmpipe, radeonsi</li>
 <li>GL_ARB_shader_stencil_export on llvmpipe</li>
+<li>GL_ARB_tessellation_shader on radeonsi</li>
 <li>GL_ARB_vertex_attrib_64bit on llvmpipe, radeonsi</li>
 <li>GL_ARB_viewport_array on radeonsi</li>
 <li>GLX_ARB_create_context_robustness on r600, radeonsi</li>
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index b545a4f4a5f..c2985b8d224 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -293,9 +293,11 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
-	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
 		return 0;
 
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+		return 30;
+
 	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
 		return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600;
 
@@ -375,6 +377,13 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_VERTEX:
 	case PIPE_SHADER_GEOMETRY:
 		break;
+	case PIPE_SHADER_TESS_CTRL:
+	case PIPE_SHADER_TESS_EVAL:
+		/* LLVM 3.6.2 is required for tessellation because of bug fixes there */
+		if (HAVE_LLVM < 0x0306 ||
+		    (HAVE_LLVM == 0x0306 && MESA_LLVM_VERSION_PATCH < 2))
+			return 0;
+		break;
 	case PIPE_SHADER_COMPUTE:
 		switch (param) {
 		case PIPE_SHADER_CAP_PREFERRED_IR:
@@ -401,7 +410,6 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 		}
 		break;
 	default:
-		/* TODO: support tessellation */
 		return 0;
 	}
 
@@ -433,7 +441,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 		/* Indirection of geometry shader input dimension is not
 		 * handled yet
 		 */
-		return shader < PIPE_SHADER_GEOMETRY;
+		return shader != PIPE_SHADER_GEOMETRY;
 	case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:

From b406c34a65677cac2517336d93ab279c3d35fce6 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 23 Jul 2015 10:39:26 +1000
Subject: [PATCH 0569/1208] i965: fix warning since tess merge.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 6c979334ac7..33469d4ad69 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -102,6 +102,8 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
       }
       case MESA_SHADER_GEOMETRY:
       case MESA_SHADER_COMPUTE:
+      case MESA_SHADER_TESS_CTRL:
+      case MESA_SHADER_TESS_EVAL:
          unreachable("fs_visitor not used for these stages yet.");
          break;
       case MESA_SHADER_FRAGMENT:

From 18955e8a80ee2b344eaf3eb1d24eed90f6ba8334 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 23 Jul 2015 11:13:36 +1000
Subject: [PATCH 0570/1208] glsl/tests: fix varying_test since tess changes.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes make check since the tess changes.

Tested-by: Michel Dänzer <michel.daenzer@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/tests/varyings_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glsl/tests/varyings_test.cpp b/src/glsl/tests/varyings_test.cpp
index 4573529f619..62f8c6bc5ad 100644
--- a/src/glsl/tests/varyings_test.cpp
+++ b/src/glsl/tests/varyings_test.cpp
@@ -70,7 +70,7 @@ public:
    hash_table *consumer_interface_inputs;
 
    const glsl_type *simple_interface;
-   ir_variable *junk[VARYING_SLOT_MAX];
+   ir_variable *junk[VARYING_SLOT_TESS_MAX];
 };
 
 link_varyings::link_varyings()

From 2ca1f767818b02354735b58cef896abb8677e4ed Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 23 Jul 2015 10:39:47 +1000
Subject: [PATCH 0571/1208] dispatch_sanity.cpp: remove commented out tess
 entries

These entries were put in the GL4.0 section, so removed the commented
out ones.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/main/tests/dispatch_sanity.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 7e3a97baea8..d1bd0cee61e 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -1577,8 +1577,6 @@ const struct function gl_core_functions_possible[] = {
 // { "glUniformSubroutinesuiv", 43, -1 },               // XXX: Add to xml
 // { "glGetUniformSubroutineuiv", 43, -1 },             // XXX: Add to xml
 // { "glGetProgramStageiv", 43, -1 },                   // XXX: Add to xml
-// { "glPatchParameteri", 43, -1 },                     // XXX: Add to xml
-// { "glPatchParameterfv", 43, -1 },                    // XXX: Add to xml
 
    { "glBindTransformFeedback", 43, -1 },
    { "glDeleteTransformFeedbacks", 43, -1 },

From 2ffe9b542116580571b157de8a89476b22694ea9 Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 10 Aug 2014 21:00:34 +1200
Subject: [PATCH 0572/1208] mesa: Add stubs for ARB_shader_subroutine
 entrypoints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/main/shaderapi.c | 61 +++++++++++++++++++++++++++++++++++++++
 src/mesa/main/shaderapi.h | 34 ++++++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 38365908a86..495cb46808c 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -2158,3 +2158,64 @@ _mesa_PatchParameterfv(GLenum pname, const GLfloat *values)
    }
 }
 
+/**
+ * ARB_shader_subroutine
+ */
+GLint GLAPIENTRY
+_mesa_GetSubroutineUniformLocation(GLuint program, GLenum shadertype,
+                                   const GLchar *name)
+{
+   return -1;
+}
+
+
+GLuint GLAPIENTRY
+_mesa_GetSubroutineIndex(GLuint program, GLenum shadertype,
+                         const GLchar *name)
+{
+   return GL_INVALID_INDEX;
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineUniformiv(GLuint program, GLenum shadertype,
+                                   GLuint index, GLenum pname, GLint *values)
+{
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineUniformName(GLuint program, GLenum shadertype,
+                                     GLuint index, GLsizei bufsize,
+                                     GLsizei *length, GLchar *name)
+{
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineName(GLuint program, GLenum shadertype,
+                              GLuint index, GLsizei bufsize,
+                              GLsizei *length, GLchar *name)
+{
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
+                            const GLuint *indices)
+{
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetUniformSubroutineuiv(GLenum shadertype, GLint location,
+                              GLuint *params)
+{
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetProgramStageiv(GLuint program, GLenum shadertype,
+                        GLenum pname, GLint *values)
+{
+}
diff --git a/src/mesa/main/shaderapi.h b/src/mesa/main/shaderapi.h
index 90e2e2d25d4..c2f04d875a7 100644
--- a/src/mesa/main/shaderapi.h
+++ b/src/mesa/main/shaderapi.h
@@ -271,6 +271,40 @@ _mesa_PatchParameteri(GLenum pname, GLint value);
 extern void GLAPIENTRY
 _mesa_PatchParameterfv(GLenum pname, const GLfloat *values);
 
+/* GL_ARB_shader_subroutine */
+extern GLint GLAPIENTRY
+_mesa_GetSubroutineUniformLocation(GLuint program, GLenum shadertype,
+                                   const GLchar *name);
+
+extern GLuint GLAPIENTRY
+_mesa_GetSubroutineIndex(GLuint program, GLenum shadertype,
+                         const GLchar *name);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineUniformiv(GLuint program, GLenum shadertype,
+                                   GLuint index, GLenum pname, GLint *values);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineUniformName(GLuint program, GLenum shadertype,
+                                     GLuint index, GLsizei bufsize,
+                                     GLsizei *length, GLchar *name);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineName(GLuint program, GLenum shadertype,
+                              GLuint index, GLsizei bufsize,
+                              GLsizei *length, GLchar *name);
+
+extern GLvoid GLAPIENTRY
+_mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
+                            const GLuint *indices);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetUniformSubroutineuiv(GLenum shadertype, GLint location,
+                              GLuint *params);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetProgramStageiv(GLuint program, GLenum shadertype,
+                        GLenum pname, GLint *values);
 
 #ifdef __cplusplus
 }

From b8f3e316bca2c9abd3c885a9447ecf29446d0ccb Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 10 Aug 2014 20:38:53 +1200
Subject: [PATCH 0573/1208] glapi: Add ARB_shader_subroutine functions and
 enums (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: fix output="true" and LENGTH typo

Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mapi/glapi/gen/ARB_shader_subroutine.xml | 84 ++++++++++++++++++++
 src/mapi/glapi/gen/Makefile.am               |  1 +
 src/mapi/glapi/gen/gl_API.xml                |  4 +-
 3 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 src/mapi/glapi/gen/ARB_shader_subroutine.xml

diff --git a/src/mapi/glapi/gen/ARB_shader_subroutine.xml b/src/mapi/glapi/gen/ARB_shader_subroutine.xml
new file mode 100644
index 00000000000..04b75cb8f59
--- /dev/null
+++ b/src/mapi/glapi/gen/ARB_shader_subroutine.xml
@@ -0,0 +1,84 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<!-- Note: no GLX protocol info yet. -->
+
+<OpenGLAPI>
+
+<category name="GL_ARB_shader_subroutine" number="90">
+
+    <function name="GetSubroutineUniformLocation" offset="assign">
+        <param name="program" type="GLuint"/>
+        <param name="shadertype" type="GLenum"/>
+        <param name="name" type="const GLchar *"/>
+        <return type="GLint"/>
+    </function>
+
+    <function name="GetSubroutineIndex" offset="assign">
+        <param name="program" type="GLuint"/>
+        <param name="shadertype" type="GLenum"/>
+        <param name="name" type="const GLchar *"/>
+        <return type="GLuint"/>
+    </function>
+
+    <function name="GetActiveSubroutineUniformiv" offset="assign">
+        <param name="program" type="GLuint"/>
+        <param name="shadertype" type="GLenum"/>
+        <param name="index" type="GLuint"/>
+        <param name="pname" type="GLenum"/>
+        <param name="values" type="GLint *" output="true"/>
+    </function>
+
+    <function name="GetActiveSubroutineUniformName" offset="assign">
+        <param name="program" type="GLuint"/>
+        <param name="shadertype" type="GLenum"/>
+        <param name="index" type="GLuint"/>
+        <param name="bufsize" type="GLsizei"/>
+        <param name="length" type="GLsizei *" output="true"/>
+        <param name="name" type="GLchar *" output="true"/>
+    </function>
+
+    <function name="GetActiveSubroutineName" offset="assign">
+        <param name="program" type="GLuint"/>
+        <param name="shadertype" type="GLenum"/>
+        <param name="index" type="GLuint"/>
+        <param name="bufsize" type="GLsizei"/>
+        <param name="length" type="GLsizei *" output="true"/>
+        <param name="name" type="GLchar *" output="true"/>
+    </function>
+
+    <function name="UniformSubroutinesuiv" offset="assign">
+        <param name="shadertype" type="GLenum"/>
+        <param name="count" type="GLsizei"/>
+        <param name="indices" type="const GLuint *"/>
+    </function>
+
+    <function name="GetUniformSubroutineuiv" offset="assign">
+        <param name="shadertype" type="GLenum"/>
+        <param name="location" type="GLint"/>
+        <param name="params" type="GLuint *" output="true"/>
+    </function>
+
+    <function name="GetProgramStageiv" offset="assign">
+        <param name="program" type="GLuint"/>
+        <param name="shadertype" type="GLenum"/>
+        <param name="pname" type="GLenum"/>
+        <param name="values" type="GLint *" output="true"/>
+    </function>
+
+    <enum name="ACTIVE_SUBROUTINES" value="0x8DE5"/>
+    <enum name="ACTIVE_SUBROUTINE_UNIFORMS" value="0x8DE6"/>
+    <enum name="ACTIVE_SUBROUTINE_UNIFORM_LOCATIONS" value="0x8E47"/>
+    <enum name="ACTIVE_SUBROUTINE_MAX_LENGTH" value="0x8E48"/>
+    <enum name="ACTIVE_SUBROUTINE_UNIFORM_MAX_LENGTH" value="0x8E49"/>
+
+    <enum name="MAX_SUBROUTINES" value="0x8DE7"/>
+    <enum name="MAX_SUBROUTINE_UNIFORM_LOCATIONS" value="0x8DE8"/>
+
+    <enum name="NUM_COMPATIBLE_SUBROUTINES" value="0x8E4A"/>
+    <enum name="COMPATIBLE_SUBROUTINES" value="0x8E4B"/>
+
+    <!-- UNIFORM_SIZE, UNIFORM_NAME_LENGTH already in GL3.1 -->
+
+</category>
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 170898c60ea..7d9d1a61215 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -152,6 +152,7 @@ API_XML = \
 	ARB_separate_shader_objects.xml \
 	ARB_shader_atomic_counters.xml \
 	ARB_shader_image_load_store.xml \
+	ARB_shader_subroutine.xml \
 	ARB_sync.xml \
 	ARB_texture_barrier.xml \
 	ARB_texture_buffer_object.xml \
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index 4bcce3c3240..658efa485f6 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -8072,7 +8072,9 @@
 
 <xi:include href="ARB_vertex_type_2_10_10_10_rev.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
-<!-- ARB extensions #86...#90 -->
+<!-- ARB extensions #86...#89 -->
+
+<xi:include href="ARB_shader_subroutine.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
 <xi:include href="ARB_tessellation_shader.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 

From 4c7b007104c63475ec080d0777a41603c78786f6 Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 10 Aug 2014 20:43:45 +1200
Subject: [PATCH 0574/1208] mesa: Add extension tracking for
 arb_shader_subroutine (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: [airlied]: merge version check update.

Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/main/extensions.c | 1 +
 src/mesa/main/mtypes.h     | 1 +
 src/mesa/main/version.c    | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 7deb823e1c4..2dbfabdc7b5 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -156,6 +156,7 @@ static const struct extension extension_table[] = {
    { "GL_ARB_shader_precision",                    o(ARB_shader_precision),                    GL,             2010 },
    { "GL_ARB_shader_stencil_export",               o(ARB_shader_stencil_export),               GL,             2009 },
    { "GL_ARB_shader_storage_buffer_object",        o(ARB_shader_storage_buffer_object),        GL,             2012 },
+   { "GL_ARB_shader_subroutine",                   o(ARB_shader_subroutine),                   GLC,            2010 },
    { "GL_ARB_shader_texture_lod",                  o(ARB_shader_texture_lod),                  GL,             2009 },
    { "GL_ARB_shading_language_100",                o(dummy_true),                              GLL,            2003 },
    { "GL_ARB_shading_language_packing",            o(ARB_shading_language_packing),            GL,             2011 },
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index b13271bbd06..c5673e268db 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3832,6 +3832,7 @@ struct gl_extensions
    GLboolean ARB_shader_precision;
    GLboolean ARB_shader_stencil_export;
    GLboolean ARB_shader_storage_buffer_object;
+   GLboolean ARB_shader_subroutine;
    GLboolean ARB_shader_texture_lod;
    GLboolean ARB_shading_language_packing;
    GLboolean ARB_shading_language_420pack;
diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c
index 8bc00ace5c4..fd7ae53ccbd 100644
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -309,7 +309,7 @@ compute_version(const struct gl_extensions *extensions,
                          extensions->ARB_gpu_shader5 &&
                          extensions->ARB_gpu_shader_fp64 &&
                          extensions->ARB_sample_shading &&
-                         false /*extensions->ARB_shader_subroutine*/ &&
+                         extensions->ARB_shader_subroutine &&
                          extensions->ARB_tessellation_shader &&
                          extensions->ARB_texture_buffer_object_rgb32 &&
                          extensions->ARB_texture_cube_map_array &&

From 25d6f56c08801909e784f81e9b9ced48977630f4 Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 10 Aug 2014 21:07:33 +1200
Subject: [PATCH 0575/1208] mesa: Add glGet support for ARB_shader_subroutine
 implementation limits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/main/config.h               | 6 ++++++
 src/mesa/main/get.c                  | 1 +
 src/mesa/main/get_hash_params.py     | 4 ++++
 src/mesa/main/tests/enum_strings.cpp | 9 +++++++++
 4 files changed, 20 insertions(+)

diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index 66eb6cb206d..b35031db3c9 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -274,6 +274,12 @@
 #define MAX_VERTEX_STREAMS                  4
 /*@}*/
 
+/** For GL_ARB_shader_subroutine */
+/*@{*/
+#define MAX_SUBROUTINES                   256
+#define MAX_SUBROUTINE_UNIFORM_LOCATIONS  1024
+/*@}*/
+
 /** For GL_INTEL_performance_query */
 /*@{*/
 #define MAX_PERFQUERY_QUERY_NAME_LENGTH     256
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 590fe284f4e..ce786915508 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -414,6 +414,7 @@ EXTRA_EXT(ARB_clip_control);
 EXTRA_EXT(EXT_polygon_offset_clamp);
 EXTRA_EXT(ARB_framebuffer_no_attachments);
 EXTRA_EXT(ARB_tessellation_shader);
+EXTRA_EXT(ARB_shader_subroutine);
 
 static const int
 extra_ARB_color_buffer_float_or_glcore[] = {
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 2cf06d64c9f..4137e7f33c5 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -852,6 +852,10 @@ descriptor=[
   [ "MAX_TESS_EVALUATION_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxAtomicCounters), extra_ARB_shader_atomic_counters_and_tessellation" ],
   [ "MAX_TESS_CONTROL_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxImageUniforms), extra_ARB_shader_image_load_store_and_tessellation"],
   [ "MAX_TESS_EVALUATION_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxImageUniforms), extra_ARB_shader_image_load_store_and_tessellation"],
+
+# GL_ARB_shader_subroutine
+  [ "MAX_SUBROUTINES", "CONST(MAX_SUBROUTINES), extra_ARB_shader_subroutine" ],
+  [ "MAX_SUBROUTINE_UNIFORM_LOCATIONS", "CONST(MAX_SUBROUTINE_UNIFORM_LOCATIONS), extra_ARB_shader_subroutine" ],
 ]}
 
 ]
diff --git a/src/mesa/main/tests/enum_strings.cpp b/src/mesa/main/tests/enum_strings.cpp
index 84c11954f44..8218cc9a685 100644
--- a/src/mesa/main/tests/enum_strings.cpp
+++ b/src/mesa/main/tests/enum_strings.cpp
@@ -1731,6 +1731,10 @@ const struct enum_info everything[] = {
    { 0x8DDF, "GL_MAX_GEOMETRY_UNIFORM_COMPONENTS" },
    { 0x8DE0, "GL_MAX_GEOMETRY_OUTPUT_VERTICES" },
    { 0x8DE1, "GL_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS" },
+   { 0x8DE5, "GL_ACTIVE_SUBROUTINES" },
+   { 0x8DE6, "GL_ACTIVE_SUBROUTINE_UNIFORMS" },
+   { 0x8DE7, "GL_MAX_SUBROUTINES" },
+   { 0x8DE8, "GL_MAX_SUBROUTINE_UNIFORM_LOCATIONS" },
    { 0x8DF0, "GL_LOW_FLOAT" },
    { 0x8DF1, "GL_MEDIUM_FLOAT" },
    { 0x8DF2, "GL_HIGH_FLOAT" },
@@ -1759,6 +1763,11 @@ const struct enum_info everything[] = {
    { 0x8E44, "GL_TEXTURE_SWIZZLE_B" },
    { 0x8E45, "GL_TEXTURE_SWIZZLE_A" },
    { 0x8E46, "GL_TEXTURE_SWIZZLE_RGBA" },
+   { 0x8E47, "GL_ACTIVE_SUBROUTINE_UNIFORM_LOCATIONS" },
+   { 0x8E48, "GL_ACTIVE_SUBROUTINE_MAX_LENGTH" },
+   { 0x8E49, "GL_ACTIVE_SUBROUTINE_UNIFORM_MAX_LENGTH" },
+   { 0x8E4A, "GL_NUM_COMPATIBLE_SUBROUTINES" },
+   { 0x8E4B, "GL_COMPATIBLE_SUBROUTINES" },
    { 0x8E4C, "GL_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION" },
    { 0x8E4D, "GL_FIRST_VERTEX_CONVENTION" },
    { 0x8E4E, "GL_LAST_VERTEX_CONVENTION" },

From cc172fddf3fc37991c6d85f2d8e4f6dc63a62809 Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 10 Aug 2014 21:31:06 +1200
Subject: [PATCH 0576/1208] glsl: Add extension plumbing and define for
 ARB_shader_subroutine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/glcpp/glcpp-parse.y        | 3 +++
 src/glsl/glsl_parser_extras.cpp     | 1 +
 src/glsl/glsl_parser_extras.h       | 2 ++
 src/glsl/standalone_scaffolding.cpp | 1 +
 4 files changed, 7 insertions(+)

diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index 4236fb319d3..7ef4dfd5316 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -2489,6 +2489,9 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 
 	      if (extensions->ARB_tessellation_shader)
 	         add_builtin_define(parser, "GL_ARB_tessellation_shader", 1);
+
+              if (extensions->ARB_shader_subroutine)
+                 add_builtin_define(parser, "GL_ARB_shader_subroutine", 1);
 	   }
 	}
 
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 43a3cd13f26..8bc690df6ff 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -598,6 +598,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(ARB_shader_precision,             true,  false,     ARB_shader_precision),
    EXT(ARB_shader_stencil_export,        true,  false,     ARB_shader_stencil_export),
    EXT(ARB_shader_storage_buffer_object, true,  false,     ARB_shader_storage_buffer_object),
+   EXT(ARB_shader_subroutine,            true,  false,     ARB_shader_subroutine),
    EXT(ARB_shader_texture_lod,           true,  false,     ARB_shader_texture_lod),
    EXT(ARB_shading_language_420pack,     true,  false,     ARB_shading_language_420pack),
    EXT(ARB_shading_language_packing,     true,  false,     ARB_shading_language_packing),
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index f4b60afcb37..564820183f5 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -496,6 +496,8 @@ struct _mesa_glsl_parse_state {
    bool ARB_shader_stencil_export_warn;
    bool ARB_shader_storage_buffer_object_enable;
    bool ARB_shader_storage_buffer_object_warn;
+   bool ARB_shader_subroutine_enable;
+   bool ARB_shader_subroutine_warn;
    bool ARB_shader_texture_lod_enable;
    bool ARB_shader_texture_lod_warn;
    bool ARB_shading_language_420pack_enable;
diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp
index 057ca6ee8a7..6033364afc5 100644
--- a/src/glsl/standalone_scaffolding.cpp
+++ b/src/glsl/standalone_scaffolding.cpp
@@ -133,6 +133,7 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api)
    ctx->Extensions.ARB_sample_shading = true;
    ctx->Extensions.ARB_shader_bit_encoding = true;
    ctx->Extensions.ARB_shader_stencil_export = true;
+   ctx->Extensions.ARB_shader_subroutine = true;
    ctx->Extensions.ARB_shader_texture_lod = true;
    ctx->Extensions.ARB_shading_language_420pack = true;
    ctx->Extensions.ARB_shading_language_packing = true;

From 24b0e5068348aacabbd3e0012de95d34866e4b99 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 23 Jul 2015 10:49:12 +1000
Subject: [PATCH 0577/1208] dispatch_sanity: add shader subroutine to fix make
 check

Add the shader subroutine to the core only API list,
and fixup dispatch_sanity to suit.

Acked-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mapi/glapi/gen/apiexec.py           | 11 +++++++++++
 src/mesa/main/tests/dispatch_sanity.cpp | 18 ++++++++++--------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/mapi/glapi/gen/apiexec.py b/src/mapi/glapi/gen/apiexec.py
index b623b44beeb..66bc79ac405 100644
--- a/src/mapi/glapi/gen/apiexec.py
+++ b/src/mapi/glapi/gen/apiexec.py
@@ -74,6 +74,17 @@ functions = {
     # GL_ARB_geometry_shader4, so OpenGL 3.2 is required.
     "FramebufferTexture": exec_info(core=32),
 
+    # OpenGL 4.0 / GL_ARB_shader_subroutines. Mesa only exposes this
+    # extension with core profile.
+    "glGetSubroutineUniformLocation": exec_info(core=31),
+    "glGetSubroutineIndex": exec_info(core=31),
+    "glGetActiveSubroutineUniformiv": exec_info(core=31),
+    "glGetActiveSubroutineUniformName": exec_info(core=31),
+    "glGetActiveSubroutineName": exec_info(core=31),
+    "glUniformSubroutinesuiv": exec_info(core=31),
+    "glGetUniformSubroutineuiv": exec_info(core=31),
+    "glGetProgramStageiv": exec_info(core=31),
+
     # OpenGL 4.0 / GL_ARB_gpu_shader_fp64.  The extension spec says:
     #
     #     "OpenGL 3.2 and GLSL 1.50 are required."
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index d1bd0cee61e..af89d2c1cfb 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -1431,6 +1431,16 @@ const struct function gl_core_functions_possible[] = {
    /* GL 3.2 */
    { "glFramebufferTexture", 32, -1 },
 
+   /* GL 4.0 */
+   { "glGetSubroutineUniformLocation", 40, -1 },
+   { "glGetSubroutineIndex", 40, -1 },
+   { "glGetActiveSubroutineUniformiv", 40, -1 },
+   { "glGetActiveSubroutineUniformName", 40, -1 },
+   { "glGetActiveSubroutineName", 40, -1 },
+   { "glUniformSubroutinesuiv", 40, -1 },
+   { "glGetUniformSubroutineuiv", 40, -1 },
+   { "glGetProgramStageiv", 40, -1 },
+
    /* GL 4.3 */
    { "glIsRenderbuffer", 43, -1 },
    { "glBindRenderbuffer", 43, -1 },
@@ -1569,14 +1579,6 @@ const struct function gl_core_functions_possible[] = {
    { "glUniformMatrix4x2dv", 40, -1 },
    { "glUniformMatrix4x3dv", 40, -1 },
    { "glGetUniformdv", 43, -1 },
-// { "glGetSubroutineUniformLocation", 43, -1 },        // XXX: Add to xml
-// { "glGetSubroutineIndex", 43, -1 },                  // XXX: Add to xml
-// { "glGetActiveSubroutineUniformiv", 43, -1 },        // XXX: Add to xml
-// { "glGetActiveSubroutineUniformName", 43, -1 },      // XXX: Add to xml
-// { "glGetActiveSubroutineName", 43, -1 },             // XXX: Add to xml
-// { "glUniformSubroutinesuiv", 43, -1 },               // XXX: Add to xml
-// { "glGetUniformSubroutineuiv", 43, -1 },             // XXX: Add to xml
-// { "glGetProgramStageiv", 43, -1 },                   // XXX: Add to xml
 
    { "glBindTransformFeedback", 43, -1 },
    { "glDeleteTransformFeedbacks", 43, -1 },

From d16ff8ac783874c8ee74ef796b1c853829ff237d Mon Sep 17 00:00:00 2001
From: Chris Forbes <chrisf@ijw.co.nz>
Date: Sun, 10 Aug 2014 21:38:23 +1200
Subject: [PATCH 0578/1208] glsl: Make `subroutine` a reserved keyword
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/glsl_lexer.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glsl/glsl_lexer.ll b/src/glsl/glsl_lexer.ll
index fe0083fe745..1cd1ffb98c6 100644
--- a/src/glsl/glsl_lexer.ll
+++ b/src/glsl/glsl_lexer.ll
@@ -579,7 +579,7 @@ usamplerBuffer	KEYWORD(140, 300, 140, 0, USAMPLERBUFFER);
     /* Additional reserved words in GLSL ES 3.00 */
 resource	KEYWORD(0, 300, 0, 0, RESOURCE);
 sample		KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_gpu_shader5_enable, SAMPLE);
-subroutine	KEYWORD(0, 300, 0, 0, SUBROUTINE);
+subroutine	KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_shader_subroutine_enable, SUBROUTINE);
 
 
 [_a-zA-Z][_a-zA-Z0-9]*	{

From 57f24299b7fe0f7b20c2a3cf1e94c747825b568d Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 20 Apr 2015 10:16:55 +1000
Subject: [PATCH 0579/1208] glsl/types: add new subroutine type (v3.2)

This type will be used to store the name of subroutine types

as in subroutine void myfunc(void);
will store myfunc into a subroutine type.

This is required to the parser can identify a subroutine
type in a uniform decleration as a valid type, and also for
looking up the type later.

Also add contains_subroutine method.

v2: handle subroutine to int comparisons, needed
for lowering pass.
v3: do subroutine to int with it's own IR
operation to avoid hacking on asserts (Kayden)
v3.1: fix warnings in this patch, fix nir,
fix tgsi
v3.2: fixup tests

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
Signed-off-by: Dave Airlie <airlied@redhat.com>

tests: fix warnings
---
 src/glsl/ast_to_hir.cpp                      |  1 +
 src/glsl/glsl_types.cpp                      | 66 ++++++++++++++++++++
 src/glsl/glsl_types.h                        | 19 ++++++
 src/glsl/ir.cpp                              |  2 +
 src/glsl/ir.h                                |  1 +
 src/glsl/ir_builder.cpp                      |  6 ++
 src/glsl/ir_builder.h                        |  1 +
 src/glsl/ir_clone.cpp                        |  1 +
 src/glsl/ir_validate.cpp                     |  4 ++
 src/glsl/link_uniform_initializers.cpp       |  1 +
 src/glsl/nir/nir_lower_io.c                  |  2 +
 src/glsl/tests/uniform_initializer_utils.cpp |  3 +
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp   |  4 ++
 13 files changed, 111 insertions(+)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index a45b6e8f026..3b836f8d33c 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -1019,6 +1019,7 @@ do_comparison(void *mem_ctx, int operation, ir_rvalue *op0, ir_rvalue *op1)
    case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_INTERFACE:
    case GLSL_TYPE_ATOMIC_UINT:
+   case GLSL_TYPE_SUBROUTINE:
       /* I assume a comparison of a struct containing a sampler just
        * ignores the sampler present in the type.
        */
diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
index cd6df5adad9..8510671d2f3 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -32,6 +32,7 @@ mtx_t glsl_type::mutex = _MTX_INITIALIZER_NP;
 hash_table *glsl_type::array_types = NULL;
 hash_table *glsl_type::record_types = NULL;
 hash_table *glsl_type::interface_types = NULL;
+hash_table *glsl_type::subroutine_types = NULL;
 void *glsl_type::mem_ctx = NULL;
 
 void
@@ -161,6 +162,22 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
    mtx_unlock(&glsl_type::mutex);
 }
 
+glsl_type::glsl_type(const char *subroutine_name) :
+   gl_type(0),
+   base_type(GLSL_TYPE_SUBROUTINE),
+   sampler_dimensionality(0), sampler_shadow(0), sampler_array(0),
+   sampler_type(0), interface_packing(0),
+   vector_elements(0), matrix_columns(0),
+   length(0)
+{
+   mtx_lock(&glsl_type::mutex);
+
+   init_ralloc_type_ctx();
+   assert(subroutine_name != NULL);
+   this->name = ralloc_strdup(this->mem_ctx, subroutine_name);
+   this->vector_elements = 1;
+   mtx_unlock(&glsl_type::mutex);
+}
 
 bool
 glsl_type::contains_sampler() const
@@ -231,6 +248,22 @@ glsl_type::contains_opaque() const {
    }
 }
 
+bool
+glsl_type::contains_subroutine() const
+{
+   if (this->is_array()) {
+      return this->fields.array->contains_subroutine();
+   } else if (this->is_record()) {
+      for (unsigned int i = 0; i < this->length; i++) {
+	 if (this->fields.structure[i].type->contains_subroutine())
+	    return true;
+      }
+      return false;
+   } else {
+      return this->is_subroutine();
+   }
+}
+
 gl_texture_index
 glsl_type::sampler_index() const
 {
@@ -836,6 +869,36 @@ glsl_type::get_interface_instance(const glsl_struct_field *fields,
    return (glsl_type *) entry->data;
 }
 
+const glsl_type *
+glsl_type::get_subroutine_instance(const char *subroutine_name)
+{
+   const glsl_type key(subroutine_name);
+
+   mtx_lock(&glsl_type::mutex);
+
+   if (subroutine_types == NULL) {
+      subroutine_types = _mesa_hash_table_create(NULL, record_key_hash,
+                                                 record_key_compare);
+   }
+
+   const struct hash_entry *entry = _mesa_hash_table_search(subroutine_types,
+                                                            &key);
+   if (entry == NULL) {
+      mtx_unlock(&glsl_type::mutex);
+      const glsl_type *t = new glsl_type(subroutine_name);
+      mtx_lock(&glsl_type::mutex);
+
+      entry = _mesa_hash_table_insert(subroutine_types, t, (void *) t);
+   }
+
+   assert(((glsl_type *) entry->data)->base_type == GLSL_TYPE_SUBROUTINE);
+   assert(strcmp(((glsl_type *) entry->data)->name, subroutine_name) == 0);
+
+   mtx_unlock(&glsl_type::mutex);
+
+   return (glsl_type *) entry->data;
+}
+
 
 const glsl_type *
 glsl_type::get_mul_type(const glsl_type *type_a, const glsl_type *type_b)
@@ -968,6 +1031,7 @@ glsl_type::component_slots() const
    case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_ATOMIC_UINT:
    case GLSL_TYPE_VOID:
+   case GLSL_TYPE_SUBROUTINE:
    case GLSL_TYPE_ERROR:
       break;
    }
@@ -988,6 +1052,7 @@ glsl_type::uniform_locations() const
    case GLSL_TYPE_BOOL:
    case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_SUBROUTINE:
       return 1;
 
    case GLSL_TYPE_STRUCT:
@@ -1341,6 +1406,7 @@ glsl_type::count_attribute_slots() const
    case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_ATOMIC_UINT:
    case GLSL_TYPE_VOID:
+   case GLSL_TYPE_SUBROUTINE:
    case GLSL_TYPE_ERROR:
       break;
    }
diff --git a/src/glsl/glsl_types.h b/src/glsl/glsl_types.h
index dff6a9a9d05..52672b313c0 100644
--- a/src/glsl/glsl_types.h
+++ b/src/glsl/glsl_types.h
@@ -59,6 +59,7 @@ enum glsl_base_type {
    GLSL_TYPE_INTERFACE,
    GLSL_TYPE_ARRAY,
    GLSL_TYPE_VOID,
+   GLSL_TYPE_SUBROUTINE,
    GLSL_TYPE_ERROR
 };
 
@@ -263,6 +264,11 @@ struct glsl_type {
 						  enum glsl_interface_packing packing,
 						  const char *block_name);
 
+   /**
+    * Get the instance of an subroutine type
+    */
+   static const glsl_type *get_subroutine_instance(const char *subroutine_name);
+
    /**
     * Get the type resulting from a multiplication of \p type_a * \p type_b
     */
@@ -514,6 +520,13 @@ struct glsl_type {
    /**
     * Query if a type is unnamed/anonymous (named by the parser)
     */
+
+   bool is_subroutine() const
+   {
+      return base_type == GLSL_TYPE_SUBROUTINE;
+   }
+   bool contains_subroutine() const;
+
    bool is_anonymous() const
    {
       return !strncmp(name, "#anon", 5);
@@ -679,6 +692,9 @@ private:
    /** Constructor for array types */
    glsl_type(const glsl_type *array, unsigned length);
 
+   /** Constructor for subroutine types */
+   glsl_type(const char *name);
+
    /** Hash table containing the known array types. */
    static struct hash_table *array_types;
 
@@ -688,6 +704,9 @@ private:
    /** Hash table containing the known interface types. */
    static struct hash_table *interface_types;
 
+   /** Hash table containing the known subroutine types. */
+   static struct hash_table *subroutine_types;
+
    static bool record_key_compare(const void *a, const void *b);
    static unsigned record_key_hash(const void *key);
 
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index 8fb7ca40cf1..fa1d85a5026 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -260,6 +260,7 @@ ir_expression::ir_expression(int op, ir_rvalue *op0)
    case ir_unop_bit_count:
    case ir_unop_find_msb:
    case ir_unop_find_lsb:
+   case ir_unop_subroutine_to_int:
       this->type = glsl_type::get_instance(GLSL_TYPE_INT,
 					   op0->type->vector_elements, 1);
       break;
@@ -568,6 +569,7 @@ static const char *const operator_strs[] = {
    "frexp_sig",
    "frexp_exp",
    "noise",
+   "subroutine_to_int",
    "interpolate_at_centroid",
    "+",
    "-",
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 98951221b02..8294b8a19bd 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -1385,6 +1385,7 @@ enum ir_expression_operation {
 
    ir_unop_noise,
 
+   ir_unop_subroutine_to_int,
    /**
     * Interpolate fs input at centroid
     *
diff --git a/src/glsl/ir_builder.cpp b/src/glsl/ir_builder.cpp
index e44b05c991c..cd03859cac0 100644
--- a/src/glsl/ir_builder.cpp
+++ b/src/glsl/ir_builder.cpp
@@ -338,6 +338,12 @@ sign(operand a)
    return expr(ir_unop_sign, a);
 }
 
+ir_expression *
+subr_to_int(operand a)
+{
+   return expr(ir_unop_subroutine_to_int, a);
+}
+
 ir_expression*
 equal(operand a, operand b)
 {
diff --git a/src/glsl/ir_builder.h b/src/glsl/ir_builder.h
index 87026588113..f76453ffcf0 100644
--- a/src/glsl/ir_builder.h
+++ b/src/glsl/ir_builder.h
@@ -153,6 +153,7 @@ ir_expression *sqrt(operand a);
 ir_expression *log(operand a);
 ir_expression *sign(operand a);
 
+ir_expression *subr_to_int(operand a);
 ir_expression *equal(operand a, operand b);
 ir_expression *nequal(operand a, operand b);
 ir_expression *less(operand a, operand b);
diff --git a/src/glsl/ir_clone.cpp b/src/glsl/ir_clone.cpp
index 914e0e4d540..49834ffbabf 100644
--- a/src/glsl/ir_clone.cpp
+++ b/src/glsl/ir_clone.cpp
@@ -362,6 +362,7 @@ ir_constant::clone(void *mem_ctx, struct hash_table *ht) const
    case GLSL_TYPE_ATOMIC_UINT:
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_ERROR:
+   case GLSL_TYPE_SUBROUTINE:
    case GLSL_TYPE_INTERFACE:
       assert(!"Should not get here.");
       break;
diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp
index 684bef28035..3f0dea74e27 100644
--- a/src/glsl/ir_validate.cpp
+++ b/src/glsl/ir_validate.cpp
@@ -448,6 +448,10 @@ ir_validate::visit_leave(ir_expression *ir)
              ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
       assert(ir->type->base_type == GLSL_TYPE_INT);
       break;
+   case ir_unop_subroutine_to_int:
+      assert(ir->operands[0]->type->base_type == GLSL_TYPE_SUBROUTINE);
+      assert(ir->type->base_type == GLSL_TYPE_INT);
+      break;
    case ir_binop_add:
    case ir_binop_sub:
    case ir_binop_mul:
diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp
index 6322a2d52cc..d61ae91f4ad 100644
--- a/src/glsl/link_uniform_initializers.cpp
+++ b/src/glsl/link_uniform_initializers.cpp
@@ -89,6 +89,7 @@ copy_constant_to_storage(union gl_constant_value *storage,
       case GLSL_TYPE_ATOMIC_UINT:
       case GLSL_TYPE_INTERFACE:
       case GLSL_TYPE_VOID:
+      case GLSL_TYPE_SUBROUTINE:
       case GLSL_TYPE_ERROR:
 	 /* All other types should have already been filtered by other
 	  * paths in the caller.
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c
index a9dd77691b2..f6e5e2beda4 100644
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -59,6 +59,8 @@ type_size(const struct glsl_type *type)
          size += type_size(glsl_get_struct_field(type, i));
       }
       return size;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
    case GLSL_TYPE_SAMPLER:
       return 0;
    case GLSL_TYPE_ATOMIC_UINT:
diff --git a/src/glsl/tests/uniform_initializer_utils.cpp b/src/glsl/tests/uniform_initializer_utils.cpp
index b90bdcaed3b..5006387036f 100644
--- a/src/glsl/tests/uniform_initializer_utils.cpp
+++ b/src/glsl/tests/uniform_initializer_utils.cpp
@@ -102,6 +102,7 @@ generate_data_element(void *mem_ctx, const glsl_type *type,
       case GLSL_TYPE_VOID:
       case GLSL_TYPE_ERROR:
       case GLSL_TYPE_INTERFACE:
+      case GLSL_TYPE_SUBROUTINE:
 	 ASSERT_TRUE(false);
 	 break;
       }
@@ -134,6 +135,7 @@ generate_data_element(void *mem_ctx, const glsl_type *type,
       case GLSL_TYPE_VOID:
       case GLSL_TYPE_ERROR:
       case GLSL_TYPE_INTERFACE:
+      case GLSL_TYPE_SUBROUTINE:
 	 ASSERT_TRUE(false);
 	 break;
       }
@@ -238,6 +240,7 @@ verify_data(gl_constant_value *storage, unsigned storage_array_size,
 	 case GLSL_TYPE_VOID:
 	 case GLSL_TYPE_ERROR:
 	 case GLSL_TYPE_INTERFACE:
+	 case GLSL_TYPE_SUBROUTINE:
 	    ASSERT_TRUE(false);
 	    break;
 	 }
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index b0fc6254edb..9553791b07a 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -1108,6 +1108,7 @@ type_size(const struct glsl_type *type)
       return size;
    case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_SUBROUTINE:
       /* Samplers take up one slot in UNIFORMS[], but they're baked in
        * at link time.
        */
@@ -1488,6 +1489,9 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
          result_src = op[0];
       }
       break;
+   case ir_unop_subroutine_to_int:
+      emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
+      break;
    case ir_unop_abs:
       emit_asm(ir, TGSI_OPCODE_ABS, result_dst, op[0]);
       break;

From f73ef824869dbb1f91c32ad563c95ca917f40c12 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 21 Jul 2015 14:52:40 +1000
Subject: [PATCH 0580/1208] glsl: don't eliminate subroutine types.

This stops dead code from removing subroutines types,
we need these for the queries to work properly.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/opt_dead_code.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/glsl/opt_dead_code.cpp b/src/glsl/opt_dead_code.cpp
index 04e4d5673d2..e4bf874700c 100644
--- a/src/glsl/opt_dead_code.cpp
+++ b/src/glsl/opt_dead_code.cpp
@@ -126,6 +126,9 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
                if (block_type->interface_packing != GLSL_INTERFACE_PACKING_PACKED)
                   continue;
             }
+
+            if (entry->var->type->is_subroutine())
+               continue;
          }
 
 	 entry->var->remove();

From 670b9e56da588581c90d6c68f0a55ecd9153504d Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 20 Apr 2015 10:20:06 +1000
Subject: [PATCH 0581/1208] mesa: add inline conversion functions for
 ARB_shader_subroutine (v2)

This handles converting the shader stages to the internal
prefix along with the program resource interfaces.

v2: add tess support

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/main/shaderobj.h | 101 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/src/mesa/main/shaderobj.h b/src/mesa/main/shaderobj.h
index 75f019dbd3d..943044e37cd 100644
--- a/src/mesa/main/shaderobj.h
+++ b/src/mesa/main/shaderobj.h
@@ -123,6 +123,107 @@ _mesa_shader_enum_to_shader_stage(GLenum v)
    }
 }
 
+/* 8 bytes + another underscore */
+#define MESA_SUBROUTINE_PREFIX_LEN 9
+static inline const char *
+_mesa_shader_stage_to_subroutine_prefix(gl_shader_stage stage)
+{
+  switch (stage) {
+  case MESA_SHADER_VERTEX:
+    return "__subu_v";
+  case MESA_SHADER_GEOMETRY:
+    return "__subu_g";
+  case MESA_SHADER_FRAGMENT:
+    return "__subu_f";
+  case MESA_SHADER_COMPUTE:
+    return "__subu_c";
+  case MESA_SHADER_TESS_CTRL:
+    return "__subu_t";
+  case MESA_SHADER_TESS_EVAL:
+    return "__subu_e";
+  default:
+    return NULL;
+  }
+}
+
+static inline gl_shader_stage
+_mesa_shader_stage_from_subroutine_uniform(GLenum subuniform)
+{
+   switch (subuniform) {
+   default:
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_VERTEX;
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_GEOMETRY;
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_FRAGMENT;
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_COMPUTE;
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_TESS_EVAL;
+   }
+}
+
+static inline gl_shader_stage
+_mesa_shader_stage_from_subroutine(GLenum subroutine)
+{
+   switch (subroutine) {
+   case GL_VERTEX_SUBROUTINE:
+      return MESA_SHADER_VERTEX;
+   case GL_GEOMETRY_SUBROUTINE:
+      return MESA_SHADER_GEOMETRY;
+   case GL_FRAGMENT_SUBROUTINE:
+      return MESA_SHADER_FRAGMENT;
+   case GL_COMPUTE_SUBROUTINE:
+      return MESA_SHADER_COMPUTE;
+   case GL_TESS_CONTROL_SUBROUTINE:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_SUBROUTINE:
+      return MESA_SHADER_TESS_EVAL;
+   }
+}
+
+static inline GLenum
+_mesa_shader_stage_to_subroutine(gl_shader_stage stage)
+{
+   switch (stage) {
+   default:
+   case MESA_SHADER_VERTEX:
+      return GL_VERTEX_SUBROUTINE;
+   case MESA_SHADER_GEOMETRY:
+      return GL_GEOMETRY_SUBROUTINE;
+   case MESA_SHADER_FRAGMENT:
+      return GL_FRAGMENT_SUBROUTINE;
+   case MESA_SHADER_COMPUTE:
+      return GL_COMPUTE_SUBROUTINE;
+   case MESA_SHADER_TESS_CTRL:
+      return GL_TESS_CONTROL_SUBROUTINE;
+   case MESA_SHADER_TESS_EVAL:
+      return GL_TESS_EVALUATION_SUBROUTINE;
+   }
+}
+
+static inline GLenum
+_mesa_shader_stage_to_subroutine_uniform(gl_shader_stage stage)
+{
+   switch (stage) {
+   default:
+   case MESA_SHADER_VERTEX:
+      return GL_VERTEX_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_GEOMETRY:
+      return GL_GEOMETRY_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_FRAGMENT:
+      return GL_FRAGMENT_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_COMPUTE:
+      return GL_COMPUTE_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_TESS_CTRL:
+      return GL_TESS_CONTROL_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_TESS_EVAL:
+      return GL_TESS_EVALUATION_SUBROUTINE_UNIFORM;
+   }
+}
 
 #ifdef __cplusplus
 }

From d8a250ce5edc3da092ede6d62d433fbb37aa6cf6 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 23 Jul 2015 10:23:36 +1000
Subject: [PATCH 0582/1208] mesa: add function to check if shader subroutines
 are enabled.

This checks if core profile and shader subroutine extension
is enabled.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/main/context.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/mesa/main/context.h b/src/mesa/main/context.h
index 7d256b18d47..0f7529ad975 100644
--- a/src/mesa/main/context.h
+++ b/src/mesa/main/context.h
@@ -343,6 +343,15 @@ _mesa_has_compute_shaders(const struct gl_context *ctx)
       (ctx->API == API_OPENGLES2 && ctx->Version >= 31);
 }
 
+/**
+ * Checks if the context supports shader subroutines.
+ */
+static inline bool
+_mesa_has_shader_subroutine(const struct gl_context *ctx)
+{
+   return ctx->API == API_OPENGL_CORE &&
+      (ctx->Version >= 40 || ctx->Extensions.ARB_shader_subroutine);
+}
 
 /**
  * Checks if the context supports tessellation.

From 30681c3bb80ad78392f1740aa915efa072c837e8 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 20 Apr 2015 10:22:57 +1000
Subject: [PATCH 0583/1208] glsl/ir: add subroutine information storage to
 ir_function (v1.1)

We need to store two sets of info into the ir_function,
if this is a function definition with a subroutine list
(subroutine_def) or if it a subroutine prototype.

v1.1: add some more documentation.

Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/ir.cpp               |  4 ++++
 src/glsl/ir.h                 | 15 +++++++++++++++
 src/glsl/ir_clone.cpp         |  6 ++++++
 src/glsl/ir_print_visitor.cpp |  2 +-
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index fa1d85a5026..3a40926e911 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -1855,6 +1855,7 @@ static void
 steal_memory(ir_instruction *ir, void *new_ctx)
 {
    ir_variable *var = ir->as_variable();
+   ir_function *fn = ir->as_function();
    ir_constant *constant = ir->as_constant();
    if (var != NULL && var->constant_value != NULL)
       steal_memory(var->constant_value, ir);
@@ -1862,6 +1863,9 @@ steal_memory(ir_instruction *ir, void *new_ctx)
    if (var != NULL && var->constant_initializer != NULL)
       steal_memory(var->constant_initializer, ir);
 
+   if (fn != NULL && fn->subroutine_types)
+      ralloc_steal(new_ctx, fn->subroutine_types);
+
    /* The components of aggregate constants are not visited by the normal
     * visitor, so steal their values by hand.
     */
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 8294b8a19bd..da867427e1a 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -1126,6 +1126,21 @@ public:
     * List of ir_function_signature for each overloaded function with this name.
     */
    struct exec_list signatures;
+
+   /**
+    * is this function a subroutine type declaration
+    * e.g. subroutine void type1(float arg1);
+    */
+   bool is_subroutine;
+
+   /**
+    * is this function associated to a subroutine type
+    * e.g. subroutine (type1, type2) function_name { function_body };
+    * would have num_subroutine_types 2,
+    * and pointers to the type1 and type2 types.
+    */
+   int num_subroutine_types;
+   const struct glsl_type **subroutine_types;
 };
 
 inline const char *ir_function_signature::function_name() const
diff --git a/src/glsl/ir_clone.cpp b/src/glsl/ir_clone.cpp
index 49834ffbabf..a8fac183a8d 100644
--- a/src/glsl/ir_clone.cpp
+++ b/src/glsl/ir_clone.cpp
@@ -267,6 +267,12 @@ ir_function::clone(void *mem_ctx, struct hash_table *ht) const
 {
    ir_function *copy = new(mem_ctx) ir_function(this->name);
 
+   copy->is_subroutine = this->is_subroutine;
+   copy->num_subroutine_types = this->num_subroutine_types;
+   copy->subroutine_types = ralloc_array(mem_ctx, const struct glsl_type *, copy->num_subroutine_types);
+   for (int i = 0; i < copy->num_subroutine_types; i++)
+     copy->subroutine_types[i] = this->subroutine_types[i];
+
    foreach_in_list(const ir_function_signature, sig, &this->signatures) {
       ir_function_signature *sig_copy = sig->clone(mem_ctx, ht);
       copy->add_signature(sig_copy);
diff --git a/src/glsl/ir_print_visitor.cpp b/src/glsl/ir_print_visitor.cpp
index 428fc310373..78475f39b78 100644
--- a/src/glsl/ir_print_visitor.cpp
+++ b/src/glsl/ir_print_visitor.cpp
@@ -231,7 +231,7 @@ void ir_print_visitor::visit(ir_function_signature *ir)
 
 void ir_print_visitor::visit(ir_function *ir)
 {
-   fprintf(f, "(function %s\n", ir->name);
+   fprintf(f, "(%s function %s\n", ir->is_subroutine ? "subroutine" : "", ir->name);
    indentation++;
    foreach_in_list(ir_function_signature, sig, &ir->signatures) {
       indent();

From 884df9ef834d6b77226d0dfd778c5317365a2394 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Fri, 24 Apr 2015 10:47:03 +1000
Subject: [PATCH 0584/1208] glsl/ir: allow ir_call to handle subroutine calling

This adds a ir_variable which contains the subroutine uniform
and an array rvalue for the deref of that uniform, these
are stored in the ir_call and lowered later.

Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/ir.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index da867427e1a..ede8caa6e47 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -1712,7 +1712,18 @@ public:
    ir_call(ir_function_signature *callee,
 	   ir_dereference_variable *return_deref,
 	   exec_list *actual_parameters)
-      : ir_instruction(ir_type_call), return_deref(return_deref), callee(callee)
+      : ir_instruction(ir_type_call), return_deref(return_deref), callee(callee), sub_var(NULL), array_idx(NULL)
+   {
+      assert(callee->return_type != NULL);
+      actual_parameters->move_nodes_to(& this->actual_parameters);
+      this->use_builtin = callee->is_builtin();
+   }
+
+   ir_call(ir_function_signature *callee,
+	   ir_dereference_variable *return_deref,
+	   exec_list *actual_parameters,
+	   ir_variable *var, ir_rvalue *array_idx)
+      : ir_instruction(ir_type_call), return_deref(return_deref), callee(callee), sub_var(var), array_idx(array_idx)
    {
       assert(callee->return_type != NULL);
       actual_parameters->move_nodes_to(& this->actual_parameters);
@@ -1760,6 +1771,14 @@ public:
 
    /** Should this call only bind to a built-in function? */
    bool use_builtin;
+
+   /*
+    * ARB_shader_subroutine support -
+    * the subroutine uniform variable and array index
+    * rvalue to be used in the lowering pass later.
+    */
+   ir_variable *sub_var;
+   ir_rvalue *array_idx;
 };
 
 

From 65ac360823ee12ac2d1f3bb6758d352fcd0d9210 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 1 Jun 2015 10:55:47 +1000
Subject: [PATCH 0585/1208] glsl: add ast/parser support for subroutine parsing
 storage (v3.2)

This is the guts of the GLSL parser and AST support for
shader subroutines.

The code creates a subroutine type in the parser, and
uses that there to validate the identifiers. The parser
also distinguishes between subroutine types/function prototypes
/uniforms and subroutine defintions for functions.

Then in the AST conversion it recreates the types, and
stores the subroutine definition info or subroutine info
into the ir_function along with a side lookup table in
the parser state. It also converts subroutine calls into
the enhanced ir_call.

v2: move to handling method calls in
function handling not in field selection.
v3: merge Chris's previous parser patches in here, to
make it clearer what's changed in one place.
v3.1: add more documentation, drop unused include
v3.2: drop is_subroutine_def

Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/ast.h                   |  17 +++++
 src/glsl/ast_function.cpp        | 120 +++++++++++++++++++++++++++++--
 src/glsl/ast_to_hir.cpp          |  96 ++++++++++++++++++++++---
 src/glsl/ast_type.cpp            |   7 +-
 src/glsl/glsl_lexer.ll           |   8 +++
 src/glsl/glsl_parser.yy          | 114 ++++++++++++++---------------
 src/glsl/glsl_parser_extras.cpp  |  22 ++++++
 src/glsl/glsl_parser_extras.h    |  19 +++++
 src/glsl/hir_field_selection.cpp |  39 ----------
 9 files changed, 326 insertions(+), 116 deletions(-)

diff --git a/src/glsl/ast.h b/src/glsl/ast.h
index 3d0e173c737..d8c6cea7832 100644
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -304,6 +304,16 @@ private:
     * Is this function call actually a constructor?
     */
    bool cons;
+   ir_rvalue *
+   handle_method(exec_list *instructions,
+                 struct _mesa_glsl_parse_state *state);
+};
+
+class ast_subroutine_list : public ast_node
+{
+public:
+   virtual void print(void) const;
+   exec_list declarations;
 };
 
 class ast_array_specifier : public ast_node {
@@ -527,6 +537,12 @@ struct ast_type_qualifier {
 	 /* tess control output layout */
 	 unsigned vertices:1;
 	 /** \} */
+
+         /** \name Qualifiers for GL_ARB_shader_subroutine */
+	 /** \{ */
+         unsigned subroutine:1;  /**< Is this marked 'subroutine' */
+         unsigned subroutine_def:1; /**< Is this marked 'subroutine' with a list of types */
+	 /** \} */
       }
       /** \brief Set of flags, accessed by name. */
       q;
@@ -669,6 +685,7 @@ struct ast_type_qualifier {
                            ast_type_qualifier q,
                            ast_node* &node);
 
+   ast_subroutine_list *subroutine_list;
 };
 
 class ast_declarator_list;
diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp
index 6749e998e5a..803edf5a14d 100644
--- a/src/glsl/ast_function.cpp
+++ b/src/glsl/ast_function.cpp
@@ -26,6 +26,7 @@
 #include "glsl_types.h"
 #include "ir.h"
 #include "main/core.h" /* for MIN2 */
+#include "main/shaderobj.h"
 
 static ir_rvalue *
 convert_component(ir_rvalue *src, const glsl_type *desired_type);
@@ -355,6 +356,8 @@ fix_parameter(void *mem_ctx, ir_rvalue *actual, const glsl_type *formal_type,
 static ir_rvalue *
 generate_call(exec_list *instructions, ir_function_signature *sig,
 	      exec_list *actual_parameters,
+              ir_variable *sub_var,
+	      ir_rvalue *array_idx,
 	      struct _mesa_glsl_parse_state *state)
 {
    void *ctx = state;
@@ -421,7 +424,8 @@ generate_call(exec_list *instructions, ir_function_signature *sig,
 
       deref = new(ctx) ir_dereference_variable(var);
    }
-   ir_call *call = new(ctx) ir_call(sig, deref, actual_parameters);
+
+   ir_call *call = new(ctx) ir_call(sig, deref, actual_parameters, sub_var, array_idx);
    instructions->push_tail(call);
 
    /* Also emit any necessary out-parameter conversions. */
@@ -489,6 +493,40 @@ done:
    return sig;
 }
 
+static ir_function_signature *
+match_subroutine_by_name(const char *name,
+                         exec_list *actual_parameters,
+                         struct _mesa_glsl_parse_state *state,
+                         ir_variable **var_r)
+{
+   void *ctx = state;
+   ir_function_signature *sig = NULL;
+   ir_function *f, *found = NULL;
+   const char *new_name;
+   ir_variable *var;
+   bool is_exact = false;
+
+   new_name = ralloc_asprintf(ctx, "%s_%s", _mesa_shader_stage_to_subroutine_prefix(state->stage), name);
+   var = state->symbols->get_variable(new_name);
+   if (!var)
+      return NULL;
+
+   for (int i = 0; i < state->num_subroutine_types; i++) {
+      f = state->subroutine_types[i];
+      if (strcmp(f->name, var->type->without_array()->name))
+         continue;
+      found = f;
+      break;
+   }
+
+   if (!found)
+      return NULL;
+   *var_r = var;
+   sig = found->matching_signature(state, actual_parameters,
+                                  false, &is_exact);
+   return sig;
+}
+
 static void
 print_function_prototypes(_mesa_glsl_parse_state *state, YYLTYPE *loc,
                           ir_function *f)
@@ -1531,6 +1569,65 @@ process_record_constructor(exec_list *instructions,
                                              &actual_parameters, state);
 }
 
+ir_rvalue *
+ast_function_expression::handle_method(exec_list *instructions,
+                                       struct _mesa_glsl_parse_state *state)
+{
+   const ast_expression *field = subexpressions[0];
+   ir_rvalue *op;
+   ir_rvalue *result;
+   void *ctx = state;
+   /* Handle "method calls" in GLSL 1.20 - namely, array.length() */
+   YYLTYPE loc = get_location();
+   state->check_version(120, 300, &loc, "methods not supported");
+
+   const char *method;
+   method = field->primary_expression.identifier;
+
+   op = field->subexpressions[0]->hir(instructions, state);
+   if (strcmp(method, "length") == 0) {
+      if (!this->expressions.is_empty()) {
+         _mesa_glsl_error(&loc, state, "length method takes no arguments");
+         goto fail;
+      }
+
+      if (op->type->is_array()) {
+         if (op->type->is_unsized_array()) {
+            _mesa_glsl_error(&loc, state, "length called on unsized array");
+            goto fail;
+         }
+
+         result = new(ctx) ir_constant(op->type->array_size());
+      } else if (op->type->is_vector()) {
+         if (state->ARB_shading_language_420pack_enable) {
+            /* .length() returns int. */
+            result = new(ctx) ir_constant((int) op->type->vector_elements);
+         } else {
+            _mesa_glsl_error(&loc, state, "length method on matrix only available"
+                             "with ARB_shading_language_420pack");
+            goto fail;
+         }
+      } else if (op->type->is_matrix()) {
+         if (state->ARB_shading_language_420pack_enable) {
+            /* .length() returns int. */
+            result = new(ctx) ir_constant((int) op->type->matrix_columns);
+         } else {
+            _mesa_glsl_error(&loc, state, "length method on matrix only available"
+                             "with ARB_shading_language_420pack");
+            goto fail;
+         }
+      } else {
+         _mesa_glsl_error(&loc, state, "length called on scalar.");
+         goto fail;
+      }
+   } else {
+         _mesa_glsl_error(&loc, state, "unknown method: `%s'", method);
+         goto fail;
+   }
+   return result;
+fail:
+   return ir_rvalue::error_value(ctx);
+}
 
 ir_rvalue *
 ast_function_expression::hir(exec_list *instructions,
@@ -1543,8 +1640,6 @@ ast_function_expression::hir(exec_list *instructions,
     * 2. methods - Only the .length() method of array types.
     * 3. functions - Calls to regular old functions.
     *
-    * Method calls are actually detected when the ast_field_selection
-    * expression is handled.
     */
    if (is_constructor()) {
       const ast_type_specifier *type = (ast_type_specifier *) subexpressions[0];
@@ -1765,11 +1860,22 @@ ast_function_expression::hir(exec_list *instructions,
 					       &actual_parameters,
 					       ctx);
       }
+   } else if (subexpressions[0]->oper == ast_field_selection) {
+      return handle_method(instructions, state);
    } else {
       const ast_expression *id = subexpressions[0];
-      const char *func_name = id->primary_expression.identifier;
+      const char *func_name;
       YYLTYPE loc = get_location();
       exec_list actual_parameters;
+      ir_variable *sub_var = NULL;
+      ir_rvalue *array_idx = NULL;
+
+      if (id->oper == ast_array_index) {
+         func_name = id->subexpressions[0]->primary_expression.identifier;
+	 array_idx = id->subexpressions[1]->hir(instructions, state);
+      } else {
+         func_name = id->primary_expression.identifier;
+      }
 
       process_parameters(instructions, &actual_parameters, &this->expressions,
 			 state);
@@ -1778,6 +1884,10 @@ ast_function_expression::hir(exec_list *instructions,
 	 match_function_by_name(func_name, &actual_parameters, state);
 
       ir_rvalue *value = NULL;
+      if (sig == NULL) {
+         sig = match_subroutine_by_name(func_name, &actual_parameters, state, &sub_var);
+      }
+
       if (sig == NULL) {
 	 no_matching_function_error(func_name, &loc, &actual_parameters, state);
 	 value = ir_rvalue::error_value(ctx);
@@ -1785,7 +1895,7 @@ ast_function_expression::hir(exec_list *instructions,
 	 /* an error has already been emitted */
 	 value = ir_rvalue::error_value(ctx);
       } else {
-         value = generate_call(instructions, sig, &actual_parameters, state);
+         value = generate_call(instructions, sig, &actual_parameters, sub_var, array_idx, state);
          if (!value) {
             ir_variable *const tmp = new(ctx) ir_variable(glsl_type::void_type,
                                                           "void_var",
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 3b836f8d33c..68f71c6393e 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -54,6 +54,7 @@
 #include "ast.h"
 #include "glsl_types.h"
 #include "program/hash_table.h"
+#include "main/shaderobj.h"
 #include "ir.h"
 #include "ir_builder.h"
 
@@ -2522,6 +2523,12 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       }
    }
 
+   if (qual->flags.q.subroutine && !qual->flags.q.uniform) {
+      _mesa_glsl_error(loc, state,
+                       "`subroutine' may only be applied to uniforms, "
+                       "subroutine type declarations, or function definitions");
+   }
+
    if (qual->flags.q.constant || qual->flags.q.attribute
        || qual->flags.q.uniform
        || (qual->flags.q.varying && (state->stage == MESA_SHADER_FRAGMENT)))
@@ -3597,7 +3604,7 @@ ast_declarator_list::hir(exec_list *instructions,
    foreach_list_typed (ast_declaration, decl, link, &this->declarations) {
       const struct glsl_type *var_type;
       ir_variable *var;
-
+      const char *identifier = decl->identifier;
       /* FINISHME: Emit a warning if a variable declaration shadows a
        * FINISHME: declaration at a higher scope.
        */
@@ -3615,10 +3622,24 @@ ast_declarator_list::hir(exec_list *instructions,
          continue;
       }
 
+      if (this->type->qualifier.flags.q.subroutine) {
+         const glsl_type *t;
+         const char *name;
+
+         t = state->symbols->get_type(this->type->specifier->type_name);
+         if (!t)
+            _mesa_glsl_error(& loc, state,
+                             "invalid type in declaration of `%s'",
+                             decl->identifier);
+         name = ralloc_asprintf(ctx, "%s_%s", _mesa_shader_stage_to_subroutine_prefix(state->stage), decl->identifier);
+
+         identifier = name;
+
+      }
       var_type = process_array_type(&loc, decl_type, decl->array_specifier,
                                     state);
 
-      var = new(ctx) ir_variable(var_type, decl->identifier, ir_var_auto);
+      var = new(ctx) ir_variable(var_type, identifier, ir_var_auto);
 
       /* The 'varying in' and 'varying out' qualifiers can only be used with
        * ARB_geometry_shader4 and EXT_geometry_shader4, which we don't support
@@ -3690,6 +3711,8 @@ ast_declarator_list::hir(exec_list *instructions,
           */
          if (this->type->qualifier.flags.q.attribute) {
             mode = "attribute";
+         } else if (this->type->qualifier.flags.q.subroutine) {
+            mode = "subroutine uniform";
          } else if (this->type->qualifier.flags.q.uniform) {
             mode = "uniform";
          } else if (this->type->qualifier.flags.q.varying) {
@@ -3930,6 +3953,9 @@ ast_declarator_list::hir(exec_list *instructions,
          if (state->stage == MESA_SHADER_TESS_CTRL) {
             handle_tess_ctrl_shader_output_decl(state, loc, var);
          }
+      } else if (var->type->contains_subroutine()) {
+         /* declare subroutine uniforms as hidden */
+         var->data.how_declared = ir_var_hidden;
       }
 
       /* Integer fragment inputs must be qualified with 'flat'.  In GLSL ES,
@@ -4394,6 +4420,7 @@ ast_function::hir(exec_list *instructions,
    ir_function *f = NULL;
    ir_function_signature *sig = NULL;
    exec_list hir_parameters;
+   YYLTYPE loc = this->get_location();
 
    const char *const name = identifier;
 
@@ -4445,6 +4472,17 @@ ast_function::hir(exec_list *instructions,
       return_type = glsl_type::error_type;
    }
 
+   /* ARB_shader_subroutine states:
+    *  "Subroutine declarations cannot be prototyped. It is an error to prepend
+    *   subroutine(...) to a function declaration."
+    */
+   if (this->return_type->qualifier.flags.q.subroutine_def && !is_definition) {
+      YYLTYPE loc = this->get_location();
+      _mesa_glsl_error(&loc, state,
+                       "function declaration `%s' cannot have subroutine prepended",
+                       name);
+   }
+
    /* From page 56 (page 62 of the PDF) of the GLSL 1.30 spec:
     * "No qualifier is allowed on the return type of a function."
     */
@@ -4482,15 +4520,15 @@ ast_function::hir(exec_list *instructions,
    f = state->symbols->get_function(name);
    if (f == NULL) {
       f = new(ctx) ir_function(name);
-      if (!state->symbols->add_function(f)) {
-         /* This function name shadows a non-function use of the same name. */
-         YYLTYPE loc = this->get_location();
-
-         _mesa_glsl_error(&loc, state, "function name `%s' conflicts with "
-                          "non-function", name);
-         return NULL;
+      if (!this->return_type->qualifier.flags.q.subroutine) {
+         if (!state->symbols->add_function(f)) {
+            /* This function name shadows a non-function use of the same name. */
+            YYLTYPE loc = this->get_location();
+            _mesa_glsl_error(&loc, state, "function name `%s' conflicts with "
+                             "non-function", name);
+            return NULL;
+         }
       }
-
       emit_function(state, f);
    }
 
@@ -4577,6 +4615,44 @@ ast_function::hir(exec_list *instructions,
    sig->replace_parameters(&hir_parameters);
    signature = sig;
 
+   if (this->return_type->qualifier.flags.q.subroutine_def) {
+      int idx;
+
+      f->num_subroutine_types = this->return_type->qualifier.subroutine_list->declarations.length();
+      f->subroutine_types = ralloc_array(state, const struct glsl_type *,
+                                         f->num_subroutine_types);
+      idx = 0;
+      foreach_list_typed(ast_declaration, decl, link, &this->return_type->qualifier.subroutine_list->declarations) {
+         const struct glsl_type *type;
+         /* the subroutine type must be already declared */
+         type = state->symbols->get_type(decl->identifier);
+         if (!type) {
+            _mesa_glsl_error(& loc, state, "unknown type '%s' in subroutine function definition", decl->identifier);
+         }
+         f->subroutine_types[idx++] = type;
+      }
+      state->subroutines = (ir_function **)reralloc(state, state->subroutines,
+                                                    ir_function *,
+                                                    state->num_subroutines + 1);
+      state->subroutines[state->num_subroutines] = f;
+      state->num_subroutines++;
+
+   }
+
+   if (this->return_type->qualifier.flags.q.subroutine) {
+      if (!state->symbols->add_type(this->identifier, glsl_type::get_subroutine_instance(this->identifier))) {
+         _mesa_glsl_error(& loc, state, "type '%s' previously defined", this->identifier);
+         return NULL;
+      }
+      state->subroutine_types = (ir_function **)reralloc(state, state->subroutine_types,
+                                                         ir_function *,
+                                                         state->num_subroutine_types + 1);
+      state->subroutine_types[state->num_subroutine_types] = f;
+      state->num_subroutine_types++;
+
+      f->is_subroutine = true;
+   }
+
    /* Function declarations (prototypes) do not have r-values.
     */
    return NULL;
diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index d96e6a0e84d..a4671e203e2 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -40,7 +40,12 @@ ast_type_specifier::print(void) const
 bool
 ast_fully_specified_type::has_qualifiers() const
 {
-   return this->qualifier.flags.i != 0;
+   /* 'subroutine' isnt a real qualifier. */
+   ast_type_qualifier subroutine_only;
+   subroutine_only.flags.i = 0;
+   subroutine_only.flags.q.subroutine = 1;
+   subroutine_only.flags.q.subroutine_def = 1;
+   return (this->qualifier.flags.i & ~subroutine_only.flags.i) != 0;
 }
 
 bool ast_type_qualifier::has_interpolation() const
diff --git a/src/glsl/glsl_lexer.ll b/src/glsl/glsl_lexer.ll
index 1cd1ffb98c6..efa0bb68099 100644
--- a/src/glsl/glsl_lexer.ll
+++ b/src/glsl/glsl_lexer.ll
@@ -595,6 +595,10 @@ subroutine	KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_shader_subroutine_ena
 			    return classify_identifier(state, yytext);
 			}
 
+\.			{ struct _mesa_glsl_parse_state *state = yyextra;
+			  state->is_field = true;
+			  return DOT_TOK; }
+
 .			{ return yytext[0]; }
 
 %%
@@ -602,6 +606,10 @@ subroutine	KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_shader_subroutine_ena
 int
 classify_identifier(struct _mesa_glsl_parse_state *state, const char *name)
 {
+   if (state->is_field) {
+      state->is_field = false;
+      return FIELD_SELECTION;
+   }
    if (state->symbols->get_variable(name) || state->symbols->get_function(name))
       return IDENTIFIER;
    else if (state->symbols->get_type(name))
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 9d5212c200f..7de31d9b40e 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -121,7 +121,7 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
    ast_case_statement *case_statement;
    ast_case_statement_list *case_statement_list;
    ast_interface_block *interface_block;
-
+   ast_subroutine_list *subroutine_list;
    struct {
       ast_node *cond;
       ast_expression *rest;
@@ -186,7 +186,7 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %token PRAGMA_OPTIMIZE_ON PRAGMA_OPTIMIZE_OFF
 %token PRAGMA_INVARIANT_ALL
 %token LAYOUT_TOK
-
+%token DOT_TOK
    /* Reserved words that are not actually used in the grammar.
     */
 %token ASM CLASS UNION ENUM TYPEDEF TEMPLATE THIS PACKED_TOK GOTO
@@ -215,6 +215,8 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %type <type_qualifier> layout_qualifier_id_list layout_qualifier_id
 %type <type_qualifier> interface_block_layout_qualifier
 %type <type_qualifier> memory_qualifier
+%type <type_qualifier> subroutine_qualifier
+%type <subroutine_list> subroutine_type_list
 %type <type_qualifier> interface_qualifier
 %type <type_specifier> type_specifier
 %type <type_specifier> type_specifier_nonarray
@@ -260,10 +262,6 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %type <expression> function_call_generic
 %type <expression> function_call_or_method
 %type <expression> function_call
-%type <expression> method_call_generic
-%type <expression> method_call_header_with_parameters
-%type <expression> method_call_header_no_parameters
-%type <expression> method_call_header
 %type <n> assignment_operator
 %type <n> unary_operator
 %type <expression> function_identifier
@@ -476,7 +474,7 @@ postfix_expression:
    {
       $$ = $1;
    }
-   | postfix_expression '.' any_identifier
+   | postfix_expression DOT_TOK FIELD_SELECTION
    {
       void *ctx = state;
       $$ = new(ctx) ast_expression(ast_field_selection, $1, NULL, NULL);
@@ -507,12 +505,6 @@ function_call:
 
 function_call_or_method:
    function_call_generic
-   | postfix_expression '.' method_call_generic
-   {
-      void *ctx = state;
-      $$ = new(ctx) ast_expression(ast_field_selection, $1, $3, NULL);
-      $$->set_location_range(@1, @3);
-   }
    ;
 
 function_call_generic:
@@ -554,62 +546,17 @@ function_identifier:
       $$ = new(ctx) ast_function_expression($1);
       $$->set_location(@1);
       }
-   | variable_identifier
+   | postfix_expression
    {
       void *ctx = state;
-      ast_expression *callee = new(ctx) ast_expression($1);
-      callee->set_location(@1);
-      $$ = new(ctx) ast_function_expression(callee);
+      $$ = new(ctx) ast_function_expression($1);
       $$->set_location(@1);
       }
-   | FIELD_SELECTION
-   {
-      void *ctx = state;
-      ast_expression *callee = new(ctx) ast_expression($1);
-      callee->set_location(@1);
-      $$ = new(ctx) ast_function_expression(callee);
-      $$->set_location(@1);
-      }
-   ;
-
-method_call_generic:
-   method_call_header_with_parameters ')'
-   | method_call_header_no_parameters ')'
-   ;
-
-method_call_header_no_parameters:
-   method_call_header VOID_TOK
-   | method_call_header
-   ;
-
-method_call_header_with_parameters:
-   method_call_header assignment_expression
-   {
-      $$ = $1;
-      $$->set_location(@1);
-      $$->expressions.push_tail(& $2->link);
-   }
-   | method_call_header_with_parameters ',' assignment_expression
-   {
-      $$ = $1;
-      $$->set_location(@1);
-      $$->expressions.push_tail(& $3->link);
-   }
    ;
 
    // Grammar Note: Constructors look like methods, but lexical
    // analysis recognized most of them as keywords. They are now
    // recognized through "type_specifier".
-method_call_header:
-   variable_identifier '('
-   {
-      void *ctx = state;
-      ast_expression *callee = new(ctx) ast_expression($1);
-      callee->set_location(@1);
-      $$ = new(ctx) ast_function_expression(callee);
-      $$->set_location(@1);
-   }
-   ;
 
    // Grammar Note: No traditional style type casts.
 unary_expression:
@@ -910,7 +857,11 @@ function_header:
       $$->return_type = $1;
       $$->identifier = $2;
 
-      state->symbols->add_function(new(state) ir_function($2));
+      if ($1->qualifier.flags.q.subroutine) {
+         /* add type for IDENTIFIER search */
+         state->symbols->add_type($2, glsl_type::get_subroutine_instance($2));
+      } else
+         state->symbols->add_function(new(state) ir_function($2));
       state->symbols->push_scope();
    }
    ;
@@ -1676,6 +1627,41 @@ interface_block_layout_qualifier:
    }
    ;
 
+subroutine_qualifier:
+   SUBROUTINE
+   {
+      memset(& $$, 0, sizeof($$));
+      $$.flags.q.subroutine = 1;
+   }
+   | SUBROUTINE '(' subroutine_type_list ')'
+   {
+      memset(& $$, 0, sizeof($$));
+      $$.flags.q.subroutine_def = 1;
+      $$.subroutine_list = $3;
+   }
+   ;
+
+subroutine_type_list:
+   any_identifier
+   {
+        void *ctx = state;
+        ast_declaration *decl = new(ctx)  ast_declaration($1, NULL, NULL);
+        decl->set_location(@1);
+
+        $$ = new(ctx) ast_subroutine_list();
+        $$->declarations.push_tail(&decl->link);
+   }
+   | subroutine_type_list ',' any_identifier
+   {
+        void *ctx = state;
+        ast_declaration *decl = new(ctx)  ast_declaration($3, NULL, NULL);
+        decl->set_location(@3);
+
+        $$ = $1;
+        $$->declarations.push_tail(&decl->link);
+   }
+   ;
+
 interpolation_qualifier:
    SMOOTH
    {
@@ -1711,6 +1697,7 @@ type_qualifier:
    | interpolation_qualifier
    | layout_qualifier
    | memory_qualifier
+   | subroutine_qualifier
    | precision_qualifier
    {
       memset(&$$, 0, sizeof($$));
@@ -1801,6 +1788,11 @@ type_qualifier:
       $$ = $1;
       $$.merge_qualifier(&@1, state, $2);
    }
+   | subroutine_qualifier type_qualifier
+   {
+      $$ = $1;
+      $$.merge_qualifier(&@1, state, $2);
+   }
    | auxiliary_storage_qualifier type_qualifier
    {
       if ($2.has_auxiliary_storage()) {
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 8bc690df6ff..59a312fc647 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -173,6 +173,10 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    this->all_invariant = false;
    this->user_structures = NULL;
    this->num_user_structures = 0;
+   this->num_subroutines = 0;
+   this->subroutines = NULL;
+   this->num_subroutine_types = 0;
+   this->subroutine_types = NULL;
 
    /* supported_versions should be large enough to support the known desktop
     * GLSL versions plus 3 GLES versions (ES 1.00, ES 3.00, and ES 3.10))
@@ -855,6 +859,15 @@ _mesa_ast_set_aggregate_type(const glsl_type *type,
 void
 _mesa_ast_type_qualifier_print(const struct ast_type_qualifier *q)
 {
+   if (q->flags.q.subroutine)
+      printf("subroutine ");
+
+   if (q->flags.q.subroutine_def) {
+      printf("subroutine (");
+      q->subroutine_list->print();
+      printf(")");
+   }
+
    if (q->flags.q.constant)
       printf("const ");
 
@@ -1447,6 +1460,15 @@ ast_struct_specifier::ast_struct_specifier(const char *identifier,
    is_declaration = true;
 }
 
+void ast_subroutine_list::print(void) const
+{
+   foreach_list_typed (ast_node, ast, link, & this->declarations) {
+      if (&ast->link != this->declarations.get_head())
+         printf(", ");
+      ast->print();
+   }
+}
+
 static void
 set_shader_inout_layout(struct gl_shader *shader,
 		     struct _mesa_glsl_parse_state *state)
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index 564820183f5..b65d53db641 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -589,6 +589,25 @@ struct _mesa_glsl_parse_state {
    unsigned atomic_counter_offsets[MAX_COMBINED_ATOMIC_BUFFERS];
 
    bool allow_extension_directive_midshader;
+
+   /**
+    * Known subroutine type declarations.
+    */
+   int num_subroutine_types;
+   ir_function **subroutine_types;
+
+   /**
+    * Functions that are associated with
+    * subroutine types.
+    */
+   int num_subroutines;
+   ir_function **subroutines;
+
+   /**
+    * field selection temporary parser storage -
+    * did the parser just parse a dot.
+    */
+   bool is_field;
 };
 
 # define YYLLOC_DEFAULT(Current, Rhs, N)			\
diff --git a/src/glsl/hir_field_selection.cpp b/src/glsl/hir_field_selection.cpp
index 0fa976811e6..337095b95b8 100644
--- a/src/glsl/hir_field_selection.cpp
+++ b/src/glsl/hir_field_selection.cpp
@@ -56,45 +56,6 @@ _mesa_ast_field_selection_to_hir(const ast_expression *expr,
 			  "structure",
 			  expr->primary_expression.identifier);
       }
-   } else if (expr->subexpressions[1] != NULL) {
-      /* Handle "method calls" in GLSL 1.20 - namely, array.length() */
-      state->check_version(120, 300, &loc, "methods not supported");
-
-      ast_expression *call = expr->subexpressions[1];
-      assert(call->oper == ast_function_call);
-
-      const char *method;
-      method = call->subexpressions[0]->primary_expression.identifier;
-
-      if (strcmp(method, "length") == 0) {
-         if (!call->expressions.is_empty())
-            _mesa_glsl_error(&loc, state, "length method takes no arguments");
-
-         if (op->type->is_array()) {
-            if (op->type->is_unsized_array())
-               _mesa_glsl_error(&loc, state, "length called on unsized array");
-
-            result = new(ctx) ir_constant(op->type->array_size());
-         } else if (op->type->is_vector()) {
-            if (state->ARB_shading_language_420pack_enable) {
-               /* .length() returns int. */
-               result = new(ctx) ir_constant((int) op->type->vector_elements);
-            } else {
-               _mesa_glsl_error(&loc, state, "length method on matrix only available"
-                                             "with ARB_shading_language_420pack");
-            }
-         } else if (op->type->is_matrix()) {
-            if (state->ARB_shading_language_420pack_enable) {
-               /* .length() returns int. */
-               result = new(ctx) ir_constant((int) op->type->matrix_columns);
-            } else {
-               _mesa_glsl_error(&loc, state, "length method on matrix only available"
-                                             "with ARB_shading_language_420pack");
-            }
-         }
-      } else {
-	 _mesa_glsl_error(&loc, state, "unknown method: `%s'", method);
-      }
    } else if (op->type->is_vector() ||
               (state->ARB_shading_language_420pack_enable &&
                op->type->is_scalar())) {

From 7dd429e8f74302d44af00d051e59911439152369 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 23 Apr 2015 13:34:14 +1000
Subject: [PATCH 0586/1208] glsl/ir: add subroutine lowering pass (v2.3)

This lowers the enhanced ir_call using the lookaside table
of subroutines into an if ladder. This initially was done
at the AST level but it caused some ordering issues so a separate
pass was required.

v2: clone return value derefs.
v2.1: update for subroutine->int convert.
v2.2: add a clone for the array index

Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/Makefile.sources       |   1 +
 src/glsl/glsl_parser_extras.cpp |   1 +
 src/glsl/ir_optimization.h      |   2 +
 src/glsl/lower_subroutine.cpp   | 109 ++++++++++++++++++++++++++++++++
 4 files changed, 113 insertions(+)
 create mode 100644 src/glsl/lower_subroutine.cpp

diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index b3b84d69dcd..93f4e48d615 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -154,6 +154,7 @@ LIBGLSL_FILES = \
 	lower_packed_varyings.cpp \
 	lower_named_interface_blocks.cpp \
 	lower_packing_builtins.cpp \
+	lower_subroutine.cpp \
 	lower_tess_level.cpp \
 	lower_texture_projection.cpp \
 	lower_variable_index_to_cond_assign.cpp \
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 59a312fc647..46896d77999 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -1617,6 +1617,7 @@ _mesa_glsl_compile_shader(struct gl_context *ctx, struct gl_shader *shader,
       struct gl_shader_compiler_options *options =
          &ctx->Const.ShaderCompilerOptions[shader->Stage];
 
+      lower_subroutine(shader->ir, state);
       /* Do some optimization at compile time to reduce shader IR size
        * and reduce later work if the same shader is linked multiple times
        */
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index 766b723df0b..eef107e5249 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -137,6 +137,8 @@ bool lower_tess_level(gl_shader *shader);
 
 bool lower_vertex_id(gl_shader *shader);
 
+bool lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state);
+
 ir_rvalue *
 compare_index_block(exec_list *instructions, ir_variable *index,
 		    unsigned base, unsigned components, void *mem_ctx);
diff --git a/src/glsl/lower_subroutine.cpp b/src/glsl/lower_subroutine.cpp
new file mode 100644
index 00000000000..e45ccfea6d9
--- /dev/null
+++ b/src/glsl/lower_subroutine.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright © 2015 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file lower_subroutine.cpp
+ *
+ * lowers subroutines to an if ladder.
+ */
+
+#include "glsl_types.h"
+#include "glsl_parser_extras.h"
+#include "ir.h"
+#include "ir_builder.h"
+
+using namespace ir_builder;
+namespace {
+
+class lower_subroutine_visitor : public ir_hierarchical_visitor {
+public:
+   lower_subroutine_visitor()
+   {
+      this->progress = false;
+   }
+
+   ir_visitor_status visit_leave(ir_call *);
+   bool progress;
+   struct _mesa_glsl_parse_state *state;
+};
+
+}
+
+bool
+lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state)
+{
+   lower_subroutine_visitor v;
+   v.state = state;
+   visit_list_elements(&v, instructions);
+   return v.progress;
+}
+
+ir_visitor_status
+lower_subroutine_visitor::visit_leave(ir_call *ir)
+{
+   if (!ir->sub_var)
+      return visit_continue;
+
+   void *mem_ctx = ralloc_parent(ir);
+   ir_if *last_branch = NULL;
+   ir_dereference_variable *return_deref = ir->return_deref;
+
+   for (int s = this->state->num_subroutines - 1; s >= 0; s--) {
+      ir_rvalue *var;
+      ir_constant *lc = new(mem_ctx)ir_constant(s);
+      ir_function *fn = this->state->subroutines[s];
+      bool is_compat = false;
+
+      for (int i = 0; i < fn->num_subroutine_types; i++) {
+         if (ir->sub_var->type->without_array() == fn->subroutine_types[i]) {
+            is_compat = true;
+            break;
+         }
+      }
+      if (is_compat == false)
+         continue;
+
+      if (ir->array_idx != NULL)
+         var = new(mem_ctx) ir_dereference_array(ir->sub_var, ir->array_idx->clone(mem_ctx, NULL));
+      else
+         var = new(mem_ctx) ir_dereference_variable(ir->sub_var);
+
+      ir_function_signature *sub_sig =
+         fn->exact_matching_signature(this->state,
+                                      &ir->actual_parameters);
+
+      ir_call *new_call = new(mem_ctx) ir_call(sub_sig, return_deref, &ir->actual_parameters);
+      if (!last_branch)
+         last_branch = if_tree(equal(subr_to_int(var), lc), new_call);
+      else
+         last_branch = if_tree(equal(subr_to_int(var), lc), new_call, last_branch);
+
+      if (s > 0)
+        return_deref = return_deref->clone(mem_ctx, NULL);
+   }
+   if (last_branch)
+      ir->insert_before(last_branch);
+   ir->remove();
+
+   return visit_continue;
+}

From 44ea8b9b8edc5f59da546683fe64129a1c1be449 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 21 Jul 2015 14:59:01 +1000
Subject: [PATCH 0587/1208] mesa/mtypes: add gl_subroutine_function and uniform
 storage to shader (v2)

This adds the necessary storage for subroutine info to gl_shader.

v2: add comments, rename one member
Acked-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/main/mtypes.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index c5673e268db..a252bf46606 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2414,6 +2414,15 @@ struct gl_ati_fragment_shader_state
    struct ati_fragment_shader *Current;
 };
 
+/**
+ *  Shader subroutine function definition
+ */
+struct gl_subroutine_function
+{
+   char *name;
+   int num_compat_types;
+   const struct glsl_type **types;
+};
 
 /**
  * A GLSL vertex or fragment shader object.
@@ -2600,6 +2609,25 @@ struct gl_shader
        */
       unsigned LocalSize[3];
    } Comp;
+
+   /**
+     * Number of types for subroutine uniforms.
+     */
+   GLuint NumSubroutineUniformTypes;
+
+   /**
+     * Subroutine uniform remap table
+     * based on the program level uniform remap table.
+     */
+   GLuint NumSubroutineUniformRemapTable;
+   struct gl_uniform_storage **SubroutineUniformRemapTable;
+
+   /**
+    * Num of subroutine functions for this stage
+    * and storage for them.
+    */
+   GLuint NumSubroutineFunctions;
+   struct gl_subroutine_function *SubroutineFunctions;
 };
 
 

From 60266863d80bb2af94fa5c189ccd23ee20607ea9 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 20 Apr 2015 10:27:36 +1000
Subject: [PATCH 0588/1208] glsl: add uniform and program resource support (v2)

This adds linker support for subroutine uniforms, they
have some subtle differences from real uniforms, we also hide
them and they are given internal uniform names.

This also adds the subroutine locations and subroutine uniforms
to the program resource tracking for later use.

v1.1: drop is_subroutine_def

v2: handle explicit location properly, ARB_explicit_location
has a lot of language for subroutine shaders.
Calculate a link time the number of compatible subroutines
for a uniform, to make program resource easier later.

Acked-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/ir_uniform.h      |   8 ++
 src/glsl/link_uniforms.cpp | 103 ++++++++++++++++++++-
 src/glsl/linker.cpp        | 182 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 286 insertions(+), 7 deletions(-)

diff --git a/src/glsl/ir_uniform.h b/src/glsl/ir_uniform.h
index e1b80147788..0b6f7201a20 100644
--- a/src/glsl/ir_uniform.h
+++ b/src/glsl/ir_uniform.h
@@ -114,6 +114,8 @@ struct gl_uniform_storage {
 
    struct gl_opaque_uniform_index image[MESA_SHADER_STAGES];
 
+   struct gl_opaque_uniform_index subroutine[MESA_SHADER_STAGES];
+
    /**
     * Storage used by the driver for the uniform
     */
@@ -173,9 +175,15 @@ struct gl_uniform_storage {
    /**
     * The 'base location' for this uniform in the uniform remap table. For
     * arrays this is the first element in the array.
+    * for subroutines this is in shader subroutine uniform remap table.
     */
    unsigned remap_location;
 
+   /**
+    * The number of compatible subroutines with this subroutine uniform.
+    */
+   unsigned num_compatible_subroutines;
+
    /**
     * This is a compiler-generated uniform that should not be advertised
     * via the API.
diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index e786ddcaa90..254086dc050 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -47,9 +47,10 @@
 static unsigned
 values_for_type(const glsl_type *type)
 {
-   if (type->is_sampler()) {
+   if (type->is_sampler() || type->is_subroutine()) {
       return 1;
-   } else if (type->is_array() && type->fields.array->is_sampler()) {
+   } else if (type->is_array() && (type->fields.array->is_sampler() ||
+                                   type->fields.array->is_subroutine())) {
       return type->array_size();
    } else {
       return type->component_slots();
@@ -284,6 +285,7 @@ public:
    count_uniform_size(struct string_to_uint_map *map)
       : num_active_uniforms(0), num_values(0), num_shader_samplers(0),
         num_shader_images(0), num_shader_uniform_components(0),
+        num_shader_subroutines(0),
         is_ubo_var(false), map(map)
    {
       /* empty */
@@ -294,6 +296,7 @@ public:
       this->num_shader_samplers = 0;
       this->num_shader_images = 0;
       this->num_shader_uniform_components = 0;
+      this->num_shader_subroutines = 0;
    }
 
    void process(ir_variable *var)
@@ -331,6 +334,11 @@ public:
     */
    unsigned num_shader_uniform_components;
 
+   /**
+    * Number of subroutine uniforms used
+    */
+   unsigned num_shader_subroutines;
+
    bool is_ubo_var;
 
 private:
@@ -348,7 +356,9 @@ private:
        * count it for each shader target.
        */
       const unsigned values = values_for_type(type);
-      if (type->contains_sampler()) {
+      if (type->contains_subroutine()) {
+         this->num_shader_subroutines += values;
+      } else if (type->contains_sampler()) {
          this->num_shader_samplers += values;
       } else if (type->contains_image()) {
          this->num_shader_images += values;
@@ -421,6 +431,7 @@ public:
       this->shader_shadow_samplers = 0;
       this->next_sampler = 0;
       this->next_image = 0;
+      this->next_subroutine = 0;
       memset(this->targets, 0, sizeof(this->targets));
    }
 
@@ -535,6 +546,24 @@ private:
       }
    }
 
+   void handle_subroutines(const glsl_type *base_type,
+                           struct gl_uniform_storage *uniform)
+   {
+      if (base_type->is_subroutine()) {
+         uniform->subroutine[shader_type].index = this->next_subroutine;
+         uniform->subroutine[shader_type].active = true;
+
+         /* Increment the subroutine index by 1 for non-arrays and by the
+          * number of array elements for arrays.
+          */
+         this->next_subroutine += MAX2(1, uniform->array_elements);
+
+      } else {
+         uniform->subroutine[shader_type].index = ~0;
+         uniform->subroutine[shader_type].active = false;
+      }
+   }
+
    virtual void visit_field(const glsl_type *type, const char *name,
                             bool row_major)
    {
@@ -588,6 +617,7 @@ private:
       /* This assigns uniform indices to sampler and image uniforms. */
       handle_samplers(base_type, &this->uniforms[id]);
       handle_images(base_type, &this->uniforms[id]);
+      handle_subroutines(base_type, &this->uniforms[id]);
 
       /* If there is already storage associated with this uniform or if the
        * uniform is set as builtin, it means that it was set while processing
@@ -672,6 +702,7 @@ private:
    struct gl_uniform_storage *uniforms;
    unsigned next_sampler;
    unsigned next_image;
+   unsigned next_subroutine;
 
 public:
    union gl_constant_value *values;
@@ -954,8 +985,8 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       sh->num_samplers = uniform_size.num_shader_samplers;
       sh->NumImages = uniform_size.num_shader_images;
       sh->num_uniform_components = uniform_size.num_shader_uniform_components;
-
       sh->num_combined_uniform_components = sh->num_uniform_components;
+
       for (unsigned i = 0; i < sh->NumUniformBlocks; i++) {
 	 sh->num_combined_uniform_components +=
 	    sh->UniformBlocks[i].UniformBufferSize / 4;
@@ -1008,6 +1039,9 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
 
    /* Reserve all the explicit locations of the active uniforms. */
    for (unsigned i = 0; i < num_uniforms; i++) {
+      if (uniforms[i].type->is_subroutine())
+         continue;
+
       if (uniforms[i].remap_location != UNMAPPED_UNIFORM_LOC) {
          /* How many new entries for this uniform? */
          const unsigned entries = MAX2(1, uniforms[i].array_elements);
@@ -1025,6 +1059,8 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
    /* Reserve locations for rest of the uniforms. */
    for (unsigned i = 0; i < num_uniforms; i++) {
 
+      if (uniforms[i].type->is_subroutine())
+         continue;
       /* Built-in uniforms should not get any location. */
       if (uniforms[i].builtin)
          continue;
@@ -1053,6 +1089,65 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       prog->NumUniformRemapTable += entries;
    }
 
+   /* Reserve all the explicit locations of the active subroutine uniforms. */
+   for (unsigned i = 0; i < num_uniforms; i++) {
+      if (!uniforms[i].type->is_subroutine())
+         continue;
+
+      if (uniforms[i].remap_location == UNMAPPED_UNIFORM_LOC)
+         continue;
+
+      for (unsigned j = 0; j < MESA_SHADER_STAGES; j++) {
+         struct gl_shader *sh = prog->_LinkedShaders[j];
+         if (!sh)
+            continue;
+
+         if (!uniforms[i].subroutine[j].active)
+            continue;
+
+         /* How many new entries for this uniform? */
+         const unsigned entries = MAX2(1, uniforms[i].array_elements);
+
+         /* Set remap table entries point to correct gl_uniform_storage. */
+         for (unsigned k = 0; k < entries; k++) {
+            unsigned element_loc = uniforms[i].remap_location + k;
+            assert(sh->SubroutineUniformRemapTable[element_loc] ==
+                   INACTIVE_UNIFORM_EXPLICIT_LOCATION);
+            sh->SubroutineUniformRemapTable[element_loc] = &uniforms[i];
+         }
+      }
+   }
+
+   /* reserve subroutine locations */
+   for (unsigned i = 0; i < num_uniforms; i++) {
+
+      if (!uniforms[i].type->is_subroutine())
+         continue;
+      const unsigned entries = MAX2(1, uniforms[i].array_elements);
+
+      if (uniforms[i].remap_location != UNMAPPED_UNIFORM_LOC)
+         continue;
+      for (unsigned j = 0; j < MESA_SHADER_STAGES; j++) {
+         struct gl_shader *sh = prog->_LinkedShaders[j];
+         if (!sh)
+            continue;
+
+         if (!uniforms[i].subroutine[j].active)
+            continue;
+
+         sh->SubroutineUniformRemapTable =
+            reralloc(sh,
+                     sh->SubroutineUniformRemapTable,
+                     gl_uniform_storage *,
+                     sh->NumSubroutineUniformRemapTable + entries);
+
+         for (unsigned k = 0; k < entries; k++)
+            sh->SubroutineUniformRemapTable[sh->NumSubroutineUniformRemapTable + k] = &uniforms[i];
+         uniforms[i].remap_location = sh->NumSubroutineUniformRemapTable;
+         sh->NumSubroutineUniformRemapTable += entries;
+      }
+   }
+
 #ifndef NDEBUG
    for (unsigned i = 0; i < num_uniforms; i++) {
       assert(uniforms[i].storage != NULL || uniforms[i].builtin);
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index d0445f16032..8f2c8ee9a05 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -933,6 +933,10 @@ cross_validate_globals(struct gl_shader_program *prog,
 	 if (uniforms_only && (var->data.mode != ir_var_uniform && var->data.mode != ir_var_shader_storage))
 	    continue;
 
+         /* don't cross validate subroutine uniforms */
+         if (var->type->contains_subroutine())
+            continue;
+
 	 /* Don't cross validate temporaries that are at global scope.  These
 	  * will eventually get pulled into the shaders 'main'.
 	  */
@@ -2196,8 +2200,11 @@ update_array_sizes(struct gl_shader_program *prog)
           * Atomic counters are supposed to get deterministic
           * locations assigned based on the declaration ordering and
           * sizes, array compaction would mess that up.
+          *
+          * Subroutine uniforms are not removed.
 	  */
-	 if (var->is_in_buffer_block() || var->type->contains_atomic())
+	 if (var->is_in_buffer_block() || var->type->contains_atomic() ||
+	     var->type->contains_subroutine())
 	    continue;
 
 	 unsigned int size = var->data.max_array_access;
@@ -2788,6 +2795,49 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog)
    }
 }
 
+static void
+link_calculate_subroutine_compat(struct gl_context *ctx, struct gl_shader_program *prog)
+{
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      struct gl_shader *sh = prog->_LinkedShaders[i];
+      int count;
+      if (!sh)
+         continue;
+
+      for (unsigned j = 0; j < sh->NumSubroutineUniformRemapTable; j++) {
+         struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[j];
+
+         if (!uni)
+            continue;
+
+         count = 0;
+         for (unsigned f = 0; f < sh->NumSubroutineFunctions; f++) {
+            struct gl_subroutine_function *fn = &sh->SubroutineFunctions[f];
+            for (int k = 0; k < fn->num_compat_types; k++) {
+               if (fn->types[k] == uni->type) {
+                  count++;
+                  break;
+               }
+            }
+         }
+         uni->num_compatible_subroutines = count;
+      }
+   }
+}
+
+static void
+check_subroutine_resources(struct gl_context *ctx, struct gl_shader_program *prog)
+{
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      struct gl_shader *sh = prog->_LinkedShaders[i];
+
+      if (sh) {
+         if (sh->NumSubroutineUniformRemapTable > MAX_SUBROUTINE_UNIFORM_LOCATIONS)
+            linker_error(prog, "Too many %s shader subroutine uniforms\n",
+                         _mesa_shader_stage_to_string(i));
+      }
+   }
+}
 /**
  * Validate shader image resources.
  */
@@ -2896,6 +2946,59 @@ reserve_explicit_locations(struct gl_shader_program *prog,
    return true;
 }
 
+static bool
+reserve_subroutine_explicit_locations(struct gl_shader_program *prog,
+                                      struct gl_shader *sh,
+                                      ir_variable *var)
+{
+   unsigned slots = var->type->uniform_locations();
+   unsigned max_loc = var->data.location + slots - 1;
+
+   /* Resize remap table if locations do not fit in the current one. */
+   if (max_loc + 1 > sh->NumSubroutineUniformRemapTable) {
+      sh->SubroutineUniformRemapTable =
+         reralloc(sh, sh->SubroutineUniformRemapTable,
+                  gl_uniform_storage *,
+                  max_loc + 1);
+
+      if (!sh->SubroutineUniformRemapTable) {
+         linker_error(prog, "Out of memory during linking.\n");
+         return false;
+      }
+
+      /* Initialize allocated space. */
+      for (unsigned i = sh->NumSubroutineUniformRemapTable; i < max_loc + 1; i++)
+         sh->SubroutineUniformRemapTable[i] = NULL;
+
+      sh->NumSubroutineUniformRemapTable = max_loc + 1;
+   }
+
+   for (unsigned i = 0; i < slots; i++) {
+      unsigned loc = var->data.location + i;
+
+      /* Check if location is already used. */
+      if (sh->SubroutineUniformRemapTable[loc] == INACTIVE_UNIFORM_EXPLICIT_LOCATION) {
+
+         /* ARB_explicit_uniform_location specification states:
+          *     "No two subroutine uniform variables can have the same location
+          *     in the same shader stage, otherwise a compiler or linker error
+          *     will be generated."
+          */
+         linker_error(prog,
+                      "location qualifier for uniform %s overlaps "
+                      "previously used location\n",
+                      var->name);
+         return false;
+      }
+
+      /* Initialize location as inactive before optimization
+       * rounds and location assignment.
+       */
+      sh->SubroutineUniformRemapTable[loc] = INACTIVE_UNIFORM_EXPLICIT_LOCATION;
+   }
+
+   return true;
+}
 /**
  * Check and reserve all explicit uniform locations, called before
  * any optimizations happen to handle also inactive uniforms and
@@ -2928,7 +3031,12 @@ check_explicit_uniform_locations(struct gl_context *ctx,
          ir_variable *var = node->as_variable();
          if (var && (var->data.mode == ir_var_uniform || var->data.mode == ir_var_shader_storage) &&
              var->data.explicit_location) {
-            if (!reserve_explicit_locations(prog, uniform_map, var)) {
+            bool ret;
+            if (var->type->is_subroutine())
+               ret = reserve_subroutine_explicit_locations(prog, sh, var);
+            else
+               ret = reserve_explicit_locations(prog, uniform_map, var);
+            if (!ret) {
                delete uniform_map;
                return;
             }
@@ -3143,10 +3251,39 @@ build_program_resource_list(struct gl_context *ctx,
          return;
    }
 
+   for (unsigned i = 0; i < shProg->NumUniformStorage; i++) {
+      GLenum type;
+      if (!shProg->UniformStorage[i].hidden)
+         continue;
+
+      for (int j = MESA_SHADER_VERTEX; j < MESA_SHADER_STAGES; j++) {
+         if (!shProg->UniformStorage[i].subroutine[j].active)
+            continue;
+
+         type = _mesa_shader_stage_to_subroutine_uniform((gl_shader_stage)j);
+         /* add shader subroutines */
+         if (!add_program_resource(shProg, type, &shProg->UniformStorage[i], 0))
+            return;
+      }
+   }
+
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      struct gl_shader *sh = shProg->_LinkedShaders[i];
+      GLuint type;
+
+      if (!sh)
+         continue;
+
+      type = _mesa_shader_stage_to_subroutine((gl_shader_stage)i);
+      for (unsigned j = 0; j < sh->NumSubroutineFunctions; j++) {
+         if (!add_program_resource(shProg, type, &sh->SubroutineFunctions[j], 0))
+            return;
+      }
+   }
+
    /* TODO - following extensions will require more resource types:
     *
     *    GL_ARB_shader_storage_buffer_object
-    *    GL_ARB_shader_subroutine
     */
 }
 
@@ -3184,6 +3321,41 @@ validate_sampler_array_indexing(struct gl_context *ctx,
    return true;
 }
 
+void
+link_assign_subroutine_types(struct gl_context *ctx,
+                             struct gl_shader_program *prog)
+{
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      gl_shader *sh = prog->_LinkedShaders[i];
+
+      if (sh == NULL)
+         continue;
+
+      foreach_in_list(ir_instruction, node, sh->ir) {
+         ir_function *fn = node->as_function();
+         if (!fn)
+            continue;
+
+         if (fn->is_subroutine)
+            sh->NumSubroutineUniformTypes++;
+
+         if (!fn->num_subroutine_types)
+            continue;
+
+         sh->SubroutineFunctions = reralloc(sh, sh->SubroutineFunctions,
+                                            struct gl_subroutine_function,
+                                            sh->NumSubroutineFunctions + 1);
+         sh->SubroutineFunctions[sh->NumSubroutineFunctions].name = ralloc_strdup(sh, fn->name);
+         sh->SubroutineFunctions[sh->NumSubroutineFunctions].num_compat_types = fn->num_subroutine_types;
+         sh->SubroutineFunctions[sh->NumSubroutineFunctions].types =
+            ralloc_array(sh, const struct glsl_type *,
+                         fn->num_subroutine_types);
+         for (int j = 0; j < fn->num_subroutine_types; j++)
+            sh->SubroutineFunctions[sh->NumSubroutineFunctions].types[j] = fn->subroutine_types[j];
+         sh->NumSubroutineFunctions++;
+      }
+   }
+}
 
 void
 link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
@@ -3373,6 +3545,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    }
 
    check_explicit_uniform_locations(ctx, prog);
+   link_assign_subroutine_types(ctx, prog);
+
    if (!prog->LinkStatus)
       goto done;
 
@@ -3630,7 +3804,9 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    link_assign_atomic_counter_resources(ctx, prog);
    store_fragdepth_layout(prog);
 
+   link_calculate_subroutine_compat(ctx, prog);
    check_resources(ctx, prog);
+   check_subroutine_resources(ctx, prog);
    check_image_resources(ctx, prog);
    link_check_atomic_counter_resources(ctx, prog);
 

From 0a18f160159b93c57943e5cb4d9d9a78a5b72996 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 20 Apr 2015 10:27:58 +1000
Subject: [PATCH 0589/1208] program_resource: add subroutine support (v3.1)

This fleshes out the ARB_program_query support for the
APIs that ARB_shader_subroutine introduces, leaving
some TODOs for later addition.

v2: reworked for lots of the ARB_program_interface_query
entry points and tests
v3: use common function to test for subroutine support
v3.1: add tess, fix missing breaks

Acked-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/main/program_resource.c | 100 +++++++++++++++++++++++-------
 src/mesa/main/shader_query.cpp   | 101 +++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+), 22 deletions(-)

diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index e77bb03e9a1..641ef22c1a6 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -28,10 +28,11 @@
 #include "main/mtypes.h"
 #include "main/shaderapi.h"
 #include "main/shaderobj.h"
+#include "main/context.h"
 #include "program_resource.h"
-
+#include "ir_uniform.h"
 static bool
-supported_interface_enum(GLenum iface)
+supported_interface_enum(struct gl_context *ctx, GLenum iface)
 {
    switch (iface) {
    case GL_UNIFORM:
@@ -42,17 +43,21 @@ supported_interface_enum(GLenum iface)
    case GL_ATOMIC_COUNTER_BUFFER:
       return true;
    case GL_VERTEX_SUBROUTINE:
+   case GL_FRAGMENT_SUBROUTINE:
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      return _mesa_has_shader_subroutine(ctx);
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      return _mesa_has_geometry_shaders(ctx) && _mesa_has_shader_subroutine(ctx);
+   case GL_COMPUTE_SUBROUTINE:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      return _mesa_has_compute_shaders(ctx) && _mesa_has_shader_subroutine(ctx);
    case GL_TESS_CONTROL_SUBROUTINE:
    case GL_TESS_EVALUATION_SUBROUTINE:
-   case GL_GEOMETRY_SUBROUTINE:
-   case GL_FRAGMENT_SUBROUTINE:
-   case GL_COMPUTE_SUBROUTINE:
-   case GL_VERTEX_SUBROUTINE_UNIFORM:
    case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
    case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
-   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
-   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
-   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      return _mesa_has_tessellation(ctx) && _mesa_has_shader_subroutine(ctx);
    case GL_BUFFER_VARIABLE:
    case GL_SHADER_STORAGE_BLOCK:
    default:
@@ -79,7 +84,7 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
    }
 
    /* Validate interface. */
-   if (!supported_interface_enum(programInterface)) {
+   if (!supported_interface_enum(ctx, programInterface)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "glGetProgramInterfaceiv(%s)",
                   _mesa_enum_to_string(programInterface));
       return;
@@ -143,6 +148,31 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
       };
       break;
    case GL_MAX_NUM_COMPATIBLE_SUBROUTINES:
+      switch (programInterface) {
+      case GL_VERTEX_SUBROUTINE_UNIFORM:
+      case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+      case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM: {
+         for (i = 0, *params = 0; i < shProg->NumProgramResourceList; i++) {
+            if (shProg->ProgramResourceList[i].Type == programInterface) {
+               struct gl_uniform_storage *uni =
+                  (struct gl_uniform_storage *)
+                  shProg->ProgramResourceList[i].Data;
+               *params = MAX2(*params, uni->num_compatible_subroutines);
+            }
+         }
+         break;
+      }
+
+      default:
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glGetProgramInterfaceiv(%s pname %s)",
+                     _mesa_enum_to_string(programInterface),
+                     _mesa_enum_to_string(pname));
+      }
+      break;
    default:
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glGetProgramInterfaceiv(pname %s)",
@@ -206,6 +236,11 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
    if (!shProg || !name)
       return GL_INVALID_INDEX;
 
+   if (!supported_interface_enum(ctx, programInterface)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceIndex(%s)",
+                  _mesa_enum_to_string(programInterface));
+      return GL_INVALID_INDEX;
+   }
    /*
     * For the interface TRANSFORM_FEEDBACK_VARYING, the value INVALID_INDEX
     * should be returned when querying the index assigned to the special names
@@ -217,6 +252,18 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
       return GL_INVALID_INDEX;
 
    switch (programInterface) {
+   case GL_TESS_CONTROL_SUBROUTINE:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+   case GL_COMPUTE_SUBROUTINE:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+   case GL_VERTEX_SUBROUTINE:
+   case GL_FRAGMENT_SUBROUTINE:
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
    case GL_PROGRAM_INPUT:
    case GL_PROGRAM_OUTPUT:
    case GL_UNIFORM:
@@ -260,7 +307,7 @@ _mesa_GetProgramResourceName(GLuint program, GLenum programInterface,
       return;
 
    if (programInterface == GL_ATOMIC_COUNTER_BUFFER ||
-       !supported_interface_enum(programInterface)) {
+       !supported_interface_enum(ctx, programInterface)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceName(%s)",
                   _mesa_enum_to_string(programInterface));
       return;
@@ -366,24 +413,33 @@ _mesa_GetProgramResourceLocation(GLuint program, GLenum programInterface,
    case GL_PROGRAM_OUTPUT:
       break;
 
-   /* For reference valid cases requiring additional extension support:
-    * GL_ARB_shader_subroutine
-    * GL_ARB_tessellation_shader
-    * GL_ARB_compute_shader
-    */
    case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      if (!_mesa_has_shader_subroutine(ctx))
+         goto fail;
+      break;
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      if (!_mesa_has_geometry_shaders(ctx) || !_mesa_has_shader_subroutine(ctx))
+         goto fail;
+      break;
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      if (!_mesa_has_compute_shaders(ctx) || !_mesa_has_shader_subroutine(ctx))
+         goto fail;
+      break;
    case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
    case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
-   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
-   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
-   case GL_COMPUTE_SUBROUTINE_UNIFORM:
-
+      if (!_mesa_has_tessellation(ctx) || !_mesa_has_shader_subroutine(ctx))
+         goto fail;
+      break;
    default:
-      _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceLocation(%s %s)",
-                  _mesa_enum_to_string(programInterface), name);
+         goto fail;
    }
 
    return _mesa_program_resource_location(shProg, programInterface, name);
+fail:
+   _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceLocation(%s %s)",
+               _mesa_enum_to_string(programInterface), name);
+   return -1;
 }
 
 /**
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 59b9396f19b..3e52a098a67 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -61,6 +61,7 @@ DECL_RESOURCE_FUNC(UBO, gl_uniform_block);
 DECL_RESOURCE_FUNC(UNI, gl_uniform_storage);
 DECL_RESOURCE_FUNC(ATC, gl_active_atomic_buffer);
 DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_varying_info);
+DECL_RESOURCE_FUNC(SUB, gl_subroutine_function);
 
 void GLAPIENTRY
 _mesa_BindAttribLocation(GLhandleARB program, GLuint index,
@@ -497,6 +498,20 @@ _mesa_program_resource_name(struct gl_program_resource *res)
       return RESOURCE_VAR(res)->name;
    case GL_UNIFORM:
       return RESOURCE_UNI(res)->name;
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      return RESOURCE_UNI(res)->name + MESA_SUBROUTINE_PREFIX_LEN;
+   case GL_VERTEX_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_FRAGMENT_SUBROUTINE:
+   case GL_COMPUTE_SUBROUTINE:
+   case GL_TESS_CONTROL_SUBROUTINE:
+   case GL_TESS_EVALUATION_SUBROUTINE:
+      return RESOURCE_SUB(res)->name;
    default:
       assert(!"support for resource type not implemented");
    }
@@ -515,7 +530,19 @@ _mesa_program_resource_array_size(struct gl_program_resource *res)
    case GL_PROGRAM_OUTPUT:
       return RESOURCE_VAR(res)->data.max_array_access;
    case GL_UNIFORM:
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
       return RESOURCE_UNI(res)->array_elements;
+   case GL_VERTEX_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_FRAGMENT_SUBROUTINE:
+   case GL_COMPUTE_SUBROUTINE:
+   case GL_TESS_CONTROL_SUBROUTINE:
+   case GL_TESS_EVALUATION_SUBROUTINE:
    case GL_ATOMIC_COUNTER_BUFFER:
    case GL_UNIFORM_BLOCK:
       return 0;
@@ -571,6 +598,18 @@ _mesa_program_resource_find_name(struct gl_shader_program *shProg,
       case GL_TRANSFORM_FEEDBACK_VARYING:
       case GL_UNIFORM_BLOCK:
       case GL_UNIFORM:
+      case GL_VERTEX_SUBROUTINE_UNIFORM:
+      case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+      case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      case GL_VERTEX_SUBROUTINE:
+      case GL_GEOMETRY_SUBROUTINE:
+      case GL_FRAGMENT_SUBROUTINE:
+      case GL_COMPUTE_SUBROUTINE:
+      case GL_TESS_CONTROL_SUBROUTINE:
+      case GL_TESS_EVALUATION_SUBROUTINE:
          if (strncmp(rname, name, baselen) == 0) {
             /* Basename match, check if array or struct. */
             if (name[baselen] == '\0' ||
@@ -651,6 +690,18 @@ _mesa_program_resource_find_index(struct gl_shader_program *shProg,
       case GL_PROGRAM_INPUT:
       case GL_PROGRAM_OUTPUT:
       case GL_UNIFORM:
+      case GL_VERTEX_SUBROUTINE_UNIFORM:
+      case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+      case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      case GL_VERTEX_SUBROUTINE:
+      case GL_GEOMETRY_SUBROUTINE:
+      case GL_FRAGMENT_SUBROUTINE:
+      case GL_COMPUTE_SUBROUTINE:
+      case GL_TESS_CONTROL_SUBROUTINE:
+      case GL_TESS_EVALUATION_SUBROUTINE:
          if (++idx == (int) index)
             return res;
          break;
@@ -740,6 +791,8 @@ program_resource_location(struct gl_shader_program *shProg,
 {
    unsigned index, offset;
    int array_index = -1;
+   long offset_ret;
+   const GLchar *base_name_end;
 
    if (res->Type == GL_PROGRAM_INPUT || res->Type == GL_PROGRAM_OUTPUT) {
       array_index = array_index_of_resource(res, name);
@@ -780,6 +833,14 @@ program_resource_location(struct gl_shader_program *shProg,
       /* location in remap table + array element offset */
       return RESOURCE_UNI(res)->remap_location + offset;
 
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      offset_ret = parse_program_resource_name(name, &base_name_end);
+      return RESOURCE_UNI(res)->remap_location + ((offset_ret != -1) ? offset_ret : 0);
    default:
       return -1;
    }
@@ -1048,6 +1109,46 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
          goto invalid_operation;
       *val = RESOURCE_VAR(res)->data.index;
       return 1;
+
+   case GL_NUM_COMPATIBLE_SUBROUTINES:
+      if (res->Type != GL_VERTEX_SUBROUTINE_UNIFORM &&
+          res->Type != GL_FRAGMENT_SUBROUTINE_UNIFORM &&
+          res->Type != GL_GEOMETRY_SUBROUTINE_UNIFORM &&
+          res->Type != GL_COMPUTE_SUBROUTINE_UNIFORM &&
+          res->Type != GL_TESS_CONTROL_SUBROUTINE_UNIFORM &&
+          res->Type != GL_TESS_EVALUATION_SUBROUTINE_UNIFORM)
+         goto invalid_operation;
+      *val = RESOURCE_UNI(res)->num_compatible_subroutines;
+      return 1;
+   case GL_COMPATIBLE_SUBROUTINES: {
+      const struct gl_uniform_storage *uni;
+      struct gl_shader *sh;
+      unsigned count, i;
+      int j;
+
+      if (res->Type != GL_VERTEX_SUBROUTINE_UNIFORM &&
+          res->Type != GL_FRAGMENT_SUBROUTINE_UNIFORM &&
+          res->Type != GL_GEOMETRY_SUBROUTINE_UNIFORM &&
+          res->Type != GL_COMPUTE_SUBROUTINE_UNIFORM &&
+          res->Type != GL_TESS_CONTROL_SUBROUTINE_UNIFORM &&
+          res->Type != GL_TESS_EVALUATION_SUBROUTINE_UNIFORM)
+         goto invalid_operation;
+      uni = RESOURCE_UNI(res);
+
+      sh = shProg->_LinkedShaders[_mesa_shader_stage_from_subroutine_uniform(res->Type)];
+      count = 0;
+      for (i = 0; i < sh->NumSubroutineFunctions; i++) {
+         struct gl_subroutine_function *fn = &sh->SubroutineFunctions[i];
+         for (j = 0; j < fn->num_compat_types; j++) {
+            if (fn->types[j] == uni->type) {
+               val[count++] = i;
+               break;
+            }
+         }
+      }
+      return count;
+   }
+   /* GL_ARB_tessellation_shader */
    case GL_IS_PER_PATCH:
       switch (res->Type) {
       case GL_PROGRAM_INPUT:

From 3f4f3e2d4877e1e2bda064cc323fb7b3667e12fe Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 20 Apr 2015 10:28:40 +1000
Subject: [PATCH 0590/1208] program: add subroutine uniform support (v1.1)

Add support for the subroutine uniform type ir->mesa.cpp

v1.1: add subroutine to int to switch

Acked-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/program/ir_to_mesa.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index cadbf49ca65..8f58f3edf98 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -534,6 +534,7 @@ type_size(const struct glsl_type *type)
       return size;
    case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_SUBROUTINE:
       /* Samplers take up one slot in UNIFORMS[], but they're baked in
        * at link time.
        */
@@ -1342,6 +1343,7 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
    case ir_unop_dFdx_fine:
    case ir_unop_dFdy_coarse:
    case ir_unop_dFdy_fine:
+   case ir_unop_subroutine_to_int:
       assert(!"not supported");
       break;
 
@@ -2451,6 +2453,7 @@ _mesa_associate_uniform_storage(struct gl_context *ctx,
 	    break;
 	 case GLSL_TYPE_SAMPLER:
 	 case GLSL_TYPE_IMAGE:
+         case GLSL_TYPE_SUBROUTINE:
 	    format = uniform_native;
 	    columns = 1;
 	    break;

From 6f57fda494a6b4ccf30cab000ca28154fbabcb78 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 20 Apr 2015 10:29:12 +1000
Subject: [PATCH 0591/1208] mesa: fill out the ARB_shader_subroutine APIs

This fleshes out the APIs, using the program resource
APIs where they should match.

It also sets the default values to valid subroutines.

Acked-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/main/shaderapi.c | 451 +++++++++++++++++++++++++++++++++++++-
 src/mesa/main/shaderapi.h |   3 +
 2 files changed, 451 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 495cb46808c..34d280a6fba 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1171,6 +1171,7 @@ _mesa_use_program(struct gl_context *ctx, struct gl_shader_program *shProg)
       use_shader_program(ctx, i, shProg, &ctx->Shader);
    _mesa_active_program(ctx, shProg, "glUseProgram");
 
+   _mesa_shader_program_init_subroutine_defaults(shProg);
    if (ctx->Driver.UseProgram)
       ctx->Driver.UseProgram(ctx, shProg);
 }
@@ -2165,15 +2166,75 @@ GLint GLAPIENTRY
 _mesa_GetSubroutineUniformLocation(GLuint program, GLenum shadertype,
                                    const GLchar *name)
 {
-   return -1;
-}
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetSubroutineUniformLocation";
+   struct gl_shader_program *shProg;
+   GLenum resource_type;
+   gl_shader_stage stage;
 
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return -1;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return -1;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return -1;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   if (!shProg->_LinkedShaders[stage]) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return -1;
+   }
+
+   resource_type = _mesa_shader_stage_to_subroutine_uniform(stage);
+   return _mesa_program_resource_location(shProg, resource_type, name);
+}
 
 GLuint GLAPIENTRY
 _mesa_GetSubroutineIndex(GLuint program, GLenum shadertype,
                          const GLchar *name)
 {
-   return GL_INVALID_INDEX;
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetSubroutineIndex";
+   struct gl_shader_program *shProg;
+   struct gl_program_resource *res;
+   GLenum resource_type;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return -1;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return -1;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return -1;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   if (!shProg->_LinkedShaders[stage]) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return -1;
+   }
+
+   resource_type = _mesa_shader_stage_to_subroutine(stage);
+   res = _mesa_program_resource_find_name(shProg, resource_type, name);
+   if (!res) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+     return -1;
+   }
+
+   return _mesa_program_resource_index(shProg, res);
 }
 
 
@@ -2181,6 +2242,82 @@ GLvoid GLAPIENTRY
 _mesa_GetActiveSubroutineUniformiv(GLuint program, GLenum shadertype,
                                    GLuint index, GLenum pname, GLint *values)
 {
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetActiveSubroutineUniformiv";
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   gl_shader_stage stage;
+   struct gl_program_resource *res;
+   const struct gl_uniform_storage *uni;
+   GLenum resource_type;
+   int count, i, j;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   resource_type = _mesa_shader_stage_to_subroutine_uniform(stage);
+
+   sh = shProg->_LinkedShaders[stage];
+   if (!sh) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   switch (pname) {
+   case GL_NUM_COMPATIBLE_SUBROUTINES: {
+      res = _mesa_program_resource_find_index(shProg, resource_type, index);
+      if (res) {
+         uni = res->Data;
+         values[0] = uni->num_compatible_subroutines;
+      }
+      break;
+   }
+   case GL_COMPATIBLE_SUBROUTINES: {
+      res = _mesa_program_resource_find_index(shProg, resource_type, index);
+      if (res) {
+         uni = res->Data;
+         count = 0;
+         for (i = 0; i < sh->NumSubroutineFunctions; i++) {
+            struct gl_subroutine_function *fn = &sh->SubroutineFunctions[i];
+            for (j = 0; j < fn->num_compat_types; j++) {
+               if (fn->types[j] == uni->type) {
+                  values[count++] = i;
+                  break;
+               }
+            }
+         }
+      }
+      break;
+   }
+   case GL_UNIFORM_SIZE:
+      res = _mesa_program_resource_find_index(shProg, resource_type, index);
+      if (res) {
+         uni = res->Data;
+         values[0] = uni->array_elements ? uni->array_elements : 1;
+      }
+      break;
+   case GL_UNIFORM_NAME_LENGTH:
+      res = _mesa_program_resource_find_index(shProg, resource_type, index);
+      if (res) {
+         values[0] = strlen(_mesa_program_resource_name(res)) + 1 + ((_mesa_program_resource_array_size(res) != 0) ? 3 : 0);;
+      }
+      break;
+   default:
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
 }
 
 
@@ -2189,6 +2326,37 @@ _mesa_GetActiveSubroutineUniformName(GLuint program, GLenum shadertype,
                                      GLuint index, GLsizei bufsize,
                                      GLsizei *length, GLchar *name)
 {
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetActiveSubroutineUniformName";
+   struct gl_shader_program *shProg;
+   GLenum resource_type;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   if (!shProg->_LinkedShaders[stage]) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   resource_type = _mesa_shader_stage_to_subroutine_uniform(stage);
+   /* get program resource name */
+   _mesa_get_program_resource_name(shProg, resource_type,
+                                   index, bufsize,
+                                   length, name, api_name);
 }
 
 
@@ -2197,6 +2365,35 @@ _mesa_GetActiveSubroutineName(GLuint program, GLenum shadertype,
                               GLuint index, GLsizei bufsize,
                               GLsizei *length, GLchar *name)
 {
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetActiveSubroutineName";
+   struct gl_shader_program *shProg;
+   GLenum resource_type;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   if (!shProg->_LinkedShaders[stage]) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+   resource_type = _mesa_shader_stage_to_subroutine(stage);
+   _mesa_get_program_resource_name(shProg, resource_type,
+                                   index, bufsize,
+                                   length, name, api_name);
 }
 
 
@@ -2204,6 +2401,80 @@ GLvoid GLAPIENTRY
 _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
                             const GLuint *indices)
 {
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glUniformSubroutinesuiv";
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   gl_shader_stage stage;
+   int i;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   shProg = ctx->_Shader->CurrentProgram[stage];
+   if (!shProg) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   sh = shProg->_LinkedShaders[stage];
+   if (!sh) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   if (count != sh->NumSubroutineUniformRemapTable) {
+      _mesa_error(ctx, GL_INVALID_VALUE, api_name);
+      return;
+   }
+
+   i = 0;
+   do {
+      struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      int uni_count = uni->array_elements ? uni->array_elements : 1;
+      int j, k;
+
+      for (j = i; j < i + uni_count; j++) {
+         struct gl_subroutine_function *subfn;
+         if (indices[j] >= sh->NumSubroutineFunctions) {
+            _mesa_error(ctx, GL_INVALID_VALUE, api_name);
+            return;
+         }
+
+         subfn = &sh->SubroutineFunctions[indices[j]];
+         for (k = 0; k < subfn->num_compat_types; k++) {
+            if (subfn->types[k] == uni->type)
+               break;
+         }
+         if (k == subfn->num_compat_types) {
+            _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+            return;
+         }
+      }
+      i += uni_count;
+   } while(i < count);
+
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
+   i = 0;
+   do {
+      struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      int uni_count = uni->array_elements ? uni->array_elements : 1;
+
+      memcpy(&uni->storage[0], &indices[i],
+             sizeof(GLuint) * uni_count);
+
+      uni->initialized = true;
+      _mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count);
+      i += uni_count;
+   } while(i < count);
 }
 
 
@@ -2211,6 +2482,46 @@ GLvoid GLAPIENTRY
 _mesa_GetUniformSubroutineuiv(GLenum shadertype, GLint location,
                               GLuint *params)
 {
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetUniformSubroutineuiv";
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   shProg = ctx->_Shader->CurrentProgram[stage];
+   if (!shProg) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   sh = shProg->_LinkedShaders[stage];
+   if (!sh) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   if (location >= sh->NumSubroutineUniformRemapTable) {
+      _mesa_error(ctx, GL_INVALID_VALUE, api_name);
+      return;
+   }
+
+   {
+      struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[location];
+      int offset = location - uni->subroutine[stage].index;
+      memcpy(params, &uni->storage[offset],
+	     sizeof(GLuint));
+   }
 }
 
 
@@ -2218,4 +2529,138 @@ GLvoid GLAPIENTRY
 _mesa_GetProgramStageiv(GLuint program, GLenum shadertype,
                         GLenum pname, GLint *values)
 {
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetProgramStageiv";
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   sh = shProg->_LinkedShaders[stage];
+   if (!sh) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      return;
+   }
+
+   switch (pname) {
+   case GL_ACTIVE_SUBROUTINES:
+      values[0] = sh->NumSubroutineFunctions;
+      break;
+   case GL_ACTIVE_SUBROUTINE_UNIFORM_LOCATIONS:
+      values[0] = sh->NumSubroutineUniformRemapTable;
+      break;
+   case GL_ACTIVE_SUBROUTINE_UNIFORMS:
+      values[0] = sh->NumSubroutineUniformTypes;
+      break;
+   case GL_ACTIVE_SUBROUTINE_MAX_LENGTH:
+   {
+      unsigned i;
+      GLint max_len = 0;
+      GLenum resource_type;
+      struct gl_program_resource *res;
+
+      resource_type = _mesa_shader_stage_to_subroutine(stage);
+      for (i = 0; i < sh->NumSubroutineFunctions; i++) {
+         res = _mesa_program_resource_find_index(shProg, resource_type, i);
+         if (res) {
+            const GLint len = strlen(_mesa_program_resource_name(res)) + 1;
+            if (len > max_len)
+               max_len = len;
+         }
+      }
+      values[0] = max_len;
+      break;
+   }
+   case GL_ACTIVE_SUBROUTINE_UNIFORM_MAX_LENGTH:
+   {
+      unsigned i;
+      GLint max_len = 0;
+      GLenum resource_type;
+      struct gl_program_resource *res;
+
+      resource_type = _mesa_shader_stage_to_subroutine_uniform(stage);
+      for (i = 0; i < sh->NumSubroutineUniformRemapTable; i++) {
+         res = _mesa_program_resource_find_index(shProg, resource_type, i);
+         if (res) {
+            const GLint len = strlen(_mesa_program_resource_name(res)) + 1+  ((_mesa_program_resource_array_size(res) != 0) ? 3 : 0);
+
+            if (len > max_len)
+               max_len = len;
+         }
+      }
+      values[0] = max_len;
+      break;
+   }
+   default:
+      _mesa_error(ctx, GL_INVALID_ENUM, api_name);
+      values[0] = -1;
+      break;
+   }
+}
+
+static int
+find_compat_subroutine(struct gl_shader *sh, const struct glsl_type *type)
+{
+   int i, j;
+
+   for (i = 0; i < sh->NumSubroutineFunctions; i++) {
+      struct gl_subroutine_function *fn = &sh->SubroutineFunctions[i];
+      for (j = 0; j < fn->num_compat_types; j++) {
+         if (fn->types[j] == type)
+            return i;
+      }
+   }
+   return 0;
+}
+
+static void
+_mesa_shader_init_subroutine_defaults(struct gl_shader *sh)
+{
+   int i, j;
+
+   for (i = 0; i < sh->NumSubroutineUniformRemapTable; i++) {
+      struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      int uni_count;
+      int val;
+
+      if (!uni)
+         continue;
+      uni_count = uni->array_elements ? uni->array_elements : 1;
+      val = find_compat_subroutine(sh, uni->type);
+
+      for (j = 0; j < uni_count; j++)
+         memcpy(&uni->storage[j], &val, sizeof(int));
+      uni->initialized = true;
+      _mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count);
+   }
+}
+
+void
+_mesa_shader_program_init_subroutine_defaults(struct gl_shader_program *shProg)
+{
+   int i;
+
+   if (!shProg)
+      return;
+
+   for (i = 0; i < MESA_SHADER_STAGES; i++) {
+      if (!shProg->_LinkedShaders[i])
+         continue;
+
+      _mesa_shader_init_subroutine_defaults(shProg->_LinkedShaders[i]);
+   }
 }
diff --git a/src/mesa/main/shaderapi.h b/src/mesa/main/shaderapi.h
index c2f04d875a7..c081f749f93 100644
--- a/src/mesa/main/shaderapi.h
+++ b/src/mesa/main/shaderapi.h
@@ -272,6 +272,9 @@ extern void GLAPIENTRY
 _mesa_PatchParameterfv(GLenum pname, const GLfloat *values);
 
 /* GL_ARB_shader_subroutine */
+void
+_mesa_shader_program_init_subroutine_defaults(struct gl_shader_program *shProg);
+
 extern GLint GLAPIENTRY
 _mesa_GetSubroutineUniformLocation(GLuint program, GLenum shadertype,
                                    const GLchar *name);

From a922c279930ec1ab34506ca2e24d8a62a297ea33 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 20 Apr 2015 10:29:42 +1000
Subject: [PATCH 0592/1208] st/mesa: add subroutine bits (v1.1)

Just add support for the subroutine type to the
glsl->tgsi convertor.

v1.1: add subroutine to int support.

Acked-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 9553791b07a..905d661a437 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -815,7 +815,7 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
    case TGSI_OPCODE_##c: \
       if (type == GLSL_TYPE_DOUBLE) \
          op = TGSI_OPCODE_##d; \
-      else if (type == GLSL_TYPE_INT)       \
+      else if (type == GLSL_TYPE_INT || type == GLSL_TYPE_SUBROUTINE)       \
          op = TGSI_OPCODE_##i; \
       else if (type == GLSL_TYPE_UINT) \
          op = TGSI_OPCODE_##u; \

From c3fad009c54fb526d236fd10f4377ce7fbb54459 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 20 Apr 2015 10:30:53 +1000
Subject: [PATCH 0593/1208] st/mesa: enable shader subroutine

since this touches drivers, only enable it on gallium
for now for drivers reporting GLSL 1.30 or above.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_extensions.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index dc5cdfd1fe7..13d636fede0 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -710,6 +710,7 @@ void st_init_extensions(struct pipe_screen *screen,
       extensions->OES_depth_texture_cube_map = GL_TRUE;
       extensions->ARB_shading_language_420pack = GL_TRUE;
       extensions->ARB_texture_query_levels = GL_TRUE;
+      extensions->ARB_shader_subroutine = GL_TRUE;
 
       if (!options->disable_shader_bit_encoding) {
          extensions->ARB_shader_bit_encoding = GL_TRUE;

From 65d84daf29adb0da779e9b49291cb4e26f021e1e Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 22 Jul 2015 11:04:52 +1000
Subject: [PATCH 0594/1208] docs/GL3.txt: update ARB_shader_subroutine status.

Acked-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt              | 2 +-
 docs/relnotes/10.7.0.html | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 197b676f64b..2cf5c316468 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -111,7 +111,7 @@ GL 4.0, GLSL 4.00:
   - New overload resolution rules                      DONE
   GL_ARB_gpu_shader_fp64                               DONE (nvc0, radeonsi, llvmpipe, softpipe)
   GL_ARB_sample_shading                                DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_shader_subroutine                             started (Dave)
+  GL_ARB_shader_subroutine                             DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_tessellation_shader                           DONE (radeonsi)
   GL_ARB_texture_buffer_object_rgb32                   DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_cube_map_array                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index c7d0d2480ff..1464dc4f092 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -50,6 +50,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_get_texture_sub_image for all drivers</li>
 <li>GL_ARB_gpu_shader_fp64 on llvmpipe, radeonsi</li>
 <li>GL_ARB_shader_stencil_export on llvmpipe</li>
+<li>GL_ARB_shader_subroutine on core profile gallium drivers</li>
 <li>GL_ARB_tessellation_shader on radeonsi</li>
 <li>GL_ARB_vertex_attrib_64bit on llvmpipe, radeonsi</li>
 <li>GL_ARB_viewport_array on radeonsi</li>

From f97c14f9e4ff5ae2b7313eb0098f99816fead71d Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 12 Jul 2014 15:40:14 -0400
Subject: [PATCH 0595/1208] nvc0: preliminary tess support

Uncomment the various functionality that was already there and add in
obvious missing bits that parallel vp/gp/fp functionality.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir.cpp       |  4 +-
 .../drivers/nouveau/codegen/nv50_ir_driver.h  |  5 --
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     |  2 +
 .../drivers/nouveau/nvc0/nvc0_context.h       |  4 +-
 .../drivers/nouveau/nvc0/nvc0_program.c       | 29 ++++-------
 .../drivers/nouveau/nvc0/nvc0_screen.c        |  6 +--
 src/gallium/drivers/nouveau/nvc0/nvc0_state.c | 52 +++++++++++++++++++
 src/gallium/drivers/nouveau/nvc0/nvc0_tex.c   | 32 +++++-------
 src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c   |  3 +-
 .../drivers/nouveau/nvc0/nvc0_vbo_translate.c |  3 +-
 10 files changed, 86 insertions(+), 54 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index ca3c806e92f..cce60550ae5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -1153,8 +1153,8 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info)
 
    switch (info->type) {
    PROG_TYPE_CASE(VERTEX, VERTEX);
-// PROG_TYPE_CASE(HULL, TESSELLATION_CONTROL);
-// PROG_TYPE_CASE(DOMAIN, TESSELLATION_EVAL);
+   PROG_TYPE_CASE(TESS_CTRL, TESSELLATION_CONTROL);
+   PROG_TYPE_CASE(TESS_EVAL, TESSELLATION_EVAL);
    PROG_TYPE_CASE(GEOMETRY, GEOMETRY);
    PROG_TYPE_CASE(FRAGMENT, FRAGMENT);
    PROG_TYPE_CASE(COMPUTE, COMPUTE);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index e6c4d668b38..bacc80d001d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -74,11 +74,6 @@ struct nv50_ir_varying
 #define NV50_SEMANTIC_TESSCOORD     (TGSI_SEMANTIC_COUNT + 8)
 #define NV50_SEMANTIC_COUNT         (TGSI_SEMANTIC_COUNT + 10)
 
-#define NV50_TESS_PART_FRACT_ODD  0
-#define NV50_TESS_PART_FRACT_EVEN 1
-#define NV50_TESS_PART_POW2       2
-#define NV50_TESS_PART_INTEGER    3
-
 #define NV50_PRIM_PATCHES PIPE_PRIM_MAX
 
 struct nv50_ir_prog_symbol
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index dca30943385..2fcf41d6913 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -372,6 +372,8 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval)
    case TGSI_SEMANTIC_SAMPLEPOS:  return nv50_ir::SV_SAMPLE_POS;
    case TGSI_SEMANTIC_SAMPLEMASK: return nv50_ir::SV_SAMPLE_MASK;
    case TGSI_SEMANTIC_INVOCATIONID: return nv50_ir::SV_INVOCATION_ID;
+   case TGSI_SEMANTIC_TESSCOORD:  return nv50_ir::SV_TESS_COORD;
+   case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT;
    default:
       assert(0);
       return nv50_ir::SV_CLOCK;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index bfa6504770c..92f33ee279d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -195,8 +195,8 @@ nvc0_shader_stage(unsigned pipe)
 {
    switch (pipe) {
    case PIPE_SHADER_VERTEX: return 0;
-/* case PIPE_SHADER_TESSELLATION_CONTROL: return 1; */
-/* case PIPE_SHADER_TESSELLATION_EVALUATION: return 2; */
+   case PIPE_SHADER_TESS_CTRL: return 1;
+   case PIPE_SHADER_TESS_EVAL: return 2;
    case PIPE_SHADER_GEOMETRY: return 3;
    case PIPE_SHADER_FRAGMENT: return 4;
    case PIPE_SHADER_COMPUTE: return 5;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 2410b832335..dbedb35c374 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -34,7 +34,8 @@ static uint32_t
 nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase)
 {
    switch (sn) {
-   case NV50_SEMANTIC_TESSFACTOR:   return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSOUTER:    return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSINNER:    return 0x010 + si * 0x4;
    case TGSI_SEMANTIC_PRIMID:       return 0x060;
    case TGSI_SEMANTIC_LAYER:        return 0x064;
    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
@@ -48,7 +49,7 @@ nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase)
    case TGSI_SEMANTIC_CLIPDIST:     return 0x2c0 + si * 0x10;
    case TGSI_SEMANTIC_CLIPVERTEX:   return 0x270;
    case TGSI_SEMANTIC_PCOORD:       return 0x2e0;
-   case NV50_SEMANTIC_TESSCOORD:    return 0x2f0;
+   case TGSI_SEMANTIC_TESSCOORD:    return 0x2f0;
    case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
    case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
    case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
@@ -63,7 +64,8 @@ static uint32_t
 nvc0_shader_output_address(unsigned sn, unsigned si, unsigned ubase)
 {
    switch (sn) {
-   case NV50_SEMANTIC_TESSFACTOR:    return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSOUTER:     return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSINNER:     return 0x010 + si * 0x4;
    case TGSI_SEMANTIC_PRIMID:        return 0x060;
    case TGSI_SEMANTIC_LAYER:         return 0x064;
    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
@@ -277,7 +279,6 @@ nvc0_vp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
    return nvc0_vtgp_gen_header(vp, info);
 }
 
-#if defined(PIPE_SHADER_HULL) || defined(PIPE_SHADER_DOMAIN)
 static void
 nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
 {
@@ -305,14 +306,13 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;
 
    switch (info->prop.tp.partitioning) {
-   case PIPE_TESS_PART_INTEGER:
-   case PIPE_TESS_PART_POW2:
+   case PIPE_TESS_SPACING_EQUAL:
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_EQUAL;
       break;
-   case PIPE_TESS_PART_FRACT_ODD:
+   case PIPE_TESS_SPACING_FRACTIONAL_ODD:
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_ODD;
       break;
-   case PIPE_TESS_PART_FRACT_EVEN:
+   case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_EVEN;
       break;
    default:
@@ -320,9 +320,7 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
       break;
    }
 }
-#endif
 
-#ifdef PIPE_SHADER_HULL
 static int
 nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info)
 {
@@ -346,9 +344,7 @@ nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info)
 
    return 0;
 }
-#endif
 
-#ifdef PIPE_SHADER_DOMAIN
 static int
 nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info)
 {
@@ -365,7 +361,6 @@ nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info)
 
    return 0;
 }
-#endif
 
 static int
 nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info)
@@ -598,16 +593,12 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
    case PIPE_SHADER_VERTEX:
       ret = nvc0_vp_gen_header(prog, info);
       break;
-#ifdef PIPE_SHADER_HULL
-   case PIPE_SHADER_HULL:
+   case PIPE_SHADER_TESS_CTRL:
       ret = nvc0_tcp_gen_header(prog, info);
       break;
-#endif
-#ifdef PIPE_SHADER_DOMAIN
-   case PIPE_SHADER_DOMAIN:
+   case PIPE_SHADER_TESS_EVAL:
       ret = nvc0_tep_gen_header(prog, info);
       break;
-#endif
    case PIPE_SHADER_GEOMETRY:
       ret = nvc0_gp_gen_header(prog, info);
       break;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 5f12281dd6c..b414caaa5c8 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -228,10 +228,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 
    switch (shader) {
    case PIPE_SHADER_VERTEX:
-      /*
-   case PIPE_SHADER_TESSELLATION_CONTROL:
-   case PIPE_SHADER_TESSELLATION_EVALUATION:
-      */
+   case PIPE_SHADER_TESS_CTRL:
+   case PIPE_SHADER_TESS_EVAL:
    case PIPE_SHADER_GEOMETRY:
    case PIPE_SHADER_FRAGMENT:
       break;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 8b3da47c76c..c39939c20b0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -508,6 +508,14 @@ nvc0_bind_sampler_states(struct pipe_context *pipe, unsigned shader,
       assert(start == 0);
       nvc0_stage_sampler_states_bind(nvc0_context(pipe), 0, nr, s);
       break;
+   case PIPE_SHADER_TESS_CTRL:
+      assert(start == 0);
+      nvc0_stage_sampler_states_bind(nvc0_context(pipe), 1, nr, s);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      assert(start == 0);
+      nvc0_stage_sampler_states_bind(nvc0_context(pipe), 2, nr, s);
+      break;
    case PIPE_SHADER_GEOMETRY:
       assert(start == 0);
       nvc0_stage_sampler_states_bind(nvc0_context(pipe), 3, nr, s);
@@ -633,6 +641,12 @@ nvc0_set_sampler_views(struct pipe_context *pipe, unsigned shader,
    case PIPE_SHADER_VERTEX:
       nvc0_stage_set_sampler_views(nvc0_context(pipe), 0, nr, views);
       break;
+   case PIPE_SHADER_TESS_CTRL:
+      nvc0_stage_set_sampler_views(nvc0_context(pipe), 1, nr, views);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      nvc0_stage_set_sampler_views(nvc0_context(pipe), 2, nr, views);
+      break;
    case PIPE_SHADER_GEOMETRY:
       nvc0_stage_set_sampler_views(nvc0_context(pipe), 3, nr, views);
       break;
@@ -733,6 +747,38 @@ nvc0_gp_state_bind(struct pipe_context *pipe, void *hwcso)
     nvc0->dirty |= NVC0_NEW_GMTYPROG;
 }
 
+static void *
+nvc0_tcp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   return nvc0_sp_state_create(pipe, cso, PIPE_SHADER_TESS_CTRL);
+}
+
+static void
+nvc0_tcp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->tctlprog = hwcso;
+    nvc0->dirty |= NVC0_NEW_TCTLPROG;
+}
+
+static void *
+nvc0_tep_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   return nvc0_sp_state_create(pipe, cso, PIPE_SHADER_TESS_EVAL);
+}
+
+static void
+nvc0_tep_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->tevlprog = hwcso;
+    nvc0->dirty |= NVC0_NEW_TEVLPROG;
+}
+
 static void *
 nvc0_cp_state_create(struct pipe_context *pipe,
                      const struct pipe_compute_state *cso)
@@ -1220,12 +1266,18 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
    pipe->create_vs_state = nvc0_vp_state_create;
    pipe->create_fs_state = nvc0_fp_state_create;
    pipe->create_gs_state = nvc0_gp_state_create;
+   pipe->create_tcs_state = nvc0_tcp_state_create;
+   pipe->create_tes_state = nvc0_tep_state_create;
    pipe->bind_vs_state = nvc0_vp_state_bind;
    pipe->bind_fs_state = nvc0_fp_state_bind;
    pipe->bind_gs_state = nvc0_gp_state_bind;
+   pipe->bind_tcs_state = nvc0_tcp_state_bind;
+   pipe->bind_tes_state = nvc0_tep_state_bind;
    pipe->delete_vs_state = nvc0_sp_state_delete;
    pipe->delete_fs_state = nvc0_sp_state_delete;
    pipe->delete_gs_state = nvc0_sp_state_delete;
+   pipe->delete_tcs_state = nvc0_sp_state_delete;
+   pipe->delete_tes_state = nvc0_sp_state_delete;
 
    pipe->create_compute_state = nvc0_cp_state_create;
    pipe->bind_compute_state = nvc0_cp_state_bind;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index dc790f4d4c2..d19082e0e15 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -356,16 +356,14 @@ nve4_validate_tic(struct nvc0_context *nvc0, unsigned s)
 
 void nvc0_validate_textures(struct nvc0_context *nvc0)
 {
-   bool need_flush;
+   bool need_flush = false;
+   int i;
 
-   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
-      need_flush  = nve4_validate_tic(nvc0, 0);
-      need_flush |= nve4_validate_tic(nvc0, 3);
-      need_flush |= nve4_validate_tic(nvc0, 4);
-   } else {
-      need_flush  = nvc0_validate_tic(nvc0, 0);
-      need_flush |= nvc0_validate_tic(nvc0, 3);
-      need_flush |= nvc0_validate_tic(nvc0, 4);
+   for (i = 0; i < 5; i++) {
+      if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
+         need_flush |= nve4_validate_tic(nvc0, i);
+      else
+         need_flush |= nvc0_validate_tic(nvc0, i);
    }
 
    if (need_flush) {
@@ -466,16 +464,14 @@ nve4_validate_tsc(struct nvc0_context *nvc0, int s)
 
 void nvc0_validate_samplers(struct nvc0_context *nvc0)
 {
-   bool need_flush;
+   bool need_flush = false;
+   int i;
 
-   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
-      need_flush  = nve4_validate_tsc(nvc0, 0);
-      need_flush |= nve4_validate_tsc(nvc0, 3);
-      need_flush |= nve4_validate_tsc(nvc0, 4);
-   } else {
-      need_flush  = nvc0_validate_tsc(nvc0, 0);
-      need_flush |= nvc0_validate_tsc(nvc0, 3);
-      need_flush |= nvc0_validate_tsc(nvc0, 4);
+   for (i = 0; i < 5; i++) {
+      if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
+         need_flush |= nve4_validate_tsc(nvc0, i);
+      else
+         need_flush |= nvc0_validate_tsc(nvc0, i);
    }
 
    if (need_flush) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index d8c19f111c1..a65fbab5b60 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -547,8 +547,7 @@ nvc0_prim_gl(unsigned prim)
    NVC0_PRIM_GL_CASE(LINE_STRIP_ADJACENCY);
    NVC0_PRIM_GL_CASE(TRIANGLES_ADJACENCY);
    NVC0_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY);
-   /*
-   NVC0_PRIM_GL_CASE(PATCHES); */
+   NVC0_PRIM_GL_CASE(PATCHES);
    default:
       return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS;
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
index 685ada9c61b..8b23a4887da 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
@@ -427,8 +427,7 @@ nvc0_prim_gl(unsigned prim)
    NVC0_PRIM_GL_CASE(LINE_STRIP_ADJACENCY);
    NVC0_PRIM_GL_CASE(TRIANGLES_ADJACENCY);
    NVC0_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY);
-   /*
-   NVC0_PRIM_GL_CASE(PATCHES); */
+   NVC0_PRIM_GL_CASE(PATCHES);
    default:
       return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS;
    }

From b9ea557fd04da9eb199388c14d64862d18118de3 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 21 Jun 2015 13:35:53 -0400
Subject: [PATCH 0596/1208] nvc0: support MAX_SHADER_PATCH_VARYINGS

---
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index b414caaa5c8..eafcbecd090 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -120,6 +120,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50;
    case PIPE_CAP_ENDIANNESS:
       return PIPE_ENDIAN_LITTLE;
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+      return 30;
 
    /* supported caps */
    case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -195,7 +197,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:

From d1ffdebce6d03497fa6c2e4c8eb754e9075e29f4 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 17 May 2015 00:40:20 -0400
Subject: [PATCH 0597/1208] nvc0: add support for setting patch vertices at
 draw time

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c       | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.h       | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c | 3 ---
 src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c          | 6 ++++++
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index eafcbecd090..5c382b8bdae 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -1021,6 +1021,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    PUSH_DATA (push, 0x20);
    BEGIN_NVC0(push, NVC0_3D(SP_SELECT(0)), 1);
    PUSH_DATA (push, 0x00);
+   screen->save_state.patch_vertices = 3;
 
    BEGIN_NVC0(push, NVC0_3D(POINT_COORD_REPLACE), 1);
    PUSH_DATA (push, 0);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 8ecfe66d92f..d8826ae0c0d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -38,6 +38,7 @@ struct nvc0_graph_state {
    uint32_t constant_elts;
    int32_t index_bias;
    uint16_t scissor;
+   uint8_t patch_vertices;
    uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
    uint8_t num_vtxbufs;
    uint8_t num_vtxelts;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index d7eeb56d2ca..8aa127adc0a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -147,9 +147,6 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0)
       PUSH_DATA (push, tp->code_base);
       BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1);
       PUSH_DATA (push, tp->num_gprs);
-
-      if (tp->tp.input_patch_size <= 32)
-         IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), tp->tp.input_patch_size);
    } else {
       BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1);
       PUSH_DATA (push, 0x20);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index a65fbab5b60..6f9e7906713 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -888,6 +888,12 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       }
    }
 
+   if (info->mode == PIPE_PRIM_PATCHES &&
+       nvc0->state.patch_vertices != info->vertices_per_patch) {
+      nvc0->state.patch_vertices = info->vertices_per_patch;
+      IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), nvc0->state.patch_vertices);
+   }
+
    /* 8 as minimum to avoid immediate double validation of new buffers */
    nvc0_state_validate(nvc0, ~0, 8);
 

From c8e5337a9a240befcc953695c8822b0749c7a042 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 20 Jul 2014 15:50:43 -0400
Subject: [PATCH 0598/1208] nvc0: add handling for set_tess_state callback

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/nvc0/nvc0_context.h       |  4 ++++
 src/gallium/drivers/nouveau/nvc0/nvc0_state.c | 19 +++++++++++++++++++
 .../nouveau/nvc0/nvc0_state_validate.c        | 11 +++++++++++
 3 files changed, 34 insertions(+)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 92f33ee279d..f4499423a10 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -54,6 +54,7 @@
 #define NVC0_NEW_IDXBUF       (1 << 22)
 #define NVC0_NEW_SURFACES     (1 << 23)
 #define NVC0_NEW_MIN_SAMPLES  (1 << 24)
+#define NVC0_NEW_TESSFACTOR   (1 << 25)
 
 #define NVC0_NEW_CP_PROGRAM   (1 << 0)
 #define NVC0_NEW_CP_SURFACES  (1 << 1)
@@ -164,6 +165,9 @@ struct nvc0_context {
    unsigned sample_mask;
    unsigned min_samples;
 
+   float default_tess_outer[4];
+   float default_tess_inner[2];
+
    bool vbo_push_hint;
 
    uint8_t tfbbuf_dirty;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index c39939c20b0..693b14fa15f 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -979,6 +979,18 @@ nvc0_set_viewport_states(struct pipe_context *pipe,
 
 }
 
+static void
+nvc0_set_tess_state(struct pipe_context *pipe,
+                    const float default_tess_outer[4],
+                    const float default_tess_inner[2])
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+   memcpy(nvc0->default_tess_outer, default_tess_outer, 4 * sizeof(float));
+   memcpy(nvc0->default_tess_inner, default_tess_inner, 2 * sizeof(float));
+   nvc0->dirty |= NVC0_NEW_TESSFACTOR;
+}
+
 static void
 nvc0_set_vertex_buffers(struct pipe_context *pipe,
                         unsigned start_slot, unsigned count,
@@ -1293,6 +1305,7 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
    pipe->set_polygon_stipple = nvc0_set_polygon_stipple;
    pipe->set_scissor_states = nvc0_set_scissor_states;
    pipe->set_viewport_states = nvc0_set_viewport_states;
+   pipe->set_tess_state = nvc0_set_tess_state;
 
    pipe->create_vertex_elements_state = nvc0_vertex_state_create;
    pipe->delete_vertex_elements_state = nvc0_vertex_state_delete;
@@ -1311,4 +1324,10 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
 
    nvc0->sample_mask = ~0;
    nvc0->min_samples = 1;
+   nvc0->default_tess_outer[0] =
+   nvc0->default_tess_outer[1] =
+   nvc0->default_tess_outer[2] =
+   nvc0->default_tess_outer[3] = 1.0;
+   nvc0->default_tess_inner[0] =
+   nvc0->default_tess_inner[1] = 1.0;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index 82004e7ca31..945e2a6c9bd 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -554,6 +554,16 @@ nvc0_validate_derived_2(struct nvc0_context *nvc0)
    }
 }
 
+static void
+nvc0_validate_tess_state(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   BEGIN_NVC0(push, NVC0_3D(TESS_LEVEL_OUTER(0)), 6);
+   PUSH_DATAp(push, nvc0->default_tess_outer, 4);
+   PUSH_DATAp(push, nvc0->default_tess_inner, 2);
+}
+
 static void
 nvc0_switch_pipe_context(struct nvc0_context *ctx_to)
 {
@@ -612,6 +622,7 @@ static struct state_validate {
     { nvc0_vertprog_validate,      NVC0_NEW_VERTPROG },
     { nvc0_tctlprog_validate,      NVC0_NEW_TCTLPROG },
     { nvc0_tevlprog_validate,      NVC0_NEW_TEVLPROG },
+    { nvc0_validate_tess_state,    NVC0_NEW_TESSFACTOR },
     { nvc0_gmtyprog_validate,      NVC0_NEW_GMTYPROG },
     { nvc0_fragprog_validate,      NVC0_NEW_FRAGPROG },
     { nvc0_validate_derived_1,     NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA |

From 4b2a58a523715c28c96267286054baf511e15303 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 12 Jul 2014 22:08:44 -0400
Subject: [PATCH 0599/1208] nvc0: TESSCOORD comes in as a sysval, not an input

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_driver.h  |  2 --
 .../drivers/nouveau/nvc0/nvc0_program.c       | 19 ++++++++++---------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index bacc80d001d..cd00ddc1c86 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -70,8 +70,6 @@ struct nv50_ir_varying
 #endif
 
 #define NV50_SEMANTIC_CLIPDISTANCE  (TGSI_SEMANTIC_COUNT + 0)
-#define NV50_SEMANTIC_TESSFACTOR    (TGSI_SEMANTIC_COUNT + 7)
-#define NV50_SEMANTIC_TESSCOORD     (TGSI_SEMANTIC_COUNT + 8)
 #define NV50_SEMANTIC_COUNT         (TGSI_SEMANTIC_COUNT + 10)
 
 #define NV50_PRIM_PATCHES PIPE_PRIM_MAX
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index dbedb35c374..014e6815dbc 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -123,9 +123,6 @@ nvc0_sp_assign_input_slots(struct nv50_ir_prog_info *info)
       if (info->in[i].patch && offset >= 0x20)
          offset = 0x20 + info->in[i].si * 0x10;
 
-      if (info->in[i].sn == NV50_SEMANTIC_TESSCOORD)
-         info->in[i].mask &= 3;
-
       for (c = 0; c < 4; ++c)
          info->in[i].slot[c] = (offset + c * 0x4) / 4;
    }
@@ -218,12 +215,8 @@ nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
          continue;
       for (c = 0; c < 4; ++c) {
          a = info->in[i].slot[c];
-         if (info->in[i].mask & (1 << c)) {
-            if (info->in[i].sn != NV50_SEMANTIC_TESSCOORD)
-               vp->hdr[5 + a / 32] |= 1 << (a % 32);
-            else
-               nvc0_vtgp_hdr_update_oread(vp, info->in[i].slot[c]);
-         }
+         if (info->in[i].mask & (1 << c))
+            vp->hdr[5 + a / 32] |= 1 << (a % 32);
       }
    }
 
@@ -252,6 +245,14 @@ nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
       case TGSI_SEMANTIC_VERTEXID:
          vp->hdr[10] |= 1 << 31;
          break;
+      case TGSI_SEMANTIC_TESSCOORD:
+         /* We don't have the mask, nor the slots populated. While this could
+          * be achieved, the vast majority of the time if either of the coords
+          * are read, then both will be read.
+          */
+         nvc0_vtgp_hdr_update_oread(vp, 0x2f0 / 4);
+         nvc0_vtgp_hdr_update_oread(vp, 0x2f4 / 4);
+         break;
       default:
          break;
       }

From 59438a4d0e8ffd7cc4c741d00eff0c87d9813b5f Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 20 Jul 2014 12:17:46 -0400
Subject: [PATCH 0600/1208] nvc0/ir: mark varyings as per-patch based on
 semantic name

Also add proper handling for PATCH semantics

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  | 14 ++++++++++++++
 src/gallium/drivers/nouveau/nvc0/nvc0_program.c    |  6 ++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 2fcf41d6913..2b7e9ebbcca 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1036,6 +1036,13 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
                if (decl->Interp.Location || info->io.sampleInterp)
                   info->in[i].centroid = 1;
             }
+
+            if (sn == TGSI_SEMANTIC_PATCH ||
+                sn == TGSI_SEMANTIC_TESSOUTER ||
+                sn == TGSI_SEMANTIC_TESSINNER)
+               info->in[i].patch = 1;
+            if (sn == TGSI_SEMANTIC_PATCH)
+               info->numPatchConstants = MAX2(info->numPatchConstants, si + 1);
          }
       }
       break;
@@ -1070,6 +1077,13 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          case TGSI_SEMANTIC_VIEWPORT_INDEX:
             info->io.viewportId = i;
             break;
+         case TGSI_SEMANTIC_PATCH:
+            info->numPatchConstants = MAX2(info->numPatchConstants, si + 1);
+            /* fallthrough */
+         case TGSI_SEMANTIC_TESSOUTER:
+         case TGSI_SEMANTIC_TESSINNER:
+            info->out[i].patch = 1;
+            break;
          default:
             break;
          }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 014e6815dbc..80edfb19e6e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -36,6 +36,7 @@ nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase)
    switch (sn) {
    case TGSI_SEMANTIC_TESSOUTER:    return 0x000 + si * 0x4;
    case TGSI_SEMANTIC_TESSINNER:    return 0x010 + si * 0x4;
+   case TGSI_SEMANTIC_PATCH:        return 0x020 + si * 0x10;
    case TGSI_SEMANTIC_PRIMID:       return 0x060;
    case TGSI_SEMANTIC_LAYER:        return 0x064;
    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
@@ -66,6 +67,7 @@ nvc0_shader_output_address(unsigned sn, unsigned si, unsigned ubase)
    switch (sn) {
    case TGSI_SEMANTIC_TESSOUTER:     return 0x000 + si * 0x4;
    case TGSI_SEMANTIC_TESSINNER:     return 0x010 + si * 0x4;
+   case TGSI_SEMANTIC_PATCH:         return 0x020 + si * 0x10;
    case TGSI_SEMANTIC_PRIMID:        return 0x060;
    case TGSI_SEMANTIC_LAYER:         return 0x064;
    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
@@ -120,8 +122,6 @@ nvc0_sp_assign_input_slots(struct nv50_ir_prog_info *info)
    for (i = 0; i < info->numInputs; ++i) {
       offset = nvc0_shader_input_address(info->in[i].sn,
                                          info->in[i].si, ubase);
-      if (info->in[i].patch && offset >= 0x20)
-         offset = 0x20 + info->in[i].si * 0x10;
 
       for (c = 0; c < 4; ++c)
          info->in[i].slot[c] = (offset + c * 0x4) / 4;
@@ -163,8 +163,6 @@ nvc0_sp_assign_output_slots(struct nv50_ir_prog_info *info)
    for (i = 0; i < info->numOutputs; ++i) {
       offset = nvc0_shader_output_address(info->out[i].sn,
                                           info->out[i].si, ubase);
-      if (info->out[i].patch && offset >= 0x20)
-         offset = 0x20 + info->out[i].si * 0x10;
 
       for (c = 0; c < 4; ++c)
          info->out[i].slot[c] = (offset + c * 0x4) / 4;

From c2350fb3dbb1b8d348125e22758da266c15bc198 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 20 Jul 2014 13:12:38 -0400
Subject: [PATCH 0601/1208] nvc0/ir: populate info structure based on new tess
 properties

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp      | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 2b7e9ebbcca..c5cefc271cc 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -948,6 +948,24 @@ void Source::scanProperty(const struct tgsi_full_property *prop)
    case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
       info->io.genUserClip = -1;
       break;
+   case TGSI_PROPERTY_TCS_VERTICES_OUT:
+      info->prop.tp.outputPatchSize = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_TES_PRIM_MODE:
+      info->prop.tp.domain = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_TES_SPACING:
+      info->prop.tp.partitioning = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_TES_VERTEX_ORDER_CW:
+      info->prop.tp.winding = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_TES_POINT_MODE:
+      if (prop->u[0].Data)
+         info->prop.tp.outputPrim = PIPE_PRIM_POINTS;
+      else
+         info->prop.tp.outputPrim = PIPE_PRIM_TRIANGLES; /* anything but points */
+      break;
    default:
       INFO("unhandled TGSI property %d\n", prop->Property.PropertyName);
       break;

From 71744c069264d32e6eb9d095350300b42633a1f8 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 20 Jul 2014 13:36:37 -0400
Subject: [PATCH 0602/1208] nvc0/ir: set perPatch flag on load/stores to
 per-patch varyings

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index c5cefc271cc..f657e8d6a7b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1575,6 +1575,7 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
    const int idx2d = src.is2D() ? src.getIndex(1) : 0;
    const int idx = src.getIndex(0);
    const int swz = src.getSwizzle(c);
+   Instruction *ld;
 
    switch (src.getFile()) {
    case TGSI_FILE_IMMEDIATE:
@@ -1602,7 +1603,9 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
          if (ptr)
             return mkLoadv(TYPE_U32, srcToSym(src, c), ptr);
       }
-      return mkLoadv(TYPE_U32, srcToSym(src, c), shiftAddress(ptr));
+      ld = mkLoad(TYPE_U32, getSSA(), srcToSym(src, c), shiftAddress(ptr));
+      ld->perPatch = info->in[idx].patch;
+      return ld->getDef(0);
    case TGSI_FILE_OUTPUT:
       assert(!"load from output file");
       return NULL;
@@ -1678,7 +1681,8 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
              viewport != NULL)
             mkOp1(OP_MOV, TYPE_U32, viewport, val);
          else
-            mkStore(OP_EXPORT, TYPE_U32, dstToSym(dst, c), ptr, val);
+            mkStore(OP_EXPORT, TYPE_U32, dstToSym(dst, c), ptr, val)->perPatch =
+               info->out[idx].patch;
       }
    } else
    if (f == TGSI_FILE_TEMPORARY ||

From e3e2df01bf855f3b435e03224a762649081c6558 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 20 Jul 2014 16:23:16 -0400
Subject: [PATCH 0603/1208] nvc0/ir: add support for reading outputs in tess
 control shaders

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp       | 17 +++++++++++++++--
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp   |  3 +++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index f657e8d6a7b..8dc7899e6c4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1203,6 +1203,16 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
          if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
             info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
                0x1 : 0x2;
+      } else
+      if (src.getFile() == TGSI_FILE_OUTPUT) {
+         if (src.isIndirect(0)) {
+            // We don't know which one is accessed, just mark everything for
+            // reading. This is an extremely unlikely occurrence.
+            for (unsigned i = 0; i < info->numOutputs; ++i)
+               info->out[i].oread = 1;
+         } else {
+            info->out[src.getIndex(0)].oread = 1;
+         }
       }
       if (src.getFile() != TGSI_FILE_INPUT)
          continue;
@@ -1521,6 +1531,7 @@ Converter::fetchSrc(int s, int c)
 
    if (src.is2D()) {
       switch (src.getFile()) {
+      case TGSI_FILE_OUTPUT:
       case TGSI_FILE_INPUT:
          dimRel = getVertexBase(s);
          break;
@@ -1607,8 +1618,10 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
       ld->perPatch = info->in[idx].patch;
       return ld->getDef(0);
    case TGSI_FILE_OUTPUT:
-      assert(!"load from output file");
-      return NULL;
+      assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
+      ld = mkLoad(TYPE_U32, getSSA(), srcToSym(src, c), shiftAddress(ptr));
+      ld->perPatch = info->out[idx].patch;
+      return ld->getDef(0);
    case TGSI_FILE_SYSTEM_VALUE:
       assert(!ptr);
       return mkOp1v(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index e71fa113d99..56dbb5134d1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1815,6 +1815,9 @@ NVC0LoweringPass::visit(Instruction *i)
             i->setIndirect(0, 0, ptr);
             i->subOp = NV50_IR_SUBOP_LDC_IS;
          }
+      } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
+         assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
+         i->op = OP_VFETCH;
       }
       break;
    case OP_ATOM:

From 7cf2bffe8254de6808202d866598ec4c9afe1a51 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 21 Feb 2015 03:12:54 -0500
Subject: [PATCH 0604/1208] nvc0/ir: patch vertex count is stored in the upper
 bits

---
 src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 56dbb5134d1..85dd05d5267 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1525,6 +1525,10 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
          i->op = OP_MOV;
          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
       }
+      if (sv == SV_VERTEX_COUNT) {
+         bld.setPosition(i, true);
+         bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
+      }
       return true;
    }
 

From 77672cdb64e9c19e974fe5985050709fc317498e Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 23 Jul 2015 02:27:04 -0400
Subject: [PATCH 0605/1208] nvc0/ir: add hazard for 2nd dim of vfetch/load
 indirect argument

Apparently a multi-word load can potentially overwrite the indirect
sources, so make sure that RA picks different registers for those.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 898653c9953..78bc97f4397 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -2066,6 +2066,8 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
          condenseDefs(i);
          if (i->src(0).isIndirect(0) && typeSizeof(i->dType) >= 8)
             addHazard(i, i->src(0).getIndirect(0));
+         if (i->src(0).isIndirect(1) && typeSizeof(i->dType) >= 8)
+            addHazard(i, i->src(0).getIndirect(1));
       } else
       if (i->op == OP_UNION ||
           i->op == OP_MERGE ||

From da89e75d9c6399c8fb0286460c91a77778c0eec9 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 30 Apr 2015 02:00:20 -0400
Subject: [PATCH 0606/1208] nvc0/ir: allow tess eval output loads to be CSE'd

These only happen for gl_TessCoord which are constant.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index ad9bf6f4aa9..e9648aa7b91 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -2518,6 +2518,8 @@ Instruction::isResultEqual(const Instruction *that) const
       case FILE_MEMORY_CONST:
       case FILE_SHADER_INPUT:
          return true;
+      case FILE_SHADER_OUTPUT:
+         return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
       default:
          return false;
       }

From fd092328e1e05fe4a3fc82a2e79bdba884bc798d Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 17 May 2015 00:45:12 -0400
Subject: [PATCH 0607/1208] nvc0/ir: cleanup private enums that have graduated
 to gallium

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h | 5 -----
 src/gallium/drivers/nouveau/nvc0/nvc0_program.c      | 2 --
 2 files changed, 7 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index cd00ddc1c86..2b9edcf9172 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -69,11 +69,6 @@ struct nv50_ir_varying
 # define NV50_IR_DEBUG_REG_ALLOC 0
 #endif
 
-#define NV50_SEMANTIC_CLIPDISTANCE  (TGSI_SEMANTIC_COUNT + 0)
-#define NV50_SEMANTIC_COUNT         (TGSI_SEMANTIC_COUNT + 10)
-
-#define NV50_PRIM_PATCHES PIPE_PRIM_MAX
-
 struct nv50_ir_prog_symbol
 {
    uint32_t label;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 80edfb19e6e..b0b8486b6df 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -46,7 +46,6 @@ nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase)
    case TGSI_SEMANTIC_FOG:          return 0x2e8;
    case TGSI_SEMANTIC_COLOR:        return 0x280 + si * 0x10;
    case TGSI_SEMANTIC_BCOLOR:       return 0x2a0 + si * 0x10;
-   case NV50_SEMANTIC_CLIPDISTANCE: return 0x2c0 + si * 0x4;
    case TGSI_SEMANTIC_CLIPDIST:     return 0x2c0 + si * 0x10;
    case TGSI_SEMANTIC_CLIPVERTEX:   return 0x270;
    case TGSI_SEMANTIC_PCOORD:       return 0x2e0;
@@ -77,7 +76,6 @@ nvc0_shader_output_address(unsigned sn, unsigned si, unsigned ubase)
    case TGSI_SEMANTIC_FOG:           return 0x2e8;
    case TGSI_SEMANTIC_COLOR:         return 0x280 + si * 0x10;
    case TGSI_SEMANTIC_BCOLOR:        return 0x2a0 + si * 0x10;
-   case NV50_SEMANTIC_CLIPDISTANCE:  return 0x2c0 + si * 0x4;
    case TGSI_SEMANTIC_CLIPDIST:      return 0x2c0 + si * 0x10;
    case TGSI_SEMANTIC_CLIPVERTEX:    return 0x270;
    case TGSI_SEMANTIC_TEXCOORD:      return 0x300 + si * 0x10;

From 88818c4cd6de9d8855a9ba3c3a85306d42f5e9d3 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 22 Jul 2015 20:34:30 -0400
Subject: [PATCH 0608/1208] gk110/ir: fake BAR support

Makes things sorta work until we figure out the real way to do this.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp   | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index ab8bf2e5504..44d3a5efecf 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -120,6 +120,8 @@ private:
 
    void emitPIXLD(const Instruction *);
 
+   void emitBAR(const Instruction *);
+
    void emitFlow(const Instruction *);
 
    inline void defId(const ValueDef&, const int pos);
@@ -1249,6 +1251,13 @@ CodeEmitterGK110::emitPIXLD(const Instruction *i)
    code[1] |= 0x00070000;
 }
 
+void
+CodeEmitterGK110::emitBAR(const Instruction *i)
+{
+   /* TODO */
+   emitNOP(i);
+}
+
 void
 CodeEmitterGK110::emitFlow(const Instruction *i)
 {
@@ -1856,6 +1865,9 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
       emitNOP(insn);
       insn->join = 1;
       break;
+   case OP_BAR:
+      emitBAR(insn);
+      break;
    case OP_PHI:
    case OP_UNION:
    case OP_CONSTRAINT:

From 6d8e466792c284e79125bab33fcfb0872d0df2c3 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 23 Jul 2015 03:39:13 -0400
Subject: [PATCH 0609/1208] docs: mark off tess for nvc0

---
 docs/GL3.txt              | 2 +-
 docs/relnotes/10.7.0.html | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 2cf5c316468..5200737bae7 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -112,7 +112,7 @@ GL 4.0, GLSL 4.00:
   GL_ARB_gpu_shader_fp64                               DONE (nvc0, radeonsi, llvmpipe, softpipe)
   GL_ARB_sample_shading                                DONE (i965, nv50, nvc0, r600, radeonsi)
   GL_ARB_shader_subroutine                             DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_tessellation_shader                           DONE (radeonsi)
+  GL_ARB_tessellation_shader                           DONE (nvc0, radeonsi)
   GL_ARB_texture_buffer_object_rgb32                   DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_cube_map_array                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_gather                                DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index 1464dc4f092..a2e7c02e7b2 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -51,7 +51,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_gpu_shader_fp64 on llvmpipe, radeonsi</li>
 <li>GL_ARB_shader_stencil_export on llvmpipe</li>
 <li>GL_ARB_shader_subroutine on core profile gallium drivers</li>
-<li>GL_ARB_tessellation_shader on radeonsi</li>
+<li>GL_ARB_tessellation_shader on nvc0, radeonsi</li>
 <li>GL_ARB_vertex_attrib_64bit on llvmpipe, radeonsi</li>
 <li>GL_ARB_viewport_array on radeonsi</li>
 <li>GLX_ARB_create_context_robustness on r600, radeonsi</li>

From 9f7a68feafc86a51a7c5165672b29cb7182da738 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Mon, 20 Jul 2015 06:49:05 -0700
Subject: [PATCH 0610/1208] gallivm: Don't use raw_debug_ostream for
 dissasembling

All LLVM API calls that require an ostream object have been removed from
the disassemble() function, so we don't need to use this class to wrap
_debug_printf() we can just call this function directly.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 .../auxiliary/gallivm/lp_bld_debug.cpp        | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index 405e6486f7a..ec88f330878 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -123,7 +123,7 @@ lp_debug_dump_value(LLVMValueRef value)
  * - http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html
  */
 static size_t
-disassemble(const void* func, llvm::raw_ostream & Out)
+disassemble(const void* func)
 {
    const uint8_t *bytes = (const uint8_t *)func;
 
@@ -141,7 +141,8 @@ disassemble(const void* func, llvm::raw_ostream & Out)
    char outline[1024];
 
    if (!D) {
-      Out << "error: couldn't create disassembler for triple " << Triple << "\n";
+      _debug_printf("error: couldn't create disassembler for triple %s\n",
+                    Triple.c_str());
       return 0;
    }
 
@@ -155,13 +156,13 @@ disassemble(const void* func, llvm::raw_ostream & Out)
        * so that between runs.
        */
 
-      Out << llvm::format("%6lu:\t", (unsigned long)pc);
+      _debug_printf("%6lu:\t", (unsigned long)pc);
 
       Size = LLVMDisasmInstruction(D, (uint8_t *)bytes + pc, extent - pc, 0, outline,
                                    sizeof outline);
 
       if (!Size) {
-         Out << "invalid\n";
+         _debug_printf("invalid\n");
          pc += 1;
          break;
       }
@@ -173,10 +174,10 @@ disassemble(const void* func, llvm::raw_ostream & Out)
       if (0) {
          unsigned i;
          for (i = 0; i < Size; ++i) {
-            Out << llvm::format("%02x ", bytes[pc + i]);
+            _debug_printf("%02x ", bytes[pc + i]);
          }
          for (; i < 16; ++i) {
-            Out << "   ";
+            _debug_printf("   ");
          }
       }
 
@@ -184,9 +185,9 @@ disassemble(const void* func, llvm::raw_ostream & Out)
        * Print the instruction.
        */
 
-      Out << outline;
+      _debug_printf("%*s", Size, outline);
 
-      Out << "\n";
+      _debug_printf("\n");
 
       /*
        * Stop disassembling on return statements, if there is no record of a
@@ -206,13 +207,12 @@ disassemble(const void* func, llvm::raw_ostream & Out)
       pc += Size;
 
       if (pc >= extent) {
-         Out << "disassembly larger than " << extent << "bytes, aborting\n";
+         _debug_printf("disassembly larger than %ull bytes, aborting\n", extent);
          break;
       }
    }
 
-   Out << "\n";
-   Out.flush();
+   _debug_printf("\n");
 
    LLVMDisasmDispose(D);
 
@@ -229,9 +229,8 @@ disassemble(const void* func, llvm::raw_ostream & Out)
 
 extern "C" void
 lp_disassemble(LLVMValueRef func, const void *code) {
-   raw_debug_ostream Out;
-   Out << LLVMGetValueName(func) << ":\n";
-   disassemble(code, Out);
+   _debug_printf("%s:\n", LLVMGetValueName(func));
+   disassemble(code);
 }
 
 

From a3b53beaa0351cf1322c6e1a580dc7cc3d0cad0c Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Mon, 20 Jul 2015 11:24:13 -0400
Subject: [PATCH 0611/1208] gallivm: Add ifdefs so raw_debug_stream is only
 defined when used

Its only use is to implement a custom version of LLVMDumpValue
on some Windows and embedded platforms.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_debug.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index ec88f330878..0a5c2ccdae3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -61,6 +61,7 @@ lp_check_alignment(const void *ptr, unsigned alignment)
    return ((uintptr_t)ptr & (alignment - 1)) == 0;
 }
 
+#if (defined(PIPE_OS_WINDOWS) && !defined(PIPE_CC_MSVC)) || defined(PIPE_OS_EMBEDDED)
 
 class raw_debug_ostream :
    public llvm::raw_ostream
@@ -91,6 +92,7 @@ raw_debug_ostream::write_impl(const char *Ptr, size_t Size)
    }
 }
 
+#endif
 
 extern "C" const char *
 lp_get_module_id(LLVMModuleRef module)

From b469cf10efd4734038dcab294f23ca38e9fc7a97 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Thu, 23 Jul 2015 16:25:21 +0200
Subject: [PATCH 0612/1208] mesa: Fix error in target validation of
 glCompressedTex(ture)SubImage3D() calls

Basically, two different target error checks are chained consecutively, and the
second one is executed regardless the result of the first one. This produces an
incorrect error if the first check fails but is overrided by the second.

This patch conditions the execution of the second check to a successful pass of
the first one.

Fixes 1 dEQP test:
* dEQP-GLES3.functional.negative_api.texture.compressedtexsubimage3d

Reviewed-by: Laura Ekstrand <laura@jlekstrand.net>
---
 src/mesa/main/teximage.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 3b309abbe14..cd451138b1c 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -4586,7 +4586,8 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
        * are valid here, which they are not, but of course not mentioned by
        * core spec.
        */
-      if (target != GL_TEXTURE_2D_ARRAY && target != GL_TEXTURE_CUBE_MAP_ARRAY) {
+      if (targetOK && target != GL_TEXTURE_2D_ARRAY &&
+          target != GL_TEXTURE_CUBE_MAP_ARRAY) {
          bool invalidformat;
          switch (format) {
             /* These came from _mesa_is_compressed_format in glformats.c. */

From f3728a16c9c6a02fc1f44b8069b0060e2358f22e Mon Sep 17 00:00:00 2001
From: Tom Hughes <tom@compton.nu>
Date: Tue, 2 Jun 2015 13:40:37 +0100
Subject: [PATCH 0613/1208] Match swrast modes more loosely.

https://bugs.freedesktop.org/show_bug.cgi?id=90817

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/glx/dri_common.c | 59 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/src/glx/dri_common.c b/src/glx/dri_common.c
index 63c8de38c7c..1a62ee29e7b 100644
--- a/src/glx/dri_common.c
+++ b/src/glx/dri_common.c
@@ -265,6 +265,36 @@ scalarEqual(struct glx_config *mode, unsigned int attrib, unsigned int value)
    return GL_TRUE;              /* Is a non-existing attribute equal to value? */
 }
 
+static int
+scalarGreaterEqual(struct glx_config *mode, unsigned int attrib, unsigned int value)
+{
+   unsigned int glxValue;
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(attribMap); i++)
+      if (attribMap[i].attrib == attrib) {
+         glxValue = *(unsigned int *) ((char *) mode + attribMap[i].offset);
+         return glxValue == GLX_DONT_CARE || glxValue >= value;
+      }
+
+   return GL_TRUE;              /* Is a non-existing attribute greater than or equal to value? */
+}
+
+static int
+booleanSupported(struct glx_config *mode, unsigned int attrib, unsigned int value)
+{
+   unsigned int glxValue;
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(attribMap); i++)
+      if (attribMap[i].attrib == attrib) {
+         glxValue = *(unsigned int *) ((char *) mode + attribMap[i].offset);
+         return glxValue == GLX_DONT_CARE || glxValue;
+      }
+
+   return GL_TRUE;              /* Is a non-existing attribute supported? */
+}
+
 static int
 driConfigEqual(const __DRIcoreExtension *core,
                struct glx_config *config, const __DRIconfig *driConfig)
@@ -313,10 +343,37 @@ driConfigEqual(const __DRIcoreExtension *core,
          if (value & __DRI_ATTRIB_TEXTURE_RECTANGLE_BIT)
             glxValue |= GLX_TEXTURE_RECTANGLE_BIT_EXT;
          if (config->bindToTextureTargets != GLX_DONT_CARE &&
-             glxValue != config->bindToTextureTargets)
+             glxValue != (config->bindToTextureTargets & glxValue))
             return GL_FALSE;
          break;
 
+      case __DRI_ATTRIB_STENCIL_SIZE:
+      case __DRI_ATTRIB_ACCUM_RED_SIZE:
+      case __DRI_ATTRIB_ACCUM_GREEN_SIZE:
+      case __DRI_ATTRIB_ACCUM_BLUE_SIZE:
+      case __DRI_ATTRIB_ACCUM_ALPHA_SIZE:
+         if (value != 0 && !scalarEqual(config, attrib, value))
+            return GL_FALSE;
+         break;
+
+      case __DRI_ATTRIB_DOUBLE_BUFFER:
+      case __DRI_ATTRIB_BIND_TO_TEXTURE_RGB:
+      case __DRI_ATTRIB_BIND_TO_TEXTURE_RGBA:
+      case __DRI_ATTRIB_BIND_TO_MIPMAP_TEXTURE:
+      case __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE:
+          if (value && !booleanSupported(config, attrib, value))
+            return GL_FALSE;
+          break;
+
+      case __DRI_ATTRIB_SAMPLE_BUFFERS:
+      case __DRI_ATTRIB_SAMPLES:
+      case __DRI_ATTRIB_AUX_BUFFERS:
+      case __DRI_ATTRIB_MAX_PBUFFER_WIDTH:
+      case __DRI_ATTRIB_MAX_PBUFFER_HEIGHT:
+      case __DRI_ATTRIB_MAX_PBUFFER_PIXELS:
+         if (!scalarGreaterEqual(config, attrib, value))
+            return GL_FALSE;
+
       default:
          if (!scalarEqual(config, attrib, value))
             return GL_FALSE;

From c6267ebd6c8a73d51a0c82d0f516177c70e05c81 Mon Sep 17 00:00:00 2001
From: Jose Fonseca <jfonseca@vmware.com>
Date: Wed, 22 Jul 2015 13:21:24 +0100
Subject: [PATCH 0614/1208] gallium/util: Stop bundling our snprintf
 implementation.

Use MSVCRT functions instead.  Their semantics are slightly
different but they can be made to work as expected.

Also, use the same code paths for both MSVCRT and MinGW.

https://bugs.freedesktop.org/show_bug.cgi?id=91418

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/Makefile.sources  |    1 -
 src/gallium/auxiliary/util/u_snprintf.c | 1480 -----------------------
 src/gallium/auxiliary/util/u_string.h   |   35 +-
 3 files changed, 31 insertions(+), 1485 deletions(-)
 delete mode 100644 src/gallium/auxiliary/util/u_snprintf.c

diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 62e6b94cab8..3616d885b47 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -274,7 +274,6 @@ C_SOURCES := \
 	util/u_simple_shaders.h \
 	util/u_slab.c \
 	util/u_slab.h \
-	util/u_snprintf.c \
 	util/u_split_prim.h \
 	util/u_sse.h \
 	util/u_staging.c \
diff --git a/src/gallium/auxiliary/util/u_snprintf.c b/src/gallium/auxiliary/util/u_snprintf.c
deleted file mode 100644
index 39e9b70d0f8..00000000000
--- a/src/gallium/auxiliary/util/u_snprintf.c
+++ /dev/null
@@ -1,1480 +0,0 @@
-/*
- * Copyright (c) 1995 Patrick Powell.
- *
- * This code is based on code written by Patrick Powell <papowell@astart.com>.
- * It may be used for any purpose as long as this notice remains intact on all
- * source code distributions.
- */
-
-/*
- * Copyright (c) 2008 Holger Weiss.
- *
- * This version of the code is maintained by Holger Weiss <holger@jhweiss.de>.
- * My changes to the code may freely be used, modified and/or redistributed for
- * any purpose.  It would be nice if additions and fixes to this file (including
- * trivial code cleanups) would be sent back in order to let me include them in
- * the version available at <http://www.jhweiss.de/software/snprintf.html>.
- * However, this is not a requirement for using or redistributing (possibly
- * modified) versions of this file, nor is leaving this notice intact mandatory.
- */
-
-/*
- * History
- *
- * 2008-01-20 Holger Weiss <holger@jhweiss.de> for C99-snprintf 1.1:
- *
- * 	Fixed the detection of infinite floating point values on IRIX (and
- * 	possibly other systems) and applied another few minor cleanups.
- *
- * 2008-01-06 Holger Weiss <holger@jhweiss.de> for C99-snprintf 1.0:
- *
- * 	Added a lot of new features, fixed many bugs, and incorporated various
- * 	improvements done by Andrew Tridgell <tridge@samba.org>, Russ Allbery
- * 	<rra@stanford.edu>, Hrvoje Niksic <hniksic@xemacs.org>, Damien Miller
- * 	<djm@mindrot.org>, and others for the Samba, INN, Wget, and OpenSSH
- * 	projects.  The additions include: support the "e", "E", "g", "G", and
- * 	"F" conversion specifiers (and use conversion style "f" or "F" for the
- * 	still unsupported "a" and "A" specifiers); support the "hh", "ll", "j",
- * 	"t", and "z" length modifiers; support the "#" flag and the (non-C99)
- * 	"'" flag; use localeconv(3) (if available) to get both the current
- * 	locale's decimal point character and the separator between groups of
- * 	digits; fix the handling of various corner cases of field width and
- * 	precision specifications; fix various floating point conversion bugs;
- * 	handle infinite and NaN floating point values; don't attempt to write to
- * 	the output buffer (which may be NULL) if a size of zero was specified;
- * 	check for integer overflow of the field width, precision, and return
- * 	values and during the floating point conversion; use the OUTCHAR() macro
- * 	instead of a function for better performance; provide asprintf(3) and
- * 	vasprintf(3) functions; add new test cases.  The replacement functions
- * 	have been renamed to use an "rpl_" prefix, the function calls in the
- * 	main project (and in this file) must be redefined accordingly for each
- * 	replacement function which is needed (by using Autoconf or other means).
- * 	Various other minor improvements have been applied and the coding style
- * 	was cleaned up for consistency.
- *
- * 2007-07-23 Holger Weiss <holger@jhweiss.de> for Mutt 1.5.13:
- *
- * 	C99 compliant snprintf(3) and vsnprintf(3) functions return the number
- * 	of characters that would have been written to a sufficiently sized
- * 	buffer (excluding the '\0').  The original code simply returned the
- * 	length of the resulting output string, so that's been fixed.
- *
- * 1998-03-05 Michael Elkins <me@mutt.org> for Mutt 0.90.8:
- *
- * 	The original code assumed that both snprintf(3) and vsnprintf(3) were
- * 	missing.  Some systems only have snprintf(3) but not vsnprintf(3), so
- * 	the code is now broken down under HAVE_SNPRINTF and HAVE_VSNPRINTF.
- *
- * 1998-01-27 Thomas Roessler <roessler@does-not-exist.org> for Mutt 0.89i:
- *
- * 	The PGP code was using unsigned hexadecimal formats.  Unfortunately,
- * 	unsigned formats simply didn't work.
- *
- * 1997-10-22 Brandon Long <blong@fiction.net> for Mutt 0.87.1:
- *
- * 	Ok, added some minimal floating point support, which means this probably
- * 	requires libm on most operating systems.  Don't yet support the exponent
- * 	(e,E) and sigfig (g,G).  Also, fmtint() was pretty badly broken, it just
- * 	wasn't being exercised in ways which showed it, so that's been fixed.
- * 	Also, formatted the code to Mutt conventions, and removed dead code left
- * 	over from the original.  Also, there is now a builtin-test, run with:
- * 	gcc -DTEST_SNPRINTF -o snprintf snprintf.c -lm && ./snprintf
- *
- * 2996-09-15 Brandon Long <blong@fiction.net> for Mutt 0.43:
- *
- * 	This was ugly.  It is still ugly.  I opted out of floating point
- * 	numbers, but the formatter understands just about everything from the
- * 	normal C string format, at least as far as I can tell from the Solaris
- * 	2.5 printf(3S) man page.
- */
-
-/*
- * ToDo
- *
- * - Add wide character support.
- * - Add support for "%a" and "%A" conversions.
- * - Create test routines which predefine the expected results.  Our test cases
- *   usually expose bugs in system implementations rather than in ours :-)
- */
-
-/*
- * Usage
- *
- * 1) The following preprocessor macros should be defined to 1 if the feature or
- *    file in question is available on the target system (by using Autoconf or
- *    other means), though basic functionality should be available as long as
- *    HAVE_STDARG_H and HAVE_STDLIB_H are defined correctly:
- *
- *    	HAVE_VSNPRINTF
- *    	HAVE_SNPRINTF
- *    	HAVE_VASPRINTF
- *    	HAVE_ASPRINTF
- *    	HAVE_STDARG_H
- *    	HAVE_STDDEF_H
- *    	HAVE_STDINT_H
- *    	HAVE_STDLIB_H
- *    	HAVE_INTTYPES_H
- *    	HAVE_LOCALE_H
- *    	HAVE_LOCALECONV
- *    	HAVE_LCONV_DECIMAL_POINT
- *    	HAVE_LCONV_THOUSANDS_SEP
- *    	HAVE_LONG_DOUBLE
- *    	HAVE_LONG_LONG_INT
- *    	HAVE_UNSIGNED_LONG_LONG_INT
- *    	HAVE_INTMAX_T
- *    	HAVE_UINTMAX_T
- *    	HAVE_UINTPTR_T
- *    	HAVE_PTRDIFF_T
- *    	HAVE_VA_COPY
- *    	HAVE___VA_COPY
- *
- * 2) The calls to the functions which should be replaced must be redefined
- *    throughout the project files (by using Autoconf or other means):
- *
- *    	#define vsnprintf rpl_vsnprintf
- *    	#define snprintf rpl_snprintf
- *    	#define vasprintf rpl_vasprintf
- *    	#define asprintf rpl_asprintf
- *
- * 3) The required replacement functions should be declared in some header file
- *    included throughout the project files:
- *
- *    	#if HAVE_CONFIG_H
- *    	#include <config.h>
- *    	#endif
- *    	#if HAVE_STDARG_H
- *    	#include <stdarg.h>
- *    	#if !HAVE_VSNPRINTF
- *    	int rpl_vsnprintf(char *, size_t, const char *, va_list);
- *    	#endif
- *    	#if !HAVE_SNPRINTF
- *    	int rpl_snprintf(char *, size_t, const char *, ...);
- *    	#endif
- *    	#if !HAVE_VASPRINTF
- *    	int rpl_vasprintf(char **, const char *, va_list);
- *    	#endif
- *    	#if !HAVE_ASPRINTF
- *    	int rpl_asprintf(char **, const char *, ...);
- *    	#endif
- *    	#endif
- *
- * Autoconf macros for handling step 1 and step 2 are available at
- * <http://www.jhweiss.de/software/snprintf.html>.
- */
-
-#include "pipe/p_config.h"
-
-#if HAVE_CONFIG_H
-#include <config.h>
-#else
-#ifdef _MSC_VER
-#define vsnprintf util_vsnprintf
-#define snprintf util_snprintf
-#define HAVE_VSNPRINTF 0
-#define HAVE_SNPRINTF 0
-#define HAVE_VASPRINTF 1 /* not needed */
-#define HAVE_ASPRINTF 1 /* not needed */
-#define HAVE_STDARG_H 1
-#define HAVE_STDDEF_H 1
-#define HAVE_STDINT_H 1
-#define HAVE_STDLIB_H 1
-#define HAVE_INTTYPES_H 0
-#define HAVE_LOCALE_H 0
-#define HAVE_LOCALECONV 0
-#define HAVE_LCONV_DECIMAL_POINT 0
-#define HAVE_LCONV_THOUSANDS_SEP 0
-#define HAVE_LONG_DOUBLE 0
-#define HAVE_LONG_LONG_INT 1
-#define HAVE_UNSIGNED_LONG_LONG_INT 1
-#define HAVE_INTMAX_T 0
-#define HAVE_UINTMAX_T 0
-#define HAVE_UINTPTR_T 1
-#define HAVE_PTRDIFF_T 1
-#define HAVE_VA_COPY 0
-#define HAVE___VA_COPY 0
-#else
-#define HAVE_VSNPRINTF 1
-#define HAVE_SNPRINTF 1
-#define HAVE_VASPRINTF 1
-#define HAVE_ASPRINTF 1
-#endif
-#endif	/* HAVE_CONFIG_H */
-
-#if !HAVE_SNPRINTF || !HAVE_VSNPRINTF || !HAVE_ASPRINTF || !HAVE_VASPRINTF
-#include <stdio.h>	/* For NULL, size_t, vsnprintf(3), and vasprintf(3). */
-#ifdef VA_START
-#undef VA_START
-#endif	/* defined(VA_START) */
-#ifdef VA_SHIFT
-#undef VA_SHIFT
-#endif	/* defined(VA_SHIFT) */
-#if HAVE_STDARG_H
-#include <stdarg.h>
-#define VA_START(ap, last) va_start(ap, last)
-#define VA_SHIFT(ap, value, type) /* No-op for ANSI C. */
-#else	/* Assume <varargs.h> is available. */
-#include <varargs.h>
-#define VA_START(ap, last) va_start(ap)	/* "last" is ignored. */
-#define VA_SHIFT(ap, value, type) value = va_arg(ap, type)
-#endif	/* HAVE_STDARG_H */
-
-#if !HAVE_VASPRINTF
-#if HAVE_STDLIB_H
-#include <stdlib.h>	/* For malloc(3). */
-#endif	/* HAVE_STDLIB_H */
-#ifdef VA_COPY
-#undef VA_COPY
-#endif	/* defined(VA_COPY) */
-#ifdef VA_END_COPY
-#undef VA_END_COPY
-#endif	/* defined(VA_END_COPY) */
-#if HAVE_VA_COPY
-#define VA_COPY(dest, src) va_copy(dest, src)
-#define VA_END_COPY(ap) va_end(ap)
-#elif HAVE___VA_COPY
-#define VA_COPY(dest, src) __va_copy(dest, src)
-#define VA_END_COPY(ap) va_end(ap)
-#else
-#define VA_COPY(dest, src) (void)mymemcpy(&dest, &src, sizeof(va_list))
-#define VA_END_COPY(ap) /* No-op. */
-#define NEED_MYMEMCPY 1
-static void *mymemcpy(void *, void *, size_t);
-#endif	/* HAVE_VA_COPY */
-#endif	/* !HAVE_VASPRINTF */
-
-#if !HAVE_VSNPRINTF
-#include <limits.h>	/* For *_MAX. */
-#if HAVE_INTTYPES_H
-#include <inttypes.h>	/* For intmax_t (if not defined in <stdint.h>). */
-#endif	/* HAVE_INTTYPES_H */
-#if HAVE_LOCALE_H
-#include <locale.h>	/* For localeconv(3). */
-#endif	/* HAVE_LOCALE_H */
-#if HAVE_STDDEF_H
-#include <stddef.h>	/* For ptrdiff_t. */
-#endif	/* HAVE_STDDEF_H */
-#if HAVE_STDINT_H
-#include <stdint.h>	/* For intmax_t. */
-#endif	/* HAVE_STDINT_H */
-
-/* Support for unsigned long long int.  We may also need ULLONG_MAX. */
-#ifndef ULONG_MAX	/* We may need ULONG_MAX as a fallback. */
-#ifdef UINT_MAX
-#define ULONG_MAX UINT_MAX
-#else
-#define ULONG_MAX INT_MAX
-#endif	/* defined(UINT_MAX) */
-#endif	/* !defined(ULONG_MAX) */
-#ifdef ULLONG
-#undef ULLONG
-#endif	/* defined(ULLONG) */
-#if HAVE_UNSIGNED_LONG_LONG_INT
-#define ULLONG unsigned long long int
-#ifndef ULLONG_MAX
-#define ULLONG_MAX ULONG_MAX
-#endif	/* !defined(ULLONG_MAX) */
-#else
-#define ULLONG unsigned long int
-#ifdef ULLONG_MAX
-#undef ULLONG_MAX
-#endif	/* defined(ULLONG_MAX) */
-#define ULLONG_MAX ULONG_MAX
-#endif	/* HAVE_LONG_LONG_INT */
-
-/* Support for uintmax_t.  We also need UINTMAX_MAX. */
-#ifdef UINTMAX_T
-#undef UINTMAX_T
-#endif	/* defined(UINTMAX_T) */
-#if HAVE_UINTMAX_T || defined(uintmax_t)
-#define UINTMAX_T uintmax_t
-#ifndef UINTMAX_MAX
-#define UINTMAX_MAX ULLONG_MAX
-#endif	/* !defined(UINTMAX_MAX) */
-#else
-#define UINTMAX_T ULLONG
-#ifdef UINTMAX_MAX
-#undef UINTMAX_MAX
-#endif	/* defined(UINTMAX_MAX) */
-#define UINTMAX_MAX ULLONG_MAX
-#endif	/* HAVE_UINTMAX_T || defined(uintmax_t) */
-
-/* Support for long double. */
-#ifndef LDOUBLE
-#if HAVE_LONG_DOUBLE
-#define LDOUBLE long double
-#else
-#define LDOUBLE double
-#endif	/* HAVE_LONG_DOUBLE */
-#endif	/* !defined(LDOUBLE) */
-
-/* Support for long long int. */
-#ifndef LLONG
-#if HAVE_LONG_LONG_INT
-#define LLONG long long int
-#else
-#define LLONG long int
-#endif	/* HAVE_LONG_LONG_INT */
-#endif	/* !defined(LLONG) */
-
-/* Support for intmax_t. */
-#ifndef INTMAX_T
-#if HAVE_INTMAX_T || defined(intmax_t)
-#define INTMAX_T intmax_t
-#else
-#define INTMAX_T LLONG
-#endif	/* HAVE_INTMAX_T || defined(intmax_t) */
-#endif	/* !defined(INTMAX_T) */
-
-/* Support for uintptr_t. */
-#ifndef UINTPTR_T
-#if HAVE_UINTPTR_T || defined(uintptr_t)
-#define UINTPTR_T uintptr_t
-#else
-#define UINTPTR_T unsigned long int
-#endif	/* HAVE_UINTPTR_T || defined(uintptr_t) */
-#endif	/* !defined(UINTPTR_T) */
-
-/* Support for ptrdiff_t. */
-#ifndef PTRDIFF_T
-#if HAVE_PTRDIFF_T || defined(ptrdiff_t)
-#define PTRDIFF_T ptrdiff_t
-#else
-#define PTRDIFF_T long int
-#endif	/* HAVE_PTRDIFF_T || defined(ptrdiff_t) */
-#endif	/* !defined(PTRDIFF_T) */
-
-/*
- * We need an unsigned integer type corresponding to ptrdiff_t (cf. C99:
- * 7.19.6.1, 7).  However, we'll simply use PTRDIFF_T and convert it to an
- * unsigned type if necessary.  This should work just fine in practice.
- */
-#ifndef UPTRDIFF_T
-#define UPTRDIFF_T PTRDIFF_T
-#endif	/* !defined(UPTRDIFF_T) */
-
-/*
- * We need a signed integer type corresponding to size_t (cf. C99: 7.19.6.1, 7).
- * However, we'll simply use size_t and convert it to a signed type if
- * necessary.  This should work just fine in practice.
- */
-#ifndef SSIZE_T
-#define SSIZE_T size_t
-#endif	/* !defined(SSIZE_T) */
-
-/* Either ERANGE or E2BIG should be available everywhere. */
-#ifndef ERANGE
-#define ERANGE E2BIG
-#endif	/* !defined(ERANGE) */
-#ifndef EOVERFLOW
-#define EOVERFLOW ERANGE
-#endif	/* !defined(EOVERFLOW) */
-
-/*
- * Buffer size to hold the octal string representation of UINT128_MAX without
- * nul-termination ("3777777777777777777777777777777777777777777").
- */
-#ifdef MAX_CONVERT_LENGTH
-#undef MAX_CONVERT_LENGTH
-#endif	/* defined(MAX_CONVERT_LENGTH) */
-#define MAX_CONVERT_LENGTH      43
-
-/* Format read states. */
-#define PRINT_S_DEFAULT         0
-#define PRINT_S_FLAGS           1
-#define PRINT_S_WIDTH           2
-#define PRINT_S_DOT             3
-#define PRINT_S_PRECISION       4
-#define PRINT_S_MOD             5
-#define PRINT_S_CONV            6
-
-/* Format flags. */
-#define PRINT_F_MINUS           (1 << 0)
-#define PRINT_F_PLUS            (1 << 1)
-#define PRINT_F_SPACE           (1 << 2)
-#define PRINT_F_NUM             (1 << 3)
-#define PRINT_F_ZERO            (1 << 4)
-#define PRINT_F_QUOTE           (1 << 5)
-#define PRINT_F_UP              (1 << 6)
-#define PRINT_F_UNSIGNED        (1 << 7)
-#define PRINT_F_TYPE_G          (1 << 8)
-#define PRINT_F_TYPE_E          (1 << 9)
-
-/* Conversion flags. */
-#define PRINT_C_CHAR            1
-#define PRINT_C_SHORT           2
-#define PRINT_C_LONG            3
-#define PRINT_C_LLONG           4
-#define PRINT_C_LDOUBLE         5
-#define PRINT_C_SIZE            6
-#define PRINT_C_PTRDIFF         7
-#define PRINT_C_INTMAX          8
-
-#ifndef MAX
-#define MAX(x, y) ((x >= y) ? x : y)
-#endif	/* !defined(MAX) */
-#ifndef CHARTOINT
-#define CHARTOINT(ch) (ch - '0')
-#endif	/* !defined(CHARTOINT) */
-#ifndef ISDIGIT
-#define ISDIGIT(ch) ('0' <= (unsigned char)ch && (unsigned char)ch <= '9')
-#endif	/* !defined(ISDIGIT) */
-#ifndef ISNAN
-#define ISNAN(x) (x != x)
-#endif	/* !defined(ISNAN) */
-#ifndef ISINF
-#define ISINF(x) (x != 0.0 && x + x == x)
-#endif	/* !defined(ISINF) */
-
-#ifdef OUTCHAR
-#undef OUTCHAR
-#endif	/* defined(OUTCHAR) */
-#define OUTCHAR(str, len, size, ch)                                          \
-do {                                                                         \
-	if (len + 1 < size)                                                  \
-		str[len] = ch;                                               \
-	(len)++;                                                             \
-} while (/* CONSTCOND */ 0)
-
-static void fmtstr(char *, size_t *, size_t, const char *, int, int, int);
-static void fmtint(char *, size_t *, size_t, INTMAX_T, int, int, int, int);
-static void fmtflt(char *, size_t *, size_t, LDOUBLE, int, int, int, int *);
-static void printsep(char *, size_t *, size_t);
-static int getnumsep(int);
-static int getexponent(LDOUBLE);
-static int convert(UINTMAX_T, char *, size_t, int, int);
-static UINTMAX_T cast(LDOUBLE);
-static UINTMAX_T myround(LDOUBLE);
-static LDOUBLE mypow10(int);
-
-int
-util_vsnprintf(char *str, size_t size, const char *format, va_list args)
-{
-	LDOUBLE fvalue;
-	INTMAX_T value;
-	unsigned char cvalue;
-	const char *strvalue;
-	INTMAX_T *intmaxptr;
-	PTRDIFF_T *ptrdiffptr;
-	SSIZE_T *sizeptr;
-	LLONG *llongptr;
-	long int *longptr;
-	int *intptr;
-	short int *shortptr;
-	signed char *charptr;
-	size_t len = 0;
-	int overflow = 0;
-	int base = 0;
-	int cflags = 0;
-	int flags = 0;
-	int width = 0;
-	int precision = -1;
-	int state = PRINT_S_DEFAULT;
-	char ch = *format++;
-
-	/*
-	 * C99 says: "If `n' is zero, nothing is written, and `s' may be a null
-	 * pointer." (7.19.6.5, 2)  We're forgiving and allow a NULL pointer
-	 * even if a size larger than zero was specified.  At least NetBSD's
-	 * snprintf(3) does the same, as well as other versions of this file.
-	 * (Though some of these versions will write to a non-NULL buffer even
-	 * if a size of zero was specified, which violates the standard.)
-	 */
-	if (str == NULL && size != 0)
-		size = 0;
-
-	while (ch != '\0')
-		switch (state) {
-		case PRINT_S_DEFAULT:
-			if (ch == '%')
-				state = PRINT_S_FLAGS;
-			else
-				OUTCHAR(str, len, size, ch);
-			ch = *format++;
-			break;
-		case PRINT_S_FLAGS:
-			switch (ch) {
-			case '-':
-				flags |= PRINT_F_MINUS;
-				ch = *format++;
-				break;
-			case '+':
-				flags |= PRINT_F_PLUS;
-				ch = *format++;
-				break;
-			case ' ':
-				flags |= PRINT_F_SPACE;
-				ch = *format++;
-				break;
-			case '#':
-				flags |= PRINT_F_NUM;
-				ch = *format++;
-				break;
-			case '0':
-				flags |= PRINT_F_ZERO;
-				ch = *format++;
-				break;
-			case '\'':	/* SUSv2 flag (not in C99). */
-				flags |= PRINT_F_QUOTE;
-				ch = *format++;
-				break;
-			default:
-				state = PRINT_S_WIDTH;
-				break;
-			}
-			break;
-		case PRINT_S_WIDTH:
-			if (ISDIGIT(ch)) {
-				ch = CHARTOINT(ch);
-				if (width > (INT_MAX - ch) / 10) {
-					overflow = 1;
-					goto out;
-				}
-				width = 10 * width + ch;
-				ch = *format++;
-			} else if (ch == '*') {
-				/*
-				 * C99 says: "A negative field width argument is
-				 * taken as a `-' flag followed by a positive
-				 * field width." (7.19.6.1, 5)
-				 */
-				if ((width = va_arg(args, int)) < 0) {
-					flags |= PRINT_F_MINUS;
-					width = -width;
-				}
-				ch = *format++;
-				state = PRINT_S_DOT;
-			} else
-				state = PRINT_S_DOT;
-			break;
-		case PRINT_S_DOT:
-			if (ch == '.') {
-				state = PRINT_S_PRECISION;
-				ch = *format++;
-			} else
-				state = PRINT_S_MOD;
-			break;
-		case PRINT_S_PRECISION:
-			if (precision == -1)
-				precision = 0;
-			if (ISDIGIT(ch)) {
-				ch = CHARTOINT(ch);
-				if (precision > (INT_MAX - ch) / 10) {
-					overflow = 1;
-					goto out;
-				}
-				precision = 10 * precision + ch;
-				ch = *format++;
-			} else if (ch == '*') {
-				/*
-				 * C99 says: "A negative precision argument is
-				 * taken as if the precision were omitted."
-				 * (7.19.6.1, 5)
-				 */
-				if ((precision = va_arg(args, int)) < 0)
-					precision = -1;
-				ch = *format++;
-				state = PRINT_S_MOD;
-			} else
-				state = PRINT_S_MOD;
-			break;
-		case PRINT_S_MOD:
-			switch (ch) {
-			case 'h':
-				ch = *format++;
-				if (ch == 'h') {	/* It's a char. */
-					ch = *format++;
-					cflags = PRINT_C_CHAR;
-				} else
-					cflags = PRINT_C_SHORT;
-				break;
-			case 'l':
-				ch = *format++;
-				if (ch == 'l') {	/* It's a long long. */
-					ch = *format++;
-					cflags = PRINT_C_LLONG;
-				} else
-					cflags = PRINT_C_LONG;
-				break;
-			case 'L':
-				cflags = PRINT_C_LDOUBLE;
-				ch = *format++;
-				break;
-			case 'j':
-				cflags = PRINT_C_INTMAX;
-				ch = *format++;
-				break;
-			case 't':
-				cflags = PRINT_C_PTRDIFF;
-				ch = *format++;
-				break;
-			case 'z':
-				cflags = PRINT_C_SIZE;
-				ch = *format++;
-				break;
-			}
-			state = PRINT_S_CONV;
-			break;
-		case PRINT_S_CONV:
-			switch (ch) {
-			case 'd':
-				/* FALLTHROUGH */
-			case 'i':
-				switch (cflags) {
-				case PRINT_C_CHAR:
-					value = (signed char)va_arg(args, int);
-					break;
-				case PRINT_C_SHORT:
-					value = (short int)va_arg(args, int);
-					break;
-				case PRINT_C_LONG:
-					value = va_arg(args, long int);
-					break;
-				case PRINT_C_LLONG:
-					value = va_arg(args, LLONG);
-					break;
-				case PRINT_C_SIZE:
-					value = va_arg(args, SSIZE_T);
-					break;
-				case PRINT_C_INTMAX:
-					value = va_arg(args, INTMAX_T);
-					break;
-				case PRINT_C_PTRDIFF:
-					value = va_arg(args, PTRDIFF_T);
-					break;
-				default:
-					value = va_arg(args, int);
-					break;
-				}
-				fmtint(str, &len, size, value, 10, width,
-				    precision, flags);
-				break;
-			case 'X':
-				flags |= PRINT_F_UP;
-				/* FALLTHROUGH */
-			case 'x':
-				base = 16;
-				/* FALLTHROUGH */
-			case 'o':
-				if (base == 0)
-					base = 8;
-				/* FALLTHROUGH */
-			case 'u':
-				if (base == 0)
-					base = 10;
-				flags |= PRINT_F_UNSIGNED;
-				switch (cflags) {
-				case PRINT_C_CHAR:
-					value = (unsigned char)va_arg(args,
-					    unsigned int);
-					break;
-				case PRINT_C_SHORT:
-					value = (unsigned short int)va_arg(args,
-					    unsigned int);
-					break;
-				case PRINT_C_LONG:
-					value = va_arg(args, unsigned long int);
-					break;
-				case PRINT_C_LLONG:
-					value = va_arg(args, ULLONG);
-					break;
-				case PRINT_C_SIZE:
-					value = va_arg(args, size_t);
-					break;
-				case PRINT_C_INTMAX:
-					value = va_arg(args, UINTMAX_T);
-					break;
-				case PRINT_C_PTRDIFF:
-					value = va_arg(args, UPTRDIFF_T);
-					break;
-				default:
-					value = va_arg(args, unsigned int);
-					break;
-				}
-				fmtint(str, &len, size, value, base, width,
-				    precision, flags);
-				break;
-			case 'A':
-				/* Not yet supported, we'll use "%F". */
-				/* FALLTHROUGH */
-			case 'F':
-				flags |= PRINT_F_UP;
-			case 'a':
-				/* Not yet supported, we'll use "%f". */
-				/* FALLTHROUGH */
-			case 'f':
-				if (cflags == PRINT_C_LDOUBLE)
-					fvalue = va_arg(args, LDOUBLE);
-				else
-					fvalue = va_arg(args, double);
-				fmtflt(str, &len, size, fvalue, width,
-				    precision, flags, &overflow);
-				if (overflow)
-					goto out;
-				break;
-			case 'E':
-				flags |= PRINT_F_UP;
-				/* FALLTHROUGH */
-			case 'e':
-				flags |= PRINT_F_TYPE_E;
-				if (cflags == PRINT_C_LDOUBLE)
-					fvalue = va_arg(args, LDOUBLE);
-				else
-					fvalue = va_arg(args, double);
-				fmtflt(str, &len, size, fvalue, width,
-				    precision, flags, &overflow);
-				if (overflow)
-					goto out;
-				break;
-			case 'G':
-				flags |= PRINT_F_UP;
-				/* FALLTHROUGH */
-			case 'g':
-				flags |= PRINT_F_TYPE_G;
-				if (cflags == PRINT_C_LDOUBLE)
-					fvalue = va_arg(args, LDOUBLE);
-				else
-					fvalue = va_arg(args, double);
-				/*
-				 * If the precision is zero, it is treated as
-				 * one (cf. C99: 7.19.6.1, 8).
-				 */
-				if (precision == 0)
-					precision = 1;
-				fmtflt(str, &len, size, fvalue, width,
-				    precision, flags, &overflow);
-				if (overflow)
-					goto out;
-				break;
-			case 'c':
-				cvalue = (unsigned char)va_arg(args, int);
-				OUTCHAR(str, len, size, cvalue);
-				break;
-			case 's':
-				strvalue = va_arg(args, char *);
-				fmtstr(str, &len, size, strvalue, width,
-				    precision, flags);
-				break;
-			case 'p':
-				/*
-				 * C99 says: "The value of the pointer is
-				 * converted to a sequence of printing
-				 * characters, in an implementation-defined
-				 * manner." (C99: 7.19.6.1, 8)
-				 */
-				if ((strvalue = va_arg(args, void *)) == NULL)
-					/*
-					 * We use the glibc format.  BSD prints
-					 * "0x0", SysV "0".
-					 */
-					fmtstr(str, &len, size, "(nil)", width,
-					    -1, flags);
-				else {
-					/*
-					 * We use the BSD/glibc format.  SysV
-					 * omits the "0x" prefix (which we emit
-					 * using the PRINT_F_NUM flag).
-					 */
-					flags |= PRINT_F_NUM;
-					flags |= PRINT_F_UNSIGNED;
-					fmtint(str, &len, size,
-					    (UINTPTR_T)strvalue, 16, width,
-					    precision, flags);
-				}
-				break;
-			case 'n':
-				switch (cflags) {
-				case PRINT_C_CHAR:
-					charptr = va_arg(args, signed char *);
-					*charptr = (signed char)len;
-					break;
-				case PRINT_C_SHORT:
-					shortptr = va_arg(args, short int *);
-					*shortptr = (short int)len;
-					break;
-				case PRINT_C_LONG:
-					longptr = va_arg(args, long int *);
-					*longptr = (long int)len;
-					break;
-				case PRINT_C_LLONG:
-					llongptr = va_arg(args, LLONG *);
-					*llongptr = (LLONG)len;
-					break;
-				case PRINT_C_SIZE:
-					/*
-					 * C99 says that with the "z" length
-					 * modifier, "a following `n' conversion
-					 * specifier applies to a pointer to a
-					 * signed integer type corresponding to
-					 * size_t argument." (7.19.6.1, 7)
-					 */
-					sizeptr = va_arg(args, SSIZE_T *);
-					*sizeptr = len;
-					break;
-				case PRINT_C_INTMAX:
-					intmaxptr = va_arg(args, INTMAX_T *);
-					*intmaxptr = len;
-					break;
-				case PRINT_C_PTRDIFF:
-					ptrdiffptr = va_arg(args, PTRDIFF_T *);
-					*ptrdiffptr = len;
-					break;
-				default:
-					intptr = va_arg(args, int *);
-					*intptr = (int)len;
-					break;
-				}
-				break;
-			case '%':	/* Print a "%" character verbatim. */
-				OUTCHAR(str, len, size, ch);
-				break;
-			default:	/* Skip other characters. */
-				break;
-			}
-			ch = *format++;
-			state = PRINT_S_DEFAULT;
-			base = cflags = flags = width = 0;
-			precision = -1;
-			break;
-		}
-out:
-	if (len < size)
-		str[len] = '\0';
-	else if (size > 0)
-		str[size - 1] = '\0';
-
-	if (overflow || len >= INT_MAX) {
-		return -1;
-	}
-	return (int)len;
-}
-
-static void
-fmtstr(char *str, size_t *len, size_t size, const char *value, int width,
-       int precision, int flags)
-{
-	int padlen, strln;	/* Amount to pad. */
-	int noprecision = (precision == -1);
-
-	if (value == NULL)	/* We're forgiving. */
-		value = "(null)";
-
-	/* If a precision was specified, don't read the string past it. */
-	for (strln = 0; value[strln] != '\0' &&
-	    (noprecision || strln < precision); strln++)
-		continue;
-
-	if ((padlen = width - strln) < 0)
-		padlen = 0;
-	if (flags & PRINT_F_MINUS)	/* Left justify. */
-		padlen = -padlen;
-
-	while (padlen > 0) {	/* Leading spaces. */
-		OUTCHAR(str, *len, size, ' ');
-		padlen--;
-	}
-	while (*value != '\0' && (noprecision || precision-- > 0)) {
-		OUTCHAR(str, *len, size, *value);
-		value++;
-	}
-	while (padlen < 0) {	/* Trailing spaces. */
-		OUTCHAR(str, *len, size, ' ');
-		padlen++;
-	}
-}
-
-static void
-fmtint(char *str, size_t *len, size_t size, INTMAX_T value, int base, int width,
-       int precision, int flags)
-{
-	UINTMAX_T uvalue;
-	char iconvert[MAX_CONVERT_LENGTH];
-	char sign = 0;
-	char hexprefix = 0;
-	int spadlen = 0;	/* Amount to space pad. */
-	int zpadlen = 0;	/* Amount to zero pad. */
-	int pos;
-	int separators = (flags & PRINT_F_QUOTE);
-	int noprecision = (precision == -1);
-
-	if (flags & PRINT_F_UNSIGNED)
-		uvalue = value;
-	else {
-		uvalue = (value >= 0) ? value : -value;
-		if (value < 0)
-			sign = '-';
-		else if (flags & PRINT_F_PLUS)	/* Do a sign. */
-			sign = '+';
-		else if (flags & PRINT_F_SPACE)
-			sign = ' ';
-	}
-
-	pos = convert(uvalue, iconvert, sizeof(iconvert), base,
-	    flags & PRINT_F_UP);
-
-	if (flags & PRINT_F_NUM && uvalue != 0) {
-		/*
-		 * C99 says: "The result is converted to an `alternative form'.
-		 * For `o' conversion, it increases the precision, if and only
-		 * if necessary, to force the first digit of the result to be a
-		 * zero (if the value and precision are both 0, a single 0 is
-		 * printed).  For `x' (or `X') conversion, a nonzero result has
-		 * `0x' (or `0X') prefixed to it." (7.19.6.1, 6)
-		 */
-		switch (base) {
-		case 8:
-			if (precision <= pos)
-				precision = pos + 1;
-			break;
-		case 16:
-			hexprefix = (flags & PRINT_F_UP) ? 'X' : 'x';
-			break;
-		}
-	}
-
-	if (separators)	/* Get the number of group separators we'll print. */
-		separators = getnumsep(pos);
-
-	zpadlen = precision - pos - separators;
-	spadlen = width                         /* Minimum field width. */
-	    - separators                        /* Number of separators. */
-	    - MAX(precision, pos)               /* Number of integer digits. */
-	    - ((sign != 0) ? 1 : 0)             /* Will we print a sign? */
-	    - ((hexprefix != 0) ? 2 : 0);       /* Will we print a prefix? */
-
-	if (zpadlen < 0)
-		zpadlen = 0;
-	if (spadlen < 0)
-		spadlen = 0;
-
-	/*
-	 * C99 says: "If the `0' and `-' flags both appear, the `0' flag is
-	 * ignored.  For `d', `i', `o', `u', `x', and `X' conversions, if a
-	 * precision is specified, the `0' flag is ignored." (7.19.6.1, 6)
-	 */
-	if (flags & PRINT_F_MINUS)	/* Left justify. */
-		spadlen = -spadlen;
-	else if (flags & PRINT_F_ZERO && noprecision) {
-		zpadlen += spadlen;
-		spadlen = 0;
-	}
-	while (spadlen > 0) {	/* Leading spaces. */
-		OUTCHAR(str, *len, size, ' ');
-		spadlen--;
-	}
-	if (sign != 0)	/* Sign. */
-		OUTCHAR(str, *len, size, sign);
-	if (hexprefix != 0) {	/* A "0x" or "0X" prefix. */
-		OUTCHAR(str, *len, size, '0');
-		OUTCHAR(str, *len, size, hexprefix);
-	}
-	while (zpadlen > 0) {	/* Leading zeros. */
-		OUTCHAR(str, *len, size, '0');
-		zpadlen--;
-	}
-	while (pos > 0) {	/* The actual digits. */
-		pos--;
-		OUTCHAR(str, *len, size, iconvert[pos]);
-		if (separators > 0 && pos > 0 && pos % 3 == 0)
-			printsep(str, len, size);
-	}
-	while (spadlen < 0) {	/* Trailing spaces. */
-		OUTCHAR(str, *len, size, ' ');
-		spadlen++;
-	}
-}
-
-static void
-fmtflt(char *str, size_t *len, size_t size, LDOUBLE fvalue, int width,
-       int precision, int flags, int *overflow)
-{
-	LDOUBLE ufvalue;
-	UINTMAX_T intpart;
-	UINTMAX_T fracpart;
-	UINTMAX_T mask;
-	const char *infnan = NULL;
-	char iconvert[MAX_CONVERT_LENGTH];
-	char fconvert[MAX_CONVERT_LENGTH];
-	char econvert[4];	/* "e-12" (without nul-termination). */
-	char esign = 0;
-	char sign = 0;
-	int leadfraczeros = 0;
-	int exponent = 0;
-	int emitpoint = 0;
-	int omitzeros = 0;
-	int omitcount = 0;
-	int padlen = 0;
-	int epos = 0;
-	int fpos = 0;
-	int ipos = 0;
-	int separators = (flags & PRINT_F_QUOTE);
-	int estyle = (flags & PRINT_F_TYPE_E);
-#if HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT
-	struct lconv *lc = localeconv();
-#endif	/* HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT */
-
-	/*
-	 * AIX' man page says the default is 0, but C99 and at least Solaris'
-	 * and NetBSD's man pages say the default is 6, and sprintf(3) on AIX
-	 * defaults to 6.
-	 */
-	if (precision == -1)
-		precision = 6;
-
-	if (fvalue < 0.0)
-		sign = '-';
-	else if (flags & PRINT_F_PLUS)	/* Do a sign. */
-		sign = '+';
-	else if (flags & PRINT_F_SPACE)
-		sign = ' ';
-
-	if (ISNAN(fvalue))
-		infnan = (flags & PRINT_F_UP) ? "NAN" : "nan";
-	else if (ISINF(fvalue))
-		infnan = (flags & PRINT_F_UP) ? "INF" : "inf";
-
-	if (infnan != NULL) {
-		if (sign != 0)
-			iconvert[ipos++] = sign;
-		while (*infnan != '\0')
-			iconvert[ipos++] = *infnan++;
-		fmtstr(str, len, size, iconvert, width, ipos, flags);
-		return;
-	}
-
-	/* "%e" (or "%E") or "%g" (or "%G") conversion. */
-	if (flags & PRINT_F_TYPE_E || flags & PRINT_F_TYPE_G) {
-		if (flags & PRINT_F_TYPE_G) {
-			/*
-			 * For "%g" (and "%G") conversions, the precision
-			 * specifies the number of significant digits, which
-			 * includes the digits in the integer part.  The
-			 * conversion will or will not be using "e-style" (like
-			 * "%e" or "%E" conversions) depending on the precision
-			 * and on the exponent.  However, the exponent can be
-			 * affected by rounding the converted value, so we'll
-			 * leave this decision for later.  Until then, we'll
-			 * assume that we're going to do an "e-style" conversion
-			 * (in order to get the exponent calculated).  For
-			 * "e-style", the precision must be decremented by one.
-			 */
-			precision--;
-			/*
-			 * For "%g" (and "%G") conversions, trailing zeros are
-			 * removed from the fractional portion of the result
-			 * unless the "#" flag was specified.
-			 */
-			if (!(flags & PRINT_F_NUM))
-				omitzeros = 1;
-		}
-		exponent = getexponent(fvalue);
-		estyle = 1;
-	}
-
-again:
-	/*
-	 * Sorry, we only support 9, 19, or 38 digits (that is, the number of
-	 * digits of the 32-bit, the 64-bit, or the 128-bit UINTMAX_MAX value
-	 * minus one) past the decimal point due to our conversion method.
-	 */
-	switch (sizeof(UINTMAX_T)) {
-	case 16:
-		if (precision > 38)
-			precision = 38;
-		break;
-	case 8:
-		if (precision > 19)
-			precision = 19;
-		break;
-	default:
-		if (precision > 9)
-			precision = 9;
-		break;
-	}
-
-	ufvalue = (fvalue >= 0.0) ? fvalue : -fvalue;
-	if (estyle)	/* We want exactly one integer digit. */
-		ufvalue /= mypow10(exponent);
-
-	if ((intpart = cast(ufvalue)) == UINTMAX_MAX) {
-		*overflow = 1;
-		return;
-	}
-
-	/*
-	 * Factor of ten with the number of digits needed for the fractional
-	 * part.  For example, if the precision is 3, the mask will be 1000.
-	 */
-	mask = (UINTMAX_T)mypow10(precision);
-	/*
-	 * We "cheat" by converting the fractional part to integer by
-	 * multiplying by a factor of ten.
-	 */
-	if ((fracpart = myround(mask * (ufvalue - intpart))) >= mask) {
-		/*
-		 * For example, ufvalue = 2.99962, intpart = 2, and mask = 1000
-		 * (because precision = 3).  Now, myround(1000 * 0.99962) will
-		 * return 1000.  So, the integer part must be incremented by one
-		 * and the fractional part must be set to zero.
-		 */
-		intpart++;
-		fracpart = 0;
-		if (estyle && intpart == 10) {
-			/*
-			 * The value was rounded up to ten, but we only want one
-			 * integer digit if using "e-style".  So, the integer
-			 * part must be set to one and the exponent must be
-			 * incremented by one.
-			 */
-			intpart = 1;
-			exponent++;
-		}
-	}
-
-	/*
-	 * Now that we know the real exponent, we can check whether or not to
-	 * use "e-style" for "%g" (and "%G") conversions.  If we don't need
-	 * "e-style", the precision must be adjusted and the integer and
-	 * fractional parts must be recalculated from the original value.
-	 *
-	 * C99 says: "Let P equal the precision if nonzero, 6 if the precision
-	 * is omitted, or 1 if the precision is zero.  Then, if a conversion
-	 * with style `E' would have an exponent of X:
-	 *
-	 * - if P > X >= -4, the conversion is with style `f' (or `F') and
-	 *   precision P - (X + 1).
-	 *
-	 * - otherwise, the conversion is with style `e' (or `E') and precision
-	 *   P - 1." (7.19.6.1, 8)
-	 *
-	 * Note that we had decremented the precision by one.
-	 */
-	if (flags & PRINT_F_TYPE_G && estyle &&
-	    precision + 1 > exponent && exponent >= -4) {
-		precision -= exponent;
-		estyle = 0;
-		goto again;
-	}
-
-	if (estyle) {
-		if (exponent < 0) {
-			exponent = -exponent;
-			esign = '-';
-		} else
-			esign = '+';
-
-		/*
-		 * Convert the exponent.  The sizeof(econvert) is 4.  So, the
-		 * econvert buffer can hold e.g. "e+99" and "e-99".  We don't
-		 * support an exponent which contains more than two digits.
-		 * Therefore, the following stores are safe.
-		 */
-		epos = convert(exponent, econvert, 2, 10, 0);
-		/*
-		 * C99 says: "The exponent always contains at least two digits,
-		 * and only as many more digits as necessary to represent the
-		 * exponent." (7.19.6.1, 8)
-		 */
-		if (epos == 1)
-			econvert[epos++] = '0';
-		econvert[epos++] = esign;
-		econvert[epos++] = (flags & PRINT_F_UP) ? 'E' : 'e';
-	}
-
-	/* Convert the integer part and the fractional part. */
-	ipos = convert(intpart, iconvert, sizeof(iconvert), 10, 0);
-	if (fracpart != 0)	/* convert() would return 1 if fracpart == 0. */
-		fpos = convert(fracpart, fconvert, sizeof(fconvert), 10, 0);
-
-	leadfraczeros = precision - fpos;
-
-	if (omitzeros) {
-		if (fpos > 0)	/* Omit trailing fractional part zeros. */
-			while (omitcount < fpos && fconvert[omitcount] == '0')
-				omitcount++;
-		else {	/* The fractional part is zero, omit it completely. */
-			omitcount = precision;
-			leadfraczeros = 0;
-		}
-		precision -= omitcount;
-	}
-
-	/*
-	 * Print a decimal point if either the fractional part is non-zero
-	 * and/or the "#" flag was specified.
-	 */
-	if (precision > 0 || flags & PRINT_F_NUM)
-		emitpoint = 1;
-	if (separators)	/* Get the number of group separators we'll print. */
-		separators = getnumsep(ipos);
-
-	padlen = width                  /* Minimum field width. */
-	    - ipos                      /* Number of integer digits. */
-	    - epos                      /* Number of exponent characters. */
-	    - precision                 /* Number of fractional digits. */
-	    - separators                /* Number of group separators. */
-	    - (emitpoint ? 1 : 0)       /* Will we print a decimal point? */
-	    - ((sign != 0) ? 1 : 0);    /* Will we print a sign character? */
-
-	if (padlen < 0)
-		padlen = 0;
-
-	/*
-	 * C99 says: "If the `0' and `-' flags both appear, the `0' flag is
-	 * ignored." (7.19.6.1, 6)
-	 */
-	if (flags & PRINT_F_MINUS)	/* Left justifty. */
-		padlen = -padlen;
-	else if (flags & PRINT_F_ZERO && padlen > 0) {
-		if (sign != 0) {	/* Sign. */
-			OUTCHAR(str, *len, size, sign);
-			sign = 0;
-		}
-		while (padlen > 0) {	/* Leading zeros. */
-			OUTCHAR(str, *len, size, '0');
-			padlen--;
-		}
-	}
-	while (padlen > 0) {	/* Leading spaces. */
-		OUTCHAR(str, *len, size, ' ');
-		padlen--;
-	}
-	if (sign != 0)	/* Sign. */
-		OUTCHAR(str, *len, size, sign);
-	while (ipos > 0) {	/* Integer part. */
-		ipos--;
-		OUTCHAR(str, *len, size, iconvert[ipos]);
-		if (separators > 0 && ipos > 0 && ipos % 3 == 0)
-			printsep(str, len, size);
-	}
-	if (emitpoint) {	/* Decimal point. */
-#if HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT
-		if (lc->decimal_point != NULL && *lc->decimal_point != '\0')
-			OUTCHAR(str, *len, size, *lc->decimal_point);
-		else	/* We'll always print some decimal point character. */
-#endif	/* HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT */
-			OUTCHAR(str, *len, size, '.');
-	}
-	while (leadfraczeros > 0) {	/* Leading fractional part zeros. */
-		OUTCHAR(str, *len, size, '0');
-		leadfraczeros--;
-	}
-	while (fpos > omitcount) {	/* The remaining fractional part. */
-		fpos--;
-		OUTCHAR(str, *len, size, fconvert[fpos]);
-	}
-	while (epos > 0) {	/* Exponent. */
-		epos--;
-		OUTCHAR(str, *len, size, econvert[epos]);
-	}
-	while (padlen < 0) {	/* Trailing spaces. */
-		OUTCHAR(str, *len, size, ' ');
-		padlen++;
-	}
-}
-
-static void
-printsep(char *str, size_t *len, size_t size)
-{
-#if HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP
-	struct lconv *lc = localeconv();
-	int i;
-
-	if (lc->thousands_sep != NULL)
-		for (i = 0; lc->thousands_sep[i] != '\0'; i++)
-			OUTCHAR(str, *len, size, lc->thousands_sep[i]);
-	else
-#endif	/* HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP */
-		OUTCHAR(str, *len, size, ',');
-}
-
-static int
-getnumsep(int digits)
-{
-	int separators = (digits - ((digits % 3 == 0) ? 1 : 0)) / 3;
-#if HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP
-	int strln;
-	struct lconv *lc = localeconv();
-
-	/* We support an arbitrary separator length (including zero). */
-	if (lc->thousands_sep != NULL) {
-		for (strln = 0; lc->thousands_sep[strln] != '\0'; strln++)
-			continue;
-		separators *= strln;
-	}
-#endif	/* HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP */
-	return separators;
-}
-
-static int
-getexponent(LDOUBLE value)
-{
-	LDOUBLE tmp = (value >= 0.0) ? value : -value;
-	int exponent = 0;
-
-	/*
-	 * We check for 99 > exponent > -99 in order to work around possible
-	 * endless loops which could happen (at least) in the second loop (at
-	 * least) if we're called with an infinite value.  However, we checked
-	 * for infinity before calling this function using our ISINF() macro, so
-	 * this might be somewhat paranoid.
-	 */
-	while (tmp < 1.0 && tmp > 0.0 && --exponent > -99)
-		tmp *= 10;
-	while (tmp >= 10.0 && ++exponent < 99)
-		tmp /= 10;
-
-	return exponent;
-}
-
-static int
-convert(UINTMAX_T value, char *buf, size_t size, int base, int caps)
-{
-	const char *digits = caps ? "0123456789ABCDEF" : "0123456789abcdef";
-	size_t pos = 0;
-
-	/* We return an unterminated buffer with the digits in reverse order. */
-	do {
-		buf[pos++] = digits[value % base];
-		value /= base;
-	} while (value != 0 && pos < size);
-
-	return (int)pos;
-}
-
-static UINTMAX_T
-cast(LDOUBLE value)
-{
-	UINTMAX_T result;
-
-	/*
-	 * We check for ">=" and not for ">" because if UINTMAX_MAX cannot be
-	 * represented exactly as an LDOUBLE value (but is less than LDBL_MAX),
-	 * it may be increased to the nearest higher representable value for the
-	 * comparison (cf. C99: 6.3.1.4, 2).  It might then equal the LDOUBLE
-	 * value although converting the latter to UINTMAX_T would overflow.
-	 */
-	if (value >= UINTMAX_MAX)
-		return UINTMAX_MAX;
-
-	result = (UINTMAX_T)value;
-	/*
-	 * At least on NetBSD/sparc64 3.0.2 and 4.99.30, casting long double to
-	 * an integer type converts e.g. 1.9 to 2 instead of 1 (which violates
-	 * the standard).  Sigh.
-	 */
-	return (result <= value) ? result : result - 1;
-}
-
-static UINTMAX_T
-myround(LDOUBLE value)
-{
-	UINTMAX_T intpart = cast(value);
-
-	return ((value -= intpart) < 0.5) ? intpart : intpart + 1;
-}
-
-static LDOUBLE
-mypow10(int exponent)
-{
-	LDOUBLE result = 1;
-
-	while (exponent > 0) {
-		result *= 10;
-		exponent--;
-	}
-	while (exponent < 0) {
-		result /= 10;
-		exponent++;
-	}
-	return result;
-}
-#endif	/* !HAVE_VSNPRINTF */
-
-#if !HAVE_VASPRINTF
-#if NEED_MYMEMCPY
-void *
-mymemcpy(void *dst, void *src, size_t len)
-{
-	const char *from = src;
-	char *to = dst;
-
-	/* No need for optimization, we use this only to replace va_copy(3). */
-	while (len-- > 0)
-		*to++ = *from++;
-	return dst;
-}
-#endif	/* NEED_MYMEMCPY */
-
-int
-util_vasprintf(char **ret, const char *format, va_list ap)
-{
-	size_t size;
-	int len;
-	va_list aq;
-
-	VA_COPY(aq, ap);
-	len = vsnprintf(NULL, 0, format, aq);
-	VA_END_COPY(aq);
-	if (len < 0 || (*ret = malloc(size = len + 1)) == NULL)
-		return -1;
-	return vsnprintf(*ret, size, format, ap);
-}
-#endif	/* !HAVE_VASPRINTF */
-
-#if !HAVE_SNPRINTF
-#if HAVE_STDARG_H
-int
-util_snprintf(char *str, size_t size, const char *format, ...)
-#else
-int
-util_snprintf(va_alist) va_dcl
-#endif	/* HAVE_STDARG_H */
-{
-#if !HAVE_STDARG_H
-	char *str;
-	size_t size;
-	char *format;
-#endif	/* HAVE_STDARG_H */
-	va_list ap;
-	int len;
-
-	VA_START(ap, format);
-	VA_SHIFT(ap, str, char *);
-	VA_SHIFT(ap, size, size_t);
-	VA_SHIFT(ap, format, const char *);
-	len = vsnprintf(str, size, format, ap);
-	va_end(ap);
-	return len;
-}
-#endif	/* !HAVE_SNPRINTF */
-
-#if !HAVE_ASPRINTF
-#if HAVE_STDARG_H
-int
-util_asprintf(char **ret, const char *format, ...)
-#else
-int
-util_asprintf(va_alist) va_dcl
-#endif	/* HAVE_STDARG_H */
-{
-#if !HAVE_STDARG_H
-	char **ret;
-	char *format;
-#endif	/* HAVE_STDARG_H */
-	va_list ap;
-	int len;
-
-	VA_START(ap, format);
-	VA_SHIFT(ap, ret, char **);
-	VA_SHIFT(ap, format, const char *);
-	len = vasprintf(ret, format, ap);
-	va_end(ap);
-	return len;
-}
-#endif	/* !HAVE_ASPRINTF */
-#else	/* Dummy declaration to avoid empty translation unit warnings. */
-int main(void);
-#endif	/* !HAVE_SNPRINTF || !HAVE_VSNPRINTF || !HAVE_ASPRINTF || [...] */
-
-
-/* vim: set joinspaces textwidth=80: */
diff --git a/src/gallium/auxiliary/util/u_string.h b/src/gallium/auxiliary/util/u_string.h
index da3e2675d8e..f7ab09c8f1c 100644
--- a/src/gallium/auxiliary/util/u_string.h
+++ b/src/gallium/auxiliary/util/u_string.h
@@ -35,13 +35,14 @@
 #ifndef U_STRING_H_
 #define U_STRING_H_
 
-#if !defined(_MSC_VER) && !defined(XF86_LIBC_H)
+#if !defined(XF86_LIBC_H)
 #include <stdio.h>
 #endif
 #include <stddef.h>
 #include <stdarg.h>
 
 #include "pipe/p_compiler.h"
+#include "util/macros.h" // PRINTFLIKE
 
 
 #ifdef __cplusplus
@@ -64,10 +65,35 @@ util_strchrnul(const char *s, char c)
 
 #endif
 
-#ifdef _MSC_VER
+#ifdef _WIN32
 
-int util_vsnprintf(char *, size_t, const char *, va_list);
-int util_snprintf(char *str, size_t size, const char *format, ...);
+static inline int
+util_vsnprintf(char *str, size_t size, const char *format, va_list ap)
+{
+   /* We need to use _vscprintf to calculate the length as vsnprintf returns -1
+    * if the number of characters to write is greater than count.
+    */
+   va_list ap_copy;
+   int ret;
+   va_copy(ap_copy, ap);
+   ret = _vsnprintf(str, size, format, ap);
+   if (ret < 0) {
+      ret = _vscprintf(format, ap_copy);
+   }
+   return ret;
+}
+
+static inline int
+   PRINTFLIKE(3, 4)
+util_snprintf(char *str, size_t size, const char *format, ...)
+{
+   va_list ap;
+   int ret;
+   va_start(ap, format);
+   ret = util_vsnprintf(str, size, format, ap);
+   va_end(ap);
+   return ret;
+}
 
 static inline void
 util_vsprintf(char *str, const char *format, va_list ap)
@@ -76,6 +102,7 @@ util_vsprintf(char *str, const char *format, va_list ap)
 }
 
 static inline void
+   PRINTFLIKE(2, 3)
 util_sprintf(char *str, const char *format, ...)
 {
    va_list ap;

From d6b50ba980b733a82fefe2a0f115635a359c445f Mon Sep 17 00:00:00 2001
From: Jose Fonseca <jfonseca@vmware.com>
Date: Thu, 23 Jul 2015 16:54:02 +0100
Subject: [PATCH 0615/1208] gallivm: Fix profile build.

---
 src/gallium/auxiliary/gallivm/lp_bld_debug.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index 0a5c2ccdae3..7283e2f162f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -274,7 +274,7 @@ lp_profile(LLVMValueRef func, const void *code)
       unsigned long addr = (uintptr_t)code;
       llvm::raw_fd_ostream Out(perf_asm_fd, false);
       Out << symbol << ":\n";
-      unsigned long size = disassemble(code, Out);
+      unsigned long size = disassemble(code);
       fprintf(perf_map_file, "%lx %lx %s\n", addr, size, symbol);
       fflush(perf_map_file);
    }

From a6f39ec1c568c38e7ef42d60eaf6c9ab8397af2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 23 Jul 2015 21:51:48 +0200
Subject: [PATCH 0616/1208] Revert "Match swrast modes more loosely."

This reverts commit f3728a16c9c6a02fc1f44b8069b0060e2358f22e.

It broke glxgears on radeonsi. The window was just black.
---
 src/glx/dri_common.c | 59 +-------------------------------------------
 1 file changed, 1 insertion(+), 58 deletions(-)

diff --git a/src/glx/dri_common.c b/src/glx/dri_common.c
index 1a62ee29e7b..63c8de38c7c 100644
--- a/src/glx/dri_common.c
+++ b/src/glx/dri_common.c
@@ -265,36 +265,6 @@ scalarEqual(struct glx_config *mode, unsigned int attrib, unsigned int value)
    return GL_TRUE;              /* Is a non-existing attribute equal to value? */
 }
 
-static int
-scalarGreaterEqual(struct glx_config *mode, unsigned int attrib, unsigned int value)
-{
-   unsigned int glxValue;
-   int i;
-
-   for (i = 0; i < ARRAY_SIZE(attribMap); i++)
-      if (attribMap[i].attrib == attrib) {
-         glxValue = *(unsigned int *) ((char *) mode + attribMap[i].offset);
-         return glxValue == GLX_DONT_CARE || glxValue >= value;
-      }
-
-   return GL_TRUE;              /* Is a non-existing attribute greater than or equal to value? */
-}
-
-static int
-booleanSupported(struct glx_config *mode, unsigned int attrib, unsigned int value)
-{
-   unsigned int glxValue;
-   int i;
-
-   for (i = 0; i < ARRAY_SIZE(attribMap); i++)
-      if (attribMap[i].attrib == attrib) {
-         glxValue = *(unsigned int *) ((char *) mode + attribMap[i].offset);
-         return glxValue == GLX_DONT_CARE || glxValue;
-      }
-
-   return GL_TRUE;              /* Is a non-existing attribute supported? */
-}
-
 static int
 driConfigEqual(const __DRIcoreExtension *core,
                struct glx_config *config, const __DRIconfig *driConfig)
@@ -343,37 +313,10 @@ driConfigEqual(const __DRIcoreExtension *core,
          if (value & __DRI_ATTRIB_TEXTURE_RECTANGLE_BIT)
             glxValue |= GLX_TEXTURE_RECTANGLE_BIT_EXT;
          if (config->bindToTextureTargets != GLX_DONT_CARE &&
-             glxValue != (config->bindToTextureTargets & glxValue))
+             glxValue != config->bindToTextureTargets)
             return GL_FALSE;
          break;
 
-      case __DRI_ATTRIB_STENCIL_SIZE:
-      case __DRI_ATTRIB_ACCUM_RED_SIZE:
-      case __DRI_ATTRIB_ACCUM_GREEN_SIZE:
-      case __DRI_ATTRIB_ACCUM_BLUE_SIZE:
-      case __DRI_ATTRIB_ACCUM_ALPHA_SIZE:
-         if (value != 0 && !scalarEqual(config, attrib, value))
-            return GL_FALSE;
-         break;
-
-      case __DRI_ATTRIB_DOUBLE_BUFFER:
-      case __DRI_ATTRIB_BIND_TO_TEXTURE_RGB:
-      case __DRI_ATTRIB_BIND_TO_TEXTURE_RGBA:
-      case __DRI_ATTRIB_BIND_TO_MIPMAP_TEXTURE:
-      case __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE:
-          if (value && !booleanSupported(config, attrib, value))
-            return GL_FALSE;
-          break;
-
-      case __DRI_ATTRIB_SAMPLE_BUFFERS:
-      case __DRI_ATTRIB_SAMPLES:
-      case __DRI_ATTRIB_AUX_BUFFERS:
-      case __DRI_ATTRIB_MAX_PBUFFER_WIDTH:
-      case __DRI_ATTRIB_MAX_PBUFFER_HEIGHT:
-      case __DRI_ATTRIB_MAX_PBUFFER_PIXELS:
-         if (!scalarGreaterEqual(config, attrib, value))
-            return GL_FALSE;
-
       default:
          if (!scalarEqual(config, attrib, value))
             return GL_FALSE;

From c844afe94eaecc66e00cc4869f700ac1236bdc89 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 22 Jul 2015 12:39:47 -0400
Subject: [PATCH 0617/1208] mesa: adjust error message when there's a missing
 teximage

The current message makes it seem like the zoffset is invalid.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/main/texgetimage.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 59ec091257e..2f35ac6d77a 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -1013,8 +1013,7 @@ dimensions_error_check(struct gl_context *ctx,
    texImage = select_tex_image(texObj, target, level, zoffset);
    if (!texImage) {
       /* missing texture image */
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "%s(zoffset = %d)", caller, zoffset);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(missing image)", caller);
       return true;
    }
 

From 17f71483698a4e134a0c85ef0aa3da80fdfdb180 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 22 Jul 2015 12:59:46 -0400
Subject: [PATCH 0618/1208] mesa: rearrange texture error checking order

This moves the width/height/depth == 0 check to the front and avoids
doing any other checking when that is the case.

Also moves the dimensions check after the format/type checks so that we
don't bail out with success on a width/height/depth == 0 request when
the format/type don't match.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91425
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/main/texgetimage.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 2f35ac6d77a..cdbd6187708 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -928,6 +928,13 @@ dimensions_error_check(struct gl_context *ctx,
    const struct gl_texture_image *texImage;
    int i;
 
+   if (width == 0 || height == 0 || depth == 0) {
+      /* Not an error, but nothing to do.  Return 'true' so that the
+       * caller simply returns.
+       */
+      return true;
+   }
+
    if (xoffset < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(xoffset = %d)", caller, xoffset);
       return true;
@@ -1079,13 +1086,6 @@ dimensions_error_check(struct gl_context *ctx,
       }
    }
 
-   if (width == 0 || height == 0 || depth == 0) {
-      /* Not an error, but nothing to do.  Return 'true' so that the
-       * caller simply returns.
-       */
-      return true;
-   }
-
    return false;
 }
 
@@ -1164,18 +1164,18 @@ getteximage_error_check(struct gl_context *ctx,
       return true;
    }
 
-   if (dimensions_error_check(ctx, texObj, target, level,
-                              xoffset, yoffset, zoffset,
-                              width, height, depth, caller)) {
-      return true;
-   }
-
    err = _mesa_error_check_format_and_type(ctx, format, type);
    if (err != GL_NO_ERROR) {
       _mesa_error(ctx, err, "%s(format/type)", caller);
       return true;
    }
 
+   if (dimensions_error_check(ctx, texObj, target, level,
+                              xoffset, yoffset, zoffset,
+                              width, height, depth, caller)) {
+      return true;
+   }
+
    if (pbo_error_check(ctx, target, width, height, depth,
                        format, type, bufSize, pixels, caller)) {
       return true;

From 80511d176a49e754a18ce585bab413db7af63bf7 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 21 Jul 2015 14:22:11 +1000
Subject: [PATCH 0619/1208] i965: add support for ARB_shader_subroutine

This just adds some missing pieces to nir/i965,
it is lightly tested on my Haswell.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                                             | 2 +-
 docs/relnotes/10.7.0.html                                | 2 +-
 src/glsl/glsl_types.cpp                                  | 4 ++--
 src/glsl/nir/glsl_to_nir.cpp                             | 1 +
 src/mesa/drivers/dri/i965/brw_fs.cpp                     | 2 ++
 src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp | 1 +
 src/mesa/drivers/dri/i965/brw_shader.cpp                 | 1 +
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp           | 7 +++++++
 src/mesa/drivers/dri/i965/intel_extensions.c             | 2 ++
 9 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 5200737bae7..bcc3424a17f 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -111,7 +111,7 @@ GL 4.0, GLSL 4.00:
   - New overload resolution rules                      DONE
   GL_ARB_gpu_shader_fp64                               DONE (nvc0, radeonsi, llvmpipe, softpipe)
   GL_ARB_sample_shading                                DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_shader_subroutine                             DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_shader_subroutine                             DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_tessellation_shader                           DONE (nvc0, radeonsi)
   GL_ARB_texture_buffer_object_rgb32                   DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_cube_map_array                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index a2e7c02e7b2..26615a83814 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -50,7 +50,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_get_texture_sub_image for all drivers</li>
 <li>GL_ARB_gpu_shader_fp64 on llvmpipe, radeonsi</li>
 <li>GL_ARB_shader_stencil_export on llvmpipe</li>
-<li>GL_ARB_shader_subroutine on core profile gallium drivers</li>
+<li>GL_ARB_shader_subroutine on core profile all drivers</li>
 <li>GL_ARB_tessellation_shader on nvc0, radeonsi</li>
 <li>GL_ARB_vertex_attrib_64bit on llvmpipe, radeonsi</li>
 <li>GL_ARB_viewport_array on radeonsi</li>
diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
index 8510671d2f3..755618ac28b 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -1027,11 +1027,11 @@ glsl_type::component_slots() const
 
    case GLSL_TYPE_IMAGE:
       return 1;
-
+   case GLSL_TYPE_SUBROUTINE:
+     return 1;
    case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_ATOMIC_UINT:
    case GLSL_TYPE_VOID:
-   case GLSL_TYPE_SUBROUTINE:
    case GLSL_TYPE_ERROR:
       break;
    }
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 66430f39995..77327b6c74f 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -1171,6 +1171,7 @@ nir_visitor::visit(ir_expression *ir)
    case ir_unop_bitcast_f2i:
    case ir_unop_bitcast_u2f:
    case ir_unop_bitcast_f2u:
+   case ir_unop_subroutine_to_int:
       /* no-op */
       emit(nir_op_imov, dest_size, srcs);
       break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 9af15377361..7f25a21f0d1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -481,6 +481,8 @@ fs_visitor::type_size(const struct glsl_type *type)
       return 0;
    case GLSL_TYPE_ATOMIC_UINT:
       return 0;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
    case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_ERROR:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index d0f61222e5a..a8883a35ef2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -243,6 +243,7 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_find_msb:
    case ir_unop_find_lsb:
    case ir_unop_saturate:
+   case ir_unop_subroutine_to_int:
       for (i = 0; i < vector_elements; i++) {
 	 ir_rvalue *op0 = get_element(op_var[0], i);
 
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 9d60543c167..703e9aa913e 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -430,6 +430,7 @@ brw_type_for_base_type(const struct glsl_type *type)
       return BRW_REGISTER_TYPE_F;
    case GLSL_TYPE_INT:
    case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_SUBROUTINE:
       return BRW_REGISTER_TYPE_D;
    case GLSL_TYPE_UINT:
       return BRW_REGISTER_TYPE_UD;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index a6eee47fcb0..6fee798038c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -603,6 +603,9 @@ type_size(const struct glsl_type *type)
 	 size += type_size(type->fields.structure[i].type);
       }
       return size;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
+
    case GLSL_TYPE_SAMPLER:
       /* Samplers take up no register space, since they're baked in at
        * link time.
@@ -1558,6 +1561,10 @@ vec4_visitor::visit(ir_expression *ir)
    case ir_unop_noise:
       unreachable("not reached: should be handled by lower_noise");
 
+   case ir_unop_subroutine_to_int:
+      emit(MOV(result_dst, op[0]));
+      break;
+
    case ir_binop_add:
       emit(ADD(result_dst, op[0], op[1]));
       break;
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 6b3bd12ff54..4a0ffffe59e 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -311,6 +311,7 @@ intelInitExtensions(struct gl_context *ctx)
        * slightly differently when the extension is enabled.
        */
       if (ctx->API == API_OPENGL_CORE) {
+         ctx->Extensions.ARB_shader_subroutine = true;
          ctx->Extensions.ARB_viewport_array = true;
          ctx->Extensions.AMD_vertex_shader_viewport_index = true;
       }
@@ -343,6 +344,7 @@ intelInitExtensions(struct gl_context *ctx)
       if (ctx->API == API_OPENGL_CORE) {
          ctx->Extensions.ARB_viewport_array = true;
          ctx->Extensions.AMD_vertex_shader_viewport_index = true;
+         ctx->Extensions.ARB_shader_subroutine = true;
       }
    }
 

From 7c4768540dacab8a4853f1310413cb976b5fb351 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 23 Jul 2015 11:19:15 +1000
Subject: [PATCH 0620/1208] docs/GL3.txt: ARB_shader_precision

This extension is about setting expectation on GL4.1 implementations
rather than actually enforcing things. So once you support GLSL 410
then you support this in theory.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index bcc3424a17f..fd8a01eb143 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -126,7 +126,7 @@ GL 4.1, GLSL 4.10:
   GL_ARB_ES2_compatibility                             DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_get_program_binary                            DONE (0 binary formats)
   GL_ARB_separate_shader_objects                       DONE (all drivers)
-  GL_ARB_shader_precision                              started (Micah)
+  GL_ARB_shader_precision                              DONE (all drivers that support GLSL 4.10)
   GL_ARB_vertex_attrib_64bit                           DONE (nvc0, radeonsi, llvmpipe, softpipe)
   GL_ARB_viewport_array                                DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
 

From 7e0036a49258326cc2d875f2960d18c6b3665036 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 23 Jul 2015 21:41:38 -0400
Subject: [PATCH 0621/1208] nvc0/ir: tess factors are now sysvals, adapt
 codegen to expect that

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir.h   |  3 ++-
 .../nouveau/codegen/nv50_ir_build_util.cpp      |  7 ++++---
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp       | 17 +++++++++++++----
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp   |  2 +-
 .../drivers/nouveau/codegen/nv50_ir_print.cpp   |  3 ++-
 .../nouveau/codegen/nv50_ir_target_nvc0.cpp     |  3 ++-
 6 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 529dcb9bdc2..174dca49ec0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -372,7 +372,8 @@ enum SVSemantic
    SV_SAMPLE_INDEX,
    SV_SAMPLE_POS,
    SV_SAMPLE_MASK,
-   SV_TESS_FACTOR,
+   SV_TESS_OUTER,
+   SV_TESS_INNER,
    SV_TESS_COORD,
    SV_TID,
    SV_CTAID,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index 708c5b322ee..19418c0e0f1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -428,8 +428,7 @@ BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex)
 {
    Symbol *sym = new_Symbol(prog, FILE_SYSTEM_VALUE, 0);
 
-   assert(svIndex < 4 ||
-          (svName == SV_CLIP_DISTANCE || svName == SV_TESS_FACTOR));
+   assert(svIndex < 4 || svName == SV_CLIP_DISTANCE);
 
    switch (svName) {
    case SV_POSITION:
@@ -438,7 +437,9 @@ BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex)
    case SV_POINT_SIZE:
    case SV_POINT_COORD:
    case SV_CLIP_DISTANCE:
-   case SV_TESS_FACTOR:
+   case SV_TESS_OUTER:
+   case SV_TESS_INNER:
+   case SV_TESS_COORD:
       sym->reg.type = TYPE_F32;
       break;
    default:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 8dc7899e6c4..88078b157fa 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -373,6 +373,8 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval)
    case TGSI_SEMANTIC_SAMPLEMASK: return nv50_ir::SV_SAMPLE_MASK;
    case TGSI_SEMANTIC_INVOCATIONID: return nv50_ir::SV_INVOCATION_ID;
    case TGSI_SEMANTIC_TESSCOORD:  return nv50_ir::SV_TESS_COORD;
+   case TGSI_SEMANTIC_TESSOUTER:  return nv50_ir::SV_TESS_OUTER;
+   case TGSI_SEMANTIC_TESSINNER:  return nv50_ir::SV_TESS_INNER;
    case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT;
    default:
       assert(0);
@@ -1055,9 +1057,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
                   info->in[i].centroid = 1;
             }
 
-            if (sn == TGSI_SEMANTIC_PATCH ||
-                sn == TGSI_SEMANTIC_TESSOUTER ||
-                sn == TGSI_SEMANTIC_TESSINNER)
+            if (sn == TGSI_SEMANTIC_PATCH)
                info->in[i].patch = 1;
             if (sn == TGSI_SEMANTIC_PATCH)
                info->numPatchConstants = MAX2(info->numPatchConstants, si + 1);
@@ -1125,6 +1125,13 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          info->sv[i].sn = sn;
          info->sv[i].si = si;
          info->sv[i].input = inferSysValDirection(sn);
+
+         switch (sn) {
+         case TGSI_SEMANTIC_TESSOUTER:
+         case TGSI_SEMANTIC_TESSINNER:
+            info->sv[i].patch = 1;
+            break;
+         }
       }
       break;
    case TGSI_FILE_RESOURCE:
@@ -1624,7 +1631,9 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
       return ld->getDef(0);
    case TGSI_FILE_SYSTEM_VALUE:
       assert(!ptr);
-      return mkOp1v(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
+      ld = mkOp1(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
+      ld->perPatch = info->sv[idx].patch;
+      return ld->getDef(0);
    default:
       return getArrayForFile(src.getFile(), idx2d)->load(
          sub.cur->values, idx, swz, shiftAddress(ptr));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 85dd05d5267..cd8ee71ad2b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1598,7 +1598,7 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
       break;
    default:
-      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
+      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
       ld = bld.mkFetch(i->getDef(0), i->dType,
                        FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
index ef3de6ff92a..556d01b864d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -258,7 +258,8 @@ static const char *SemanticStr[SV_LAST + 1] =
    "SAMPLE_INDEX",
    "SAMPLE_POS",
    "SAMPLE_MASK",
-   "TESS_FACTOR",
+   "TESS_OUTER",
+   "TESS_INNER",
    "TESS_COORD",
    "TID",
    "CTAID",
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 7d4a859dde4..27df0eba66b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -286,7 +286,8 @@ TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
    case SV_CLIP_DISTANCE:  return 0x2c0 + idx * 4;
    case SV_POINT_COORD:    return 0x2e0 + idx * 4;
    case SV_FACE:           return 0x3fc;
-   case SV_TESS_FACTOR:    return 0x000 + idx * 4;
+   case SV_TESS_OUTER:     return 0x000 + idx * 4;
+   case SV_TESS_INNER:     return 0x010 + idx * 4;
    case SV_TESS_COORD:     return 0x2f0 + idx * 4;
    case SV_NTID:           return kepler ? (0x00 + idx * 4) : ~0;
    case SV_NCTAID:         return kepler ? (0x0c + idx * 4) : ~0;

From 0a51acbb467bce5afddc7edf53db426ac697ccf1 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 23 Jul 2015 16:57:25 -0400
Subject: [PATCH 0622/1208] docs: remove expanded ARB_dsa notes

This doesn't provide much value since it's all done. The qbo interaction
is fairly trivial.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index fd8a01eb143..9d3d5e03b66 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -193,15 +193,6 @@ GL 4.5, GLSL 4.50:
   GL_ARB_cull_distance                                 in progress (Tobias)
   GL_ARB_derivative_control                            DONE (i965, nv50, nvc0, r600)
   GL_ARB_direct_state_access                           DONE (all drivers)
-  - Transform Feedback object                          DONE
-  - Buffer object                                      DONE
-  - Framebuffer object                                 DONE
-  - Renderbuffer object                                DONE
-  - Texture object                                     DONE
-  - Vertex array object                                DONE
-  - Sampler object                                     DONE
-  - Program Pipeline object                            DONE
-  - Query object                                       DONE (will require changes when GL_ARB_query_buffer_object lands)
   GL_ARB_get_texture_sub_image                         DONE (all drivers)
   GL_ARB_shader_texture_image_samples                  not started
   GL_ARB_texture_barrier                               DONE (nv50, nvc0, r600, radeonsi)

From 319b83b3ee2629f443a8734256bbf33b3fb4a7a9 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Fri, 24 Jul 2015 12:02:57 +1000
Subject: [PATCH 0623/1208] apiexec: remove leading gl from shader subroutine
 interfaces

Remove the gl at the start, stared at this for a while
yesterday, totally missed it.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91441
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mapi/glapi/gen/apiexec.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/mapi/glapi/gen/apiexec.py b/src/mapi/glapi/gen/apiexec.py
index 66bc79ac405..3a0eb1869a8 100644
--- a/src/mapi/glapi/gen/apiexec.py
+++ b/src/mapi/glapi/gen/apiexec.py
@@ -76,14 +76,14 @@ functions = {
 
     # OpenGL 4.0 / GL_ARB_shader_subroutines. Mesa only exposes this
     # extension with core profile.
-    "glGetSubroutineUniformLocation": exec_info(core=31),
-    "glGetSubroutineIndex": exec_info(core=31),
-    "glGetActiveSubroutineUniformiv": exec_info(core=31),
-    "glGetActiveSubroutineUniformName": exec_info(core=31),
-    "glGetActiveSubroutineName": exec_info(core=31),
-    "glUniformSubroutinesuiv": exec_info(core=31),
-    "glGetUniformSubroutineuiv": exec_info(core=31),
-    "glGetProgramStageiv": exec_info(core=31),
+    "GetSubroutineUniformLocation": exec_info(core=31),
+    "GetSubroutineIndex": exec_info(core=31),
+    "GetActiveSubroutineUniformiv": exec_info(core=31),
+    "GetActiveSubroutineUniformName": exec_info(core=31),
+    "GetActiveSubroutineName": exec_info(core=31),
+    "UniformSubroutinesuiv": exec_info(core=31),
+    "GetUniformSubroutineuiv": exec_info(core=31),
+    "GetProgramStageiv": exec_info(core=31),
 
     # OpenGL 4.0 / GL_ARB_gpu_shader_fp64.  The extension spec says:
     #

From dbefffa5b4c438008d44db106b5774f575cb495f Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 21 Jul 2015 18:42:41 -0600
Subject: [PATCH 0624/1208] mesa: initialize variables to silence compiler
 warnings

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
---
 src/mesa/main/fbobject.c  | 4 ++--
 src/mesa/main/shaderapi.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index f46554b761e..cc342c23c03 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -3304,7 +3304,7 @@ _mesa_FramebufferTexture(GLenum target, GLenum attachment,
    GET_CURRENT_CONTEXT(ctx);
    struct gl_framebuffer *fb;
    struct gl_texture_object *texObj;
-   GLboolean layered;
+   GLboolean layered = GL_FALSE;
 
    const char *func = "FramebufferTexture";
 
@@ -3347,7 +3347,7 @@ _mesa_NamedFramebufferTexture(GLuint framebuffer, GLenum attachment,
    GET_CURRENT_CONTEXT(ctx);
    struct gl_framebuffer *fb;
    struct gl_texture_object *texObj;
-   GLboolean layered;
+   GLboolean layered = GL_FALSE;
 
    const char *func = "glNamedFramebufferTexture";
 
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 34d280a6fba..b86f22ef9c5 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1432,7 +1432,7 @@ void GLAPIENTRY
 _mesa_GetObjectParameterfvARB(GLhandleARB object, GLenum pname,
                               GLfloat *params)
 {
-   GLint iparams[1];  /* XXX is one element enough? */
+   GLint iparams[1] = {0};  /* XXX is one element enough? */
    _mesa_GetObjectParameterivARB(object, pname, iparams);
    params[0] = (GLfloat) iparams[0];
 }

From 81e2c256e921ad4f5c13bb0d95bbe0ad232ec37c Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 22 Jul 2015 07:32:36 -0600
Subject: [PATCH 0625/1208] mesa: simplify format check in
 compressed_subtexture_target_check()

Lose the invalidformat local variable.

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/mesa/main/teximage.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index cd451138b1c..4fe6ee4f305 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -4586,21 +4586,17 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
        * are valid here, which they are not, but of course not mentioned by
        * core spec.
        */
-      if (targetOK && target != GL_TEXTURE_2D_ARRAY &&
-          target != GL_TEXTURE_CUBE_MAP_ARRAY) {
-         bool invalidformat;
+      if (target != GL_TEXTURE_2D_ARRAY && target != GL_TEXTURE_CUBE_MAP_ARRAY) {
          switch (format) {
-            /* These came from _mesa_is_compressed_format in glformats.c. */
-            case GL_COMPRESSED_RGBA_BPTC_UNORM:
-            case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
-            case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
-            case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
-               invalidformat = false;
-               break;
-            default:
-               invalidformat = true;
-         }
-         if (invalidformat) {
+         /* These are the only 3D compression formats supported at this time */
+         case GL_COMPRESSED_RGBA_BPTC_UNORM:
+         case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
+         case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
+         case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
+            /* valid format */
+            break;
+         default:
+            /* invalid format */
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(invalid target %s for format %s)", caller,
                         _mesa_enum_to_string(target),

From 05a44ab32832efe61a252ef4ac2d128c1101c286 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 22 Jul 2015 07:42:12 -0600
Subject: [PATCH 0626/1208] mesa: another target fix in
 compressed_subtexture_target_check()

The previous fix added GL_TEXTURE_CUBE_MAP_ARRAY but we also need
to support GL_TEXTURE_CUBE_MAP (via DSA).

So in the end, GL_TEXTURE_3D is the only (legal) target for
glCompressedTex*SubImage3D() which needs additional compression
format checking.  GL_TEXTURE_2D_ARRAY, GL_TEXTURE_CUBE_MAP_ARRAY
and GL_TEXTURE_CUBE_MAP are basically 2D images which support all
compressed formats.

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/mesa/main/teximage.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 4fe6ee4f305..0726758076f 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -4581,12 +4581,19 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
        *    one of the EAC, ETC2, or RGTC formats and either border is
        *    non-zero, or the effective target for the texture is not
        *    TEXTURE_2D_ARRAY."
+       *
+       * NOTE: that's probably a spec error.  It should probably say
+       *    "... or the effective target for the texture is not
+       *    TEXTURE_2D_ARRAY, TEXTURE_CUBE_MAP, nor GL_TEXTURE_CUBE_MAP_ARRAY."
+       * since those targets are 2D images and they support all compression
+       * formats.
+       *
        * Instead of listing all these, just list those which are allowed,
        * which is (at this time) only bptc. Otherwise we'd say s3tc (and more)
        * are valid here, which they are not, but of course not mentioned by
        * core spec.
        */
-      if (target != GL_TEXTURE_2D_ARRAY && target != GL_TEXTURE_CUBE_MAP_ARRAY) {
+      if (target == GL_TEXTURE_3D) {
          switch (format) {
          /* These are the only 3D compression formats supported at this time */
          case GL_COMPRESSED_RGBA_BPTC_UNORM:

From 3afa40e43368b29ca99018999936336c3879fa4d Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 22 Jul 2015 07:53:01 -0600
Subject: [PATCH 0627/1208] mesa: do more thorough target checking in
 compressed_subtexture_target_check()

When we're error-checking the target, we also need to check if the
corresponding extension is supported.

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/mesa/main/teximage.c | 66 ++++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 26 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 0726758076f..949eef0e285 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -4555,13 +4555,15 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
    case 2:
       switch (target) {
       case GL_TEXTURE_2D:
+         targetOK = GL_TRUE;
+         break;
       case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
       case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
       case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
       case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
       case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
       case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-         targetOK = GL_TRUE;
+         targetOK = ctx->Extensions.ARB_texture_cube_map;
          break;
       default:
          targetOK = GL_FALSE;
@@ -4569,31 +4571,40 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
       }
       break;
    case 3:
-      targetOK = (target == GL_TEXTURE_3D) ||
-                 (target == GL_TEXTURE_2D_ARRAY) ||
-                 (target == GL_TEXTURE_CUBE_MAP_ARRAY) ||
-                 (target == GL_TEXTURE_CUBE_MAP && dsa);
-
-      /* OpenGL 4.5 spec (30.10.2014) says in Section 8.7 Compressed Texture
-       * Images:
-       *    "An INVALID_OPERATION error is generated by
-       *    CompressedTex*SubImage3D if the internal format of the texture is
-       *    one of the EAC, ETC2, or RGTC formats and either border is
-       *    non-zero, or the effective target for the texture is not
-       *    TEXTURE_2D_ARRAY."
-       *
-       * NOTE: that's probably a spec error.  It should probably say
-       *    "... or the effective target for the texture is not
-       *    TEXTURE_2D_ARRAY, TEXTURE_CUBE_MAP, nor GL_TEXTURE_CUBE_MAP_ARRAY."
-       * since those targets are 2D images and they support all compression
-       * formats.
-       *
-       * Instead of listing all these, just list those which are allowed,
-       * which is (at this time) only bptc. Otherwise we'd say s3tc (and more)
-       * are valid here, which they are not, but of course not mentioned by
-       * core spec.
-       */
-      if (target == GL_TEXTURE_3D) {
+      switch (target) {
+      case GL_TEXTURE_CUBE_MAP:
+         targetOK = dsa && ctx->Extensions.ARB_texture_cube_map;
+         break;
+      case GL_TEXTURE_2D_ARRAY:
+         targetOK = _mesa_is_gles3(ctx) ||
+            (_mesa_is_desktop_gl(ctx) && ctx->Extensions.EXT_texture_array);
+         break;
+      case GL_TEXTURE_CUBE_MAP_ARRAY:
+         targetOK = ctx->Extensions.ARB_texture_cube_map_array;
+         break;
+      case GL_TEXTURE_3D:
+         targetOK = GL_TRUE;
+         /*
+          * OpenGL 4.5 spec (30.10.2014) says in Section 8.7 Compressed Texture
+          * Images:
+          *    "An INVALID_OPERATION error is generated by
+          *    CompressedTex*SubImage3D if the internal format of the texture
+          *    is one of the EAC, ETC2, or RGTC formats and either border is
+          *    non-zero, or the effective target for the texture is not
+          *    TEXTURE_2D_ARRAY."
+          *
+          * NOTE: that's probably a spec error.  It should probably say
+          *    "... or the effective target for the texture is not
+          *    TEXTURE_2D_ARRAY, TEXTURE_CUBE_MAP, nor
+          *    GL_TEXTURE_CUBE_MAP_ARRAY."
+          * since those targets are 2D images and they support all compression
+          * formats.
+          *
+          * Instead of listing all these, just list those which are allowed,
+          * which is (at this time) only bptc. Otherwise we'd say s3tc (and
+          * more) are valid here, which they are not, but of course not
+          * mentioned by core spec.
+          */
          switch (format) {
          /* These are the only 3D compression formats supported at this time */
          case GL_COMPRESSED_RGBA_BPTC_UNORM:
@@ -4610,6 +4621,9 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
                         _mesa_enum_to_string(format));
             return GL_TRUE;
          }
+         break;
+      default:
+         targetOK = GL_FALSE;
       }
 
       break;

From d7cb3f76f5c175c95ffbbec40eea5976493f8681 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 23 Jul 2015 07:41:09 -0600
Subject: [PATCH 0628/1208] st/mesa: add comments on a few sampler view
 functions

Trivial.
---
 src/mesa/state_tracker/st_texture.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 6beb21e3389..52b094330b9 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -462,6 +462,11 @@ st_texture_get_sampler_view(struct st_context *st,
    return free;
 }
 
+
+/**
+ * For the given texture object, release any sampler views which belong
+ * to the calling context.
+ */
 void
 st_texture_release_sampler_view(struct st_context *st,
                                 struct st_texture_object *stObj)
@@ -478,6 +483,11 @@ st_texture_release_sampler_view(struct st_context *st,
    }
 }
 
+
+/**
+ * Release all sampler views attached to the given texture object, regardless
+ * of the context.
+ */
 void
 st_texture_release_all_sampler_views(struct st_context *st,
                                      struct st_texture_object *stObj)

From 43b69aad195f5abfc2c8c75bfa2ff31e5b99fbab Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 23 Jul 2015 07:43:11 -0600
Subject: [PATCH 0629/1208] st/mesa: remove unused 'samp' function parameters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_atom_texture.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index ba3cf9beeb4..1e315332560 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -232,7 +232,6 @@ static unsigned last_layer(struct st_texture_object *stObj)
 static struct pipe_sampler_view *
 st_create_texture_sampler_view_from_stobj(struct pipe_context *pipe,
 					  struct st_texture_object *stObj,
-                                          const struct gl_sampler_object *samp,
 					  enum pipe_format format)
 {
    struct pipe_sampler_view templ;
@@ -283,7 +282,6 @@ st_create_texture_sampler_view_from_stobj(struct pipe_context *pipe,
 static struct pipe_sampler_view *
 st_get_texture_sampler_view_from_stobj(struct st_context *st,
                                        struct st_texture_object *stObj,
-                                       const struct gl_sampler_object *samp,
 				       enum pipe_format format)
 {
    struct pipe_sampler_view **sv;
@@ -318,7 +316,7 @@ st_get_texture_sampler_view_from_stobj(struct st_context *st,
    }
 
    if (!*sv) {
-      *sv = st_create_texture_sampler_view_from_stobj(st->pipe, stObj, samp, format);
+      *sv = st_create_texture_sampler_view_from_stobj(st->pipe, stObj, format);
 
    } else if ((*sv)->context != st->pipe) {
       /* Recreate view in correct context, use existing view as template */
@@ -374,8 +372,8 @@ update_single_texture(struct st_context *st,
       }
    }
 
-   *sampler_view = st_get_texture_sampler_view_from_stobj(st, stObj, samp,
-							  view_format);
+   *sampler_view = st_get_texture_sampler_view_from_stobj(st, stObj,
+                                                          view_format);
    return GL_TRUE;
 }
 

From dd86fbeaaa136c4ddfd255286f4975d869e799a0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 23 Jul 2015 07:47:25 -0600
Subject: [PATCH 0630/1208] mesa: fix _mesa_error() compiler warnings in
 shaderapi.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix many instances of:
main/shaderapi.c: In function '_mesa_GetSubroutineUniformLocation':
main/shaderapi.c:2176:7: warning: format not a string literal and no format arguments [-Wformat-security]
       _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
       ^

Ideally, many of these error messages should be improved to indicate
which argument is incorrect as we do in other parts of Mesa.

Reviewed-by: Kai Wasserbäch <kai@dev.carbon-project.org>
Tested-by: Kai Wasserbäch <kai@dev.carbon-project.org>
---
 src/mesa/main/shaderapi.c | 66 +++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index b86f22ef9c5..e96e27a8ba2 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -2173,12 +2173,12 @@ _mesa_GetSubroutineUniformLocation(GLuint program, GLenum shadertype,
    gl_shader_stage stage;
 
    if (!_mesa_has_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return -1;
    }
 
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return -1;
    }
 
@@ -2188,7 +2188,7 @@ _mesa_GetSubroutineUniformLocation(GLuint program, GLenum shadertype,
 
    stage = _mesa_shader_enum_to_shader_stage(shadertype);
    if (!shProg->_LinkedShaders[stage]) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return -1;
    }
 
@@ -2208,12 +2208,12 @@ _mesa_GetSubroutineIndex(GLuint program, GLenum shadertype,
    gl_shader_stage stage;
 
    if (!_mesa_has_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return -1;
    }
 
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return -1;
    }
 
@@ -2223,14 +2223,14 @@ _mesa_GetSubroutineIndex(GLuint program, GLenum shadertype,
 
    stage = _mesa_shader_enum_to_shader_stage(shadertype);
    if (!shProg->_LinkedShaders[stage]) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return -1;
    }
 
    resource_type = _mesa_shader_stage_to_subroutine(stage);
    res = _mesa_program_resource_find_name(shProg, resource_type, name);
    if (!res) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
      return -1;
    }
 
@@ -2253,12 +2253,12 @@ _mesa_GetActiveSubroutineUniformiv(GLuint program, GLenum shadertype,
    int count, i, j;
 
    if (!_mesa_has_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
@@ -2271,7 +2271,7 @@ _mesa_GetActiveSubroutineUniformiv(GLuint program, GLenum shadertype,
 
    sh = shProg->_LinkedShaders[stage];
    if (!sh) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
@@ -2315,7 +2315,7 @@ _mesa_GetActiveSubroutineUniformiv(GLuint program, GLenum shadertype,
       }
       break;
    default:
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 }
@@ -2333,12 +2333,12 @@ _mesa_GetActiveSubroutineUniformName(GLuint program, GLenum shadertype,
    gl_shader_stage stage;
 
    if (!_mesa_has_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
@@ -2348,7 +2348,7 @@ _mesa_GetActiveSubroutineUniformName(GLuint program, GLenum shadertype,
 
    stage = _mesa_shader_enum_to_shader_stage(shadertype);
    if (!shProg->_LinkedShaders[stage]) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
@@ -2372,12 +2372,12 @@ _mesa_GetActiveSubroutineName(GLuint program, GLenum shadertype,
    gl_shader_stage stage;
 
    if (!_mesa_has_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
@@ -2387,7 +2387,7 @@ _mesa_GetActiveSubroutineName(GLuint program, GLenum shadertype,
 
    stage = _mesa_shader_enum_to_shader_stage(shadertype);
    if (!shProg->_LinkedShaders[stage]) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
    resource_type = _mesa_shader_stage_to_subroutine(stage);
@@ -2409,30 +2409,30 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
    int i;
 
    if (!_mesa_has_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
    stage = _mesa_shader_enum_to_shader_stage(shadertype);
    shProg = ctx->_Shader->CurrentProgram[stage];
    if (!shProg) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
    sh = shProg->_LinkedShaders[stage];
    if (!sh) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
    if (count != sh->NumSubroutineUniformRemapTable) {
-      _mesa_error(ctx, GL_INVALID_VALUE, api_name);
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s", api_name);
       return;
    }
 
@@ -2445,7 +2445,7 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
       for (j = i; j < i + uni_count; j++) {
          struct gl_subroutine_function *subfn;
          if (indices[j] >= sh->NumSubroutineFunctions) {
-            _mesa_error(ctx, GL_INVALID_VALUE, api_name);
+            _mesa_error(ctx, GL_INVALID_VALUE, "%s", api_name);
             return;
          }
 
@@ -2455,7 +2455,7 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
                break;
          }
          if (k == subfn->num_compat_types) {
-            _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+            _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
             return;
          }
       }
@@ -2489,30 +2489,30 @@ _mesa_GetUniformSubroutineuiv(GLenum shadertype, GLint location,
    gl_shader_stage stage;
 
    if (!_mesa_has_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
    stage = _mesa_shader_enum_to_shader_stage(shadertype);
    shProg = ctx->_Shader->CurrentProgram[stage];
    if (!shProg) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
    sh = shProg->_LinkedShaders[stage];
    if (!sh) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
    if (location >= sh->NumSubroutineUniformRemapTable) {
-      _mesa_error(ctx, GL_INVALID_VALUE, api_name);
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s", api_name);
       return;
    }
 
@@ -2536,12 +2536,12 @@ _mesa_GetProgramStageiv(GLuint program, GLenum shadertype,
    gl_shader_stage stage;
 
    if (!_mesa_has_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
@@ -2552,7 +2552,7 @@ _mesa_GetProgramStageiv(GLuint program, GLenum shadertype,
    stage = _mesa_shader_enum_to_shader_stage(shadertype);
    sh = shProg->_LinkedShaders[stage];
    if (!sh) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, api_name);
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
    }
 
@@ -2606,7 +2606,7 @@ _mesa_GetProgramStageiv(GLuint program, GLenum shadertype,
       break;
    }
    default:
-      _mesa_error(ctx, GL_INVALID_ENUM, api_name);
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", api_name);
       values[0] = -1;
       break;
    }

From 28db89fa8b3f59d35032c0576fbd0c74739b3c87 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 23 Jul 2015 10:04:13 -0600
Subject: [PATCH 0631/1208] mesa: minor clean-ups in shaderapi.c

80-column wrapping.  Move break statements.  Indentation fixes.
---
 src/mesa/main/shaderapi.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index e96e27a8ba2..0887c68763e 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1561,7 +1561,7 @@ read_shader(const char *fname)
  */
 void GLAPIENTRY
 _mesa_ShaderSource(GLhandleARB shaderObj, GLsizei count,
-                      const GLcharARB * const * string, const GLint * length)
+                   const GLcharARB * const * string, const GLint * length)
 {
    GET_CURRENT_CONTEXT(ctx);
    GLint *offsets;
@@ -1986,7 +1986,8 @@ _mesa_use_shader_program(struct gl_context *ctx, GLenum type,
 
 static GLuint
 _mesa_create_shader_program(struct gl_context* ctx, GLboolean separate,
-                            GLenum type, GLsizei count, const GLchar* const *strings)
+                            GLenum type, GLsizei count,
+                            const GLchar* const *strings)
 {
    const GLuint shader = create_shader(ctx, type);
    GLuint program = 0;
@@ -2071,20 +2072,20 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
       dst->UsesClipDistanceOut = src->Geom.UsesClipDistance;
       dst_gp->UsesEndPrimitive = src->Geom.UsesEndPrimitive;
       dst_gp->UsesStreams = src->Geom.UsesStreams;
-   }
       break;
+   }
    case MESA_SHADER_FRAGMENT: {
       struct gl_fragment_program *dst_fp = (struct gl_fragment_program *) dst;
       dst_fp->FragDepthLayout = src->FragDepthLayout;
-   }
       break;
+   }
    case MESA_SHADER_COMPUTE: {
       struct gl_compute_program *dst_cp = (struct gl_compute_program *) dst;
       int i;
       for (i = 0; i < 3; i++)
          dst_cp->LocalSize[i] = src->Comp.LocalSize[i];
-   }
       break;
+   }
    default:
       break;
    }
@@ -2311,7 +2312,8 @@ _mesa_GetActiveSubroutineUniformiv(GLuint program, GLenum shadertype,
    case GL_UNIFORM_NAME_LENGTH:
       res = _mesa_program_resource_find_index(shProg, resource_type, index);
       if (res) {
-         values[0] = strlen(_mesa_program_resource_name(res)) + 1 + ((_mesa_program_resource_array_size(res) != 0) ? 3 : 0);;
+         values[0] = strlen(_mesa_program_resource_name(res)) + 1
+            + ((_mesa_program_resource_array_size(res) != 0) ? 3 : 0);;
       }
       break;
    default:
@@ -2596,7 +2598,8 @@ _mesa_GetProgramStageiv(GLuint program, GLenum shadertype,
       for (i = 0; i < sh->NumSubroutineUniformRemapTable; i++) {
          res = _mesa_program_resource_find_index(shProg, resource_type, i);
          if (res) {
-            const GLint len = strlen(_mesa_program_resource_name(res)) + 1+  ((_mesa_program_resource_array_size(res) != 0) ? 3 : 0);
+            const GLint len = strlen(_mesa_program_resource_name(res)) + 1
+               + ((_mesa_program_resource_array_size(res) != 0) ? 3 : 0);
 
             if (len > max_len)
                max_len = len;

From bc893e3dad74622b971e295f60a022f179ca9942 Mon Sep 17 00:00:00 2001
From: Rhys Kidd <rhyskidd@gmail.com>
Date: Wed, 22 Jul 2015 22:14:00 -0600
Subject: [PATCH 0632/1208] doxygen: Correct grammatical typo in
 math/m_vector.h

Signed-off-by: Rhys Kidd <rhyskidd@gmail.com>

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/math/m_vector.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/math/m_vector.h b/src/mesa/math/m_vector.h
index 8551ee7520e..3b7f5834c56 100644
--- a/src/mesa/math/m_vector.h
+++ b/src/mesa/math/m_vector.h
@@ -51,7 +51,7 @@
 
 /**
  * Wrap all the information about vectors up in a struct.  Has
- * additional fields compared to the other vectors to help us track of
+ * additional fields compared to the other vectors to help us track
  * different vertex sizes, and whether we need to clean columns out
  * because they contain non-(0,0,0,1) values.
  *

From 00fb21e744b045cc9f945021305b85595c35dd69 Mon Sep 17 00:00:00 2001
From: Rhys Kidd <rhyskidd@gmail.com>
Date: Wed, 22 Jul 2015 22:14:00 -0600
Subject: [PATCH 0633/1208] doxygen: Link GLvector4f struct members properly,
 avoiding invalid XML/HTML warning

Signed-off-by: Rhys Kidd <rhyskidd@gmail.com>

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/math/m_vector.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/math/m_vector.h b/src/mesa/math/m_vector.h
index 3b7f5834c56..5bd76b8987d 100644
--- a/src/mesa/math/m_vector.h
+++ b/src/mesa/math/m_vector.h
@@ -61,7 +61,7 @@
  */
 typedef struct {
    GLfloat (*data)[4];	/**< may be malloc'd or point to client data */
-   GLfloat *start;	/**< points somewhere inside of <data> */
+   GLfloat *start;	/**< points somewhere inside of GLvector4f::data */
    GLuint count;	/**< size of the vector (in elements) */
    GLuint stride;	/**< stride from one element to the next (in bytes) */
    GLuint size;		/**< 2-4 for vertices and 1-4 for texcoords */

From 22c9339abf00c2ecf40e0d8fd740faafba3ec37b Mon Sep 17 00:00:00 2001
From: Vinson Lee <vlee@freedesktop.org>
Date: Tue, 21 Jul 2015 21:50:29 -0700
Subject: [PATCH 0634/1208] radeon: Silence GCC unused-but-set-variable
 warnings.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

radeon_fbo.c: In function 'radeon_map_renderbuffer_s8z24':
radeon_fbo.c:162:9: warning: variable 'ret' set but not used [-Wunused-but-set-variable]
     int ret;
         ^
radeon_fbo.c: In function 'radeon_map_renderbuffer_z16':
radeon_fbo.c:200:9: warning: variable 'ret' set but not used [-Wunused-but-set-variable]
     int ret;
         ^
radeon_fbo.c: In function 'radeon_map_renderbuffer':
radeon_fbo.c:242:8: warning: variable 'ret' set but not used [-Wunused-but-set-variable]
    int ret;
        ^
radeon_fbo.c: In function 'radeon_unmap_renderbuffer':
radeon_fbo.c:419:14: warning: variable 'ok' set but not used [-Wunused-but-set-variable]
    GLboolean ok;
              ^

Signed-off-by: Vinson Lee <vlee@freedesktop.org>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/drivers/dri/radeon/radeon_fbo.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
index 5e4aaca1e0e..5eece518c95 100644
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -169,6 +169,7 @@ radeon_map_renderbuffer_s8z24(struct gl_context *ctx,
     rrb->map_buffer = malloc(w * h * 4);
     ret = radeon_bo_map(rrb->bo, !!(mode & GL_MAP_WRITE_BIT));
     assert(!ret);
+    (void) ret;
     untiled_s8z24_map = rrb->map_buffer;
     tiled_s8z24_map = rrb->bo->ptr;
 
@@ -207,6 +208,7 @@ radeon_map_renderbuffer_z16(struct gl_context *ctx,
     rrb->map_buffer = malloc(w * h * 2);
     ret = radeon_bo_map(rrb->bo, !!(mode & GL_MAP_WRITE_BIT));
     assert(!ret);
+    (void) ret;
 
     untiled_z16_map = rrb->map_buffer;
     tiled_z16_map = rrb->bo->ptr;
@@ -324,6 +326,7 @@ radeon_map_renderbuffer(struct gl_context *ctx,
 
    ret = radeon_bo_map(rrb->bo, !!(mode & GL_MAP_WRITE_BIT));
    assert(!ret);
+   (void) ret;
 
    map = rrb->bo->ptr;
    stride = rrb->map_pitch;
@@ -416,7 +419,6 @@ radeon_unmap_renderbuffer(struct gl_context *ctx,
 {
    struct radeon_context *const rmesa = RADEON_CONTEXT(ctx);
    struct radeon_renderbuffer *rrb = radeon_renderbuffer(rb);
-   GLboolean ok;
 
    if ((rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_DEPTH_ALWAYS_TILED) && !rrb->has_surface) {
        if (rb->Format == MESA_FORMAT_Z24_UNORM_S8_UINT || rb->Format == MESA_FORMAT_Z24_UNORM_X8_UINT) {
@@ -438,6 +440,7 @@ radeon_unmap_renderbuffer(struct gl_context *ctx,
    radeon_bo_unmap(rrb->map_bo);
 
    if (rrb->map_mode & GL_MAP_WRITE_BIT) {
+      GLboolean ok;
       ok = rmesa->vtbl.blit(ctx, rrb->map_bo, 0,
 			    rb->Format, rrb->map_pitch / rrb->cpp,
 			    rrb->map_w, rrb->map_h,
@@ -449,6 +452,7 @@ radeon_unmap_renderbuffer(struct gl_context *ctx,
 			    rrb->map_w, rrb->map_h,
 			    GL_FALSE);
       assert(ok);
+      (void) ok;
    }
 
    radeon_bo_unref(rrb->map_bo);

From 9d60793a03e40e1d139b78fce0144cad57438741 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 23 Jul 2015 23:03:53 -0400
Subject: [PATCH 0635/1208] nvc0/ir: kepler can't do indirect shader
 input/output loads directly

There's a special AL2P instruction (called AFETCH in nv50 ir) which
computes a "physical" value to be used with indirect addressing with ALD.

Fixes

  tcs-input-array-*-index-rd
  tcs-output-array-*-index-wr

varying-indexing tessellation tests on Kepler.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir.h |  1 +
 .../nouveau/codegen/nv50_ir_emit_gk110.cpp    | 21 +++++++++++++++++++
 .../nouveau/codegen/nv50_ir_emit_gm107.cpp    | 15 +++++++++++++
 .../nouveau/codegen/nv50_ir_emit_nvc0.cpp     | 16 ++++++++++++++
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 19 +++++++++++++++--
 .../drivers/nouveau/codegen/nv50_ir_print.cpp |  1 +
 .../nouveau/codegen/nv50_ir_target.cpp        |  6 +++---
 .../nouveau/codegen/nv50_ir_target_nv50.cpp   |  2 +-
 8 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 174dca49ec0..3ddaeafebbd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -106,6 +106,7 @@ enum operation
    OP_MEMBAR, // memory barrier (mfence, lfence, sfence)
    OP_VFETCH, // indirection 0 in attribute space, indirection 1 is vertex base
    OP_PFETCH, // fetch base address of vertex src0 (immediate) [+ src1]
+   OP_AFETCH, // fetch base address of shader input (a[%r1+0x10])
    OP_EXPORT,
    OP_LINTERP,
    OP_PINTERP,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 44d3a5efecf..f06056f8f17 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -77,6 +77,7 @@ private:
    void emitMOV(const Instruction *);
 
    void emitINTERP(const Instruction *);
+   void emitAFETCH(const Instruction *);
    void emitPFETCH(const Instruction *);
    void emitVFETCH(const Instruction *);
    void emitEXPORT(const Instruction *);
@@ -1338,6 +1339,23 @@ CodeEmitterGK110::emitFlow(const Instruction *i)
    }
 }
 
+void
+CodeEmitterGK110::emitAFETCH(const Instruction *i)
+{
+   uint32_t offset = i->src(0).get()->reg.data.offset & 0x7ff;
+
+   code[0] = 0x00000002 | (offset << 23);
+   code[1] = 0x7d000000 | (offset >> 9);
+
+   if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+      code[1] |= 0x8;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+   srcId(i->src(0).getIndirect(0), 10);
+}
+
 void
 CodeEmitterGK110::emitPFETCH(const Instruction *i)
 {
@@ -1707,6 +1725,9 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
    case OP_EXPORT:
       emitEXPORT(insn);
       break;
+   case OP_AFETCH:
+      emitAFETCH(insn);
+      break;
    case OP_PFETCH:
       emitPFETCH(insn);
       break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 65c1f23e101..ef5c87d0437 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -174,6 +174,7 @@ private:
    void emitALD();
    void emitAST();
    void emitISBERD();
+   void emitAL2P();
    void emitIPA();
 
    void emitPIXLD();
@@ -2203,6 +2204,17 @@ CodeEmitterGM107::emitISBERD()
    emitGPR (0x00, insn->def(0));
 }
 
+void
+CodeEmitterGM107::emitAL2P()
+{
+   emitInsn (0xefa00000);
+   emitField(0x2f, 2, (insn->getDef(0)->reg.size / 4) - 1);
+   emitO    (0x20);
+   emitField(0x14, 11, insn->src(0).get()->reg.data.offset);
+   emitGPR  (0x08, insn->src(0).getIndirect(0));
+   emitGPR  (0x00, insn->def(0));
+}
+
 void
 CodeEmitterGM107::emitIPA()
 {
@@ -2759,6 +2771,9 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
    case OP_PFETCH:
       emitISBERD();
       break;
+   case OP_AFETCH:
+      emitAL2P();
+      break;
    case OP_LINTERP:
    case OP_PINTERP:
       emitIPA();
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 472e3a84119..3ed815bad39 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -85,6 +85,7 @@ private:
    void emitCCTL(const Instruction *);
 
    void emitINTERP(const Instruction *);
+   void emitAFETCH(const Instruction *);
    void emitPFETCH(const Instruction *);
    void emitVFETCH(const Instruction *);
    void emitEXPORT(const Instruction *);
@@ -1493,6 +1494,21 @@ CodeEmitterNVC0::emitBAR(const Instruction *i)
    }
 }
 
+void
+CodeEmitterNVC0::emitAFETCH(const Instruction *i)
+{
+   code[0] = 0x00000006;
+   code[1] = 0x0c000000 | (i->src(0).get()->reg.data.offset & 0x7ff);
+
+   if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+      code[0] |= 0x200;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 14);
+   srcId(i->src(0).getIndirect(0), 20);
+}
+
 void
 CodeEmitterNVC0::emitPFETCH(const Instruction *i)
 {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index cd8ee71ad2b..710f53de1c4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1749,6 +1749,7 @@ NVC0LoweringPass::checkPredicate(Instruction *insn)
 bool
 NVC0LoweringPass::visit(Instruction *i)
 {
+   bool ret = true;
    bld.setPosition(i, false);
 
    if (i->cc != CC_ALWAYS)
@@ -1780,7 +1781,8 @@ NVC0LoweringPass::visit(Instruction *i)
    case OP_SQRT:
       return handleSQRT(i);
    case OP_EXPORT:
-      return handleEXPORT(i);
+      ret = handleEXPORT(i);
+      break;
    case OP_EMIT:
    case OP_RESTART:
       return handleOUT(i);
@@ -1843,7 +1845,20 @@ NVC0LoweringPass::visit(Instruction *i)
    default:
       break;
    }
-   return true;
+
+   /* Kepler+ has a special opcode to compute a new base address to be used
+    * for indirect loads.
+    */
+   if (targ->getChipset() >= NVISA_GK104_CHIPSET && !i->perPatch &&
+       (i->op == OP_VFETCH || i->op == OP_EXPORT) && i->src(0).isIndirect(0)) {
+      Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
+                                      cloneShallow(func, i->getSrc(0)));
+      afetch->setIndirect(0, 0, i->getIndirect(0, 0));
+      i->src(0).get()->reg.data.offset = 0;
+      i->setIndirect(0, 0, afetch->getDef(0));
+   }
+
+   return ret;
 }
 
 bool
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
index 556d01b864d..9ebdc6586db 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -135,6 +135,7 @@ const char *operationStr[OP_LAST + 1] =
    "membar",
    "vfetch",
    "pfetch",
+   "afetch",
    "export",
    "linterp",
    "pinterp",
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
index 7992f539782..fe530c76b62 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -41,7 +41,7 @@ const uint8_t Target::operationSrcNr[] =
    0, 0, 0, 0, 0,          // BRA, CALL, RET, CONT, BREAK,
    0, 0, 0,                // PRERET,CONT,BREAK
    0, 0, 0, 0, 0, 0,       // BRKPT, JOINAT, JOIN, DISCARD, EXIT, MEMBAR
-   1, 1, 2, 1, 2,          // VFETCH, PFETCH, EXPORT, LINTERP, PINTERP
+   1, 1, 1, 2, 1, 2,       // VFETCH, PFETCH, AFETCH, EXPORT, LINTERP, PINTERP
    1, 1,                   // EMIT, RESTART
    1, 1, 1,                // TEX, TXB, TXL,
    1, 1, 1, 1, 1, 1, 2,    // TXF, TXQ, TXD, TXG, TXLQ, TEXCSAA, TEXPREP
@@ -96,8 +96,8 @@ const OpClass Target::operationClass[] =
    OPCLASS_FLOW, OPCLASS_FLOW,
    // MEMBAR
    OPCLASS_CONTROL,
-   // VFETCH, PFETCH, EXPORT
-   OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_STORE,
+   // VFETCH, PFETCH, AFETCH, EXPORT
+   OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_STORE,
    // LINTERP, PINTERP
    OPCLASS_SFU, OPCLASS_SFU,
    // EMIT, RESTART
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index ca545a6024a..f3ddcaa5199 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -118,7 +118,7 @@ void TargetNV50::initOpInfo()
    static const uint32_t shortForm[(OP_LAST + 31) / 32] =
    {
       // MOV,ADD,SUB,MUL,MAD,SAD,L/PINTERP,RCP,TEX,TXF
-      0x00014e40, 0x00000040, 0x00000498, 0x00000000
+      0x00014e40, 0x00000040, 0x00000930, 0x00000000
    };
    static const operation noDestList[] =
    {

From 24a7d4e437e27c758c2848e887ceaf1d4a55ae50 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 24 Jul 2015 00:21:28 -0400
Subject: [PATCH 0636/1208] nvc0/ir: per-patch vars are in a separate address
 space

There's no need to attempt to avoid overlapping generic i/o with patch
i/o. By the same token, we can't merge patch and non-patch loads/stores.

This fixes at least the

  tes-both-input-array-*-index-rd

tessellation variable-indexing tests.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_peephole.cpp       |  2 ++
 .../drivers/nouveau/nvc0/nvc0_program.c        | 18 +++++++-----------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index e9648aa7b91..cea96dcdfc5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -2085,6 +2085,8 @@ MemoryOpt::runOpt(BasicBlock *bb)
       }
       if (ldst->getPredicate()) // TODO: handle predicated ld/st
          continue;
+      if (ldst->perPatch) // TODO: create separate per-patch lists
+         continue;
 
       if (isLoad) {
          DataFile file = ldst->src(0).getFile();
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index b0b8486b6df..507a2507fe3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -31,7 +31,7 @@
  * 124 scalar varying values.
  */
 static uint32_t
-nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase)
+nvc0_shader_input_address(unsigned sn, unsigned si)
 {
    switch (sn) {
    case TGSI_SEMANTIC_TESSOUTER:    return 0x000 + si * 0x4;
@@ -42,7 +42,7 @@ nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase)
    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
    case TGSI_SEMANTIC_PSIZE:        return 0x06c;
    case TGSI_SEMANTIC_POSITION:     return 0x070;
-   case TGSI_SEMANTIC_GENERIC:      return ubase + si * 0x10;
+   case TGSI_SEMANTIC_GENERIC:      return 0x080 + si * 0x10;
    case TGSI_SEMANTIC_FOG:          return 0x2e8;
    case TGSI_SEMANTIC_COLOR:        return 0x280 + si * 0x10;
    case TGSI_SEMANTIC_BCOLOR:       return 0x2a0 + si * 0x10;
@@ -61,7 +61,7 @@ nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase)
 }
 
 static uint32_t
-nvc0_shader_output_address(unsigned sn, unsigned si, unsigned ubase)
+nvc0_shader_output_address(unsigned sn, unsigned si)
 {
    switch (sn) {
    case TGSI_SEMANTIC_TESSOUTER:     return 0x000 + si * 0x4;
@@ -72,7 +72,7 @@ nvc0_shader_output_address(unsigned sn, unsigned si, unsigned ubase)
    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
    case TGSI_SEMANTIC_PSIZE:         return 0x06c;
    case TGSI_SEMANTIC_POSITION:      return 0x070;
-   case TGSI_SEMANTIC_GENERIC:       return ubase + si * 0x10;
+   case TGSI_SEMANTIC_GENERIC:       return 0x080 + si * 0x10;
    case TGSI_SEMANTIC_FOG:           return 0x2e8;
    case TGSI_SEMANTIC_COLOR:         return 0x280 + si * 0x10;
    case TGSI_SEMANTIC_BCOLOR:        return 0x2a0 + si * 0x10;
@@ -97,7 +97,7 @@ nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info)
       case TGSI_SEMANTIC_VERTEXID:
          info->in[i].mask = 0x1;
          info->in[i].slot[0] =
-            nvc0_shader_input_address(info->in[i].sn, 0, 0) / 4;
+            nvc0_shader_input_address(info->in[i].sn, 0) / 4;
          continue;
       default:
          break;
@@ -113,13 +113,11 @@ nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info)
 static int
 nvc0_sp_assign_input_slots(struct nv50_ir_prog_info *info)
 {
-   unsigned ubase = MAX2(0x80, 0x20 + info->numPatchConstants * 0x10);
    unsigned offset;
    unsigned i, c;
 
    for (i = 0; i < info->numInputs; ++i) {
-      offset = nvc0_shader_input_address(info->in[i].sn,
-                                         info->in[i].si, ubase);
+      offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si);
 
       for (c = 0; c < 4; ++c)
          info->in[i].slot[c] = (offset + c * 0x4) / 4;
@@ -154,13 +152,11 @@ nvc0_fp_assign_output_slots(struct nv50_ir_prog_info *info)
 static int
 nvc0_sp_assign_output_slots(struct nv50_ir_prog_info *info)
 {
-   unsigned ubase = MAX2(0x80, 0x20 + info->numPatchConstants * 0x10);
    unsigned offset;
    unsigned i, c;
 
    for (i = 0; i < info->numOutputs; ++i) {
-      offset = nvc0_shader_output_address(info->out[i].sn,
-                                          info->out[i].si, ubase);
+      offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si);
 
       for (c = 0; c < 4; ++c)
          info->out[i].slot[c] = (offset + c * 0x4) / 4;

From 30f97b5e52b324d501c56df8902d294fb755a5b7 Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Thu, 23 Jul 2015 10:38:36 +0200
Subject: [PATCH 0637/1208] glsl/glcpp: fix SIGSEGV when checking error
 condition for macro redefinition

Commit a6e9cd14c does not take into account than node_{a,b}->next could be NULL
in some circumstances, such as in a shader containing this code:

  #define A 1 /* comment */
  #define A 1 /* comment */

This patch fixes the segmentation fault for cases like that.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91290
Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Cc: mesa-stable@lists.freedesktop.org
---
 src/glsl/glcpp/glcpp-parse.y | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index 7ef4dfd5316..dd5ec2a30b5 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -1074,9 +1074,9 @@ _token_list_equal_ignoring_space (token_list_t *a, token_list_t *b)
 		 */
 		if (node_a->token->type == SPACE
 		    && node_b->token->type == SPACE) {
-			while (node_a->token->type == SPACE)
+			while (node_a && node_a->token->type == SPACE)
 				node_a = node_a->next;
-			while (node_b->token->type == SPACE)
+			while (node_b && node_b->token->type == SPACE)
 				node_b = node_b->next;
 			continue;
 		}

From 013d731a67538a2eb8f508fa54bb86191f0e5491 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 21 Jul 2015 11:12:57 +0100
Subject: [PATCH 0638/1208] i965: Use updated kernel interface for accurate
 TIMESTAMP reads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I was mistaken, I thought we already had fixed this in the kernel a
couple of years ago. We had not, and the broken read (the hardware
shifts the register output on 64bit kernels, but not on 32bit kernels) is
now enshrined into the ABI. I also had the buggy architecture reversed,
believing it to be 32bit that had the shifted results. On the basis of
those mistakes, I wrote

commit c8d3ebaffc0d7d915c1c19d54dba61fd1e57b338
Author: Chris Wilson <chris@chris-wilson.co.uk>
Date:   Wed Apr 29 13:32:38 2015 +0100

    i965: Query whether we have kernel support for the TIMESTAMP register once

Now that we do have an extended register read interface for always
reporting the full 36bit TIMESTAMP (irrespective of whether the hardware
is buggy or not), make use of it and in the process fix my reversed
detection of the buggy reads for unpatched kernels.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Martin Peres <martin.peres@linux.intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Tested-and-acked-by: Chris Forbes <chrisf@ijw.co.nz>
Reviewed-by: Daniel Vetter <daniel@ffwll.ch>
---
 src/mesa/drivers/dri/i965/brw_queryobj.c | 15 +++++--
 src/mesa/drivers/dri/i965/intel_screen.c | 51 +++++++++++++++++-------
 src/mesa/drivers/dri/i965/intel_screen.h |  2 +-
 3 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index aea4d9b77d3..d6b012c392e 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -497,13 +497,22 @@ brw_get_timestamp(struct gl_context *ctx)
    struct brw_context *brw = brw_context(ctx);
    uint64_t result = 0;
 
-   drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result);
+   switch (brw->intelScreen->hw_has_timestamp) {
+   case 3: /* New kernel, always full 36bit accuracy */
+      drm_intel_reg_read(brw->bufmgr, TIMESTAMP | 1, &result);
+      break;
+   case 2: /* 64bit kernel, result is left-shifted by 32bits, losing 4bits */
+      drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result);
+      result = result >> 32;
+      break;
+   case 1: /* 32bit kernel, result is 36bit wide but may be inaccurate! */
+      drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result);
+      break;
+   }
 
    /* See logic in brw_queryobj_get_results() */
-   result = result >> 32;
    result *= 80;
    result &= (1ull << 36) - 1;
-
    return result;
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 1470b059d9b..65a17664188 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1123,25 +1123,48 @@ intel_detect_swizzling(struct intel_screen *screen)
       return true;
 }
 
-static bool
+static int
 intel_detect_timestamp(struct intel_screen *screen)
 {
-   uint64_t dummy = 0;
-   int loop = 10;
+   uint64_t dummy = 0, last = 0;
+   int upper, lower, loops;
 
-   /*
-    * On 32bit systems, some old kernels trigger a hw bug resulting in the
-    * TIMESTAMP register being shifted and the low 32bits always zero. Detect
-    * this by repeating the read a few times and check the register is
-    * incrementing every 80ns as expected and not stuck on zero (as would be
-    * the case with the buggy kernel/hw.).
+   /* On 64bit systems, some old kernels trigger a hw bug resulting in the
+    * TIMESTAMP register being shifted and the low 32bits always zero.
+    *
+    * More recent kernels offer an interface to read the full 36bits
+    * everywhere.
     */
-   do {
-      if (drm_intel_reg_read(screen->bufmgr, TIMESTAMP, &dummy))
-	 return false;
-   } while ((dummy & 0xffffffff) == 0 && --loop);
+   if (drm_intel_reg_read(screen->bufmgr, TIMESTAMP | 1, &dummy) == 0)
+      return 3;
 
-   return loop > 0;
+   /* Determine if we have a 32bit or 64bit kernel by inspecting the
+    * upper 32bits for a rapidly changing timestamp.
+    */
+   if (drm_intel_reg_read(screen->bufmgr, TIMESTAMP, &last))
+      return 0;
+
+   upper = lower = 0;
+   for (loops = 0; loops < 10; loops++) {
+      /* The TIMESTAMP should change every 80ns, so several round trips
+       * through the kernel should be enough to advance it.
+       */
+      if (drm_intel_reg_read(screen->bufmgr, TIMESTAMP, &dummy))
+         return 0;
+
+      upper += (dummy >> 32) != (last >> 32);
+      if (upper > 1) /* beware 32bit counter overflow */
+         return 2; /* upper dword holds the low 32bits of the timestamp */
+
+      lower += (dummy & 0xffffffff) != (last & 0xffffffff);
+      if (lower > 1)
+         return 1; /* timestamp is unshifted */
+
+      last = dummy;
+   }
+
+   /* No advancement? No timestamp! */
+   return 0;
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index a741c94e0ff..fd5143eecba 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -52,7 +52,7 @@ struct intel_screen
 
    bool hw_has_swizzling;
 
-   bool hw_has_timestamp;
+   int hw_has_timestamp;
 
    /**
     * Does the kernel support resource streamer?

From 7974e23be9ff7586e5250cff321b6ec7749ecc44 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Tue, 19 May 2015 17:44:52 -0700
Subject: [PATCH 0639/1208] mesa: Turn get_readpixels_transfer_ops() in to a
 global function

This utility function is utilized in a later patch.

Cc: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/main/readpix.c | 16 +++++++++-------
 src/mesa/main/readpix.h |  6 ++++++
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index 32e31ffae2f..25a09873fe0 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -64,9 +64,11 @@ _mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format)
 /**
  * Return transfer op flags for this ReadPixels operation.
  */
-static GLbitfield
-get_readpixels_transfer_ops(const struct gl_context *ctx, mesa_format texFormat,
-                            GLenum format, GLenum type, GLboolean uses_blit)
+GLbitfield
+_mesa_get_readpixels_transfer_ops(const struct gl_context *ctx,
+                                  mesa_format texFormat,
+                                  GLenum format, GLenum type,
+                                  GLboolean uses_blit)
 {
    GLbitfield transferOps = ctx->_ImageTransferState;
 
@@ -153,8 +155,8 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
       }
 
       /* And finally, see if there are any transfer ops. */
-      return get_readpixels_transfer_ops(ctx, rb->Format, format, type,
-                                         uses_blit) != 0;
+      return _mesa_get_readpixels_transfer_ops(ctx, rb->Format, format, type,
+                                               uses_blit) != 0;
    }
    return GL_FALSE;
 }
@@ -420,8 +422,8 @@ read_rgba_pixels( struct gl_context *ctx,
    if (!rb)
       return;
 
-   transferOps = get_readpixels_transfer_ops(ctx, rb->Format, format, type,
-                                             GL_FALSE);
+   transferOps = _mesa_get_readpixels_transfer_ops(ctx, rb->Format, format,
+                                                   type, GL_FALSE);
    /* Describe the dst format */
    dst_is_integer = _mesa_is_enum_format_integer(format);
    dst_stride = _mesa_image_row_stride(packing, width, format, type);
diff --git a/src/mesa/main/readpix.h b/src/mesa/main/readpix.h
index 1636dd9ce3e..f8940361fb4 100644
--- a/src/mesa/main/readpix.h
+++ b/src/mesa/main/readpix.h
@@ -40,6 +40,12 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
 extern GLboolean
 _mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format);
 
+extern GLbitfield
+_mesa_get_readpixels_transfer_ops(const struct gl_context *ctx,
+                                  mesa_format texFormat,
+                                  GLenum format, GLenum type,
+                                  GLboolean uses_blit);
+
 extern void
 _mesa_readpixels(struct gl_context *ctx,
                  GLint x, GLint y, GLsizei width, GLsizei height,

From 1252d53c19ec005c17ca666cecb7db072d77e5ce Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Wed, 20 May 2015 10:21:39 -0700
Subject: [PATCH 0640/1208] meta: Fix transfer operations check in meta pbo
 path for readpixels

Currently used ctx->_ImageTransferState check is not sufficient
because it doesn't include the read color clamping enabled with
GL_CLAMP_READ_COLOR. So, use the helper function
_mesa_get_readpixels_transfer_ops().

Also, transfer operations don't affect glGetTexImage(). So, do
the check only for glReadPixles.

Without this patch, arb_color_buffer_float-readpixels test fails, when
forced to use meta pbo path.

V2: Add a comment and bump up the commit message.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Cc: <mesa-stable@lists.freedesktop.org>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/common/meta_tex_subimage.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index d2474f52718..90d78e55fa8 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -273,12 +273,17 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
        format == GL_COLOR_INDEX)
       return false;
 
-   if (ctx->_ImageTransferState)
-      return false;
-
-
+   /* Don't use meta path for readpixels in below conditions. */
    if (!tex_image) {
       rb = ctx->ReadBuffer->_ColorReadBuffer;
+
+      /* _mesa_get_readpixels_transfer_ops() includes the cases of read
+       * color clamping along with the ctx->_ImageTransferState.
+       */
+      if (_mesa_get_readpixels_transfer_ops(ctx, rb->Format, format,
+                                            type, GL_FALSE))
+         return false;
+
       if (_mesa_need_rgb_to_luminance_conversion(rb->Format, format))
          return false;
    }

From 0d207905e675b778739236072e7a4dfba7cd7959 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Wed, 20 May 2015 10:22:45 -0700
Subject: [PATCH 0641/1208] meta: Abort meta pbo path if readpixels need
 signed-unsigned conversion

Meta pbo path for ReadPixels rely on BlitFramebuffer which doesn't support
signed to unsigned integer conversions and vice versa.

Without this patch, piglit test fbo_integer_readpixels_sint_uint fails, when
forced to use the meta pbo path.

v2: Make need_signed_unsigned_int_conversion() a static function. (Iago)
    Bump up the comment and the commit message. (Jason)

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Cc: <mesa-stable@lists.freedesktop.org>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Iago Toral <itoral@igalia.com>
---
 src/mesa/drivers/common/meta_tex_subimage.c | 26 +++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index 90d78e55fa8..6d7c1eb65b7 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -248,6 +248,24 @@ fail:
    return success;
 }
 
+static bool
+need_signed_unsigned_int_conversion(mesa_format rbFormat,
+                                    GLenum format, GLenum type)
+{
+   const GLenum srcType = _mesa_get_format_datatype(rbFormat);
+   const bool is_dst_format_integer = _mesa_is_enum_format_integer(format);
+   return (srcType == GL_INT &&
+           is_dst_format_integer &&
+           (type == GL_UNSIGNED_INT ||
+            type == GL_UNSIGNED_SHORT ||
+            type == GL_UNSIGNED_BYTE)) ||
+          (srcType == GL_UNSIGNED_INT &&
+           is_dst_format_integer &&
+           (type == GL_INT ||
+            type == GL_SHORT ||
+            type == GL_BYTE));
+}
+
 bool
 _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
                               struct gl_texture_image *tex_image,
@@ -286,6 +304,14 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
 
       if (_mesa_need_rgb_to_luminance_conversion(rb->Format, format))
          return false;
+
+      /* This function rely on BlitFramebuffer to fill in the pixel data for
+       * ReadPixels. But, BlitFrameBuffer doesn't support signed to unsigned
+       * or unsigned to signed integer conversions. OpenGL spec expects an
+       * invalid operation in that case.
+       */
+      if (need_signed_unsigned_int_conversion(rb->Format, format, type))
+         return false;
    }
 
    /* For arrays, use a tall (height * depth) 2D texture but taking into

From ca4e17e03e9aeaa04fe6bb04bfe2d6f97991005b Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Wed, 6 May 2015 05:43:08 -0700
Subject: [PATCH 0642/1208] meta: Don't do fragment color clamping in
 _mesa_meta_pbo_GetTexSubImage

_mesa_meta_pbo_GetTexSubImage() uses _mesa_meta_BlitFrameBuffer(),
which will do fragment clamping if enabled. But fragment clamping
doesn't affect ReadPixels and GetTexImage.

Without this patch, piglit test arb_color_buffer_float-clear fails,
when forced to use the meta pbo path.

v2: Apply this fix to both glReadPixels and glGetTexImage.

Cc: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/common/meta_tex_subimage.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index 6d7c1eb65b7..038d5259b3f 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -25,6 +25,7 @@
  *    Jason Ekstrand <jason.ekstrand@intel.com>
  */
 
+#include "blend.h"
 #include "bufferobj.h"
 #include "buffers.h"
 #include "fbobject.h"
@@ -331,6 +332,10 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    _mesa_meta_begin(ctx, ~(MESA_META_PIXEL_TRANSFER |
                            MESA_META_PIXEL_STORE));
 
+   /* GL_CLAMP_FRAGMENT_COLOR doesn't affect ReadPixels and GettexImage */
+   if (ctx->Extensions.ARB_color_buffer_float)
+      _mesa_ClampColor(GL_CLAMP_FRAGMENT_COLOR, GL_FALSE);
+
    _mesa_GenFramebuffers(2, fbos);
 
    if (tex_image && tex_image->TexObject->Target == GL_TEXTURE_1D_ARRAY) {

From bbbefec7323d0a338346233ab4ab715bcf4e1b78 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Thu, 11 Jun 2015 16:44:45 -0700
Subject: [PATCH 0643/1208] mesa: Set green, blue channels to zero only for
 formats with these components

This is an optimization which avoids setting pixel transfer operations
when not required. _mesa_ReadPixels falls back to slower path if
transfer operations are set.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/common/meta.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index ca10bbe81f6..826f2239d11 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -3159,9 +3159,16 @@ decompress_texture_image(struct gl_context *ctx,
        * returned as red and two-channel texture values are returned as
        * red/alpha.
        */
-      if ((baseTexFormat == GL_LUMINANCE ||
-           baseTexFormat == GL_LUMINANCE_ALPHA ||
-           baseTexFormat == GL_INTENSITY) ||
+      if (((baseTexFormat == GL_LUMINANCE ||
+            baseTexFormat == GL_LUMINANCE_ALPHA ||
+            baseTexFormat == GL_INTENSITY) &&
+           (destBaseFormat == GL_RGBA ||
+            destBaseFormat == GL_RGB ||
+            destBaseFormat == GL_RG ||
+            destBaseFormat == GL_GREEN ||
+            destBaseFormat == GL_BLUE ||
+            destBaseFormat == GL_BGRA ||
+            destBaseFormat == GL_BGR)) ||
           /* If we're reading back an RGB(A) texture (using glGetTexImage) as
 	   * luminance then we need to return L=tex(R).
 	   */

From 0127580647ee23d543228f0b7f42bd688e76f2bd Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Fri, 12 Jun 2015 12:09:05 -0700
Subject: [PATCH 0644/1208] mesa: Add a helper function
 _mesa_unpack_format_to_base_format()

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/main/glformats.c | 44 +++++++++++++++++++++++++++++++++++++++
 src/mesa/main/glformats.h |  3 +++
 2 files changed, 47 insertions(+)

diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index ae97079836e..6688abe7c2c 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -1278,6 +1278,50 @@ _mesa_is_compressed_format(const struct gl_context *ctx, GLenum format)
    }
 }
 
+/**
+ * Convert various unpack formats to the corresponding base format.
+ */
+GLenum
+_mesa_unpack_format_to_base_format(GLenum format)
+{
+   switch(format) {
+   case GL_RED_INTEGER:
+      return GL_RED;
+   case GL_GREEN_INTEGER:
+      return GL_GREEN;
+   case GL_BLUE_INTEGER:
+      return GL_BLUE;
+   case GL_ALPHA_INTEGER:
+      return GL_ALPHA;
+   case GL_RG_INTEGER:
+      return GL_RG;
+   case GL_RGB_INTEGER:
+      return GL_RGB;
+   case GL_RGBA_INTEGER:
+      return GL_RGBA;
+   case GL_BGR_INTEGER:
+      return GL_BGR;
+   case GL_BGRA_INTEGER:
+      return GL_BGRA;
+   case GL_LUMINANCE_INTEGER_EXT:
+      return GL_LUMINANCE;
+   case GL_LUMINANCE_ALPHA_INTEGER_EXT:
+      return GL_LUMINANCE_ALPHA;
+   case GL_RED:
+   case GL_GREEN:
+   case GL_BLUE:
+   case GL_RG:
+   case GL_RGB:
+   case GL_RGBA:
+   case GL_BGR:
+   case GL_BGRA:
+   case GL_ALPHA:
+   case GL_LUMINANCE:
+   case GL_LUMINANCE_ALPHA:
+   default:
+      return format;
+   }
+}
 
 /**
  * Convert various base formats to the cooresponding integer format.
diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h
index 8881cb7d86b..419955a6033 100644
--- a/src/mesa/main/glformats.h
+++ b/src/mesa/main/glformats.h
@@ -101,6 +101,9 @@ _mesa_is_compressed_format(const struct gl_context *ctx, GLenum format);
 extern GLenum
 _mesa_base_format_to_integer_format(GLenum format);
 
+extern GLenum
+_mesa_unpack_format_to_base_format(GLenum format);
+
 extern GLboolean
 _mesa_base_format_has_channel(GLenum base_format, GLenum pname);
 

From 9fff00d387cacf7820c344324820cab764541762 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Fri, 12 Jun 2015 12:11:01 -0700
Subject: [PATCH 0645/1208] meta: Use _mesa_unpack_format_to_base_format() to
 handle integer formats

Replace a call to mesa_base_tex_format() that handles only internal
formats with a call to the new _mesa_unpack_format_to_base_format()
function that handles allowed unpack formats and does not care for
internal formats at all.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/common/meta.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 826f2239d11..5c46efc45d3 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -3150,7 +3150,7 @@ decompress_texture_image(struct gl_context *ctx,
    /* read pixels from renderbuffer */
    {
       GLenum baseTexFormat = texImage->_BaseFormat;
-      GLenum destBaseFormat = _mesa_base_tex_format(ctx, destFormat);
+      GLenum destBaseFormat = _mesa_unpack_format_to_base_format(destFormat);
 
       /* The pixel transfer state will be set to default values at this point
        * (see MESA_META_PIXEL_TRANSFER) so pixel transfer ops are effectively
@@ -3176,9 +3176,7 @@ decompress_texture_image(struct gl_context *ctx,
             baseTexFormat == GL_RGB  ||
             baseTexFormat == GL_RG) &&
           (destBaseFormat == GL_LUMINANCE ||
-           destBaseFormat == GL_LUMINANCE_ALPHA ||
-           destBaseFormat == GL_LUMINANCE_INTEGER_EXT ||
-           destBaseFormat == GL_LUMINANCE_ALPHA_INTEGER_EXT))) {
+           destBaseFormat == GL_LUMINANCE_ALPHA))) {
          /* Green and blue must be zero */
          _mesa_PixelTransferf(GL_GREEN_SCALE, 0.0f);
          _mesa_PixelTransferf(GL_BLUE_SCALE, 0.0f);

From c59c0f8a42652603da7f89e3270897cb685fe76b Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Thu, 11 Jun 2015 16:48:26 -0700
Subject: [PATCH 0646/1208] mesa: Add a helper function
 _mesa_need_luminance_to_rgb_conversion()

Cc: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/main/readpix.c | 18 ++++++++++++++++++
 src/mesa/main/readpix.h |  4 ++++
 2 files changed, 22 insertions(+)

diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index 25a09873fe0..ac46f7e47de 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -60,6 +60,24 @@ _mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format)
            format == GL_LUMINANCE_ALPHA_INTEGER_EXT);
 }
 
+/**
+ * Return true if the conversion L,I to RGB conversion is needed.
+ */
+GLboolean
+_mesa_need_luminance_to_rgb_conversion(GLenum srcBaseFormat,
+                                       GLenum dstBaseFormat)
+{
+   return (srcBaseFormat == GL_LUMINANCE ||
+           srcBaseFormat == GL_LUMINANCE_ALPHA ||
+           srcBaseFormat == GL_INTENSITY) &&
+          (dstBaseFormat == GL_GREEN ||
+           dstBaseFormat == GL_BLUE ||
+           dstBaseFormat == GL_RG ||
+           dstBaseFormat == GL_RGB ||
+           dstBaseFormat == GL_BGR ||
+           dstBaseFormat == GL_RGBA ||
+           dstBaseFormat == GL_BGRA);
+}
 
 /**
  * Return transfer op flags for this ReadPixels operation.
diff --git a/src/mesa/main/readpix.h b/src/mesa/main/readpix.h
index f8940361fb4..41116e430ec 100644
--- a/src/mesa/main/readpix.h
+++ b/src/mesa/main/readpix.h
@@ -40,6 +40,10 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
 extern GLboolean
 _mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format);
 
+extern GLboolean
+_mesa_need_luminance_to_rgb_conversion(GLenum srcBaseFormat,
+                                       GLenum dstBaseFormat);
+
 extern GLbitfield
 _mesa_get_readpixels_transfer_ops(const struct gl_context *ctx,
                                   mesa_format texFormat,

From be405ee334ec758a2609d8780221f4f1a1ed3343 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Thu, 11 Jun 2015 17:23:34 -0700
Subject: [PATCH 0647/1208] meta: Use _mesa_need_luminance_to_rgb_conversion()
 in decompress_texture_image()

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/common/meta.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 5c46efc45d3..b7c91bfd65e 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -3159,16 +3159,8 @@ decompress_texture_image(struct gl_context *ctx,
        * returned as red and two-channel texture values are returned as
        * red/alpha.
        */
-      if (((baseTexFormat == GL_LUMINANCE ||
-            baseTexFormat == GL_LUMINANCE_ALPHA ||
-            baseTexFormat == GL_INTENSITY) &&
-           (destBaseFormat == GL_RGBA ||
-            destBaseFormat == GL_RGB ||
-            destBaseFormat == GL_RG ||
-            destBaseFormat == GL_GREEN ||
-            destBaseFormat == GL_BLUE ||
-            destBaseFormat == GL_BGRA ||
-            destBaseFormat == GL_BGR)) ||
+      if (_mesa_need_luminance_to_rgb_conversion(baseTexFormat,
+                                                 destBaseFormat) ||
           /* If we're reading back an RGB(A) texture (using glGetTexImage) as
 	   * luminance then we need to return L=tex(R).
 	   */

From aa40546b2de4cd572af02d31fd5c7d4045505ea2 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Tue, 12 May 2015 05:46:04 -0700
Subject: [PATCH 0648/1208] meta: Fix reading luminance texture as rgba in
 _mesa_meta_pbo_GetTexSubImage()

After recent addition of pbo testing in piglit test getteximage-luminance,
it fails on i965. This patch makes a sub test pass.

This patch adds a clear color operation to meta pbo path, which I think is
better than falling back to software path.

V2: Fix color mask for GL_LUMINANCE_ALPHA

Cc: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/common/meta_tex_subimage.c | 36 +++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index 038d5259b3f..8c0a50984d2 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -28,6 +28,7 @@
 #include "blend.h"
 #include "bufferobj.h"
 #include "buffers.h"
+#include "clear.h"
 #include "fbobject.h"
 #include "glformats.h"
 #include "glheader.h"
@@ -279,8 +280,9 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    int full_height, image_height;
    struct gl_texture_image *pbo_tex_image;
    struct gl_renderbuffer *rb = NULL;
-   GLenum status;
-   bool success = false;
+   GLenum status, src_base_format;
+   bool success = false, clear_channels_to_zero = false;
+   float save_clear_color[4];
    int z;
 
    if (!_mesa_is_bufferobj(packing->BufferObj))
@@ -381,6 +383,27 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
                                   GL_COLOR_BUFFER_BIT, GL_NEAREST))
       goto fail;
 
+   src_base_format = tex_image ?
+                     tex_image->_BaseFormat :
+                     ctx->ReadBuffer->_ColorReadBuffer->_BaseFormat;
+
+   /* Depending on the base formats involved we might need to rebase some
+    * values. For example if we download from a Luminance format to RGBA
+    * format, we want G=0 and B=0.
+    */
+   clear_channels_to_zero =
+      _mesa_need_luminance_to_rgb_conversion(src_base_format,
+                                             pbo_tex_image->_BaseFormat);
+
+   if (clear_channels_to_zero) {
+      memcpy(save_clear_color, ctx->Color.ClearColor.f, 4 * sizeof(float));
+      /* Clear the Green, Blue channels. */
+      _mesa_ColorMask(GL_FALSE, GL_TRUE, GL_TRUE,
+                      src_base_format != GL_LUMINANCE_ALPHA);
+      _mesa_ClearColor(0.0, 0.0, 0.0, 1.0);
+      _mesa_Clear(GL_COLOR_BUFFER_BIT);
+   }
+
    for (z = 1; z < depth; z++) {
       _mesa_meta_bind_fbo_image(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
                                 tex_image, zoffset + z);
@@ -393,6 +416,15 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
                                  0, z * image_height,
                                  width, z * image_height + height,
                                  GL_COLOR_BUFFER_BIT, GL_NEAREST);
+      if (clear_channels_to_zero)
+         _mesa_Clear(GL_COLOR_BUFFER_BIT);
+   }
+
+   /* Unmask the color channels and restore the saved clear color values. */
+   if (clear_channels_to_zero) {
+      _mesa_ColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+      _mesa_ClearColor(save_clear_color[0], save_clear_color[1],
+                       save_clear_color[2], save_clear_color[3]);
    }
 
    success = true;

From 4b8745680ff45cd7adc7896c06263e14b8d347ce Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Fri, 12 Jun 2015 14:42:57 -0700
Subject: [PATCH 0649/1208] mesa: Change the signature of
 _mesa_need_rgb_to_luminance_conversion()

This allows us to handle cases when texImage->_BaseFormat doesn't match
_mesa_format_get_base_format(texImage->Format). _BaseFormat is what we
care about in this function.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/common/meta_tex_subimage.c |  4 ++-
 src/mesa/main/readpix.c                     | 28 +++++++++++----------
 src/mesa/main/readpix.h                     |  3 ++-
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index 8c0a50984d2..16d8f5d4747 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -280,6 +280,7 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    int full_height, image_height;
    struct gl_texture_image *pbo_tex_image;
    struct gl_renderbuffer *rb = NULL;
+   GLenum dstBaseFormat = _mesa_unpack_format_to_base_format(format);
    GLenum status, src_base_format;
    bool success = false, clear_channels_to_zero = false;
    float save_clear_color[4];
@@ -305,7 +306,8 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
                                             type, GL_FALSE))
          return false;
 
-      if (_mesa_need_rgb_to_luminance_conversion(rb->Format, format))
+      if (_mesa_need_rgb_to_luminance_conversion(rb->_BaseFormat,
+                                                 dstBaseFormat))
          return false;
 
       /* This function rely on BlitFramebuffer to fill in the pixel data for
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index ac46f7e47de..9d72c1c31b0 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -47,17 +47,14 @@
  * Return true if the conversion L=R+G+B is needed.
  */
 GLboolean
-_mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format)
+_mesa_need_rgb_to_luminance_conversion(GLenum srcBaseFormat,
+                                       GLenum dstBaseFormat)
 {
-   GLenum baseTexFormat = _mesa_get_format_base_format(texFormat);
-
-   return (baseTexFormat == GL_RG ||
-           baseTexFormat == GL_RGB ||
-           baseTexFormat == GL_RGBA) &&
-          (format == GL_LUMINANCE ||
-           format == GL_LUMINANCE_ALPHA ||
-           format == GL_LUMINANCE_INTEGER_EXT ||
-           format == GL_LUMINANCE_ALPHA_INTEGER_EXT);
+   return (srcBaseFormat == GL_RG ||
+           srcBaseFormat == GL_RGB ||
+           srcBaseFormat == GL_RGBA) &&
+          (dstBaseFormat == GL_LUMINANCE ||
+           dstBaseFormat == GL_LUMINANCE_ALPHA);
 }
 
 /**
@@ -89,6 +86,8 @@ _mesa_get_readpixels_transfer_ops(const struct gl_context *ctx,
                                   GLboolean uses_blit)
 {
    GLbitfield transferOps = ctx->_ImageTransferState;
+   GLenum srcBaseFormat = _mesa_get_format_base_format(texFormat);
+   GLenum dstBaseFormat = _mesa_unpack_format_to_base_format(format);
 
    if (format == GL_DEPTH_COMPONENT ||
        format == GL_DEPTH_STENCIL ||
@@ -125,7 +124,7 @@ _mesa_get_readpixels_transfer_ops(const struct gl_context *ctx,
     * have any effect anyway.
     */
    if (_mesa_get_format_datatype(texFormat) == GL_UNSIGNED_NORMALIZED &&
-       !_mesa_need_rgb_to_luminance_conversion(texFormat, format)) {
+       !_mesa_need_rgb_to_luminance_conversion(srcBaseFormat, dstBaseFormat)) {
       transferOps &= ~IMAGE_CLAMP_BIT;
    }
 
@@ -148,6 +147,7 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
 {
    struct gl_renderbuffer *rb =
          _mesa_get_read_renderbuffer_for_format(ctx, format);
+   GLenum dstBaseFormat = _mesa_unpack_format_to_base_format(format);
 
    assert(rb);
 
@@ -168,7 +168,8 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
 
    default:
       /* Color formats. */
-      if (_mesa_need_rgb_to_luminance_conversion(rb->Format, format)) {
+      if (_mesa_need_rgb_to_luminance_conversion(rb->_BaseFormat,
+                                                 dstBaseFormat)) {
          return GL_TRUE;
       }
 
@@ -436,6 +437,7 @@ read_rgba_pixels( struct gl_context *ctx,
    uint8_t rebase_swizzle[4];
    struct gl_framebuffer *fb = ctx->ReadBuffer;
    struct gl_renderbuffer *rb = fb->_ColorReadBuffer;
+   GLenum dstBaseFormat = _mesa_unpack_format_to_base_format(format);
 
    if (!rb)
       return;
@@ -447,7 +449,7 @@ read_rgba_pixels( struct gl_context *ctx,
    dst_stride = _mesa_image_row_stride(packing, width, format, type);
    dst_format = _mesa_format_from_format_and_type(format, type);
    convert_rgb_to_lum =
-      _mesa_need_rgb_to_luminance_conversion(rb->Format, format);
+      _mesa_need_rgb_to_luminance_conversion(rb->_BaseFormat, dstBaseFormat);
    dst = (GLubyte *) _mesa_image_address2d(packing, pixels, width, height,
                                            format, type, 0, 0);
 
diff --git a/src/mesa/main/readpix.h b/src/mesa/main/readpix.h
index 41116e430ec..481ad9d9c37 100644
--- a/src/mesa/main/readpix.h
+++ b/src/mesa/main/readpix.h
@@ -38,7 +38,8 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
                                  GLenum type, GLboolean uses_blit);
 
 extern GLboolean
-_mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format);
+_mesa_need_rgb_to_luminance_conversion(GLenum srcBaseFormat,
+                                       GLenum dstBaseFormat);
 
 extern GLboolean
 _mesa_need_luminance_to_rgb_conversion(GLenum srcBaseFormat,

From a9cbb2c722615e11818066cbe33006c5cfc43381 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Fri, 12 Jun 2015 14:58:46 -0700
Subject: [PATCH 0650/1208] meta: Use _mesa_need_rgb_to_luminance_conversion()
 in decompress_texture_image()

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/common/meta.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index b7c91bfd65e..bde544ef490 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -3164,11 +3164,8 @@ decompress_texture_image(struct gl_context *ctx,
           /* If we're reading back an RGB(A) texture (using glGetTexImage) as
 	   * luminance then we need to return L=tex(R).
 	   */
-          ((baseTexFormat == GL_RGBA ||
-            baseTexFormat == GL_RGB  ||
-            baseTexFormat == GL_RG) &&
-          (destBaseFormat == GL_LUMINANCE ||
-           destBaseFormat == GL_LUMINANCE_ALPHA))) {
+          _mesa_need_rgb_to_luminance_conversion(baseTexFormat,
+                                                 destBaseFormat)) {
          /* Green and blue must be zero */
          _mesa_PixelTransferf(GL_GREEN_SCALE, 0.0f);
          _mesa_PixelTransferf(GL_BLUE_SCALE, 0.0f);

From 56980f107ef64d0a5bfc5d292cc891661e47d0f0 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Fri, 26 Jun 2015 15:39:40 -0700
Subject: [PATCH 0651/1208] mesa: Fix typo in a comment

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
---
 src/mesa/main/glformats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index 6688abe7c2c..c3fd7341ebc 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -1324,7 +1324,7 @@ _mesa_unpack_format_to_base_format(GLenum format)
 }
 
 /**
- * Convert various base formats to the cooresponding integer format.
+ * Convert various base formats to the corresponding integer format.
  */
 GLenum
 _mesa_base_format_to_integer_format(GLenum format)

From f8059c9f3fdd270370737c9eff369eb6d14caa0b Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 23 Jul 2015 20:18:57 -0400
Subject: [PATCH 0652/1208] mesa: fix error checking for getting zero-sized
 texture images

Commit 17f714836 (mesa: rearrange texture error checking order) moved
the width/height/depth == 0 allowance before checking if the image was
there. This was in part due to depth having to be == 1 for 2D images and
width having to be == 1 for 1D images. Instead relax the height/depth
checks to also accept 0 as valid.

With this change,

  bin/arb_direct_state_access-get-textures

starts passing again.

Fixes: 17f714836 (mesa: rearrange texture error checking order)
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/main/texgetimage.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index cdbd6187708..c0ccce3d50e 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -928,13 +928,6 @@ dimensions_error_check(struct gl_context *ctx,
    const struct gl_texture_image *texImage;
    int i;
 
-   if (width == 0 || height == 0 || depth == 0) {
-      /* Not an error, but nothing to do.  Return 'true' so that the
-       * caller simply returns.
-       */
-      return true;
-   }
-
    if (xoffset < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(xoffset = %d)", caller, xoffset);
       return true;
@@ -973,7 +966,7 @@ dimensions_error_check(struct gl_context *ctx,
                      "%s(1D, yoffset = %d)", caller, yoffset);
          return true;
       }
-      if (height != 1) {
+      if (height > 1) {
          _mesa_error(ctx, GL_INVALID_VALUE,
                      "%s(1D, height = %d)", caller, height);
          return true;
@@ -987,7 +980,7 @@ dimensions_error_check(struct gl_context *ctx,
                      "%s(zoffset = %d)", caller, zoffset);
          return true;
       }
-      if (depth != 1) {
+      if (depth > 1) {
          _mesa_error(ctx, GL_INVALID_VALUE,
                      "%s(depth = %d)", caller, depth);
          return true;
@@ -1086,6 +1079,13 @@ dimensions_error_check(struct gl_context *ctx,
       }
    }
 
+   if (width == 0 || height == 0 || depth == 0) {
+      /* Not an error, but nothing to do.  Return 'true' so that the
+       * caller simply returns.
+       */
+      return true;
+   }
+
    return false;
 }
 

From b42444ffed87114e82522dd81d3e5540c21a128c Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 24 Jul 2015 17:06:22 -0400
Subject: [PATCH 0653/1208] glsl: recognize ARB_shading_language_420pack to be
 enabled with 4.20+

The 420pack extension enables various GLSL rules that need to be applied
to any GLSL 4.20+ shader even if the extension is not explicitly
enabled.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
---
 src/glsl/glsl_parser.yy       | 18 +++++++++---------
 src/glsl/glsl_parser_extras.h |  5 +++++
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 7de31d9b40e..4cce5b8b284 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -934,7 +934,7 @@ parameter_qualifier:
       if (($1.flags.q.in || $1.flags.q.out) && ($2.flags.q.in || $2.flags.q.out))
          _mesa_glsl_error(&@1, state, "duplicate in/out/inout qualifier");
 
-      if (!state->ARB_shading_language_420pack_enable && $2.flags.q.constant)
+      if (!state->has_420pack() && $2.flags.q.constant)
          _mesa_glsl_error(&@1, state, "in/out/inout must come after const "
                                       "or precise");
 
@@ -946,7 +946,7 @@ parameter_qualifier:
       if ($2.precision != ast_precision_none)
          _mesa_glsl_error(&@1, state, "duplicate precision qualifier");
 
-      if (!state->ARB_shading_language_420pack_enable && $2.flags.i != 0)
+      if (!state->has_420pack() && $2.flags.i != 0)
          _mesa_glsl_error(&@1, state, "precision qualifiers must come last");
 
       $$ = $2;
@@ -1458,7 +1458,7 @@ layout_qualifier_id:
          }
       }
 
-      if ((state->ARB_shading_language_420pack_enable ||
+      if ((state->has_420pack() ||
            state->has_atomic_counters() ||
            state->ARB_shader_storage_buffer_object_enable) &&
           match_layout_qualifier("binding", $1, state) == 0) {
@@ -1729,7 +1729,7 @@ type_qualifier:
       if ($2.flags.q.invariant)
          _mesa_glsl_error(&@1, state, "duplicate \"invariant\" qualifier");
 
-      if (!state->ARB_shading_language_420pack_enable && $2.flags.q.precise)
+      if (!state->has_420pack() && $2.flags.q.precise)
          _mesa_glsl_error(&@1, state,
                           "\"invariant\" must come after \"precise\"");
 
@@ -1762,7 +1762,7 @@ type_qualifier:
       if ($2.has_interpolation())
          _mesa_glsl_error(&@1, state, "duplicate interpolation qualifier");
 
-      if (!state->ARB_shading_language_420pack_enable &&
+      if (!state->has_420pack() &&
           ($2.flags.q.precise || $2.flags.q.invariant)) {
          _mesa_glsl_error(&@1, state, "interpolation qualifiers must come "
                           "after \"precise\" or \"invariant\"");
@@ -1782,7 +1782,7 @@ type_qualifier:
        * precise qualifiers since these are useful in ARB_separate_shader_objects.
        * There is no clear spec guidance on this either.
        */
-      if (!state->ARB_shading_language_420pack_enable && $2.has_layout())
+      if (!state->has_420pack() && $2.has_layout())
          _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
 
       $$ = $1;
@@ -1800,7 +1800,7 @@ type_qualifier:
                           "duplicate auxiliary storage qualifier (centroid or sample)");
       }
 
-      if (!state->ARB_shading_language_420pack_enable &&
+      if (!state->has_420pack() &&
           ($2.flags.q.precise || $2.flags.q.invariant ||
            $2.has_interpolation() || $2.has_layout())) {
          _mesa_glsl_error(&@1, state, "auxiliary storage qualifiers must come "
@@ -1818,7 +1818,7 @@ type_qualifier:
       if ($2.has_storage())
          _mesa_glsl_error(&@1, state, "duplicate storage qualifier");
 
-      if (!state->ARB_shading_language_420pack_enable &&
+      if (!state->has_420pack() &&
           ($2.flags.q.precise || $2.flags.q.invariant || $2.has_interpolation() ||
            $2.has_layout() || $2.has_auxiliary_storage())) {
          _mesa_glsl_error(&@1, state, "storage qualifiers must come after "
@@ -1834,7 +1834,7 @@ type_qualifier:
       if ($2.precision != ast_precision_none)
          _mesa_glsl_error(&@1, state, "duplicate precision qualifier");
 
-      if (!state->ARB_shading_language_420pack_enable && $2.flags.i != 0)
+      if (!state->has_420pack() && $2.flags.i != 0)
          _mesa_glsl_error(&@1, state, "precision qualifiers must come last");
 
       $$ = $2;
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index b65d53db641..eb325f04eed 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -231,6 +231,11 @@ struct _mesa_glsl_parse_state {
       return ARB_gpu_shader_fp64_enable || is_version(400, 0);
    }
 
+   bool has_420pack() const
+   {
+      return ARB_shading_language_420pack_enable || is_version(420, 0);
+   }
+
    void process_version_directive(YYLTYPE *locp, int version,
                                   const char *ident);
 

From 730e8c4410d73bf9db2c1768af8cf6e98e24cc73 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 16 Jul 2015 04:38:41 +0100
Subject: [PATCH 0654/1208] radeonsi: separate out load sample position
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is prep work for reusing this in the interpolation
code later.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 44 ++++++++++++++----------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index cddd9a0e120..77ba9c9e0a1 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1030,6 +1030,31 @@ static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resou
 			       LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 }
 
+static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
+{
+	struct si_shader_context *si_shader_ctx =
+		si_shader_context(&radeon_bld->soa.bld_base);
+	struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
+	struct gallivm_state *gallivm = &radeon_bld->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
+	LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
+
+	/* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
+	LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
+	LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
+
+	LLVMValueRef pos[4] = {
+		buffer_load_const(builder, resource, offset0, radeon_bld->soa.bld_base.base.elem_type),
+		buffer_load_const(builder, resource, offset1, radeon_bld->soa.bld_base.base.elem_type),
+		lp_build_const_float(gallivm, 0),
+		lp_build_const_float(gallivm, 0)
+	};
+
+	return lp_build_gather_values(gallivm, pos, 4);
+}
+
 static void declare_system_value(
 	struct radeon_llvm_context * radeon_bld,
 	unsigned index,
@@ -1081,25 +1106,8 @@ static void declare_system_value(
 		break;
 
 	case TGSI_SEMANTIC_SAMPLEPOS:
-	{
-		LLVMBuilderRef builder = gallivm->builder;
-		LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
-		LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
-		LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
-
-		/* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
-		LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, get_sample_id(radeon_bld), 8);
-		LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
-
-		LLVMValueRef pos[4] = {
-			buffer_load_const(builder, resource, offset0, radeon_bld->soa.bld_base.base.elem_type),
-			buffer_load_const(builder, resource, offset1, radeon_bld->soa.bld_base.base.elem_type),
-			lp_build_const_float(gallivm, 0),
-			lp_build_const_float(gallivm, 0)
-		};
-		value = lp_build_gather_values(gallivm, pos, 4);
+		value = load_sample_position(radeon_bld, get_sample_id(radeon_bld));
 		break;
-	}
 
 	case TGSI_SEMANTIC_SAMPLEMASK:
 		/* Smoothing isn't MSAA in GL, but it's MSAA in hardware.

From 4b6c1efb225777231459de54903484367d0b1ca1 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Fri, 17 Jul 2015 04:43:09 +0100
Subject: [PATCH 0655/1208] radeonsi: split out interpolation input selection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is prep work for using it in the interpolation code
later.

Also add storage for the input interpolation mode so we
can pick it up later.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 62 ++++++++++++++----------
 src/gallium/drivers/radeonsi/si_shader.h |  2 +-
 2 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 77ba9c9e0a1..5a3de96a846 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -836,6 +836,35 @@ static LLVMValueRef fetch_input_gs(
 				tgsi2llvmtype(bld_base, type), "");
 }
 
+static int lookup_interp_param_index(unsigned interpolate, unsigned location)
+{
+	switch (interpolate) {
+	case TGSI_INTERPOLATE_CONSTANT:
+		return 0;
+
+	case TGSI_INTERPOLATE_LINEAR:
+		if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
+			return SI_PARAM_LINEAR_SAMPLE;
+		else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
+			return SI_PARAM_LINEAR_CENTROID;
+		else
+			return SI_PARAM_LINEAR_CENTER;
+		break;
+	case TGSI_INTERPOLATE_COLOR:
+	case TGSI_INTERPOLATE_PERSPECTIVE:
+		if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
+			return SI_PARAM_PERSP_SAMPLE;
+		else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
+			return SI_PARAM_PERSP_CENTROID;
+		else
+			return SI_PARAM_PERSP_CENTER;
+		break;
+	default:
+		fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
+		return -1;
+	}
+}
+
 static void declare_input_fs(
 	struct radeon_llvm_context *radeon_bld,
 	unsigned input_index,
@@ -850,7 +879,8 @@ static void declare_input_fs(
 	LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
 	LLVMValueRef main_fn = radeon_bld->main_fn;
 
-	LLVMValueRef interp_param;
+	LLVMValueRef interp_param = NULL;
+	int interp_param_idx;
 	const char * intr_name;
 
 	/* This value is:
@@ -899,31 +929,13 @@ static void declare_input_fs(
 	attr_number = lp_build_const_int32(gallivm,
 					   shader->ps_input_param_offset[input_index]);
 
-	switch (decl->Interp.Interpolate) {
-	case TGSI_INTERPOLATE_CONSTANT:
-		interp_param = 0;
-		break;
-	case TGSI_INTERPOLATE_LINEAR:
-		if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_SAMPLE)
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_SAMPLE);
-		else if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_CENTROID)
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_CENTROID);
-		else
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_CENTER);
-		break;
-	case TGSI_INTERPOLATE_COLOR:
-	case TGSI_INTERPOLATE_PERSPECTIVE:
-		if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_SAMPLE)
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_SAMPLE);
-		else if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_CENTROID)
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_CENTROID);
-		else
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_CENTER);
-		break;
-	default:
-		fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
+	shader->ps_input_interpolate[input_index] = decl->Interp.Interpolate;
+	interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
+						     decl->Interp.Location);
+	if (interp_param_idx == -1)
 		return;
-	}
+	else if (interp_param_idx)
+		interp_param = LLVMGetParam(main_fn, interp_param_idx);
 
 	/* fs.constant returns the param from the middle vertex, so it's not
 	 * really useful for flat shading. It's meant to be used for custom
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index b52714d5abe..82e9c915965 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -258,7 +258,7 @@ struct si_shader {
 	unsigned		nparam;
 	unsigned		vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS];
 	unsigned		ps_input_param_offset[PIPE_MAX_SHADER_INPUTS];
-
+	unsigned		ps_input_interpolate[PIPE_MAX_SHADER_INPUTS];
 	bool			uses_instanceid;
 	unsigned		nr_pos_exports;
 	unsigned		nr_param_exports;

From b0654e368b1741083055efd281b981db4fb5724b Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 13 Jul 2015 00:07:09 +0100
Subject: [PATCH 0656/1208] radeonsi: add support for indirect samplers (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds the frontend support, however the llvm
backend produces the wrong pattern, however
we can conditionalise enabling ARB_gpu_shader5
on whatever version of llvm we fix this in.

v2: drop unneeded sampler_indirect checks (Marek)

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                             |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c | 49 ++++++++++++++++++++----
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 9d3d5e03b66..1f2771abecc 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -98,7 +98,7 @@ GL 4.0, GLSL 4.00:
   GL_ARB_draw_indirect                                 DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_gpu_shader5                                   DONE (i965, nvc0)
   - 'precise' qualifier                                DONE
-  - Dynamically uniform sampler array indices          DONE (r600, softpipe)
+  - Dynamically uniform sampler array indices          DONE (r600, radeonsi, softpipe)
   - Dynamically uniform UBO array indices              DONE (r600)
   - Implicit signed -> unsigned conversions            DONE
   - Fused multiply-add                                 DONE ()
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 5a3de96a846..d4a7d3b3617 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2252,9 +2252,29 @@ static void tex_fetch_args(
 	unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &ref_pos);
 	unsigned count = 0;
 	unsigned chan;
-	unsigned sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
-	unsigned sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
+	unsigned sampler_src;
+	unsigned sampler_index;
 	bool has_offset = HAVE_LLVM >= 0x0305 ? inst->Texture.NumOffsets > 0 : false;
+	LLVMValueRef res_ptr, samp_ptr;
+
+	sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
+	sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
+
+	if (emit_data->inst->Src[sampler_src].Register.Indirect) {
+		const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
+		LLVMValueRef ind_index;
+
+		ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
+
+		res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+		res_ptr = build_indexed_load_const(si_shader_ctx, res_ptr, ind_index);
+
+		samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
+		samp_ptr = build_indexed_load_const(si_shader_ctx, samp_ptr, ind_index);
+	} else {
+		res_ptr = si_shader_ctx->resources[sampler_index];
+		samp_ptr = si_shader_ctx->samplers[sampler_index];
+	}
 
 	if (target == TGSI_TEXTURE_BUFFER) {
 		LLVMTypeRef i128 = LLVMIntTypeInContext(gallivm->context, 128);
@@ -2263,7 +2283,7 @@ static void tex_fetch_args(
 		LLVMTypeRef v16i8 = LLVMVectorType(i8, 16);
 
 		/* Bitcast and truncate v8i32 to v16i8. */
-		LLVMValueRef res = si_shader_ctx->resources[sampler_index];
+		LLVMValueRef res = res_ptr;
 		res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
 		res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
 		res = LLVMBuildBitCast(gallivm->builder, res, v16i8, "");
@@ -2489,7 +2509,7 @@ static void tex_fetch_args(
 	}
 
 	/* Resource */
-	emit_data->args[1] = si_shader_ctx->resources[sampler_index];
+	emit_data->args[1] = res_ptr;
 
 	if (opcode == TGSI_OPCODE_TXF) {
 		/* add tex offsets */
@@ -2572,7 +2592,7 @@ static void tex_fetch_args(
 			dmask = 1 << gather_comp;
 		}
 
-		emit_data->args[2] = si_shader_ctx->samplers[sampler_index];
+		emit_data->args[2] = samp_ptr;
 		emit_data->args[3] = lp_build_const_int32(gallivm, dmask);
 		emit_data->args[4] = lp_build_const_int32(gallivm, is_rect); /* unorm */
 		emit_data->args[5] = lp_build_const_int32(gallivm, 0); /* r128 */
@@ -2588,7 +2608,7 @@ static void tex_fetch_args(
 			LLVMFloatTypeInContext(gallivm->context),
 			4);
 	} else {
-		emit_data->args[2] = si_shader_ctx->samplers[sampler_index];
+		emit_data->args[2] = samp_ptr;
 		emit_data->args[3] = lp_build_const_int32(gallivm, target);
 		emit_data->arg_count = 4;
 
@@ -2734,13 +2754,26 @@ static void txq_fetch_args(
 	const struct tgsi_full_instruction *inst = emit_data->inst;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	unsigned target = inst->Texture.Texture;
+	LLVMValueRef res_ptr;
+
+	if (inst->Src[1].Register.Indirect) {
+		const struct tgsi_full_src_register *reg = &inst->Src[1];
+		LLVMValueRef ind_index;
+
+		ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
+
+		res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+		res_ptr = build_indexed_load_const(si_shader_ctx, res_ptr,
+						   ind_index);
+	} else
+		res_ptr = si_shader_ctx->resources[inst->Src[1].Register.Index];
 
 	if (target == TGSI_TEXTURE_BUFFER) {
 		LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 		LLVMTypeRef v8i32 = LLVMVectorType(i32, 8);
 
 		/* Read the size from the buffer descriptor directly. */
-		LLVMValueRef size = si_shader_ctx->resources[inst->Src[1].Register.Index];
+		LLVMValueRef size = res_ptr;
 		size = LLVMBuildBitCast(gallivm->builder, size, v8i32, "");
 		size = LLVMBuildExtractElement(gallivm->builder, size,
 					      lp_build_const_int32(gallivm, 6), "");
@@ -2752,7 +2785,7 @@ static void txq_fetch_args(
 	emit_data->args[0] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
 
 	/* Resource */
-	emit_data->args[1] = si_shader_ctx->resources[inst->Src[1].Register.Index];
+	emit_data->args[1] = res_ptr;
 
 	/* Texture target */
 	if (target == TGSI_TEXTURE_CUBE_ARRAY ||

From 7b40d92f0d0661c05c1afa59555905b2c37e594f Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 13 Jul 2015 09:12:18 +0100
Subject: [PATCH 0657/1208] radeonsi: ubo indexing support (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is required as part of ARB_gpu_shader5.

no backend changes are required for this, or if
any are, it's the same ones as for samplers.

v2: use get_indirect_index (Marek)

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                             |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 1f2771abecc..e3fa1a1ec1b 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -99,7 +99,7 @@ GL 4.0, GLSL 4.00:
   GL_ARB_gpu_shader5                                   DONE (i965, nvc0)
   - 'precise' qualifier                                DONE
   - Dynamically uniform sampler array indices          DONE (r600, radeonsi, softpipe)
-  - Dynamically uniform UBO array indices              DONE (r600)
+  - Dynamically uniform UBO array indices              DONE (r600, radeonsi)
   - Implicit signed -> unsigned conversions            DONE
   - Fused multiply-add                                 DONE ()
   - Packing/bitfield/conversion functions              DONE (r600, radeonsi, softpipe)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index d4a7d3b3617..81f7bdb3472 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1191,7 +1191,7 @@ static LLVMValueRef fetch_constant(
 	const struct tgsi_ind_register *ireg = &reg->Indirect;
 	unsigned buf, idx;
 
-	LLVMValueRef addr;
+	LLVMValueRef addr, bufp;
 	LLVMValueRef result;
 
 	if (swizzle == LP_CHAN_ALL) {
@@ -1206,7 +1206,7 @@ static LLVMValueRef fetch_constant(
 	buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
 	idx = reg->Register.Index * 4 + swizzle;
 
-	if (!reg->Register.Indirect) {
+	if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
 		if (type != TGSI_TYPE_DOUBLE)
 			return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]);
 		else {
@@ -1216,13 +1216,22 @@ static LLVMValueRef fetch_constant(
 		}
 	}
 
+	if (reg->Register.Dimension && reg->Dimension.Indirect) {
+		LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+		LLVMValueRef index;
+		index = get_indirect_index(si_shader_ctx, &reg->DimIndirect,
+						   reg->Dimension.Index);
+		bufp = build_indexed_load_const(si_shader_ctx, ptr, index);
+	} else
+		bufp = si_shader_ctx->const_resource[buf];
+
 	addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
 	addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
 	addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
 	addr = lp_build_add(&bld_base->uint_bld, addr,
 			    lp_build_const_int32(base->gallivm, idx * 4));
 
-	result = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
+	result = buffer_load_const(base->gallivm->builder, bufp,
 				   addr, bld_base->base.elem_type);
 
 	if (type != TGSI_TYPE_DOUBLE)

From a818faa6ddcfa6cd90a24b70c49ec76573954111 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 11 Jul 2015 12:47:03 -0400
Subject: [PATCH 0658/1208] nvc0: fix geometry program revalidation of clipping
 params

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Cc: mesa-stable@lists.freedesktop.org
---
 src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index 945e2a6c9bd..ce1119c284d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -339,7 +339,7 @@ nvc0_check_program_ucps(struct nvc0_context *nvc0,
       nvc0_vertprog_validate(nvc0);
    else
    if (likely(vp == nvc0->gmtyprog))
-      nvc0_vertprog_validate(nvc0);
+      nvc0_gmtyprog_validate(nvc0);
    else
       nvc0_tevlprog_validate(nvc0);
 }

From e39ece0d7856d0532a0f011cd5cb17bc85ee82e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 24 Jul 2015 19:47:06 +0200
Subject: [PATCH 0659/1208] st/mesa: don't ignore texture buffer state changes

Fixes piglit:
  spec@arb_texture_buffer_range@ranges-2

Cc: mesa-stable@lists.freedesktop.org
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/state_tracker/st_atom_texture.c | 10 +++++-----
 src/mesa/state_tracker/st_context.c      |  1 +
 src/mesa/state_tracker/st_context.h      |  1 +
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index 1e315332560..e80f9893ad5 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -508,7 +508,7 @@ const struct st_tracked_state st_update_fragment_texture = {
    "st_update_texture",					/* name */
    {							/* dirty */
       _NEW_TEXTURE,					/* mesa */
-      ST_NEW_FRAGMENT_PROGRAM,				/* st */
+      ST_NEW_FRAGMENT_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
    update_fragment_textures				/* update */
 };
@@ -518,7 +518,7 @@ const struct st_tracked_state st_update_vertex_texture = {
    "st_update_vertex_texture",				/* name */
    {							/* dirty */
       _NEW_TEXTURE,					/* mesa */
-      ST_NEW_VERTEX_PROGRAM,				/* st */
+      ST_NEW_VERTEX_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
    update_vertex_textures				/* update */
 };
@@ -528,7 +528,7 @@ const struct st_tracked_state st_update_geometry_texture = {
    "st_update_geometry_texture",			/* name */
    {							/* dirty */
       _NEW_TEXTURE,					/* mesa */
-      ST_NEW_GEOMETRY_PROGRAM,				/* st */
+      ST_NEW_GEOMETRY_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
    update_geometry_textures				/* update */
 };
@@ -538,7 +538,7 @@ const struct st_tracked_state st_update_tessctrl_texture = {
    "st_update_tessctrl_texture",			/* name */
    {							/* dirty */
       _NEW_TEXTURE,					/* mesa */
-      ST_NEW_TESSCTRL_PROGRAM,				/* st */
+      ST_NEW_TESSCTRL_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
    update_tessctrl_textures				/* update */
 };
@@ -548,7 +548,7 @@ const struct st_tracked_state st_update_tesseval_texture = {
    "st_update_tesseval_texture",			/* name */
    {							/* dirty */
       _NEW_TEXTURE,					/* mesa */
-      ST_NEW_TESSEVAL_PROGRAM,				/* st */
+      ST_NEW_TESSEVAL_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
    update_tesseval_textures				/* update */
 };
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 44244a1e720..72c23cad4bc 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -314,6 +314,7 @@ static void st_init_driver_flags(struct gl_driver_flags *f)
    f->NewRasterizerDiscard = ST_NEW_RASTERIZER;
    f->NewUniformBuffer = ST_NEW_UNIFORM_BUFFER;
    f->NewDefaultTessLevels = ST_NEW_TESS_STATE;
+   f->NewTextureBuffer = ST_NEW_SAMPLER_VIEWS;
 }
 
 struct st_context *st_create_context(gl_api api, struct pipe_context *pipe,
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 8183412ceab..a68f881c81c 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -60,6 +60,7 @@ struct u_upload_mgr;
 #define ST_NEW_UNIFORM_BUFFER          (1 << 8)
 #define ST_NEW_TESSCTRL_PROGRAM        (1 << 9)
 #define ST_NEW_TESSEVAL_PROGRAM        (1 << 10)
+#define ST_NEW_SAMPLER_VIEWS           (1 << 11)
 
 
 struct st_state_flags {

From 9deb614cacbeca3e99724f08254ab1789f34b56c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 24 Jul 2015 00:54:08 +0200
Subject: [PATCH 0660/1208] radeonsi: fix GLSL textureGrad(samplerCube*)
 functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

+4 piglits

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/radeon_llvm.h      |  5 +-
 .../drivers/radeon/radeon_setup_tgsi_llvm.c   | 84 ++++++++++++++-----
 src/gallium/drivers/radeonsi/si_shader.c      | 35 +++++---
 3 files changed, 90 insertions(+), 34 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index ef09ead9774..950a51beff3 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -173,8 +173,9 @@ static inline LLVMValueRef bitcast(
 
 
 void radeon_llvm_emit_prepare_cube_coords(struct lp_build_tgsi_context * bld_base,
-                                          struct lp_build_emit_data * emit_data,
-                                          LLVMValueRef *coords_arg);
+					  struct lp_build_emit_data * emit_data,
+					  LLVMValueRef *coords_arg,
+					  LLVMValueRef *derivs_arg);
 
 void radeon_llvm_context_init(struct radeon_llvm_context * ctx);
 
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 39bbdb67778..2362979cf87 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -748,32 +748,24 @@ static void kil_emit(
 	}
 }
 
-void radeon_llvm_emit_prepare_cube_coords(
-		struct lp_build_tgsi_context * bld_base,
-		struct lp_build_emit_data * emit_data,
-		LLVMValueRef *coords_arg)
+static void radeon_llvm_cube_to_2d_coords(struct lp_build_tgsi_context *bld_base,
+					  LLVMValueRef *in, LLVMValueRef *out)
 {
-
-	unsigned target = emit_data->inst->Texture.Texture;
-	unsigned opcode = emit_data->inst->Instruction.Opcode;
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	LLVMTypeRef type = bld_base->base.elem_type;
 	LLVMValueRef coords[4];
 	LLVMValueRef mad_args[3];
-	LLVMValueRef idx;
-	struct LLVMOpaqueValue *cube_vec;
-	LLVMValueRef v;
+	LLVMValueRef v, cube_vec;
 	unsigned i;
 
-	cube_vec = lp_build_gather_values(bld_base->base.gallivm, coords_arg, 4);
+	cube_vec = lp_build_gather_values(bld_base->base.gallivm, in, 4);
 	v = build_intrinsic(builder, "llvm.AMDGPU.cube", LLVMVectorType(type, 4),
                             &cube_vec, 1, LLVMReadNoneAttribute);
 
-	for (i = 0; i < 4; ++i) {
-		idx = lp_build_const_int32(gallivm, i);
-		coords[i] = LLVMBuildExtractElement(builder, v, idx, "");
-	}
+	for (i = 0; i < 4; ++i)
+		coords[i] = LLVMBuildExtractElement(builder, v,
+						    lp_build_const_int32(gallivm, i), "");
 
 	coords[2] = build_intrinsic(builder, "fabs",
 			type, &coords[2], 1, LLVMReadNoneAttribute);
@@ -791,10 +783,60 @@ void radeon_llvm_emit_prepare_cube_coords(
 			mad_args[0], mad_args[1], mad_args[2]);
 
 	/* apply xyz = yxw swizzle to cooords */
-	coords[2] = coords[3];
-	coords[3] = coords[1];
-	coords[1] = coords[0];
-	coords[0] = coords[3];
+	out[0] = coords[1];
+	out[1] = coords[0];
+	out[2] = coords[3];
+}
+
+void radeon_llvm_emit_prepare_cube_coords(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data,
+		LLVMValueRef *coords_arg,
+		LLVMValueRef *derivs_arg)
+{
+
+	unsigned target = emit_data->inst->Texture.Texture;
+	unsigned opcode = emit_data->inst->Instruction.Opcode;
+	struct gallivm_state * gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef coords[4];
+	unsigned i;
+
+	radeon_llvm_cube_to_2d_coords(bld_base, coords_arg, coords);
+
+	if (opcode == TGSI_OPCODE_TXD && derivs_arg) {
+		LLVMValueRef derivs[4];
+		int axis;
+
+		/* Convert cube derivatives to 2D derivatives. */
+		for (axis = 0; axis < 2; axis++) {
+			LLVMValueRef shifted_cube_coords[4], shifted_coords[4];
+
+			/* Shift the cube coordinates by the derivatives to get
+			 * the cube coordinates of the "neighboring pixel".
+			 */
+			for (i = 0; i < 3; i++)
+				shifted_cube_coords[i] =
+					LLVMBuildFAdd(builder, coords_arg[i],
+						      derivs_arg[axis*3+i], "");
+			shifted_cube_coords[3] = LLVMGetUndef(bld_base->base.elem_type);
+
+			/* Project the shifted cube coordinates onto the face. */
+			radeon_llvm_cube_to_2d_coords(bld_base, shifted_cube_coords,
+						      shifted_coords);
+
+			/* Subtract both sets of 2D coordinates to get 2D derivatives.
+			 * This won't work if the shifted coordinates ended up
+			 * in a different face.
+			 */
+			for (i = 0; i < 2; i++)
+				derivs[axis * 2 + i] =
+					LLVMBuildFSub(builder, shifted_coords[i],
+						      coords[i], "");
+		}
+
+		memcpy(derivs_arg, derivs, sizeof(derivs));
+	}
 
 	if (target == TGSI_TEXTURE_CUBE_ARRAY ||
 	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
@@ -864,7 +906,7 @@ static void txp_fetch_args(
 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
 	}
 
 	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
@@ -909,7 +951,7 @@ static void tex_fetch_args(
 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
 	}
 
 	emit_data->arg_count = 1;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 81f7bdb3472..bc0f2171a52 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2255,7 +2255,7 @@ static void tex_fetch_args(
 	const struct tgsi_full_instruction * inst = emit_data->inst;
 	unsigned opcode = inst->Instruction.Opcode;
 	unsigned target = inst->Texture.Texture;
-	LLVMValueRef coords[5];
+	LLVMValueRef coords[5], derivs[6];
 	LLVMValueRef address[16];
 	int ref_pos;
 	unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &ref_pos);
@@ -2263,6 +2263,7 @@ static void tex_fetch_args(
 	unsigned chan;
 	unsigned sampler_src;
 	unsigned sampler_index;
+	unsigned num_deriv_channels = 0;
 	bool has_offset = HAVE_LLVM >= 0x0305 ? inst->Texture.NumOffsets > 0 : false;
 	LLVMValueRef res_ptr, samp_ptr;
 
@@ -2361,18 +2362,13 @@ static void tex_fetch_args(
 		}
 	}
 
-	if (target == TGSI_TEXTURE_CUBE ||
-	    target == TGSI_TEXTURE_CUBE_ARRAY ||
-	    target == TGSI_TEXTURE_SHADOWCUBE ||
-	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
-
 	/* Pack user derivatives */
 	if (opcode == TGSI_OPCODE_TXD) {
-		int num_deriv_channels, param;
+		int param, num_src_deriv_channels;
 
 		switch (target) {
 		case TGSI_TEXTURE_3D:
+			num_src_deriv_channels = 3;
 			num_deriv_channels = 3;
 			break;
 		case TGSI_TEXTURE_2D:
@@ -2381,27 +2377,44 @@ static void tex_fetch_args(
 		case TGSI_TEXTURE_SHADOWRECT:
 		case TGSI_TEXTURE_2D_ARRAY:
 		case TGSI_TEXTURE_SHADOW2D_ARRAY:
+			num_src_deriv_channels = 2;
+			num_deriv_channels = 2;
+			break;
 		case TGSI_TEXTURE_CUBE:
 		case TGSI_TEXTURE_SHADOWCUBE:
 		case TGSI_TEXTURE_CUBE_ARRAY:
 		case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+			/* Cube derivatives will be converted to 2D. */
+			num_src_deriv_channels = 3;
 			num_deriv_channels = 2;
 			break;
 		case TGSI_TEXTURE_1D:
 		case TGSI_TEXTURE_SHADOW1D:
 		case TGSI_TEXTURE_1D_ARRAY:
 		case TGSI_TEXTURE_SHADOW1D_ARRAY:
+			num_src_deriv_channels = 1;
 			num_deriv_channels = 1;
 			break;
 		default:
 			assert(0); /* no other targets are valid here */
 		}
 
-		for (param = 1; param <= 2; param++)
-			for (chan = 0; chan < num_deriv_channels; chan++)
-				address[count++] = lp_build_emit_fetch(bld_base, inst, param, chan);
+		for (param = 0; param < 2; param++)
+			for (chan = 0; chan < num_src_deriv_channels; chan++)
+				derivs[param * num_src_deriv_channels + chan] =
+					lp_build_emit_fetch(bld_base, inst, param+1, chan);
 	}
 
+	if (target == TGSI_TEXTURE_CUBE ||
+	    target == TGSI_TEXTURE_CUBE_ARRAY ||
+	    target == TGSI_TEXTURE_SHADOWCUBE ||
+	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
+
+	if (opcode == TGSI_OPCODE_TXD)
+		for (int i = 0; i < num_deriv_channels * 2; i++)
+			address[count++] = derivs[i];
+
 	/* Pack texture coordinates */
 	address[count++] = coords[0];
 	if (num_coords > 1)

From bb9d59aed5b01133f4c8e9f131a83b45fce91fdc Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Fri, 17 Jul 2015 05:35:30 +0100
Subject: [PATCH 0661/1208] radeonsi: add fine derivate control (v2.1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support for fine derivatives and enables
ARB_derivative_control on radeonsi.

(just fell out of my working out interpolation)

v2: cleanup some bits, write a comment
v2.1: take Michel's comment from the mailing list

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                             |  2 +-
 docs/relnotes/10.7.0.html                |  1 +
 src/gallium/drivers/radeonsi/si_pipe.c   |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c | 52 +++++++++++++++++++++---
 4 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index e3fa1a1ec1b..15bb57f441a 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -191,7 +191,7 @@ GL 4.5, GLSL 4.50:
   GL_ARB_clip_control                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_conditional_render_inverted                   DONE (i965, nv50, nvc0, llvmpipe, softpipe)
   GL_ARB_cull_distance                                 in progress (Tobias)
-  GL_ARB_derivative_control                            DONE (i965, nv50, nvc0, r600)
+  GL_ARB_derivative_control                            DONE (i965, nv50, nvc0, r600, radeonsi)
   GL_ARB_direct_state_access                           DONE (all drivers)
   GL_ARB_get_texture_sub_image                         DONE (all drivers)
   GL_ARB_shader_texture_image_samples                  not started
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index 26615a83814..afef5256b75 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -45,6 +45,7 @@ Note: some of the new features are only available with certain drivers.
 
 <ul>
 <li>GL_AMD_vertex_shader_viewport_index on radeonsi</li>
+<li>GL_ARB_derivative_control on radeonsi</li>
 <li>GL_ARB_fragment_layer_viewport on radeonsi</li>
 <li>GL_ARB_framebuffer_no_attachments on i965</li>
 <li>GL_ARB_get_texture_sub_image for all drivers</li>
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index c2985b8d224..ebe1f5a68f1 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -249,6 +249,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
 	case PIPE_CAP_TGSI_TEXCOORD:
+	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
 		return 1;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
@@ -289,7 +290,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index bc0f2171a52..27b3c72d1c9 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2854,6 +2854,35 @@ static void build_txq_intrinsic(const struct lp_build_tgsi_action * action,
 	}
 }
 
+/*
+ * SI implements derivatives using the local data store (LDS)
+ * All writes to the LDS happen in all executing threads at
+ * the same time. TID is the Thread ID for the current
+ * thread and is a value between 0 and 63, representing
+ * the thread's position in the wavefront.
+ *
+ * For the pixel shader threads are grouped into quads of four pixels.
+ * The TIDs of the pixels of a quad are:
+ *
+ *  +------+------+
+ *  |4n + 0|4n + 1|
+ *  +------+------+
+ *  |4n + 2|4n + 3|
+ *  +------+------+
+ *
+ * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
+ * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
+ * the current pixel's column, and masking with 0xfffffffe yields the TID
+ * of the left pixel of the current pixel's row.
+ *
+ * Adding 1 yields the TID of the pixel to the right of the left pixel, and
+ * adding 2 yields the TID of the pixel below the top pixel.
+ */
+/* masks for thread ID. */
+#define TID_MASK_TOP_LEFT 0xfffffffc
+#define TID_MASK_TOP      0xfffffffd
+#define TID_MASK_LEFT     0xfffffffe
+
 static void si_llvm_emit_ddxy(
 	const struct lp_build_tgsi_action * action,
 	struct lp_build_tgsi_context * bld_base,
@@ -2870,6 +2899,8 @@ static void si_llvm_emit_ddxy(
 	LLVMTypeRef i32;
 	unsigned swizzle[4];
 	unsigned c;
+	int idx;
+	unsigned mask;
 
 	i32 = LLVMInt32TypeInContext(gallivm->context);
 
@@ -2879,15 +2910,22 @@ static void si_llvm_emit_ddxy(
 	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
+	if (opcode == TGSI_OPCODE_DDX_FINE)
+		mask = TID_MASK_LEFT;
+	else if (opcode == TGSI_OPCODE_DDY_FINE)
+		mask = TID_MASK_TOP;
+	else
+		mask = TID_MASK_TOP_LEFT;
+
 	indices[1] = LLVMBuildAnd(gallivm->builder, indices[1],
-				  lp_build_const_int32(gallivm, 0xfffffffc), "");
+				  lp_build_const_int32(gallivm, mask), "");
 	load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
+	/* for DDX we want to next X pixel, DDY next Y pixel. */
+	idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
 	indices[1] = LLVMBuildAdd(gallivm->builder, indices[1],
-				  lp_build_const_int32(gallivm,
-						       opcode == TGSI_OPCODE_DDX ? 1 : 2),
-				  "");
+				  lp_build_const_int32(gallivm, idx), "");
 	load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
@@ -3229,7 +3267,9 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 
 	if (bld_base->info &&
 	    (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
-	     bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0))
+	     bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
+	     bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
+	     bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0))
 		si_shader_ctx->lds =
 			LLVMAddGlobalInAddressSpace(gallivm->module,
 						    LLVMArrayType(i32, 64),
@@ -3722,6 +3762,8 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 
 	bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
 	bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
+	bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
+	bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
 
 	bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
 	bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;

From e1dcd158785606d4e7e9ca5513732b7e6e7b93d9 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Sun, 26 Jul 2015 14:38:58 +0100
Subject: [PATCH 0662/1208] Add release notes for 10.6.3

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
(cherry picked from commit ddc976368fef367e464472ebcc2ac4fd89eb9fd8)
---
 docs/relnotes/10.6.3.html | 105 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 docs/relnotes/10.6.3.html

diff --git a/docs/relnotes/10.6.3.html b/docs/relnotes/10.6.3.html
new file mode 100644
index 00000000000..831af4896e9
--- /dev/null
+++ b/docs/relnotes/10.6.3.html
@@ -0,0 +1,105 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.6.3 Release Notes / July 26, 2015</h1>
+
+<p>
+Mesa 10.6.3 is a bug fix release which fixes bugs found since the 10.6.2 release.
+</p>
+<p>
+Mesa 10.6.3 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90728">Bug 90728</a> - dvd playback with vlc and vdpau causes segmentation fault</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91337">Bug 91337</a> - OSMesaGetProcAdress(&quot;OSMesaPixelStore&quot;) returns nil</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>osmesa: fix OSMesaPixelsStore typo</li>
+</ul>
+
+<p>Chad Versace (1):</p>
+<ul>
+  <li>mesa: Fix generation of git_sha1.h.tmp for gitlinks</li>
+</ul>
+
+<p>Christian König (2):</p>
+<ul>
+  <li>vl: cleanup video buffer private when the decoder is destroyed</li>
+  <li>st/vdpau: fix mixer size checks</li>
+</ul>
+
+<p>Emil Velikov (3):</p>
+<ul>
+  <li>docs: Add sha256 checksums for the 10.6.2 release</li>
+  <li>auxiliary/vl: use the correct screen index</li>
+  <li>Update version to 10.6.3</li>
+</ul>
+
+<p>Francisco Jerez (1):</p>
+<ul>
+  <li>i965/gen9: Use custom MOCS entries set up by the kernel.</li>
+</ul>
+
+<p>Ilia Mirkin (5):</p>
+<ul>
+  <li>nv50, nvc0: enable at least one color RT if alphatest is enabled</li>
+  <li>nvc0/ir: fix txq on indirect samplers</li>
+  <li>nvc0/ir: don't worry about sampler in txq handling</li>
+  <li>gm107/ir: fix indirect txq emission</li>
+  <li>nv50: fix max level clamping on G80</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>program: Allow redundant OPTION ARB_fog_* directives.</li>
+</ul>
+
+<p>Rob Clark (1):</p>
+<ul>
+  <li>xa: don't leak fences</li>
+</ul>
+
+
+</div>
+</body>
+</html>

From 11516b8bd1af2709b74a135787b43de55fe6238e Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Sun, 26 Jul 2015 15:18:24 +0100
Subject: [PATCH 0663/1208] docs: Add checksums for mesa 10.6.3 tarballs

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
(cherry picked from commit ccef8901de421eae5dcc8affa14218d46cc06593)
---
 docs/relnotes/10.6.3.html | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/relnotes/10.6.3.html b/docs/relnotes/10.6.3.html
index 831af4896e9..1622c87cde2 100644
--- a/docs/relnotes/10.6.3.html
+++ b/docs/relnotes/10.6.3.html
@@ -31,7 +31,8 @@ because compatibility contexts are not supported.
 
 <h2>SHA256 checksums</h2>
 <pre>
-TBD
+c27e1e33798e69a6d2d2425aee8ac7b4c0b243066a65dd76cbb182ea31b1c7f2  mesa-10.6.3.tar.gz
+58592e07c350cd2e8969b73fa83048c657a39fe2f13f3b88f5e5818fe2e4676d  mesa-10.6.3.tar.xz
 </pre>
 
 

From d69da58e84448188808488ad1c1c0181b5630a74 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Sun, 26 Jul 2015 15:20:31 +0100
Subject: [PATCH 0664/1208] docs: add news item and link release notes for mesa
 10.6.3

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 docs/index.html    | 6 ++++++
 docs/relnotes.html | 1 +
 2 files changed, 7 insertions(+)

diff --git a/docs/index.html b/docs/index.html
index 36b993eeb98..5b097c73e0f 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,12 @@
 
 <h1>News</h1>
 
+<h2>July 26 2015</h2>
+<p>
+<a href="relnotes/10.6.3.html">Mesa 10.6.3</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>July 11 2015</h2>
 <p>
 <a href="relnotes/10.6.2.html">Mesa 10.6.2</a> is released.
diff --git a/docs/relnotes.html b/docs/relnotes.html
index 1acbce0551a..8355f1132b3 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>
 
 <ul>
+<li><a href="relnotes/10.6.3.html">10.6.3 release notes</a>
 <li><a href="relnotes/10.6.2.html">10.6.2 release notes</a>
 <li><a href="relnotes/10.5.9.html">10.5.9 release notes</a>
 <li><a href="relnotes/10.6.1.html">10.6.1 release notes</a>

From 4b15cb6daa29d4bdd268eac6c2e40fb1503e98fa Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 25 Jul 2015 01:06:20 -0400
Subject: [PATCH 0665/1208] glsl: enable conservative depth, ssbo based on GLSL
 version
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add in missed version checks in the GLSL parser

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Timothy Arceri <t_arceri@yahoo.com.au>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/glsl/glsl_parser.yy | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 4cce5b8b284..2b0c8bd8c6f 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -1166,7 +1166,8 @@ layout_qualifier_id:
       /* Layout qualifiers for AMD/ARB_conservative_depth. */
       if (!$$.flags.i &&
           (state->AMD_conservative_depth_enable ||
-           state->ARB_conservative_depth_enable)) {
+           state->ARB_conservative_depth_enable ||
+           state->is_version(420, 0))) {
          if (match_layout_qualifier($1, "depth_any", state) == 0) {
             $$.flags.q.depth_any = 1;
          } else if (match_layout_qualifier($1, "depth_greater", state) == 0) {
@@ -1460,7 +1461,7 @@ layout_qualifier_id:
 
       if ((state->has_420pack() ||
            state->has_atomic_counters() ||
-           state->ARB_shader_storage_buffer_object_enable) &&
+           state->has_shader_storage_buffer_objects()) &&
           match_layout_qualifier("binding", $1, state) == 0) {
          $$.flags.q.explicit_binding = 1;
          $$.binding = $3;

From bc5e2bec303acd7fd962996bf369be5ce0e15cd2 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Thu, 23 Jul 2015 15:31:13 -0400
Subject: [PATCH 0666/1208] freedreno/ir3: updated cat6 encoding

Sync updated cat6 encoding from freedreno.git, needed to properly encode
store instructions.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/ir3/disasm-a3xx.c       | 227 ++++++++++++------
 .../drivers/freedreno/ir3/instr-a3xx.h        |  87 ++++---
 src/gallium/drivers/freedreno/ir3/ir3.c       |  27 ++-
 src/gallium/drivers/freedreno/ir3/ir3.h       |   4 +-
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  |   2 +-
 5 files changed, 232 insertions(+), 115 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
index 48ae7c71b9f..83ed5ffdca0 100644
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -103,7 +103,7 @@ static void print_reg(reg_t reg, bool full, bool r, bool c, bool im,
 	} else if ((reg.num == REG_P0) && !c) {
 		printf("p0.%c", component[reg.comp]);
 	} else {
-		printf("%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]);
+		printf("%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]);
 	}
 }
 
@@ -122,6 +122,32 @@ static void print_reg_src(reg_t reg, bool full, bool r, bool c, bool im,
 	print_reg(reg, full, r, c, im, neg, abs, addr_rel);
 }
 
+/* TODO switch to using reginfo struct everywhere, since more readable
+ * than passing a bunch of bools to print_reg_src
+ */
+
+struct reginfo {
+	reg_t reg;
+	bool full;
+	bool r;
+	bool c;
+	bool im;
+	bool neg;
+	bool abs;
+	bool addr_rel;
+};
+
+static void print_src(struct reginfo *info)
+{
+	print_reg_src(info->reg, info->full, info->r, info->c, info->im,
+			info->neg, info->abs, info->addr_rel);
+}
+
+//static void print_dst(struct reginfo *info)
+//{
+//	print_reg_dst(info->reg, info->full, info->addr_rel);
+//}
+
 static void print_instr_cat0(instr_t *instr)
 {
 	instr_cat0_t *cat0 = &instr->cat0;
@@ -454,10 +480,70 @@ static void print_instr_cat6(instr_t *instr)
 {
 	instr_cat6_t *cat6 = &instr->cat6;
 	char sd = 0, ss = 0;  /* dst/src address space */
-	bool full = type_size(cat6->type) == 32;
 	bool nodst = false;
+	struct reginfo dst, src1, src2;
+	int src1off = 0, dstoff = 0;
 
-	printf(".%s ", type[cat6->type]);
+	memset(&dst, 0, sizeof(dst));
+	memset(&src1, 0, sizeof(src1));
+	memset(&src2, 0, sizeof(src2));
+
+	switch (cat6->opc) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = type_size(cat6->type) == 32;
+		src2.full = type_size(cat6->type) == 32;
+		break;
+	case OPC_L2G:
+	case OPC_G2L:
+		dst.full = true;
+		src1.full = true;
+		src2.full = true;
+		break;
+	case OPC_STG:
+	case OPC_STL:
+	case OPC_STP:
+	case OPC_STI:
+	case OPC_STLW:
+	case OPC_STGB_4D_4:
+	case OPC_STIB:
+		dst.full  = true;
+		src1.full = type_size(cat6->type) == 32;
+		src2.full = type_size(cat6->type) == 32;
+		break;
+	default:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = true;
+		src2.full = true;
+		break;
+	}
+
+	switch (cat6->opc) {
+	case OPC_PREFETCH:
+	case OPC_RESINFO:
+		break;
+	case OPC_ATOMIC_ADD:
+	case OPC_ATOMIC_SUB:
+	case OPC_ATOMIC_XCHG:
+	case OPC_ATOMIC_INC:
+	case OPC_ATOMIC_DEC:
+	case OPC_ATOMIC_CMPXCHG:
+	case OPC_ATOMIC_MIN:
+	case OPC_ATOMIC_MAX:
+	case OPC_ATOMIC_AND:
+	case OPC_ATOMIC_OR:
+	case OPC_ATOMIC_XOR:
+		ss = cat6->g ? 'g' : 'l';
+		printf(".%c", ss);
+		printf(".%s", type[cat6->type]);
+		break;
+	default:
+		dst.im = cat6->g && !cat6->dst_off;
+		printf(".%s", type[cat6->type]);
+		break;
+	}
+	printf(" ");
 
 	switch (cat6->opc) {
 	case OPC_STG:
@@ -499,68 +585,65 @@ static void print_instr_cat6(instr_t *instr)
 		break;
 
 	case OPC_STI:
-		full = false;  // XXX or inverts??
+		dst.full = false;  // XXX or inverts??
 		break;
 	}
 
-	if (cat6->has_off) {
-		if (!nodst) {
-			if (sd)
-				printf("%c[", sd);
-			print_reg_dst((reg_t)(cat6->a.dst), full, false);
-			if (sd)
-				printf("]");
-			printf(", ");
-		}
-		if (ss)
-			printf("%c[", ss);
-		print_reg_src((reg_t)(cat6->a.src1), true,
-				false, false, cat6->a.src1_im, false, false, false);
-		if (cat6->a.off)
-			printf("%+d", cat6->a.off);
-		if (ss)
-			printf("]");
-		printf(", ");
-		print_reg_src((reg_t)(cat6->a.src2), full,
-				false, false, cat6->a.src2_im, false, false, false);
+	if (cat6->dst_off) {
+		dst.reg = (reg_t)(cat6->c.dst);
+		dstoff  = cat6->c.off;
 	} else {
-		if (!nodst) {
-			if (sd)
-				printf("%c[", sd);
-			print_reg_dst((reg_t)(cat6->b.dst), full, false);
-			if (sd)
-				printf("]");
-			printf(", ");
-		}
-		if (ss)
-			printf("%c[", ss);
-		print_reg_src((reg_t)(cat6->b.src1), true,
-				false, false, cat6->b.src1_im, false, false, false);
-		if (ss)
-			printf("]");
-		printf(", ");
-		print_reg_src((reg_t)(cat6->b.src2), full,
-				false, false, cat6->b.src2_im, false, false, false);
+		dst.reg = (reg_t)(cat6->d.dst);
 	}
 
-	if (debug & PRINT_VERBOSE) {
-		switch (cat6->opc) {
-		case OPC_LDG:
-		case OPC_LDP:
-			/* load instructions: */
-			if (cat6->a.dummy2|cat6->a.dummy3)
-				printf("\t{6: %x,%x}", cat6->a.dummy2, cat6->a.dummy3);
-			break;
-		case OPC_STG:
-		case OPC_STP:
-		case OPC_STI:
-			/* store instructions: */
-			if (cat6->b.dummy2|cat6->b.dummy2)
-				printf("\t{6: %x,%x}", cat6->b.dummy2, cat6->b.dummy3);
-			if (cat6->b.ignore0)
-				printf("\t{?? %x}", cat6->b.ignore0);
-			break;
-		}
+	if (cat6->src_off) {
+		src1.reg = (reg_t)(cat6->a.src1);
+		src1.im  = cat6->a.src1_im;
+		src2.reg = (reg_t)(cat6->a.src2);
+		src2.im  = cat6->a.src2_im;
+		src1off  = cat6->a.off;
+	} else {
+		src1.reg = (reg_t)(cat6->b.src1);
+		src1.im  = cat6->b.src1_im;
+		src2.reg = (reg_t)(cat6->b.src2);
+		src2.im  = cat6->b.src2_im;
+	}
+
+	if (!nodst) {
+		if (sd)
+			printf("%c[", sd);
+		/* note: dst might actually be a src (ie. address to store to) */
+		print_src(&dst);
+		if (dstoff)
+			printf("%+d", dstoff);
+		if (sd)
+			printf("]");
+		printf(", ");
+	}
+
+	if (ss)
+		printf("%c[", ss);
+
+	/* can have a larger than normal immed, so hack: */
+	if (src1.im) {
+		printf("%u", src1.reg.dummy13);
+	} else {
+		print_src(&src1);
+	}
+
+	if (src1off)
+		printf("%+d", src1off);
+	if (ss)
+		printf("]");
+
+	switch (cat6->opc) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		break;
+	default:
+		printf(", ");
+		print_src(&src2);
+		break;
 	}
 }
 
@@ -711,19 +794,19 @@ struct opc_info {
 	OPC(6, OPC_LDLW,         ldlw),
 	OPC(6, OPC_STLW,         stlw),
 	OPC(6, OPC_RESFMT,       resfmt),
-	OPC(6, OPC_RESINFO,      resinf),
-	OPC(6, OPC_ATOMIC_ADD_L,     atomic.add.l),
-	OPC(6, OPC_ATOMIC_SUB_L,     atomic.sub.l),
-	OPC(6, OPC_ATOMIC_XCHG_L,    atomic.xchg.l),
-	OPC(6, OPC_ATOMIC_INC_L,     atomic.inc.l),
-	OPC(6, OPC_ATOMIC_DEC_L,     atomic.dec.l),
-	OPC(6, OPC_ATOMIC_CMPXCHG_L, atomic.cmpxchg.l),
-	OPC(6, OPC_ATOMIC_MIN_L,     atomic.min.l),
-	OPC(6, OPC_ATOMIC_MAX_L,     atomic.max.l),
-	OPC(6, OPC_ATOMIC_AND_L,     atomic.and.l),
-	OPC(6, OPC_ATOMIC_OR_L,      atomic.or.l),
-	OPC(6, OPC_ATOMIC_XOR_L,     atomic.xor.l),
-	OPC(6, OPC_LDGB_TYPED_4D,    ldgb.typed.4d),
+	OPC(6, OPC_RESINFO,      resinfo),
+	OPC(6, OPC_ATOMIC_ADD,     atomic.add),
+	OPC(6, OPC_ATOMIC_SUB,     atomic.sub),
+	OPC(6, OPC_ATOMIC_XCHG,    atomic.xchg),
+	OPC(6, OPC_ATOMIC_INC,     atomic.inc),
+	OPC(6, OPC_ATOMIC_DEC,     atomic.dec),
+	OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg),
+	OPC(6, OPC_ATOMIC_MIN,     atomic.min),
+	OPC(6, OPC_ATOMIC_MAX,     atomic.max),
+	OPC(6, OPC_ATOMIC_AND,     atomic.and),
+	OPC(6, OPC_ATOMIC_OR,      atomic.or),
+	OPC(6, OPC_ATOMIC_XOR,     atomic.xor),
+	OPC(6, OPC_LDGB_TYPED_4D,    ldgb.typed.3d),
 	OPC(6, OPC_STGB_4D_4,    stgb.4d.4),
 	OPC(6, OPC_STIB,         stib),
 	OPC(6, OPC_LDC_4,        ldc.4),
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
index efb07ea479e..c3fb68d511c 100644
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -173,17 +173,17 @@ typedef enum {
 	OPC_STLW = 11,
 	OPC_RESFMT = 14,
 	OPC_RESINFO = 15,
-	OPC_ATOMIC_ADD_L = 16,
-	OPC_ATOMIC_SUB_L = 17,
-	OPC_ATOMIC_XCHG_L = 18,
-	OPC_ATOMIC_INC_L = 19,
-	OPC_ATOMIC_DEC_L = 20,
-	OPC_ATOMIC_CMPXCHG_L = 21,
-	OPC_ATOMIC_MIN_L = 22,
-	OPC_ATOMIC_MAX_L = 23,
-	OPC_ATOMIC_AND_L = 24,
-	OPC_ATOMIC_OR_L = 25,
-	OPC_ATOMIC_XOR_L = 26,
+	OPC_ATOMIC_ADD = 16,
+	OPC_ATOMIC_SUB = 17,
+	OPC_ATOMIC_XCHG = 18,
+	OPC_ATOMIC_INC = 19,
+	OPC_ATOMIC_DEC = 20,
+	OPC_ATOMIC_CMPXCHG = 21,
+	OPC_ATOMIC_MIN = 22,
+	OPC_ATOMIC_MAX = 23,
+	OPC_ATOMIC_AND = 24,
+	OPC_ATOMIC_OR = 25,
+	OPC_ATOMIC_XOR = 26,
 	OPC_LDGB_TYPED_4D = 27,
 	OPC_STGB_4D_4 = 28,
 	OPC_STIB = 29,
@@ -575,7 +575,7 @@ typedef struct PACKED {
 	uint32_t opc_cat  : 3;
 } instr_cat5_t;
 
-/* [src1 + off], src2: */
+/* dword0 encoding for src_off: [src1 + off], src2: */
 typedef struct PACKED {
 	/* dword0: */
 	uint32_t mustbe1  : 1;
@@ -586,37 +586,50 @@ typedef struct PACKED {
 	uint32_t src2     : 8;
 
 	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t dummy2   : 9;
-	uint32_t type     : 3;
-	uint32_t dummy3   : 2;
-	uint32_t opc      : 5;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
+	uint32_t dword1;
 } instr_cat6a_t;
 
-/* [src1], src2: */
+/* dword0 encoding for !src_off: [src1], src2 */
 typedef struct PACKED {
 	/* dword0: */
 	uint32_t mustbe0  : 1;
-	uint32_t src1     : 8;
-	uint32_t ignore0  : 13;
+	uint32_t src1     : 13;
+	uint32_t ignore0  : 8;
 	uint32_t src1_im  : 1;
 	uint32_t src2_im  : 1;
 	uint32_t src2     : 8;
 
 	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t dummy2   : 9;
-	uint32_t type     : 3;
-	uint32_t dummy3   : 2;
-	uint32_t opc      : 5;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
+	uint32_t dword1;
 } instr_cat6b_t;
 
+/* dword1 encoding for dst_off: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t dword0;
+
+	/* note: there is some weird stuff going on where sometimes
+	 * cat6->a.off is involved.. but that seems like a bug in
+	 * the blob, since it is used even if !cat6->src_off
+	 * It would make sense for there to be some more bits to
+	 * bring us to 11 bits worth of offset, but not sure..
+	 */
+	int32_t off       : 8;
+	uint32_t mustbe1  : 1;
+	uint32_t dst      : 8;
+	uint32_t pad1     : 15;
+} instr_cat6c_t;
+
+/* dword1 encoding for !dst_off: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t dword0;
+
+	uint32_t dst      : 8;
+	uint32_t mustbe0  : 1;
+	uint32_t pad0     : 23;
+} instr_cat6d_t;
+
 /* I think some of the other cat6 instructions use additional
  * sub-encodings..
  */
@@ -624,16 +637,20 @@ typedef struct PACKED {
 typedef union PACKED {
 	instr_cat6a_t a;
 	instr_cat6b_t b;
+	instr_cat6c_t c;
+	instr_cat6d_t d;
 	struct PACKED {
 		/* dword0: */
-		uint32_t has_off  : 1;
+		uint32_t src_off  : 1;
 		uint32_t pad1     : 31;
 
 		/* dword1: */
-		uint32_t dst      : 8;
-		uint32_t dummy2   : 9;
+		uint32_t pad2     : 8;
+		uint32_t dst_off  : 1;
+		uint32_t pad3     : 8;
 		uint32_t type     : 3;
-		uint32_t dummy3   : 2;
+		uint32_t g        : 1;  /* or in some cases it means dst immed */
+		uint32_t pad4     : 1;
 		uint32_t opc      : 5;
 		uint32_t jmp_tgt  : 1;
 		uint32_t sync     : 1;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index a0cb74498ec..6d19a29275b 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -506,25 +506,28 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr,
 
 	iassert(instr->regs_count >= 2);
 
-	if (instr->cat6.offset || instr->opc == OPC_LDG) {
+	/* TODO we need a more comprehensive list about which instructions
+	 * can be encoded which way.  Or possibly use IR3_INSTR_0 flag to
+	 * indicate to use the src_off encoding even if offset is zero
+	 * (but then what to do about dst_off?)
+	 */
+	if (instr->cat6.src_offset || (instr->opc == OPC_LDG)) {
 		instr_cat6a_t *cat6a = ptr;
 
-		cat6->has_off = true;
+		cat6->src_off = true;
 
-		cat6a->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
 		cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
 		cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED);
 		if (src2) {
 			cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
 			cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED);
 		}
-		cat6a->off = instr->cat6.offset;
+		cat6a->off = instr->cat6.src_offset;
 	} else {
 		instr_cat6b_t *cat6b = ptr;
 
-		cat6->has_off = false;
+		cat6->src_off = false;
 
-		cat6b->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
 		cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
 		cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED);
 		if (src2) {
@@ -533,10 +536,22 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr,
 		}
 	}
 
+	if (instr->cat6.dst_offset || (instr->opc == OPC_STG)) {
+		instr_cat6c_t *cat6c = ptr;
+		cat6->dst_off = true;
+		cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+		cat6c->off = instr->cat6.dst_offset;
+	} else {
+		instr_cat6d_t *cat6d = ptr;
+		cat6->dst_off = false;
+		cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	}
+
 	cat6->type     = instr->cat6.type;
 	cat6->opc      = instr->opc;
 	cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
 	cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat6->g        = !!(instr->flags & IR3_INSTR_G);
 	cat6->opc_cat  = 6;
 
 	return 0;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index f11d8eda5f2..c3b61a0fe01 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -172,6 +172,7 @@ struct ir3_instruction {
 		IR3_INSTR_P     = 0x080,
 		IR3_INSTR_S     = 0x100,
 		IR3_INSTR_S2EN  = 0x200,
+		IR3_INSTR_G     = 0x400,
 		/* meta-flags, for intermediate stages of IR, ie.
 		 * before register assignment is done:
 		 */
@@ -209,7 +210,8 @@ struct ir3_instruction {
 		} cat5;
 		struct {
 			type_t type;
-			int offset;
+			int src_offset;
+			int dst_offset;
 			int iim_val;
 		} cat6;
 		/* for meta-instructions, just used to hold extra data
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 22885ff85f3..bdba3aae36f 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1215,7 +1215,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 		struct ir3_instruction *load =
 				ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
 		load->cat6.type = TYPE_U32;
-		load->cat6.offset = off + i * 4;    /* byte offset */
+		load->cat6.src_offset = off + i * 4;     /* byte offset */
 		dst[i] = load;
 	}
 }

From 0815729d964f4e8e6e263acf70b5b91577de027a Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Thu, 23 Jul 2015 15:51:13 -0400
Subject: [PATCH 0667/1208] freedreno/ir3: bit of shader API refactoring

Since for transform-feedback, we'll need more than just the TGSI
tokens from the state object, just pass the entire state object to
ir3_shader_create().  This also cleans things up a bit for some
day in the future when we could take shader either as TGSI or
directly NIR (for ex, glsl2nir or spirv2nir paths).  In the same
spirit, drop extra args from ir3_compile_shader_nir() (since it
can anyways get what it needs from the ir3_shader_variant).

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a3xx/fd3_program.c    |  2 +-
 src/gallium/drivers/freedreno/a4xx/fd4_program.c    |  2 +-
 src/gallium/drivers/freedreno/ir3/ir3_cmdline.c     | 13 ++++++++-----
 src/gallium/drivers/freedreno/ir3/ir3_compiler.h    |  4 +---
 .../drivers/freedreno/ir3/ir3_compiler_nir.c        | 10 ++++------
 src/gallium/drivers/freedreno/ir3/ir3_shader.c      | 11 ++++++-----
 src/gallium/drivers/freedreno/ir3/ir3_shader.h      |  5 ++++-
 7 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index 57fcaa9020e..9bf42f5b931 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -51,7 +51,7 @@ create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state
 		enum shader_t type)
 {
 	struct fd3_shader_stateobj *so = CALLOC_STRUCT(fd3_shader_stateobj);
-	so->shader = ir3_shader_create(pctx, cso->tokens, type);
+	so->shader = ir3_shader_create(pctx, cso, type);
 	return so;
 }
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index e8f5837f7ce..b3c06e3aae1 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -53,7 +53,7 @@ create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state
 		enum shader_t type)
 {
 	struct fd4_shader_stateobj *so = CALLOC_STRUCT(fd4_shader_stateobj);
-	so->shader = ir3_shader_create(pctx, cso->tokens, type);
+	so->shader = ir3_shader_create(pctx, cso, type);
 	return so;
 }
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index d2aabe1140d..68f08486075 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -181,10 +181,6 @@ int main(int argc, char **argv)
 
 	filename = argv[n];
 
-	memset(&v, 0, sizeof(v));
-	v.key = key;
-	v.shader = &s;
-
 	ret = read_file(filename, &ptr, &size);
 	if (ret) {
 		print_usage();
@@ -197,6 +193,13 @@ int main(int argc, char **argv)
 	if (!tgsi_text_translate(ptr, toks, Elements(toks)))
 		errx(1, "could not parse `%s'", filename);
 
+	memset(&s, 0, sizeof(s));
+	s.tokens = toks;
+
+	memset(&v, 0, sizeof(v));
+	v.key = key;
+	v.shader = &s;
+
 	tgsi_parse_init(&parse, toks);
 	switch (parse.FullHeader.Processor.Processor) {
 	case TGSI_PROCESSOR_FRAGMENT:
@@ -214,7 +217,7 @@ int main(int argc, char **argv)
 	compiler = ir3_compiler_create(320);
 
 	info = "NIR compiler";
-	ret = ir3_compile_shader_nir(compiler, &v, toks, key);
+	ret = ir3_compile_shader_nir(compiler, &v);
 	if (ret) {
 		fprintf(stderr, "compiler failed!\n");
 		return ret;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
index 378f737924c..697afeba61a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
@@ -43,8 +43,6 @@ struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id);
 void ir3_compiler_destroy(struct ir3_compiler *compiler);
 
 int ir3_compile_shader_nir(struct ir3_compiler *compiler,
-		struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens,
-		struct ir3_shader_key key);
+		struct ir3_shader_variant *so);
 
 #endif /* IR3_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index bdba3aae36f..94d7b7f054a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -2283,9 +2283,7 @@ fixup_frag_inputs(struct ir3_compile *ctx)
 
 int
 ir3_compile_shader_nir(struct ir3_compiler *compiler,
-		struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens,
-		struct ir3_shader_key key)
+		struct ir3_shader_variant *so)
 {
 	struct ir3_compile *ctx;
 	struct ir3 *ir;
@@ -2295,7 +2293,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	assert(!so->ir);
 
-	ctx = compile_init(compiler, so, tokens);
+	ctx = compile_init(compiler, so, so->shader->tokens);
 	if (!ctx) {
 		DBG("INIT failed!");
 		ret = -1;
@@ -2320,7 +2318,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 		fixup_frag_inputs(ctx);
 
 	/* at this point, for binning pass, throw away unneeded outputs: */
-	if (key.binning_pass) {
+	if (so->key.binning_pass) {
 		for (i = 0, j = 0; i < so->outputs_count; i++) {
 			unsigned name = sem2name(so->outputs[i].semantic);
 			unsigned idx = sem2idx(so->outputs[i].semantic);
@@ -2345,7 +2343,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	/* if we want half-precision outputs, mark the output registers
 	 * as half:
 	 */
-	if (key.half_precision) {
+	if (so->key.half_precision) {
 		for (i = 0; i < ir->noutputs; i++) {
 			struct ir3_instruction *out = ir->outputs[i];
 			if (!out)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 210dc298f4e..d4027729a22 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -177,7 +177,6 @@ static struct ir3_shader_variant *
 create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 {
 	struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
-	const struct tgsi_token *tokens = shader->tokens;
 	int ret;
 
 	if (!v)
@@ -191,10 +190,10 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 	if (fd_mesa_debug & FD_DBG_DISASM) {
 		DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type,
 			key.binning_pass, key.color_two_side, key.half_precision);
-		tgsi_dump(tokens, 0);
+		tgsi_dump(shader->tokens, 0);
 	}
 
-	ret = ir3_compile_shader_nir(shader->compiler, v, tokens, key);
+	ret = ir3_compile_shader_nir(shader->compiler, v);
 	if (ret) {
 		debug_error("compile failed!");
 		goto fail;
@@ -273,7 +272,8 @@ ir3_shader_destroy(struct ir3_shader *shader)
 }
 
 struct ir3_shader *
-ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens,
+ir3_shader_create(struct pipe_context *pctx,
+		const struct pipe_shader_state *cso,
 		enum shader_t type)
 {
 	struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
@@ -281,7 +281,8 @@ ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens,
 	shader->id = ++shader->compiler->shader_count;
 	shader->pctx = pctx;
 	shader->type = type;
-	shader->tokens = tgsi_dup_tokens(tokens);
+	shader->tokens = tgsi_dup_tokens(cso->tokens);
+	shader->stream_output = cso->stream_output;
 	if (fd_mesa_debug & FD_DBG_SHADERDB) {
 		/* if shader-db run, create a standard variant immediately
 		 * (as otherwise nothing will trigger the shader to be
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 79b9f8a3eeb..5365d5687f1 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -29,6 +29,8 @@
 #ifndef IR3_SHADER_H_
 #define IR3_SHADER_H_
 
+#include "pipe/p_state.h"
+
 #include "ir3.h"
 #include "disasm.h"
 
@@ -203,6 +205,7 @@ struct ir3_shader {
 
 	struct pipe_context *pctx;
 	const struct tgsi_token *tokens;
+	struct pipe_stream_output_info stream_output;
 
 	struct ir3_shader_variant *variants;
 
@@ -215,7 +218,7 @@ struct ir3_shader {
 void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
 
 struct ir3_shader * ir3_shader_create(struct pipe_context *pctx,
-		const struct tgsi_token *tokens, enum shader_t type);
+		const struct pipe_shader_state *cso, enum shader_t type);
 void ir3_shader_destroy(struct ir3_shader *shader);
 struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
 		struct ir3_shader_key key);

From 56462a30080c1f25a81ae566d59a25d2ad6bb809 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 24 Jul 2015 13:07:33 -0400
Subject: [PATCH 0668/1208] freedreno/ir3: move emit_const to ir3

Details of the cmdstream packets are different between a3xx and a4xx,
but the logic about the layout of const registers is the same, as that
is dictated by the ir3 shader compiler.  So rather than duplicating
logic that is tightly coupled to ir3 between a3xx and a4xx, move this
into ir3 and use per-generation callbacks for to build the cmdstream
packets.

This should make it easier to pass additional const regs (such as for
transform feedback).  And it also keeps the layout internal to ir3 in
case we want to make the layout more dynamic some day.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/a3xx/fd3_context.c      |   1 +
 src/gallium/drivers/freedreno/a3xx/fd3_draw.c |   2 +-
 src/gallium/drivers/freedreno/a3xx/fd3_emit.c | 154 +++++-------------
 src/gallium/drivers/freedreno/a3xx/fd3_emit.h |   6 +-
 .../drivers/freedreno/a4xx/fd4_context.c      |   1 +
 src/gallium/drivers/freedreno/a4xx/fd4_draw.c |   2 +-
 src/gallium/drivers/freedreno/a4xx/fd4_emit.c | 154 +++++-------------
 src/gallium/drivers/freedreno/a4xx/fd4_emit.h |   6 +-
 .../drivers/freedreno/freedreno_context.h     |   9 +-
 .../drivers/freedreno/freedreno_util.h        |   1 +
 .../drivers/freedreno/ir3/ir3_shader.c        | 148 +++++++++++++++++
 .../drivers/freedreno/ir3/ir3_shader.h        |   4 +
 12 files changed, 261 insertions(+), 227 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
index 8441898382b..dc33783e398 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -121,6 +121,7 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv)
 	fd3_gmem_init(pctx);
 	fd3_texture_init(pctx);
 	fd3_prog_init(pctx);
+	fd3_emit_init(pctx);
 
 	pctx = fd_context_init(&fd3_ctx->base, pscreen, primtypes, priv);
 	if (!pctx)
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index 070ed43a279..fc30d4842ba 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -345,7 +345,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 
 	fd3_emit_vertex_bufs(ring, &emit);
 
-	fd3_emit_constant(ring, SB_FRAG_SHADER, 0, 0, 4, color->ui, NULL);
+	fd3_emit_const(ring, SHADER_FRAGMENT, 0, 0, 4, color->ui, NULL);
 
 	OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1);
 	OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) |
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 07cc2266d08..9032366b748 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -43,19 +43,26 @@
 #include "fd3_format.h"
 #include "fd3_zsa.h"
 
+static const enum adreno_state_block sb[] = {
+	[SHADER_VERTEX]   = SB_VERT_SHADER,
+	[SHADER_FRAGMENT] = SB_FRAG_SHADER,
+};
+
 /* regid:          base const register
  * prsc or dwords: buffer containing constant values
  * sizedwords:     size of const value buffer
  */
 void
-fd3_emit_constant(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
+fd3_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc)
 {
 	uint32_t i, sz;
 	enum adreno_state_src src;
 
+	debug_assert((regid % 4) == 0);
+	debug_assert((sizedwords % 4) == 0);
+
 	if (prsc) {
 		sz = 0;
 		src = SS_INDIRECT;
@@ -67,7 +74,7 @@ fd3_emit_constant(struct fd_ringbuffer *ring,
 	OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz);
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) |
 			CP_LOAD_STATE_0_STATE_SRC(src) |
-			CP_LOAD_STATE_0_STATE_BLOCK(sb) |
+			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
 			CP_LOAD_STATE_0_NUM_UNIT(sizedwords/2));
 	if (prsc) {
 		struct fd_bo *bo = fd_resource(prsc)->bo;
@@ -84,89 +91,31 @@ fd3_emit_constant(struct fd_ringbuffer *ring,
 }
 
 static void
-emit_constants(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
-		struct fd_constbuf_stateobj *constbuf,
-		struct ir3_shader_variant *shader,
-		bool emit_immediates)
+fd3_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
+		uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets)
 {
-	uint32_t enabled_mask = constbuf->enabled_mask;
-	uint32_t max_const;
-	int i;
+	uint32_t i;
 
-	// XXX TODO only emit dirty consts.. but we need to keep track if
-	// they are clobbered by a clear, gmem2mem, or mem2gmem..
-	constbuf->dirty_mask = enabled_mask;
+	debug_assert((regid % 4) == 0);
+	debug_assert((num % 4) == 0);
 
-	/* in particular, with binning shader we may end up with unused
-	 * consts, ie. we could end up w/ constlen that is smaller
-	 * than first_immediate.  In that case truncate the user consts
-	 * early to avoid HLSQ lockup caused by writing too many consts
-	 */
-	max_const = MIN2(shader->first_driver_param, shader->constlen);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + num);
+	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) |
+			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
+			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
+			CP_LOAD_STATE_0_NUM_UNIT(num/2));
+	OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
+			CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
 
-	/* emit user constants: */
-	if (enabled_mask & 1) {
-		const unsigned index = 0;
-		struct pipe_constant_buffer *cb = &constbuf->cb[index];
-		unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
-
-		// I expect that size should be a multiple of vec4's:
-		assert(size == align(size, 4));
-
-		/* and even if the start of the const buffer is before
-		 * first_immediate, the end may not be:
-		 */
-		size = MIN2(size, 4 * max_const);
-
-		if (size && constbuf->dirty_mask & (1 << index)) {
-			fd3_emit_constant(ring, sb, 0,
-							  cb->buffer_offset, size,
-							  cb->user_buffer, cb->buffer);
-			constbuf->dirty_mask &= ~(1 << index);
-		}
-
-		enabled_mask &= ~(1 << index);
-	}
-
-	if (shader->constlen > shader->first_driver_param) {
-		uint32_t params = MIN2(4, shader->constlen - shader->first_driver_param);
-		/* emit ubos: */
-		OUT_PKT3(ring, CP_LOAD_STATE, 2 + params * 4);
-		OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(shader->first_driver_param * 2) |
-				 CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-				 CP_LOAD_STATE_0_STATE_BLOCK(sb) |
-				 CP_LOAD_STATE_0_NUM_UNIT(params * 2));
-		OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
-				 CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
-
-		for (i = 1; i <= params * 4; i++) {
-			struct pipe_constant_buffer *cb = &constbuf->cb[i];
-			assert(!cb->user_buffer);
-			if ((enabled_mask & (1 << i)) && cb->buffer)
-				OUT_RELOC(ring, fd_resource(cb->buffer)->bo, cb->buffer_offset, 0, 0);
-			else
-				OUT_RING(ring, 0xbad00000 | ((i - 1) << 16));
-		}
-	}
-
-	/* emit shader immediates: */
-	if (shader && emit_immediates) {
-		int size = shader->immediates_count;
-		uint32_t base = shader->first_immediate;
-
-		/* truncate size to avoid writing constants that shader
-		 * does not use:
-		 */
-		size = MIN2(size + base, shader->constlen) - base;
-
-		/* convert out of vec4: */
-		base *= 4;
-		size *= 4;
-
-		if (size > 0) {
-			fd3_emit_constant(ring, sb, base,
-				0, size, shader->immediates[0].val, NULL);
+	for (i = 0; i < num; i++) {
+		if (bos[i]) {
+			if (write) {
+				OUT_RELOCW(ring, bos[i], offsets[i], 0, 0);
+			} else {
+				OUT_RELOC(ring, bos[i], offsets[i], 0, 0);
+			}
+		} else {
+			OUT_RING(ring, 0xbad00000 | (i << 16));
 		}
 	}
 }
@@ -669,33 +618,12 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	OUT_PKT3(ring, CP_EVENT_WRITE, 1);
 	OUT_RING(ring, HLSQ_FLUSH);
 
-	if ((dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) &&
-			/* evil hack to deal sanely with clear path: */
-			(emit->prog == &ctx->prog)) {
-		fd_wfi(ctx, ring);
-		emit_constants(ring,  SB_VERT_SHADER,
-				&ctx->constbuf[PIPE_SHADER_VERTEX],
-				vp, emit->prog->dirty & FD_SHADER_DIRTY_VP);
-		if (!emit->key.binning_pass) {
-			emit_constants(ring, SB_FRAG_SHADER,
-					&ctx->constbuf[PIPE_SHADER_FRAGMENT],
-					fp, emit->prog->dirty & FD_SHADER_DIRTY_FP);
-		}
-	}
-
-	/* emit driver params every time */
-	if (emit->info && emit->prog == &ctx->prog) {
-		uint32_t vertex_params[4] = {
-			emit->info->indexed ? emit->info->index_bias : emit->info->start,
-			0,
-			0,
-			0
-		};
-		if (vp->constlen >= vp->first_driver_param + 4) {
-			fd3_emit_constant(ring, SB_VERT_SHADER,
-							  (vp->first_driver_param + 4) * 4,
-							  0, 4, vertex_params, NULL);
-		}
+	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
+		ir3_emit_consts(vp, ring, emit->info, dirty);
+		if (!emit->key.binning_pass)
+			ir3_emit_consts(fp, ring, emit->info, dirty);
+		/* mark clean after emitting consts: */
+		ctx->prog.dirty = 0;
 	}
 
 	if ((dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) && ctx->blend) {
@@ -930,3 +858,11 @@ fd3_emit_restore(struct fd_context *ctx)
 
 	ctx->needs_rb_fbd = true;
 }
+
+void
+fd3_emit_init(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->emit_const = fd3_emit_const;
+	ctx->emit_const_bo = fd3_emit_const_bo;
+}
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
index 8f21919c9a7..795654706a7 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
@@ -37,10 +37,8 @@
 #include "ir3_shader.h"
 
 struct fd_ringbuffer;
-enum adreno_state_block;
 
-void fd3_emit_constant(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
+void fd3_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc);
 
@@ -90,4 +88,6 @@ void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 void fd3_emit_restore(struct fd_context *ctx);
 
+void fd3_emit_init(struct pipe_context *pctx);
+
 #endif /* FD3_EMIT_H */
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
index 6e109b6205a..e172d350517 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
@@ -119,6 +119,7 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv)
 	fd4_gmem_init(pctx);
 	fd4_texture_init(pctx);
 	fd4_prog_init(pctx);
+	fd4_emit_init(pctx);
 
 	pctx = fd_context_init(&fd4_ctx->base, pscreen, primtypes, priv);
 	if (!pctx)
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index d070f5fd6b7..ff1dfdc392f 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -295,7 +295,7 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW3 */
 
 	/* until fastclear works: */
-	fd4_emit_constant(ring, SB_FRAG_SHADER, 0, 0, 4, color->ui, NULL);
+	fd4_emit_const(ring, SHADER_FRAGMENT, 0, 0, 4, color->ui, NULL);
 
 	OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2);
 	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index f3e1ccebccc..4462a82777f 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -43,19 +43,26 @@
 #include "fd4_format.h"
 #include "fd4_zsa.h"
 
+static const enum adreno_state_block sb[] = {
+	[SHADER_VERTEX]   = SB_VERT_SHADER,
+	[SHADER_FRAGMENT] = SB_FRAG_SHADER,
+};
+
 /* regid:          base const register
  * prsc or dwords: buffer containing constant values
  * sizedwords:     size of const value buffer
  */
 void
-fd4_emit_constant(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
+fd4_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc)
 {
 	uint32_t i, sz;
 	enum adreno_state_src src;
 
+	debug_assert((regid % 4) == 0);
+	debug_assert((sizedwords % 4) == 0);
+
 	if (prsc) {
 		sz = 0;
 		src = 0x2;  // TODO ??
@@ -67,7 +74,7 @@ fd4_emit_constant(struct fd_ringbuffer *ring,
 	OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz);
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
 			CP_LOAD_STATE_0_STATE_SRC(src) |
-			CP_LOAD_STATE_0_STATE_BLOCK(sb) |
+			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
 			CP_LOAD_STATE_0_NUM_UNIT(sizedwords/4));
 	if (prsc) {
 		struct fd_bo *bo = fd_resource(prsc)->bo;
@@ -84,89 +91,31 @@ fd4_emit_constant(struct fd_ringbuffer *ring,
 }
 
 static void
-emit_constants(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
-		struct fd_constbuf_stateobj *constbuf,
-		struct ir3_shader_variant *shader,
-		bool emit_immediates)
+fd4_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
+		uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets)
 {
-	uint32_t enabled_mask = constbuf->enabled_mask;
-	uint32_t max_const;
-	int i;
+	uint32_t i;
 
-	// XXX TODO only emit dirty consts.. but we need to keep track if
-	// they are clobbered by a clear, gmem2mem, or mem2gmem..
-	constbuf->dirty_mask = enabled_mask;
+	debug_assert((regid % 4) == 0);
+	debug_assert((num % 4) == 0);
 
-	/* in particular, with binning shader we may end up with unused
-	 * consts, ie. we could end up w/ constlen that is smaller
-	 * than first_immediate.  In that case truncate the user consts
-	 * early to avoid HLSQ lockup caused by writing too many consts
-	 */
-	max_const = MIN2(shader->first_driver_param, shader->constlen);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + num);
+	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
+			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
+			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
+			CP_LOAD_STATE_0_NUM_UNIT(num/4));
+	OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
+			CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
 
-	/* emit user constants: */
-	if (enabled_mask & 1) {
-		const unsigned index = 0;
-		struct pipe_constant_buffer *cb = &constbuf->cb[index];
-		unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
-
-		// I expect that size should be a multiple of vec4's:
-		assert(size == align(size, 4));
-
-		/* and even if the start of the const buffer is before
-		 * first_immediate, the end may not be:
-		 */
-		size = MIN2(size, 4 * max_const);
-
-		if (size && (constbuf->dirty_mask & (1 << index))) {
-			fd4_emit_constant(ring, sb, 0,
-					cb->buffer_offset, size,
-					cb->user_buffer, cb->buffer);
-			constbuf->dirty_mask &= ~(1 << index);
-		}
-
-		enabled_mask &= ~(1 << index);
-	}
-
-	/* emit ubos: */
-	if (shader->constlen > shader->first_driver_param) {
-		uint32_t params = MIN2(4, shader->constlen - shader->first_driver_param);
-		OUT_PKT3(ring, CP_LOAD_STATE, 2 + params * 4);
-		OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(shader->first_driver_param) |
-				CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-				CP_LOAD_STATE_0_STATE_BLOCK(sb) |
-				CP_LOAD_STATE_0_NUM_UNIT(params));
-		OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
-				CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
-
-		for (i = 1; i <= params * 4; i++) {
-			struct pipe_constant_buffer *cb = &constbuf->cb[i];
-			assert(!cb->user_buffer);
-			if ((enabled_mask & (1 << i)) && cb->buffer)
-				OUT_RELOC(ring, fd_resource(cb->buffer)->bo, cb->buffer_offset, 0, 0);
-			else
-				OUT_RING(ring, 0xbad00000 | ((i - 1) << 16));
-		}
-	}
-
-	/* emit shader immediates: */
-	if (shader && emit_immediates) {
-		int size = shader->immediates_count;
-		uint32_t base = shader->first_immediate;
-
-		/* truncate size to avoid writing constants that shader
-		 * does not use:
-		 */
-		size = MIN2(size + base, shader->constlen) - base;
-
-		/* convert out of vec4: */
-		base *= 4;
-		size *= 4;
-
-		if (size > 0) {
-			fd4_emit_constant(ring, sb, base,
-				0, size, shader->immediates[0].val, NULL);
+	for (i = 0; i < num; i++) {
+		if (bos[i]) {
+			if (write) {
+				OUT_RELOCW(ring, bos[i], offsets[i], 0, 0);
+			} else {
+				OUT_RELOC(ring, bos[i], offsets[i], 0, 0);
+			}
+		} else {
+			OUT_RING(ring, 0xbad00000 | (i << 16));
 		}
 	}
 }
@@ -520,33 +469,12 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	if (dirty & FD_DIRTY_PROG)
 		fd4_program_emit(ring, emit);
 
-	if ((dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) &&
-			/* evil hack to deal sanely with clear path: */
-			(emit->prog == &ctx->prog)) {
-		fd_wfi(ctx, ring);
-		emit_constants(ring,  SB_VERT_SHADER,
-				&ctx->constbuf[PIPE_SHADER_VERTEX],
-				vp, emit->prog->dirty & FD_SHADER_DIRTY_VP);
-		if (!emit->key.binning_pass) {
-			emit_constants(ring, SB_FRAG_SHADER,
-					&ctx->constbuf[PIPE_SHADER_FRAGMENT],
-					fp, emit->prog->dirty & FD_SHADER_DIRTY_FP);
-		}
-	}
-
-	/* emit driver params every time */
-	if (emit->info && emit->prog == &ctx->prog) {
-		uint32_t vertex_params[4] = {
-			emit->info->indexed ? emit->info->index_bias : emit->info->start,
-			0,
-			0,
-			0
-		};
-		if (vp->constlen >= vp->first_driver_param + 4) {
-			fd4_emit_constant(ring, SB_VERT_SHADER,
-							  (vp->first_driver_param + 4) * 4,
-							  0, 4, vertex_params, NULL);
-		}
+	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
+		ir3_emit_consts(vp, ring, emit->info, dirty);
+		if (!emit->key.binning_pass)
+			ir3_emit_consts(fp, ring, emit->info, dirty);
+		/* mark clean after emitting consts: */
+		ctx->prog.dirty = 0;
 	}
 
 	if ((dirty & FD_DIRTY_BLEND) && ctx->blend) {
@@ -767,3 +695,11 @@ fd4_emit_restore(struct fd_context *ctx)
 
 	ctx->needs_rb_fbd = true;
 }
+
+void
+fd4_emit_init(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->emit_const = fd4_emit_const;
+	ctx->emit_const_bo = fd4_emit_const_bo;
+}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
index 7d059f8e532..7debee59471 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
@@ -37,10 +37,8 @@
 #include "ir3_shader.h"
 
 struct fd_ringbuffer;
-enum adreno_state_block;
 
-void fd4_emit_constant(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
+void fd4_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc);
 
@@ -96,4 +94,6 @@ void fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 void fd4_emit_restore(struct fd_context *ctx);
 
+void fd4_emit_init(struct pipe_context *pctx);
+
 #endif /* FD4_EMIT_H */
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index c2d98345349..bc5267aa96e 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -351,9 +351,16 @@ struct fd_context {
 	void (*emit_sysmem_prep)(struct fd_context *ctx);
 
 	/* draw: */
-	void (*draw_vbo)(struct fd_context *pctx, const struct pipe_draw_info *info);
+	void (*draw_vbo)(struct fd_context *ctx, const struct pipe_draw_info *info);
 	void (*clear)(struct fd_context *ctx, unsigned buffers,
 			const union pipe_color_union *color, double depth, unsigned stencil);
+
+	/* constant emit:  (note currently not used/needed for a2xx) */
+	void (*emit_const)(struct fd_ringbuffer *ring, enum shader_t type,
+			uint32_t regid, uint32_t offset, uint32_t sizedwords,
+			const uint32_t *dwords, struct pipe_resource *prsc);
+	void (*emit_const_bo)(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
+			uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets);
 };
 
 static inline struct fd_context *
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index 1b78763c58e..6aec2585ceb 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -40,6 +40,7 @@
 #include "util/u_dynarray.h"
 #include "util/u_pack_color.h"
 
+#include "disasm.h"
 #include "adreno_common.xml.h"
 #include "adreno_pm4.xml.h"
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index d4027729a22..75425e91378 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -412,3 +412,151 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin)
 
 	debug_printf("\n");
 }
+
+/* This has to reach into the fd_context a bit more than the rest of
+ * ir3, but it needs to be aligned with the compiler, so both agree
+ * on which const regs hold what.  And the logic is identical between
+ * a3xx/a4xx, the only difference is small details in the actual
+ * CP_LOAD_STATE packets (which is handled inside the generation
+ * specific ctx->emit_const(_bo)() fxns)
+ */
+
+#include "freedreno_resource.h"
+
+static void
+emit_user_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_constbuf_stateobj *constbuf)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+	const unsigned index = 0;     /* user consts are index 0 */
+	/* TODO save/restore dirty_mask for binning pass instead: */
+	uint32_t dirty_mask = constbuf->enabled_mask;
+
+	if (dirty_mask & (1 << index)) {
+		struct pipe_constant_buffer *cb = &constbuf->cb[index];
+		unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
+
+		/* in particular, with binning shader we may end up with
+		 * unused consts, ie. we could end up w/ constlen that is
+		 * smaller than first_driver_param.  In that case truncate
+		 * the user consts early to avoid HLSQ lockup caused by
+		 * writing too many consts
+		 */
+		uint32_t max_const = MIN2(v->first_driver_param, v->constlen);
+
+		// I expect that size should be a multiple of vec4's:
+		assert(size == align(size, 4));
+
+		/* and even if the start of the const buffer is before
+		 * first_immediate, the end may not be:
+		 */
+		size = MIN2(size, 4 * max_const);
+
+		if (size > 0) {
+			fd_wfi(ctx, ring);
+			ctx->emit_const(ring, v->type, 0,
+					cb->buffer_offset, size,
+					cb->user_buffer, cb->buffer);
+			constbuf->dirty_mask &= ~(1 << index);
+		}
+	}
+}
+
+static void
+emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_constbuf_stateobj *constbuf)
+{
+	if (v->constlen > v->first_driver_param) {
+		struct fd_context *ctx = fd_context(v->shader->pctx);
+		uint32_t offset = v->first_driver_param;  /* UBOs after user consts */
+		uint32_t params = MIN2(4, v->constlen - v->first_driver_param) * 4;
+		uint32_t offsets[params];
+		struct fd_bo *bos[params];
+
+		for (uint32_t i = 0; i < params; i++) {
+			const uint32_t index = i + 1;   /* UBOs start at index 1 */
+			struct pipe_constant_buffer *cb = &constbuf->cb[index];
+			assert(!cb->user_buffer);
+
+			if ((constbuf->enabled_mask & (1 << index)) && cb->buffer) {
+				offsets[i] = cb->buffer_offset;
+				bos[i] = fd_resource(cb->buffer)->bo;
+			} else {
+				offsets[i] = 0;
+				bos[i] = NULL;
+			}
+		}
+
+		fd_wfi(ctx, ring);
+		ctx->emit_const_bo(ring, v->type, false, offset * 4, params, bos, offsets);
+	}
+}
+
+static void
+emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+	int size = v->immediates_count;
+	uint32_t base = v->first_immediate;
+
+	/* truncate size to avoid writing constants that shader
+	 * does not use:
+	 */
+	size = MIN2(size + base, v->constlen) - base;
+
+	/* convert out of vec4: */
+	base *= 4;
+	size *= 4;
+
+	if (size > 0) {
+		fd_wfi(ctx, ring);
+		ctx->emit_const(ring, v->type, base,
+			0, size, v->immediates[0].val, NULL);
+	}
+}
+
+void
+ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		const struct pipe_draw_info *info, uint32_t dirty)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+
+	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) {
+		struct fd_constbuf_stateobj *constbuf;
+		bool shader_dirty;
+
+		if (v->type == SHADER_VERTEX) {
+			constbuf = &ctx->constbuf[PIPE_SHADER_VERTEX];
+			shader_dirty = !!(ctx->prog.dirty & FD_SHADER_DIRTY_VP);
+		} else if (v->type == SHADER_FRAGMENT) {
+			constbuf = &ctx->constbuf[PIPE_SHADER_FRAGMENT];
+			shader_dirty = !!(ctx->prog.dirty & FD_SHADER_DIRTY_FP);
+		} else {
+			unreachable("bad shader type");
+			return;
+		}
+
+		emit_user_consts(v, ring, constbuf);
+		emit_ubos(v, ring, constbuf);
+		if (shader_dirty)
+			emit_immediates(v, ring);
+	}
+
+	/* emit driver params every time: */
+	/* TODO skip emit if shader doesn't use driver params to avoid WFI.. */
+	if (info && (v->type == SHADER_VERTEX)) {
+		uint32_t offset = v->first_driver_param + 4;  /* driver params after UBOs */
+		if (v->constlen >= offset) {
+			uint32_t vertex_params[4] = {
+				info->indexed ? info->index_bias : info->start,
+				0,
+				0,
+				0
+			};
+
+			fd_wfi(ctx, ring);
+			ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0,
+					ARRAY_SIZE(vertex_params), vertex_params, NULL);
+		}
+	}
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 5365d5687f1..ef16d7b2f6e 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -224,6 +224,10 @@ struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
 		struct ir3_shader_key key);
 void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin);
 
+struct fd_ringbuffer;
+void ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		const struct pipe_draw_info *info, uint32_t dirty);
+
 static inline const char *
 ir3_shader_stage(struct ir3_shader *shader)
 {

From 810763deb514c3fec41c3e95761de34e6211d291 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 24 Jul 2015 16:42:10 -0400
Subject: [PATCH 0669/1208] freedreno/ir3: drop unused create_input() arg

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 94d7b7f054a..bdbaf8970cf 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -706,16 +706,13 @@ create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 }
 
 static struct ir3_instruction *
-create_input(struct ir3_block *block, struct ir3_instruction *instr,
-		unsigned n)
+create_input(struct ir3_block *block, unsigned n)
 {
 	struct ir3_instruction *in;
 
 	in = ir3_instr_create(block, -1, OPC_META_INPUT);
 	in->inout.block = block;
 	ir3_reg_create(in, n, 0);
-	if (instr)
-		ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
 
 	return in;
 }
@@ -747,7 +744,7 @@ create_frag_coord(struct ir3_compile *ctx, unsigned comp)
 
 	compile_assert(ctx, !ctx->frag_coord[comp]);
 
-	ctx->frag_coord[comp] = create_input(ctx->block, NULL, 0);
+	ctx->frag_coord[comp] = create_input(ctx->block, 0);
 
 	switch (comp) {
 	case 0: /* .x */
@@ -786,7 +783,7 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp)
 	case 0: /* .x */
 		compile_assert(ctx, !ctx->frag_face);
 
-		ctx->frag_face = create_input(block, NULL, 0);
+		ctx->frag_face = create_input(block, 0);
 		ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
 
 		/* for faceness, we always get -1 or 0 (int).. but TGSI expects
@@ -1428,7 +1425,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_vertex_id_zero_base:
 		if (!ctx->vertex_id) {
-			ctx->vertex_id = create_input(ctx->block, NULL, 0);
+			ctx->vertex_id = create_input(ctx->block, 0);
 			add_sysval_input(ctx, TGSI_SEMANTIC_VERTEXID_NOBASE,
 					ctx->vertex_id);
 		}
@@ -1436,7 +1433,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_instance_id:
 		if (!ctx->instance_id) {
-			ctx->instance_id = create_input(ctx->block, NULL, 0);
+			ctx->instance_id = create_input(ctx->block, 0);
 			add_sysval_input(ctx, TGSI_SEMANTIC_INSTANCEID,
 					ctx->instance_id);
 		}
@@ -2054,7 +2051,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in)
 						so->inputs[n].inloc + i - 8, use_ldlv);
 			}
 		} else {
-			instr = create_input(ctx->block, NULL, idx);
+			instr = create_input(ctx->block, idx);
 		}
 
 		ctx->ir->inputs[idx] = instr;
@@ -2267,13 +2264,13 @@ fixup_frag_inputs(struct ir3_compile *ctx)
 	so->pos_regid = regid;
 
 	/* r0.x */
-	instr = create_input(ctx->in_block, NULL, ir->ninputs);
+	instr = create_input(ctx->in_block, ir->ninputs);
 	instr->regs[0]->num = regid++;
 	inputs[ir->ninputs++] = instr;
 	ctx->frag_pos->regs[1]->instr = instr;
 
 	/* r0.y */
-	instr = create_input(ctx->in_block, NULL, ir->ninputs);
+	instr = create_input(ctx->in_block, ir->ninputs);
 	instr->regs[0]->num = regid++;
 	inputs[ir->ninputs++] = instr;
 	ctx->frag_pos->regs[2]->instr = instr;

From 1b1ef6b4573ab9f21abd5fb374bc74d03390146d Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 24 Jul 2015 17:06:01 -0400
Subject: [PATCH 0670/1208] freedreno/ir3: add stream-output support to cmdline
 compiler

A bit hard-coded configuration at the moment, but sufficient for now.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/ir3/ir3_cmdline.c       | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index 68f08486075..ede29f445dc 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -93,8 +93,7 @@ static void print_usage(void)
 	printf("    --saturate-s MASK - bitmask of samplers to saturate S coord\n");
 	printf("    --saturate-t MASK - bitmask of samplers to saturate T coord\n");
 	printf("    --saturate-r MASK - bitmask of samplers to saturate R coord\n");
-	printf("    --nocp            - disable copy propagation\n");
-	printf("    --nir             - use NIR compiler\n");
+	printf("    --stream-out      - enable stream-out (aka transform feedback)\n");
 	printf("    --help            - show this message\n");
 }
 
@@ -114,6 +113,9 @@ int main(int argc, char **argv)
 
 	fd_mesa_debug |= FD_DBG_DISASM;
 
+	memset(&s, 0, sizeof(s));
+	memset(&v, 0, sizeof(v));
+
 	/* cmdline args which impact shader variant get spit out in a
 	 * comment on the first line..  a quick/dirty way to preserve
 	 * that info so when ir3test recompiles the shader with a new
@@ -170,6 +172,24 @@ int main(int argc, char **argv)
 			continue;
 		}
 
+		if (!strcmp(argv[n], "--stream-out")) {
+			struct pipe_stream_output_info *so = &s.stream_output;
+			debug_printf(" %s", argv[n]);
+			/* TODO more dynamic config based on number of outputs, etc
+			 * rather than just hard-code for first output:
+			 */
+			so->num_outputs = 1;
+			so->stride[0] = 4;
+			so->output[0].register_index = 0;
+			so->output[0].start_component = 0;
+			so->output[0].num_components = 4;
+			so->output[0].output_buffer = 0;
+			so->output[0].dst_offset = 2;
+			so->output[0].stream = 0;
+			n++;
+			continue;
+		}
+
 		if (!strcmp(argv[n], "--help")) {
 			print_usage();
 			return 0;
@@ -193,10 +213,8 @@ int main(int argc, char **argv)
 	if (!tgsi_text_translate(ptr, toks, Elements(toks)))
 		errx(1, "could not parse `%s'", filename);
 
-	memset(&s, 0, sizeof(s));
 	s.tokens = toks;
 
-	memset(&v, 0, sizeof(v));
 	v.key = key;
 	v.shader = &s;
 

From 65d36a109a7dd333c15180a0f30ad919eb01d78f Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 24 Jul 2015 17:07:23 -0400
Subject: [PATCH 0671/1208] freedreno/a3xx+a4xx: add support for vtxcnt
 semantic

This will be used for stream-out (transform-feedback)

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a3xx/fd3_emit.c | 22 ++++++++++++++-----
 .../drivers/freedreno/a3xx/fd3_program.c      |  4 ----
 src/gallium/drivers/freedreno/a4xx/fd4_emit.c | 14 ++++++++----
 .../drivers/freedreno/ir3/ir3_shader.h        |  5 +++++
 4 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 9032366b748..49b3d97cce8 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -393,7 +393,9 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 	uint32_t total_in = 0;
 	const struct fd_vertex_state *vtx = emit->vtx;
 	struct ir3_shader_variant *vp = fd3_emit_get_vp(emit);
-	unsigned vertex_regid = regid(63, 0), instance_regid = regid(63, 0);
+	unsigned vertex_regid = regid(63, 0);
+	unsigned instance_regid = regid(63, 0);
+	unsigned vtxcnt_regid = regid(63, 0);
 
 	for (i = 0; i < vp->inputs_count; i++) {
 		uint8_t semantic = sem2name(vp->inputs[i].semantic);
@@ -401,14 +403,17 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 			vertex_regid = vp->inputs[i].regid;
 		else if (semantic == TGSI_SEMANTIC_INSTANCEID)
 			instance_regid = vp->inputs[i].regid;
+		else if (semantic == IR3_SEMANTIC_VTXCNT)
+			vtxcnt_regid = vp->inputs[i].regid;
 		else if (i < vtx->vtx->num_elements && vp->inputs[i].compmask)
 			last = i;
 	}
 
 	/* hw doesn't like to be configured for zero vbo's, it seems: */
-	if (vtx->vtx->num_elements == 0 &&
-		vertex_regid == regid(63, 0) &&
-		instance_regid == regid(63, 0))
+	if ((vtx->vtx->num_elements == 0) &&
+			(vertex_regid == regid(63, 0)) &&
+			(instance_regid == regid(63, 0)) &&
+			(vtxcnt_regid == regid(63, 0)))
 		return;
 
 	for (i = 0, j = 0; i <= last; i++) {
@@ -421,8 +426,9 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 			enum pipe_format pfmt = elem->src_format;
 			enum a3xx_vtx_fmt fmt = fd3_pipe2vtx(pfmt);
 			bool switchnext = (i != last) ||
-				vertex_regid != regid(63, 0) ||
-				instance_regid != regid(63, 0);
+					(vertex_regid != regid(63, 0)) ||
+					(instance_regid != regid(63, 0)) ||
+					(vtxcnt_regid != regid(63, 0));
 			bool isint = util_format_is_pure_integer(pfmt);
 			uint32_t fs = util_format_get_blocksize(pfmt);
 
@@ -461,6 +467,10 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 	OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX
 			A3XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) |
 			A3XX_VFD_CONTROL_1_REGID4INST(instance_regid));
+
+	OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1);
+	OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
+			A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(vtxcnt_regid));
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index 9bf42f5b931..7cd4885ab4f 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -449,10 +449,6 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 		OUT_RING(ring, flatshade[1]);        /* SP_FS_FLAT_SHAD_MODE_REG_1 */
 	}
 
-	OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1);
-	OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
-			A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(252));
-
 	if (vpbuffer == BUFFER)
 		emit_shader(ring, vp);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 4462a82777f..62a1e9ee955 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -251,7 +251,9 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 	uint32_t total_in = 0;
 	const struct fd_vertex_state *vtx = emit->vtx;
 	struct ir3_shader_variant *vp = fd4_emit_get_vp(emit);
-	unsigned vertex_regid = regid(63, 0), instance_regid = regid(63, 0);
+	unsigned vertex_regid = regid(63, 0);
+	unsigned instance_regid = regid(63, 0);
+	unsigned vtxcnt_regid = regid(63, 0);
 
 	for (i = 0; i < vp->inputs_count; i++) {
 		uint8_t semantic = sem2name(vp->inputs[i].semantic);
@@ -259,6 +261,8 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 			vertex_regid = vp->inputs[i].regid;
 		else if (semantic == TGSI_SEMANTIC_INSTANCEID)
 			instance_regid = vp->inputs[i].regid;
+		else if (semantic == IR3_SEMANTIC_VTXCNT)
+			vtxcnt_regid = vp->inputs[i].regid;
 		else if ((i < vtx->vtx->num_elements) && vp->inputs[i].compmask)
 			last = i;
 	}
@@ -266,7 +270,8 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 	/* hw doesn't like to be configured for zero vbo's, it seems: */
 	if ((vtx->vtx->num_elements == 0) &&
 			(vertex_regid == regid(63, 0)) &&
-			(instance_regid == regid(63, 0)))
+			(instance_regid == regid(63, 0)) &&
+			(vtxcnt_regid == regid(63, 0)))
 		return;
 
 	for (i = 0, j = 0; i <= last; i++) {
@@ -280,7 +285,8 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 			enum a4xx_vtx_fmt fmt = fd4_pipe2vtx(pfmt);
 			bool switchnext = (i != last) ||
 					(vertex_regid != regid(63, 0)) ||
-					(instance_regid != regid(63, 0));
+					(instance_regid != regid(63, 0)) ||
+					(vtxcnt_regid != regid(63, 0));
 			bool isint = util_format_is_pure_integer(pfmt);
 			uint32_t fs = util_format_get_blocksize(pfmt);
 			uint32_t off = vb->buffer_offset + elem->src_offset;
@@ -321,7 +327,7 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 			A4XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) |
 			A4XX_VFD_CONTROL_1_REGID4INST(instance_regid));
 	OUT_RING(ring, 0x00000000);   /* XXX VFD_CONTROL_2 */
-	OUT_RING(ring, A4XX_VFD_CONTROL_3_REGID_VTXCNT(regid(63, 0)));
+	OUT_RING(ring, A4XX_VFD_CONTROL_3_REGID_VTXCNT(vtxcnt_regid));
 	OUT_RING(ring, 0x00000000);   /* XXX VFD_CONTROL_4 */
 
 	/* cache invalidate, otherwise vertex fetch could see
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index ef16d7b2f6e..f0af4478109 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -34,6 +34,11 @@
 #include "ir3.h"
 #include "disasm.h"
 
+/* internal semantic used for passing vtxcnt to vertex shader to
+ * implement transform feedback:
+ */
+#define IR3_SEMANTIC_VTXCNT (TGSI_SEMANTIC_COUNT + 0)
+
 typedef uint16_t ir3_semantic;  /* semantic name + index */
 static inline ir3_semantic
 ir3_semantic_name(uint8_t name, uint16_t index)

From bda1354aac9d32e236048af4d353d5530f644c34 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 26 Jul 2015 13:30:26 -0400
Subject: [PATCH 0672/1208] freedreno: add resource tracking support for
 written buffers

With stream-out (transform-feedback) we have the case where resources
are *written* by the gpu, which needs basically the same tracking to
figure out when rendering must be flushed.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/freedreno_context.c     |  3 ++-
 .../drivers/freedreno/freedreno_draw.c        | 19 +++++++++++--------
 .../drivers/freedreno/freedreno_resource.c    |  5 +++--
 .../drivers/freedreno/freedreno_resource.h    |  2 +-
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 127fb5f5aa0..02613dcdbdb 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -130,8 +130,9 @@ fd_context_render(struct pipe_context *pctx)
 
 	/* go through all the used resources and clear their reading flag */
 	LIST_FOR_EACH_ENTRY_SAFE(rsc, rsc_tmp, &ctx->used_resources, list) {
-		assert(rsc->reading);
+		assert(rsc->reading || rsc->writing);
 		rsc->reading = false;
+		rsc->writing = false;
 		list_delinit(&rsc->list);
 	}
 
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index c9e317c7dc9..ae75b3efdcc 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -40,7 +40,7 @@
 #include "freedreno_util.h"
 
 static void
-resource_reading(struct fd_context *ctx, struct pipe_resource *prsc)
+resource_used(struct fd_context *ctx, struct pipe_resource *prsc, boolean reading)
 {
 	struct fd_resource *rsc;
 
@@ -48,7 +48,10 @@ resource_reading(struct fd_context *ctx, struct pipe_resource *prsc)
 		return;
 
 	rsc = fd_resource(prsc);
-	rsc->reading = true;
+	if (reading)
+		rsc->reading = true;
+	else
+		rsc->writing = true;
 	list_delinit(&rsc->list);
 	list_addtail(&rsc->list, &ctx->used_resources);
 }
@@ -120,26 +123,26 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 	/* Skip over buffer 0, that is sent along with the command stream */
 	for (i = 1; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
-		resource_reading(ctx, ctx->constbuf[PIPE_SHADER_VERTEX].cb[i].buffer);
-		resource_reading(ctx, ctx->constbuf[PIPE_SHADER_FRAGMENT].cb[i].buffer);
+		resource_used(ctx, ctx->constbuf[PIPE_SHADER_VERTEX].cb[i].buffer, true);
+		resource_used(ctx, ctx->constbuf[PIPE_SHADER_FRAGMENT].cb[i].buffer, true);
 	}
 
 	/* Mark VBOs as being read */
 	for (i = 0; i < ctx->vtx.vertexbuf.count; i++) {
 		assert(!ctx->vtx.vertexbuf.vb[i].user_buffer);
-		resource_reading(ctx, ctx->vtx.vertexbuf.vb[i].buffer);
+		resource_used(ctx, ctx->vtx.vertexbuf.vb[i].buffer, true);
 	}
 
 	/* Mark index buffer as being read */
-	resource_reading(ctx, ctx->indexbuf.buffer);
+	resource_used(ctx, ctx->indexbuf.buffer, true);
 
 	/* Mark textures as being read */
 	for (i = 0; i < ctx->verttex.num_textures; i++)
 		if (ctx->verttex.textures[i])
-			resource_reading(ctx, ctx->verttex.textures[i]->texture);
+			resource_used(ctx, ctx->verttex.textures[i]->texture, true);
 	for (i = 0; i < ctx->fragtex.num_textures; i++)
 		if (ctx->fragtex.textures[i])
-			resource_reading(ctx, ctx->fragtex.textures[i]->texture);
+			resource_used(ctx, ctx->fragtex.textures[i]->texture, true);
 
 	ctx->num_draws++;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index 20495779d50..de3cb64ed5b 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -97,7 +97,7 @@ realloc_bo(struct fd_resource *rsc, uint32_t size)
 
 	rsc->bo = fd_bo_new(screen->dev, size, flags);
 	rsc->timestamp = 0;
-	rsc->dirty = rsc->reading = false;
+	rsc->dirty = rsc->reading = rsc->writing = false;
 	list_delinit(&rsc->list);
 	util_range_set_empty(&rsc->valid_buffer_range);
 }
@@ -239,7 +239,8 @@ fd_resource_transfer_map(struct pipe_context *pctx,
 		 * resource and we're trying to write to it, flush the renders.
 		 */
 		if (rsc->dirty || (rsc->stencil && rsc->stencil->dirty) ||
-			((ptrans->usage & PIPE_TRANSFER_WRITE) && rsc->reading))
+			((ptrans->usage & PIPE_TRANSFER_WRITE) && rsc->reading) ||
+			((ptrans->usage & PIPE_TRANSFER_READ) && rsc->writing))
 			fd_context_render(pctx);
 
 		/* The GPU keeps track of how the various bo's are being used, and
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h
index 9833b30e17f..e7f127edca4 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.h
+++ b/src/gallium/drivers/freedreno/freedreno_resource.h
@@ -68,7 +68,7 @@ struct fd_resource {
 	uint32_t layer_size;
 	struct fd_resource_slice slices[MAX_MIP_LEVELS];
 	uint32_t timestamp;
-	bool dirty, reading;
+	bool dirty, reading, writing;
 	/* buffer range that has been initialized */
 	struct util_range valid_buffer_range;
 

From be8a8ebe578267ab24e343c3c1347936a221468e Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 25 Jul 2015 10:56:39 -0400
Subject: [PATCH 0673/1208] freedreno: add transform-feedback state

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/freedreno_context.h     | 16 +++++
 .../drivers/freedreno/freedreno_draw.c        | 15 ++++-
 .../drivers/freedreno/freedreno_resource.c    |  2 +
 .../drivers/freedreno/freedreno_state.c       | 65 +++++++++++++++++++
 4 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index bc5267aa96e..cc585af1b3f 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -82,6 +82,20 @@ struct fd_vertex_stateobj {
 	unsigned num_elements;
 };
 
+struct fd_streamout_stateobj {
+	struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
+	unsigned num_targets;
+	/* Track offset from vtxcnt for streamout data.  This counter
+	 * is just incremented by # of vertices on each draw until
+	 * reset or new streamout buffer bound.
+	 *
+	 * When we eventually have GS, the CPU won't actually know the
+	 * number of vertices per draw, so I think we'll have to do
+	 * something more clever.
+	 */
+	unsigned offsets[PIPE_MAX_SO_BUFFERS];
+};
+
 /* group together the vertex and vertexbuf state.. for ease of passing
  * around, and because various internal operations (gmem<->mem, etc)
  * need their own vertex state:
@@ -319,6 +333,7 @@ struct fd_context {
 		FD_DIRTY_VTXBUF      = (1 << 15),
 		FD_DIRTY_INDEXBUF    = (1 << 16),
 		FD_DIRTY_SCISSOR     = (1 << 17),
+		FD_DIRTY_STREAMOUT   = (1 << 18),
 	} dirty;
 
 	struct pipe_blend_state *blend;
@@ -339,6 +354,7 @@ struct fd_context {
 	struct pipe_viewport_state viewport;
 	struct fd_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
 	struct pipe_index_buffer indexbuf;
+	struct fd_streamout_stateobj streamout;
 
 	/* GMEM/tile handling fxns: */
 	void (*emit_tile_init)(struct fd_context *ctx);
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index ae75b3efdcc..f88654063fa 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -62,7 +62,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 	struct fd_context *ctx = fd_context(pctx);
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
-	unsigned i, buffers = 0;
+	unsigned i, prims, buffers = 0;
 
 	/* if we supported transform feedback, we'd have to disable this: */
 	if (((scissor->maxx - scissor->minx) *
@@ -144,11 +144,17 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 		if (ctx->fragtex.textures[i])
 			resource_used(ctx, ctx->fragtex.textures[i]->texture, true);
 
+	/* Mark streamout buffers as being read.. actually they are written.. */
+	for (i = 0; i < ctx->streamout.num_targets; i++)
+		if (ctx->streamout.targets[i])
+			resource_used(ctx, ctx->streamout.targets[i]->buffer, false);
+
 	ctx->num_draws++;
 
+	prims = u_reduced_prims_for_vertices(info->mode, info->count);
+
 	ctx->stats.draw_calls++;
-	ctx->stats.prims_emitted +=
-		u_reduced_prims_for_vertices(info->mode, info->count);
+	ctx->stats.prims_emitted += prims;
 
 	/* any buffers that haven't been cleared yet, we need to restore: */
 	ctx->restore |= buffers & (FD_BUFFER_ALL & ~ctx->cleared);
@@ -162,6 +168,9 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 	fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_DRAW);
 	ctx->draw_vbo(ctx, info);
 
+	for (i = 0; i < ctx->streamout.num_targets; i++)
+		ctx->streamout.offsets[i] += prims;
+
 	/* if an app (or, well, piglit test) does many thousands of draws
 	 * without flush (or anything which implicitly flushes, like
 	 * changing render targets), we can exceed the ringbuffer size.
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index de3cb64ed5b..d649925af48 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -647,6 +647,8 @@ fd_blitter_pipe_begin(struct fd_context *ctx)
 	util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vtx.vertexbuf.vb);
 	util_blitter_save_vertex_elements(ctx->blitter, ctx->vtx.vtx);
 	util_blitter_save_vertex_shader(ctx->blitter, ctx->prog.vp);
+	util_blitter_save_so_targets(ctx->blitter, ctx->streamout.num_targets,
+			ctx->streamout.targets);
 	util_blitter_save_rasterizer(ctx->blitter, ctx->rasterizer);
 	util_blitter_save_viewport(ctx->blitter, &ctx->viewport);
 	util_blitter_save_scissor(ctx->blitter, &ctx->scissor);
diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c
index 77aa4f21d3b..7bf8bdb4507 100644
--- a/src/gallium/drivers/freedreno/freedreno_state.c
+++ b/src/gallium/drivers/freedreno/freedreno_state.c
@@ -300,6 +300,67 @@ fd_vertex_state_bind(struct pipe_context *pctx, void *hwcso)
 	ctx->dirty |= FD_DIRTY_VTXSTATE;
 }
 
+static struct pipe_stream_output_target *
+fd_create_stream_output_target(struct pipe_context *pctx,
+		struct pipe_resource *prsc, unsigned buffer_offset,
+		unsigned buffer_size)
+{
+	struct pipe_stream_output_target *target;
+
+	target = CALLOC_STRUCT(pipe_stream_output_target);
+	if (!target)
+		return NULL;
+
+	pipe_reference_init(&target->reference, 1);
+	pipe_resource_reference(&target->buffer, prsc);
+
+	target->context = pctx;
+	target->buffer_offset = buffer_offset;
+	target->buffer_size = buffer_size;
+
+	return target;
+}
+
+static void
+fd_stream_output_target_destroy(struct pipe_context *pctx,
+		struct pipe_stream_output_target *target)
+{
+	pipe_resource_reference(&target->buffer, NULL);
+	FREE(target);
+}
+
+static void
+fd_set_stream_output_targets(struct pipe_context *pctx,
+		unsigned num_targets, struct pipe_stream_output_target **targets,
+		const unsigned *offsets)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	struct fd_streamout_stateobj *so = &ctx->streamout;
+	unsigned i;
+
+	debug_assert(num_targets <= ARRAY_SIZE(so->targets));
+
+	for (i = 0; i < num_targets; i++) {
+		boolean changed = targets[i] != so->targets[i];
+		boolean append = (offsets[i] == (unsigned)-1);
+
+		if (!changed && append)
+			continue;
+
+		so->offsets[i] = 0;
+
+		pipe_so_target_reference(&so->targets[i], targets[i]);
+	}
+
+	for (; i < so->num_targets; i++) {
+		pipe_so_target_reference(&so->targets[i], NULL);
+	}
+
+	so->num_targets = num_targets;
+
+	ctx->dirty |= FD_DIRTY_STREAMOUT;
+}
+
 void
 fd_state_init(struct pipe_context *pctx)
 {
@@ -328,4 +389,8 @@ fd_state_init(struct pipe_context *pctx)
 	pctx->create_vertex_elements_state = fd_vertex_state_create;
 	pctx->delete_vertex_elements_state = fd_vertex_state_delete;
 	pctx->bind_vertex_elements_state = fd_vertex_state_bind;
+
+	pctx->create_stream_output_target = fd_create_stream_output_target;
+	pctx->stream_output_target_destroy = fd_stream_output_target_destroy;
+	pctx->set_stream_output_targets = fd_set_stream_output_targets;
 }

From a240748de52f2e469e91b60d29ae872828a594d7 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 25 Jul 2015 12:48:18 -0400
Subject: [PATCH 0674/1208] freedreno/ir3: cleanup driver-param stuff

Add 'enum ir3_driver_param' to track driver-param slots, and a
create_driver_param() helper to avoid having the knowledge about
where driver params are placed in const regs spread throughout
the code as we add additional driver-params.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  | 31 +++++++++++++++----
 .../drivers/freedreno/ir3/ir3_shader.c        |  5 +--
 .../drivers/freedreno/ir3/ir3_shader.h        |  5 +++
 3 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index bdbaf8970cf..e013abedce6 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -261,13 +261,26 @@ compile_init(struct ir3_compiler *compiler,
 
 	so->first_driver_param = so->first_immediate = ctx->s->num_uniforms;
 
-	/* one (vec4) slot for vertex id base: */
-	if (so->type == SHADER_VERTEX)
-		so->first_immediate++;
+	/* Layout of constant registers:
+	 *
+	 *    num_uniform * vec4  -  user consts
+	 *    4 * vec4            -  UBO addresses
+	 *    if (vertex shader) {
+	 *        1 * vec4        -  driver params (IR3_DP_*)
+	 *    }
+	 *
+	 * TODO this could be made more dynamic, to at least skip sections
+	 * that we don't need..
+	 */
 
 	/* reserve 4 (vec4) slots for ubo base addresses: */
 	so->first_immediate += 4;
 
+	if (so->type == SHADER_VERTEX) {
+		/* one (vec4) slot for driver params (see ir3_driver_param): */
+		so->first_immediate++;
+	}
+
 	return ctx;
 }
 
@@ -811,6 +824,14 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp)
 	}
 }
 
+static struct ir3_instruction *
+create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp)
+{
+	/* first four vec4 sysval's reserved for UBOs: */
+	unsigned r = regid(ctx->so->first_driver_param + 4, dp);
+	return create_uniform(ctx, r);
+}
+
 /* helper for instructions that produce multiple consecutive scalar
  * outputs which need to have a split/fanout meta instruction inserted
  */
@@ -1415,9 +1436,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_base_vertex:
 		if (!ctx->basevertex) {
-			/* first four vec4 sysval's reserved for UBOs: */
-			unsigned r = regid(ctx->so->first_driver_param + 4, 0);
-			ctx->basevertex = create_uniform(ctx, r);
+			ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
 			add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX,
 					ctx->basevertex);
 		}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 75425e91378..166eb007dbb 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -548,10 +548,7 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
 		uint32_t offset = v->first_driver_param + 4;  /* driver params after UBOs */
 		if (v->constlen >= offset) {
 			uint32_t vertex_params[4] = {
-				info->indexed ? info->index_bias : info->start,
-				0,
-				0,
-				0
+				[IR3_DP_VTXID_BASE] = info->indexed ? info->index_bias : info->start,
 			};
 
 			fd_wfi(ctx, ring);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index f0af4478109..4cb25205324 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -34,6 +34,11 @@
 #include "ir3.h"
 #include "disasm.h"
 
+/* driver param indices: */
+enum ir3_driver_param {
+	IR3_DP_VTXID_BASE = 0,
+};
+
 /* internal semantic used for passing vtxcnt to vertex shader to
  * implement transform feedback:
  */

From 020301baccc77e5753ead1e890c0cf24a9675517 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 25 Jul 2015 13:48:07 -0400
Subject: [PATCH 0675/1208] freedreno/ir3: add support for store instructions

For store instructions, the "dst" register is a read register, not a
written register.  (Ie. it is the address to store to.)  Lets not
confuse register allocation, scheduling, etc, with these details.
Instead just leave a dummy instr->regs[0], and take "dst" from
instr->regs[1] and srcs following.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3.c       | 24 +++++++++++++++----
 src/gallium/drivers/freedreno/ir3/ir3.h       | 21 ++++++++++++++++
 .../drivers/freedreno/ir3/ir3_legalize.c      |  4 ++--
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index 6d19a29275b..b24825cff85 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -499,12 +499,28 @@ static int emit_cat5(struct ir3_instruction *instr, void *ptr,
 static int emit_cat6(struct ir3_instruction *instr, void *ptr,
 		struct ir3_info *info)
 {
-	struct ir3_register *dst  = instr->regs[0];
-	struct ir3_register *src1 = instr->regs[1];
-	struct ir3_register *src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
+	struct ir3_register *dst, *src1, *src2;
 	instr_cat6_t *cat6 = ptr;
 
-	iassert(instr->regs_count >= 2);
+	/* the "dst" for a store instruction is (from the perspective
+	 * of data flow in the shader, ie. register use/def, etc) in
+	 * fact a register that is read by the instruction, rather
+	 * than written:
+	 */
+	if (is_store(instr)) {
+		iassert(instr->regs_count >= 3);
+
+		dst  = instr->regs[1];
+		src1 = instr->regs[2];
+		src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL;
+	} else {
+		iassert(instr->regs_count >= 2);
+
+		dst  = instr->regs[0];
+		src1 = instr->regs[1];
+		src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
+	}
+
 
 	/* TODO we need a more comprehensive list about which instructions
 	 * can be encoded which way.  Or possibly use IR3_INSTR_0 flag to
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index c3b61a0fe01..e68170dec58 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -554,6 +554,26 @@ is_store(struct ir3_instruction *instr)
 	return false;
 }
 
+static inline bool is_load(struct ir3_instruction *instr)
+{
+	if (is_mem(instr)) {
+		switch (instr->opc) {
+		case OPC_LDG:
+		case OPC_LDL:
+		case OPC_LDP:
+		case OPC_L2G:
+		case OPC_LDLW:
+		case OPC_LDC_4:
+		case OPC_LDLV:
+		/* probably some others too.. */
+			return true;
+		default:
+			break;
+		}
+	}
+	return false;
+}
+
 static inline bool is_input(struct ir3_instruction *instr)
 {
 	/* in some cases, ldlv is used to fetch varying without
@@ -1043,6 +1063,7 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
 /* cat6 instructions: */
 INSTR2(6, LDLV)
 INSTR2(6, LDG)
+INSTR3(6, STG)
 
 /* ************************************************************************* */
 /* split this out or find some helper to use.. like main/bitset.h.. */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index f4a4223ae17..e94293f6d6b 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -182,14 +182,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 			 */
 			ctx->has_samp = true;
 			regmask_set(&needs_sy, n->regs[0]);
-		} else if (is_mem(n)) {
+		} else if (is_load(n)) {
 			regmask_set(&needs_sy, n->regs[0]);
 		}
 
 		/* both tex/sfu appear to not always immediately consume
 		 * their src register(s):
 		 */
-		if (is_tex(n) || is_sfu(n) || is_mem(n)) {
+		if (is_tex(n) || is_sfu(n) || is_load(n)) {
 			foreach_src(reg, n) {
 				if (reg_gpr(reg))
 					regmask_set(&needs_ss_war, reg);

From 96d4db683f90f02e72d34ece544de7eedfa873ee Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 25 Jul 2015 13:51:16 -0400
Subject: [PATCH 0676/1208] freedreno/ir3: track "keeps" in ir

Previously we had a fixed array to track kills, since they don't
generate an SSA value, and then cheated by stuffing them in the
outputs array before sending things through depth/sched/etc.  But
store instructions will need similar treatment.  So convert this
over to a more general array of instructions that must be kept
and fix up the places that were previously relying on kills being
in the output array.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3.h       |  6 +++++
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  | 27 +++----------------
 src/gallium/drivers/freedreno/ir3/ir3_cp.c    |  4 +++
 src/gallium/drivers/freedreno/ir3/ir3_depth.c |  3 +++
 4 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index e68170dec58..12f2ebe18db 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -369,6 +369,12 @@ struct ir3 {
 	unsigned predicates_count, predicates_sz;
 	struct ir3_instruction **predicates;
 
+	/* Track instructions which do not write a register but other-
+	 * wise must not be discarded (such as kill, stg, etc)
+	 */
+	unsigned keeps_count, keeps_sz;
+	struct ir3_instruction **keeps;
+
 	/* List of blocks: */
 	struct list_head block_list;
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index e013abedce6..a4b27854433 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -117,10 +117,6 @@ struct ir3_compile {
 	/* for looking up which system value is which */
 	unsigned sysval_semantics[8];
 
-	/* list of kill instructions: */
-	struct ir3_instruction *kill[16];
-	unsigned int kill_count;
-
 	/* set if we encounter something we can't handle yet, so we
 	 * can bail cleanly and fallback to TGSI compiler f/e
 	 */
@@ -1481,7 +1477,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		kill = ir3_KILL(b, cond, 0);
 		array_insert(ctx->ir->predicates, kill);
 
-		ctx->kill[ctx->kill_count++] = kill;
+		array_insert(ctx->ir->keeps, kill);
 		ctx->so->has_kill = true;
 
 		break;
@@ -2165,13 +2161,9 @@ emit_instructions(struct ir3_compile *ctx)
 	ninputs  = exec_list_length(&ctx->s->inputs) * 4;
 	noutputs = exec_list_length(&ctx->s->outputs) * 4;
 
-	/* we need to allocate big enough outputs array so that
-	 * we can stuff the kill's at the end.  Likewise for vtx
-	 * shaders, we need to leave room for sysvals:
+	/* or vtx shaders, we need to leave room for sysvals:
 	 */
-	if (ctx->so->type == SHADER_FRAGMENT) {
-		noutputs += ARRAY_SIZE(ctx->kill);
-	} else if (ctx->so->type == SHADER_VERTEX) {
+	if (ctx->so->type == SHADER_VERTEX) {
 		ninputs += 8;
 	}
 
@@ -2182,9 +2174,7 @@ emit_instructions(struct ir3_compile *ctx)
 	ctx->in_block = ctx->block;
 	list_addtail(&ctx->block->node, &ctx->ir->block_list);
 
-	if (ctx->so->type == SHADER_FRAGMENT) {
-		ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill);
-	} else if (ctx->so->type == SHADER_VERTEX) {
+	if (ctx->so->type == SHADER_VERTEX) {
 		ctx->ir->ninputs -= 8;
 	}
 
@@ -2380,15 +2370,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 		}
 	}
 
-	/* at this point, we want the kill's in the outputs array too,
-	 * so that they get scheduled (since they have no dst).. we've
-	 * already ensured that the array is big enough in push_block():
-	 */
-	if (so->type == SHADER_FRAGMENT) {
-		for (i = 0; i < ctx->kill_count; i++)
-			ir->outputs[ir->noutputs++] = ctx->kill[i];
-	}
-
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("BEFORE CP:\n");
 		ir3_print(ir);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index f4c825b2ab6..be4e4e81109 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -408,6 +408,10 @@ ir3_cp(struct ir3 *ir)
 		}
 	}
 
+	for (unsigned i = 0; i < ir->keeps_count; i++) {
+		ir->keeps[i] = instr_cp(ir->keeps[i], NULL);
+	}
+
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 		if (block->condition)
 			block->condition = instr_cp(block->condition, NULL);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index 0f346b26f4a..97df0c2ac99 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -156,6 +156,9 @@ ir3_depth(struct ir3 *ir)
 		if (ir->outputs[i])
 			ir3_instr_depth(ir->outputs[i]);
 
+	for (i = 0; i < ir->keeps_count; i++)
+		ir3_instr_depth(ir->keeps[i]);
+
 	/* We also need to account for if-condition: */
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 		if (block->condition)

From 98a4b111fbb9e3ae45e907ddd4d2407e5ab669ec Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 25 Jul 2015 12:53:23 -0400
Subject: [PATCH 0677/1208] freedreno/ir3: add transform-feedback support

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/freedreno_screen.c      |  11 ++
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  | 130 ++++++++++++++++++
 .../drivers/freedreno/ir3/ir3_shader.c        |  92 ++++++++++++-
 .../drivers/freedreno/ir3/ir3_shader.h        |   1 +
 4 files changed, 230 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index b28d315cc12..97e4161ede2 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -227,9 +227,20 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 
 	/* Stream output. */
 	case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+		if (is_a3xx(screen) || is_a4xx(screen))
+			return PIPE_MAX_SO_BUFFERS;
+		return 0;
 	case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+		if (is_a3xx(screen) || is_a4xx(screen))
+			return 1;
+		return 0;
 	case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+		if (is_a3xx(screen) || is_a4xx(screen))
+			return 16;    /* should only be shader out limit? */
+		return 0;
 	case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+		if (is_a3xx(screen) || is_a4xx(screen))
+			return 16;    /* should only be shader out limit? */
 		return 0;
 
 	/* Geometry shader output, unsupported. */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index a4b27854433..53faf16ae30 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -263,6 +263,7 @@ compile_init(struct ir3_compiler *compiler,
 	 *    4 * vec4            -  UBO addresses
 	 *    if (vertex shader) {
 	 *        1 * vec4        -  driver params (IR3_DP_*)
+	 *        1 * vec4        -  stream-out addresses
 	 *    }
 	 *
 	 * TODO this could be made more dynamic, to at least skip sections
@@ -275,6 +276,8 @@ compile_init(struct ir3_compiler *compiler,
 	if (so->type == SHADER_VERTEX) {
 		/* one (vec4) slot for driver params (see ir3_driver_param): */
 		so->first_immediate++;
+		/* one (vec4) slot for stream-output base addresses: */
+		so->first_immediate++;
 	}
 
 	return ctx;
@@ -1971,6 +1974,115 @@ emit_cf_list(struct ir3_compile *ctx, struct exec_list *list)
 	}
 }
 
+/* emit stream-out code.  At this point, the current block is the original
+ * (nir) end block, and nir ensures that all flow control paths terminate
+ * into the end block.  We re-purpose the original end block to generate
+ * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
+ * block holding stream-out write instructions, followed by the new end
+ * block:
+ *
+ *   blockOrigEnd {
+ *      p0.x = (vtxcnt < maxvtxcnt)
+ *      // succs: blockStreamOut, blockNewEnd
+ *   }
+ *   blockStreamOut {
+ *      ... stream-out instructions ...
+ *      // succs: blockNewEnd
+ *   }
+ *   blockNewEnd {
+ *   }
+ */
+static void
+emit_stream_out(struct ir3_compile *ctx)
+{
+	struct ir3_shader_variant *v = ctx->so;
+	struct ir3 *ir = ctx->ir;
+	struct pipe_stream_output_info *strmout =
+			&ctx->so->shader->stream_output;
+	struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
+	struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
+	struct ir3_instruction *bases[PIPE_MAX_SO_BUFFERS];
+
+	/* create vtxcnt input in input block at top of shader,
+	 * so that it is seen as live over the entire duration
+	 * of the shader:
+	 */
+	vtxcnt = create_input(ctx->in_block, 0);
+	add_sysval_input(ctx, IR3_SEMANTIC_VTXCNT, vtxcnt);
+
+	maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
+
+	/* at this point, we are at the original 'end' block,
+	 * re-purpose this block to stream-out condition, then
+	 * append stream-out block and new-end block
+	 */
+	orig_end_block = ctx->block;
+
+	stream_out_block = ir3_block_create(ir);
+	list_addtail(&stream_out_block->node, &ir->block_list);
+
+	new_end_block = ir3_block_create(ir);
+	list_addtail(&new_end_block->node, &ir->block_list);
+
+	orig_end_block->successors[0] = stream_out_block;
+	orig_end_block->successors[1] = new_end_block;
+	stream_out_block->successors[0] = new_end_block;
+
+	/* setup 'if (vtxcnt < maxvtxcnt)' condition: */
+	cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
+	cond->regs[0]->num = regid(REG_P0, 0);
+	cond->cat2.condition = IR3_COND_LT;
+
+	/* condition goes on previous block to the conditional,
+	 * since it is used to pick which of the two successor
+	 * paths to take:
+	 */
+	orig_end_block->condition = cond;
+
+	/* switch to stream_out_block to generate the stream-out
+	 * instructions:
+	 */
+	ctx->block = stream_out_block;
+
+	/* Calculate base addresses based on vtxcnt.  Instructions
+	 * generated for bases not used in following loop will be
+	 * stripped out in the backend.
+	 */
+	for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+		unsigned stride = strmout->stride[i];
+		struct ir3_instruction *base, *off;
+
+		base = create_uniform(ctx, regid(v->first_driver_param + 5, i));
+
+		/* 24-bit should be enough: */
+		off = ir3_MUL_U(ctx->block, vtxcnt, 0,
+				create_immed(ctx->block, stride * 4), 0);
+
+		bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
+	}
+
+	/* Generate the per-output store instructions: */
+	for (unsigned i = 0; i < strmout->num_outputs; i++) {
+		for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
+			unsigned c = j + strmout->output[i].start_component;
+			struct ir3_instruction *base, *out, *stg;
+
+			base = bases[strmout->output[i].output_buffer];
+			out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
+
+			stg = ir3_STG(ctx->block, base, 0, out, 0,
+					create_immed(ctx->block, 1), 0);
+			stg->cat6.type = TYPE_U32;
+			stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
+
+			array_insert(ctx->ir->keeps, stg);
+		}
+	}
+
+	/* and finally switch to the new_end_block: */
+	ctx->block = new_end_block;
+}
+
 static void
 emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
 {
@@ -1981,6 +2093,24 @@ emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
 	 * into which we emit the 'end' instruction.
 	 */
 	compile_assert(ctx, list_empty(&ctx->block->instr_list));
+
+	/* If stream-out (aka transform-feedback) enabled, emit the
+	 * stream-out instructions, followed by a new empty block (into
+	 * which the 'end' instruction lands).
+	 *
+	 * NOTE: it is done in this order, rather than inserting before
+	 * we emit end_block, because NIR guarantees that all blocks
+	 * flow into end_block, and that end_block has no successors.
+	 * So by re-purposing end_block as the first block of stream-
+	 * out, we guarantee that all exit paths flow into the stream-
+	 * out instructions.
+	 */
+	if ((ctx->so->shader->stream_output.num_outputs > 0) &&
+			!ctx->so->key.binning_pass) {
+		debug_assert(ctx->so->type == SHADER_VERTEX);
+		emit_stream_out(ctx);
+	}
+
 	ir3_END(ctx->block);
 }
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 166eb007dbb..312174c0c6d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -466,10 +466,10 @@ static void
 emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
 		struct fd_constbuf_stateobj *constbuf)
 {
-	if (v->constlen > v->first_driver_param) {
+	uint32_t offset = v->first_driver_param;  /* UBOs after user consts */
+	if (v->constlen > offset) {
 		struct fd_context *ctx = fd_context(v->shader->pctx);
-		uint32_t offset = v->first_driver_param;  /* UBOs after user consts */
-		uint32_t params = MIN2(4, v->constlen - v->first_driver_param) * 4;
+		uint32_t params = MIN2(4, v->constlen - offset) * 4;
 		uint32_t offsets[params];
 		struct fd_bo *bos[params];
 
@@ -515,6 +515,83 @@ emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
 	}
 }
 
+/* emit stream-out buffers: */
+static void
+emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+	uint32_t offset = v->first_driver_param + 5;  /* streamout addresses after driver-params*/
+	if (v->constlen > offset) {
+		struct fd_context *ctx = fd_context(v->shader->pctx);
+		struct fd_streamout_stateobj *so = &ctx->streamout;
+		struct pipe_stream_output_info *info = &v->shader->stream_output;
+		uint32_t params = 4;
+		uint32_t offsets[params];
+		struct fd_bo *bos[params];
+
+		for (uint32_t i = 0; i < params; i++) {
+			struct pipe_stream_output_target *target = so->targets[i];
+
+			if (target) {
+				offsets[i] = (so->offsets[i] * info->stride[i] * 4) +
+						target->buffer_offset;
+				bos[i] = fd_resource(target->buffer)->bo;
+			} else {
+				offsets[i] = 0;
+				bos[i] = NULL;
+			}
+		}
+
+		fd_wfi(ctx, ring);
+		ctx->emit_const_bo(ring, v->type, true, offset * 4, params, bos, offsets);
+	}
+}
+
+static uint32_t
+max_tf_vtx(struct ir3_shader_variant *v)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+	struct fd_streamout_stateobj *so = &ctx->streamout;
+	struct pipe_stream_output_info *info = &v->shader->stream_output;
+	uint32_t maxvtxcnt = 0x7fffffff;
+
+	if (v->key.binning_pass)
+		return 0;
+	if (v->shader->stream_output.num_outputs == 0)
+		return 0;
+	if (so->num_targets == 0)
+		return 0;
+
+	/* offset to write to is:
+	 *
+	 *   total_vtxcnt = vtxcnt + offsets[i]
+	 *   offset = total_vtxcnt * stride[i]
+	 *
+	 *   offset =   vtxcnt * stride[i]       ; calculated in shader
+	 *            + offsets[i] * stride[i]   ; calculated at emit_tfbos()
+	 *
+	 * assuming for each vtx, each target buffer will have data written
+	 * up to 'offset + stride[i]', that leaves maxvtxcnt as:
+	 *
+	 *   buffer_size = (maxvtxcnt * stride[i]) + stride[i]
+	 *   maxvtxcnt   = (buffer_size - stride[i]) / stride[i]
+	 *
+	 * but shader is actually doing a less-than (rather than less-than-
+	 * equal) check, so we can drop the -stride[i].
+	 *
+	 * TODO is assumption about `offset + stride[i]` legit?
+	 */
+	for (unsigned i = 0; i < so->num_targets; i++) {
+		struct pipe_stream_output_target *target = so->targets[i];
+		unsigned stride = info->stride[i] * 4;   /* convert dwords->bytes */
+		if (target) {
+			uint32_t max = target->buffer_size / stride;
+			maxvtxcnt = MIN2(maxvtxcnt, max);
+		}
+	}
+
+	return maxvtxcnt;
+}
+
 void
 ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
 		const struct pipe_draw_info *info, uint32_t dirty)
@@ -548,12 +625,19 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
 		uint32_t offset = v->first_driver_param + 4;  /* driver params after UBOs */
 		if (v->constlen >= offset) {
 			uint32_t vertex_params[4] = {
-				[IR3_DP_VTXID_BASE] = info->indexed ? info->index_bias : info->start,
+				[IR3_DP_VTXID_BASE] = info->indexed ?
+						info->index_bias : info->start,
+				[IR3_DP_VTXCNT_MAX] = max_tf_vtx(v),
 			};
 
 			fd_wfi(ctx, ring);
 			ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0,
 					ARRAY_SIZE(vertex_params), vertex_params, NULL);
+
+			/* if needed, emit stream-out buffer addresses: */
+			if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) {
+				emit_tfbos(v, ring);
+			}
 		}
 	}
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 4cb25205324..c0fd44d4ed1 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -37,6 +37,7 @@
 /* driver param indices: */
 enum ir3_driver_param {
 	IR3_DP_VTXID_BASE = 0,
+	IR3_DP_VTXCNT_MAX = 1,
 };
 
 /* internal semantic used for passing vtxcnt to vertex shader to

From 7850774f2118ae87c7e6a4f6c17751e405edfb34 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 22 Jul 2015 12:14:40 -0700
Subject: [PATCH 0678/1208] vc4: Add support for ARB_draw_elements_base_vertex.

Gallium exposes it unconditionally, so do our best to support it.  It
fails on the negative index cases, but those seem unlikely to be used in
the wild.
---
 src/gallium/drivers/vc4/vc4_draw.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index fc3c2321abb..1c7f3b16be3 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -201,7 +201,9 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 struct pipe_vertex_buffer *vb =
                         &vertexbuf->vb[elem->vertex_buffer_index];
                 struct vc4_resource *rsc = vc4_resource(vb->buffer);
-                uint32_t offset = vb->buffer_offset + elem->src_offset;
+                uint32_t offset = (vb->buffer_offset +
+                                   elem->src_offset +
+                                   vb->stride * info->index_bias);
                 uint32_t vb_size = rsc->bo->size - offset;
                 uint32_t elem_size =
                         util_format_get_blocksize(elem->src_format);

From ab63610a3603ae1e40a36d238b5938621bb9e8cc Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 28 Jul 2015 02:00:20 -0400
Subject: [PATCH 0679/1208] nvc0/ir: fix barrier emission

immediate arguments require a flag to be set for each one

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 3ed815bad39..f607f3ba3ec 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1451,6 +1451,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i)
       ImmediateValue *imm = i->getSrc(0)->asImm();
       assert(imm);
       code[0] |= imm->reg.data.u32 << 20;
+      code[1] |= 0x8000;
    }
 
    // thread count
@@ -1461,6 +1462,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i)
       assert(imm);
       code[0] |= imm->reg.data.u32 << 26;
       code[1] |= imm->reg.data.u32 >> 6;
+      code[1] |= 0x4000;
    }
 
    if (i->srcExists(2) && (i->predSrc != 2)) {

From 313940b03cf7c857143b9e3ec0ab969ce4472c83 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 28 Jul 2015 02:37:51 -0400
Subject: [PATCH 0680/1208] nvc0/ir: trim out barrier sync for non-compute
 shaders

It seems like they're never necessary, and actively cause harm. This
fixes some of the barrier-related piglits.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp       | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 710f53de1c4..c632e30afae 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -559,6 +559,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
       } else
       if (i->isNop()) {
          bb->remove(i);
+      } else
+      if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
+          prog->getType() != Program::TYPE_COMPUTE) {
+         // It seems like barriers are never required for tessellation since
+         // the warp size is 32, and there are always at most 32 tcs threads.
+         bb->remove(i);
       } else {
          // TODO: Move this to before register allocation for operations that
          // need the $c register !

From fd865d56d2229d8c5d7ea893ac1dba525d88e647 Mon Sep 17 00:00:00 2001
From: Chad Versace <chad.versace@intel.com>
Date: Tue, 23 Jun 2015 15:48:17 -0700
Subject: [PATCH 0681/1208] egl: Add support for DRM_FORMAT_R8, RG88, and GR88
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Kodi/XBMC developers want to transcode NV12 to RGB with OpenGL shaders,
importing the two source planes through EGL_EXT_image_dma_buf_import. That
requires importing the Y plane as an R8 EGLImage and the UV plane as either an
RG88 or GR88 EGLImage.

This patch teaches the driver-independent part of EGL about the new
formats. Real driver support is left for follow-up patches.

The new formats landed in airlied's kernel branch 'drm-next' on July 24.

Tested-by: Peter Frühberger <peter.fruehberger@gmail.com>
Signed-off-by: Chad Versace <chad.versace@intel.com>
---
 src/egl/drivers/dri2/egl_dri2.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index e5aa396b08a..e3afab4fee8 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -54,6 +54,22 @@
 #include "egl_dri2.h"
 #include "../util/u_atomic.h"
 
+/* The kernel header drm_fourcc.h defines the DRM formats below.  We duplicate
+ * some of the definitions here so that building Mesa won't bleeding-edge
+ * kernel headers.
+ */
+#ifndef DRM_FORMAT_R8
+#define DRM_FORMAT_R8            fourcc_code('R', '8', ' ', ' ') /* [7:0] R */
+#endif
+
+#ifndef DRM_FORMAT_RG88
+#define DRM_FORMAT_RG88          fourcc_code('R', 'G', '8', '8') /* [15:0] R:G 8:8 little endian */
+#endif
+
+#ifndef DRM_FORMAT_GR88
+#define DRM_FORMAT_GR88          fourcc_code('G', 'R', '8', '8') /* [15:0] G:R 8:8 little endian */
+#endif
+
 const __DRIuseInvalidateExtension use_invalidate = {
    .base = { __DRI_USE_INVALIDATE, 1 }
 };
@@ -1727,6 +1743,9 @@ dri2_check_dma_buf_format(const _EGLImageAttribs *attrs)
    unsigned i, plane_n;
 
    switch (attrs->DMABufFourCC.Value) {
+   case DRM_FORMAT_R8:
+   case DRM_FORMAT_RG88:
+   case DRM_FORMAT_GR88:
    case DRM_FORMAT_RGB332:
    case DRM_FORMAT_BGR233:
    case DRM_FORMAT_XRGB4444:

From 56f1f47eda881d6281e9c7531bc17e72b25d9bb9 Mon Sep 17 00:00:00 2001
From: Chad Versace <chad.versace@intel.com>
Date: Tue, 23 Jun 2015 15:48:40 -0700
Subject: [PATCH 0682/1208] i965: Support importing R8 and GR88 dma_bufs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

EGL_EXT_image_dma_buf_import now supports those formats.

Tests:
  - Tested by Piglit ext_image_dma_buf_import-transcode-nv12-as-r8-gr88.
  - Tested by Peter in Kodi/XBMC to obtain 60fps NV12 transcode at 4K.

Tested-by: Peter Frühberger <peter.fruehberger@gmail.com>
Signed-off-by: Chad Versace <chad.versace@intel.com>
---
 include/GL/internal/dri_interface.h      | 9 +++++++--
 src/mesa/drivers/dri/i965/intel_screen.c | 6 ++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index f17b7b1e080..e7cf50df83f 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -1094,12 +1094,15 @@ struct __DRIdri2ExtensionRec {
 
 
 /**
- * Four CC formats that matches with WL_DRM_FORMAT_* from wayland_drm.h
- * and GBM_FORMAT_* from gbm.h, used with createImageFromNames.
+ * Four CC formats that matches with WL_DRM_FORMAT_* from wayland_drm.h,
+ * GBM_FORMAT_* from gbm.h, and DRM_FORMAT_* from drm_fourcc.h. Used with
+ * createImageFromNames.
  *
  * \since 5
  */
 
+#define __DRI_IMAGE_FOURCC_R8		0x20203852
+#define __DRI_IMAGE_FOURCC_GR88		0x38385247
 #define __DRI_IMAGE_FOURCC_RGB565	0x36314752
 #define __DRI_IMAGE_FOURCC_ARGB8888	0x34325241
 #define __DRI_IMAGE_FOURCC_XRGB8888	0x34325258
@@ -1134,6 +1137,8 @@ struct __DRIdri2ExtensionRec {
 #define __DRI_IMAGE_COMPONENTS_Y_U_V	0x3003
 #define __DRI_IMAGE_COMPONENTS_Y_UV	0x3004
 #define __DRI_IMAGE_COMPONENTS_Y_XUXV	0x3005
+#define __DRI_IMAGE_COMPONENTS_R	0x3006
+#define __DRI_IMAGE_COMPONENTS_RG	0x3007
 
 
 /**
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 65a17664188..147fa1ea49e 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -229,6 +229,12 @@ static struct intel_image_format intel_image_formats[] = {
    { __DRI_IMAGE_FOURCC_RGB565, __DRI_IMAGE_COMPONENTS_RGB, 1,
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_RGB565, 2 } } },
 
+   { __DRI_IMAGE_FOURCC_R8, __DRI_IMAGE_COMPONENTS_R, 1,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, } },
+
+   { __DRI_IMAGE_FOURCC_GR88, __DRI_IMAGE_COMPONENTS_RG, 1,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 }, } },
+
    { __DRI_IMAGE_FOURCC_YUV410, __DRI_IMAGE_COMPONENTS_Y_U_V, 3,
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
        { 1, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 },

From aa25a2c1ba2ea14efdab405707f15dace323cd48 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Fri, 17 Jul 2015 04:44:18 +0100
Subject: [PATCH 0683/1208] radeonsi: add support for interpolateAt functions
 (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is part of ARB_gpu_shader5, and this passes
all the piglit tests currently available.

v2: use macros from the fine derivs commit.
add comments.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                             |   2 +-
 src/gallium/drivers/radeonsi/si_shader.c | 241 ++++++++++++++++++++++-
 2 files changed, 241 insertions(+), 2 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 15bb57f441a..258a6fb94e2 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -107,7 +107,7 @@ GL 4.0, GLSL 4.00:
   - Geometry shader instancing                         DONE (r600, radeonsi, llvmpipe, softpipe)
   - Geometry shader multiple streams                   DONE ()
   - Enhanced per-sample shading                        DONE (r600, radeonsi)
-  - Interpolation functions                            DONE (r600)
+  - Interpolation functions                            DONE (r600, radeonsi)
   - New overload resolution rules                      DONE
   GL_ARB_gpu_shader_fp64                               DONE (nvc0, radeonsi, llvmpipe, softpipe)
   GL_ARB_sample_shading                                DONE (i965, nv50, nvc0, r600, radeonsi)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 27b3c72d1c9..fa31f734ff2 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2960,6 +2960,234 @@ static void si_llvm_emit_ddxy(
 	emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
 }
 
+/*
+ * this takes an I,J coordinate pair,
+ * and works out the X and Y derivatives.
+ * it returns DDX(I), DDX(J), DDY(I), DDY(J).
+ */
+static LLVMValueRef si_llvm_emit_ddxy_interp(
+	struct lp_build_tgsi_context *bld_base,
+	LLVMValueRef interp_ij)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	struct lp_build_context *base = &bld_base->base;
+	LLVMValueRef indices[2];
+	LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
+	LLVMValueRef tl, tr, bl, result[4];
+	LLVMTypeRef i32;
+	unsigned c;
+
+	i32 = LLVMInt32TypeInContext(gallivm->context);
+
+	indices[0] = bld_base->uint_bld.zero;
+	indices[1] = build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
+				     NULL, 0, LLVMReadNoneAttribute);
+	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				 indices, 2, "");
+
+	temp = LLVMBuildAnd(gallivm->builder, indices[1],
+			    lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
+
+	temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
+			     lp_build_const_int32(gallivm, TID_MASK_TOP), "");
+
+	indices[1] = temp;
+	load_ptr_x = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				  indices, 2, "");
+
+	indices[1] = temp2;
+	load_ptr_y = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				  indices, 2, "");
+
+	indices[1] = LLVMBuildAdd(gallivm->builder, temp,
+				  lp_build_const_int32(gallivm, 1), "");
+	load_ptr_ddx = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				   indices, 2, "");
+
+	indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
+				  lp_build_const_int32(gallivm, 2), "");
+	load_ptr_ddy = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				   indices, 2, "");
+
+	for (c = 0; c < 2; ++c) {
+		LLVMValueRef store_val;
+		LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
+
+		store_val = LLVMBuildExtractElement(gallivm->builder,
+						    interp_ij, c_ll, "");
+		LLVMBuildStore(gallivm->builder,
+			       store_val,
+			       store_ptr);
+
+		tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
+		tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
+
+		tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
+		tr = LLVMBuildBitCast(gallivm->builder, tr, base->elem_type, "");
+
+		result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
+
+		tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
+		tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
+
+		bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
+		bl = LLVMBuildBitCast(gallivm->builder, bl, base->elem_type, "");
+
+		result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
+	}
+
+	return lp_build_gather_values(gallivm, result, 4);
+}
+
+static void interp_fetch_args(
+	struct lp_build_tgsi_context *bld_base,
+	struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
+		/* offset is in second src, first two channels */
+		emit_data->args[0] = lp_build_emit_fetch(bld_base,
+							 emit_data->inst, 1,
+							 0);
+		emit_data->args[1] = lp_build_emit_fetch(bld_base,
+							 emit_data->inst, 1,
+							 1);
+		emit_data->arg_count = 2;
+	} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
+		LLVMValueRef sample_position;
+		LLVMValueRef sample_id;
+		LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
+
+		/* fetch sample ID, then fetch its sample position,
+		 * and place into first two channels.
+		 */
+		sample_id = lp_build_emit_fetch(bld_base,
+						emit_data->inst, 1, 0);
+		sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
+					     LLVMInt32TypeInContext(gallivm->context),
+					     "");
+		sample_position = load_sample_position(&si_shader_ctx->radeon_bld, sample_id);
+
+		emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
+							     sample_position,
+							     lp_build_const_int32(gallivm, 0), "");
+
+		emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
+		emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
+							     sample_position,
+							     lp_build_const_int32(gallivm, 1), "");
+		emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
+		emit_data->arg_count = 2;
+	}
+}
+
+static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
+				struct lp_build_tgsi_context *bld_base,
+				struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader *shader = si_shader_ctx->shader;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMValueRef interp_param;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	const char *intr_name;
+	int input_index;
+	int chan;
+	int i;
+	LLVMValueRef attr_number;
+	LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
+	LLVMValueRef params = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
+	int interp_param_idx;
+	unsigned location;
+
+	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
+	input_index = inst->Src[0].Register.Index;
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
+	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
+		location = TGSI_INTERPOLATE_LOC_CENTER;
+	else
+		location = TGSI_INTERPOLATE_LOC_CENTROID;
+
+	interp_param_idx = lookup_interp_param_index(shader->ps_input_interpolate[input_index],
+						     location);
+	if (interp_param_idx == -1)
+		return;
+	else if (interp_param_idx)
+		interp_param = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, interp_param_idx);
+	else
+		interp_param = NULL;
+
+	attr_number = lp_build_const_int32(gallivm,
+					   shader->ps_input_param_offset[input_index]);
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
+	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
+		LLVMValueRef ij_out[2];
+		LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
+
+		/*
+		 * take the I then J parameters, and the DDX/Y for it, and
+		 * calculate the IJ inputs for the interpolator.
+		 * temp1 = ddx * offset/sample.x + I;
+		 * interp_param.I = ddy * offset/sample.y + temp1;
+		 * temp1 = ddx * offset/sample.x + J;
+		 * interp_param.J = ddy * offset/sample.y + temp1;
+		 */
+		for (i = 0; i < 2; i++) {
+			LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
+			LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
+			LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
+								      ddxy_out, ix_ll, "");
+			LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
+								      ddxy_out, iy_ll, "");
+			LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
+									 interp_param, ix_ll, "");
+			LLVMValueRef temp1, temp2;
+
+			interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
+						     LLVMFloatTypeInContext(gallivm->context), "");
+
+			temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
+
+			temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
+
+			temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
+
+			temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
+
+			ij_out[i] = LLVMBuildBitCast(gallivm->builder,
+						     temp2,
+						     LLVMIntTypeInContext(gallivm->context, 32), "");
+		}
+		interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
+	}
+
+	intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
+	for (chan = 0; chan < 2; chan++) {
+		LLVMValueRef args[4];
+		LLVMValueRef llvm_chan;
+		unsigned schan;
+
+		schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
+		llvm_chan = lp_build_const_int32(gallivm, schan);
+
+		args[0] = llvm_chan;
+		args[1] = attr_number;
+		args[2] = params;
+		args[3] = interp_param;
+
+		emit_data->output[chan] =
+			build_intrinsic(gallivm->builder, intr_name,
+					input_type, args, args[3] ? 4 : 3,
+					LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
+	}
+}
+
 /* Emit one vertex from the geometry shader */
 static void si_llvm_emit_vertex(
 	const struct lp_build_tgsi_action *action,
@@ -3073,6 +3301,11 @@ static const struct lp_build_tgsi_action txq_action = {
 	.intr_name = "llvm.SI.resinfo"
 };
 
+static const struct lp_build_tgsi_action interp_action = {
+	.fetch_args = interp_fetch_args,
+	.emit = build_interp_intrinsic,
+};
+
 static void create_meta_data(struct si_shader_context *si_shader_ctx)
 {
 	struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
@@ -3269,7 +3502,9 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 	    (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
 	     bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
 	     bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
-	     bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0))
+	     bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
+	     bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
+	     bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
 		si_shader_ctx->lds =
 			LLVMAddGlobalInAddressSpace(gallivm->module,
 						    LLVMArrayType(i32, 64),
@@ -3747,6 +3982,10 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
 
+	bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
+	bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
+	bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
+
 	bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;

From 82546729e3533c9a5ec0392585a60833bd93acca Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Mon, 27 Jul 2015 11:01:47 +1000
Subject: [PATCH 0684/1208] r600,radeonsi: GL_ARB_conditional_render_inverted
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By using 'Tobias Klausmann' piglit test-suite patch. We obtain
a full 12/12 passes using this patch. By 'faking' to claim
support for this extension we obtain 7 fails and 5 passes.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Tested-by: Furkan Alaca <falaca@gmail.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 docs/GL3.txt                            |  2 +-
 docs/relnotes/10.7.0.html               |  1 +
 src/gallium/drivers/r600/r600_pipe.c    |  2 +-
 src/gallium/drivers/radeon/r600_query.c | 22 +++++++++++++---------
 src/gallium/drivers/radeonsi/si_pipe.c  |  2 +-
 5 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 258a6fb94e2..d438403b54a 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -189,7 +189,7 @@ GL 4.5, GLSL 4.50:
 
   GL_ARB_ES3_1_compatibility                           not started
   GL_ARB_clip_control                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_conditional_render_inverted                   DONE (i965, nv50, nvc0, llvmpipe, softpipe)
+  GL_ARB_conditional_render_inverted                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_cull_distance                                 in progress (Tobias)
   GL_ARB_derivative_control                            DONE (i965, nv50, nvc0, r600, radeonsi)
   GL_ARB_direct_state_access                           DONE (all drivers)
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index afef5256b75..2df18c0532d 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -45,6 +45,7 @@ Note: some of the new features are only available with certain drivers.
 
 <ul>
 <li>GL_AMD_vertex_shader_viewport_index on radeonsi</li>
+<li>GL_ARB_conditional_render_inverted on r600, radeonsi</li>
 <li>GL_ARB_derivative_control on radeonsi</li>
 <li>GL_ARB_fragment_layer_viewport on radeonsi</li>
 <li>GL_ARB_framebuffer_no_attachments on i965</li>
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index e8459288106..e755784209a 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -268,6 +268,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_SAMPLE_SHADING:
 	case PIPE_CAP_CLIP_HALFZ:
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 		return 1;
 
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
@@ -332,7 +333,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index a1d8241c513..6bf0271e14f 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -290,6 +290,13 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct
 					int operation, bool flag_wait)
 {
 	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+	uint32_t op = PRED_OP(operation);
+
+	/* if true then invert, see GL_ARB_conditional_render_inverted */
+	if (ctx->current_render_cond_cond)
+		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */
+	else
+		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */
 
 	if (operation == PREDICATION_OP_CLEAR) {
 		ctx->need_gfx_cs_space(&ctx->b, 3, FALSE);
@@ -300,24 +307,21 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct
 	} else {
 		struct r600_query_buffer *qbuf;
 		unsigned count;
-		uint32_t op;
-
 		/* Find how many results there are. */
 		count = 0;
 		for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
 			count += qbuf->results_end / query->result_size;
 		}
-
+	
 		ctx->need_gfx_cs_space(&ctx->b, 5 * count, TRUE);
-
-		op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
-				(flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
-
+	
+		op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+	
 		/* emit predicate packets for all data blocks */
 		for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
 			unsigned results_base = 0;
 			uint64_t va = qbuf->buf->gpu_address;
-
+	
 			while (results_base < qbuf->results_end) {
 				radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
 				radeon_emit(cs, (va + results_base) & 0xFFFFFFFFUL);
@@ -325,7 +329,7 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct
 				r600_emit_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ,
 						RADEON_PRIO_MIN);
 				results_base += query->result_size;
-
+	
 				/* set CONTINUE bit for all packets except the first */
 				op |= PREDICATION_CONTINUE;
 			}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index ebe1f5a68f1..808b9bcdc84 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -250,6 +250,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
 	case PIPE_CAP_TGSI_TEXCOORD:
 	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 		return 1;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
@@ -290,7 +291,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
 		return 0;

From 768b4a25b95b95989dae3ff2f5a06172a2f4ab85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 23 Jul 2015 21:57:19 +0200
Subject: [PATCH 0685/1208] st/mesa: fix GLSL 1.30 texture shadow functions
 with the GL_ALPHA depth mode (v2)

Fixes piglit:
    spec@glsl-1.30@execution@fs-texture-sampler2dshadow-10
    spec@glsl-1.30@execution@fs-texture-sampler2dshadow-11

v2: use st_shader_stage_to_ptarget

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/state_tracker/st_atom_texture.c   | 74 +++++++++++++++-------
 src/mesa/state_tracker/st_context.h        | 23 +++++++
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 27 +-------
 3 files changed, 77 insertions(+), 47 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index e80f9893ad5..4422d9a1a52 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -103,7 +103,8 @@ swizzle_swizzle(unsigned swizzle1, unsigned swizzle2)
  */
 static unsigned
 compute_texture_format_swizzle(GLenum baseFormat, GLenum depthMode,
-                               enum pipe_format actualFormat)
+                               enum pipe_format actualFormat,
+                               unsigned glsl_version)
 {
    switch (baseFormat) {
    case GL_RGBA:
@@ -157,8 +158,26 @@ compute_texture_format_swizzle(GLenum baseFormat, GLenum depthMode,
       case GL_INTENSITY:
          return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
       case GL_ALPHA:
-         return MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO,
-                              SWIZZLE_ZERO, SWIZZLE_X);
+         /* The texture(sampler*Shadow) functions from GLSL 1.30 ignore
+          * the depth mode and return float, while older shadow* functions
+          * and ARB_fp instructions return vec4 according to the depth mode.
+          *
+          * The problem with the GLSL 1.30 functions is that GL_ALPHA forces
+          * them to return 0, breaking them completely.
+          *
+          * A proper fix would increase code complexity and that's not worth
+          * it for a rarely used feature such as the GL_ALPHA depth mode
+          * in GL3. Therefore, change GL_ALPHA to GL_INTENSITY for all
+          * shaders that use GLSL 1.30 or later.
+          *
+          * BTW, it's required that sampler views are updated when
+          * shaders change (check_sampler_swizzle takes care of that).
+          */
+         if (glsl_version && glsl_version >= 130)
+            return SWIZZLE_XXXX;
+         else
+            return MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO,
+                                 SWIZZLE_ZERO, SWIZZLE_X);
       case GL_RED:
          return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO,
                               SWIZZLE_ZERO, SWIZZLE_ONE);
@@ -174,7 +193,8 @@ compute_texture_format_swizzle(GLenum baseFormat, GLenum depthMode,
 
 
 static unsigned
-get_texture_format_swizzle(const struct st_texture_object *stObj)
+get_texture_format_swizzle(const struct st_texture_object *stObj,
+                           unsigned glsl_version)
 {
    GLenum baseFormat = _mesa_texture_base_format(&stObj->base);
    unsigned tex_swizzle;
@@ -182,7 +202,8 @@ get_texture_format_swizzle(const struct st_texture_object *stObj)
    if (baseFormat != GL_NONE) {
       tex_swizzle = compute_texture_format_swizzle(baseFormat,
                                                    stObj->base.DepthMode,
-                                                   stObj->pt->format);
+                                                   stObj->pt->format,
+                                                   glsl_version);
    }
    else {
       tex_swizzle = SWIZZLE_XYZW;
@@ -201,9 +222,9 @@ get_texture_format_swizzle(const struct st_texture_object *stObj)
  */
 static boolean
 check_sampler_swizzle(const struct st_texture_object *stObj,
-		      struct pipe_sampler_view *sv)
+		      struct pipe_sampler_view *sv, unsigned glsl_version)
 {
-   unsigned swizzle = get_texture_format_swizzle(stObj);
+   unsigned swizzle = get_texture_format_swizzle(stObj, glsl_version);
 
    return ((sv->swizzle_r != GET_SWZ(swizzle, 0)) ||
            (sv->swizzle_g != GET_SWZ(swizzle, 1)) ||
@@ -232,10 +253,11 @@ static unsigned last_layer(struct st_texture_object *stObj)
 static struct pipe_sampler_view *
 st_create_texture_sampler_view_from_stobj(struct pipe_context *pipe,
 					  struct st_texture_object *stObj,
-					  enum pipe_format format)
+					  enum pipe_format format,
+                                          unsigned glsl_version)
 {
    struct pipe_sampler_view templ;
-   unsigned swizzle = get_texture_format_swizzle(stObj);
+   unsigned swizzle = get_texture_format_swizzle(stObj, glsl_version);
 
    u_sampler_view_default_template(&templ,
                                    stObj->pt,
@@ -282,7 +304,8 @@ st_create_texture_sampler_view_from_stobj(struct pipe_context *pipe,
 static struct pipe_sampler_view *
 st_get_texture_sampler_view_from_stobj(struct st_context *st,
                                        struct st_texture_object *stObj,
-				       enum pipe_format format)
+				       enum pipe_format format,
+                                       unsigned glsl_version)
 {
    struct pipe_sampler_view **sv;
    const struct st_texture_image *firstImage;
@@ -304,7 +327,7 @@ st_get_texture_sampler_view_from_stobj(struct st_context *st,
 
    /* if sampler view has changed dereference it */
    if (*sv) {
-      if (check_sampler_swizzle(stObj, *sv) ||
+      if (check_sampler_swizzle(stObj, *sv, glsl_version) ||
 	  (format != (*sv)->format) ||
           gl_target_to_pipe(stObj->base.Target) != (*sv)->target ||
           stObj->base.MinLevel + stObj->base.BaseLevel != (*sv)->u.tex.first_level ||
@@ -316,7 +339,8 @@ st_get_texture_sampler_view_from_stobj(struct st_context *st,
    }
 
    if (!*sv) {
-      *sv = st_create_texture_sampler_view_from_stobj(st->pipe, stObj, format);
+      *sv = st_create_texture_sampler_view_from_stobj(st->pipe, stObj,
+                                                      format, glsl_version);
 
    } else if ((*sv)->context != st->pipe) {
       /* Recreate view in correct context, use existing view as template */
@@ -332,7 +356,7 @@ st_get_texture_sampler_view_from_stobj(struct st_context *st,
 static GLboolean
 update_single_texture(struct st_context *st,
                       struct pipe_sampler_view **sampler_view,
-		      GLuint texUnit)
+		      GLuint texUnit, unsigned glsl_version)
 {
    struct gl_context *ctx = st->ctx;
    const struct gl_sampler_object *samp;
@@ -372,8 +396,9 @@ update_single_texture(struct st_context *st,
       }
    }
 
-   *sampler_view = st_get_texture_sampler_view_from_stobj(st, stObj,
-                                                          view_format);
+   *sampler_view =
+      st_get_texture_sampler_view_from_stobj(st, stObj, view_format,
+                                             glsl_version);
    return GL_TRUE;
 }
 
@@ -381,7 +406,7 @@ update_single_texture(struct st_context *st,
 
 static void
 update_textures(struct st_context *st,
-                unsigned shader_stage,
+                gl_shader_stage mesa_shader,
                 const struct gl_program *prog,
                 unsigned max_units,
                 struct pipe_sampler_view **sampler_views,
@@ -390,6 +415,10 @@ update_textures(struct st_context *st,
    const GLuint old_max = *num_textures;
    GLbitfield samplers_used = prog->SamplersUsed;
    GLuint unit;
+   struct gl_shader_program *shader =
+      st->ctx->_Shader->CurrentProgram[mesa_shader];
+   unsigned glsl_version = shader ? shader->Version : 0;
+   unsigned shader_stage = st_shader_stage_to_ptarget(mesa_shader);
 
    if (samplers_used == 0x0 && old_max == 0)
       return;
@@ -404,7 +433,8 @@ update_textures(struct st_context *st,
          const GLuint texUnit = prog->SamplerUnits[unit];
          GLboolean retval;
 
-         retval = update_single_texture(st, &sampler_view, texUnit);
+         retval = update_single_texture(st, &sampler_view, texUnit,
+                                        glsl_version);
          if (retval == GL_FALSE)
             continue;
 
@@ -433,7 +463,7 @@ update_vertex_textures(struct st_context *st)
 
    if (ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits > 0) {
       update_textures(st,
-                      PIPE_SHADER_VERTEX,
+                      MESA_SHADER_VERTEX,
                       &ctx->VertexProgram._Current->Base,
                       ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_VERTEX],
@@ -448,7 +478,7 @@ update_fragment_textures(struct st_context *st)
    const struct gl_context *ctx = st->ctx;
 
    update_textures(st,
-                   PIPE_SHADER_FRAGMENT,
+                   MESA_SHADER_FRAGMENT,
                    &ctx->FragmentProgram._Current->Base,
                    ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits,
                    st->state.sampler_views[PIPE_SHADER_FRAGMENT],
@@ -463,7 +493,7 @@ update_geometry_textures(struct st_context *st)
 
    if (ctx->GeometryProgram._Current) {
       update_textures(st,
-                      PIPE_SHADER_GEOMETRY,
+                      MESA_SHADER_GEOMETRY,
                       &ctx->GeometryProgram._Current->Base,
                       ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_GEOMETRY],
@@ -479,7 +509,7 @@ update_tessctrl_textures(struct st_context *st)
 
    if (ctx->TessCtrlProgram._Current) {
       update_textures(st,
-                      PIPE_SHADER_TESS_CTRL,
+                      MESA_SHADER_TESS_CTRL,
                       &ctx->TessCtrlProgram._Current->Base,
                       ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_TESS_CTRL],
@@ -495,7 +525,7 @@ update_tesseval_textures(struct st_context *st)
 
    if (ctx->TessEvalProgram._Current) {
       update_textures(st,
-                      PIPE_SHADER_TESS_EVAL,
+                      MESA_SHADER_TESS_EVAL,
                       &ctx->TessEvalProgram._Current->Base,
                       ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_TESS_EVAL],
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index a68f881c81c..48c9b6f0bb9 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -279,6 +279,29 @@ st_fb_orientation(const struct gl_framebuffer *fb)
 }
 
 
+static inline unsigned
+st_shader_stage_to_ptarget(gl_shader_stage stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+      return PIPE_SHADER_VERTEX;
+   case MESA_SHADER_FRAGMENT:
+      return PIPE_SHADER_FRAGMENT;
+   case MESA_SHADER_GEOMETRY:
+      return PIPE_SHADER_GEOMETRY;
+   case MESA_SHADER_TESS_CTRL:
+      return PIPE_SHADER_TESS_CTRL;
+   case MESA_SHADER_TESS_EVAL:
+      return PIPE_SHADER_TESS_EVAL;
+   case MESA_SHADER_COMPUTE:
+      return PIPE_SHADER_COMPUTE;
+   }
+
+   assert(!"should not be reached");
+   return PIPE_SHADER_VERTEX;
+}
+
+
 /** clear-alloc a struct-sized object, with casting */
 #define ST_CALLOC_STRUCT(T)   (struct T *) calloc(1, sizeof(struct T))
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 905d661a437..4c6e48aa222 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -5693,29 +5693,6 @@ out:
 /* ----------------------------- End TGSI code ------------------------------ */
 
 
-static unsigned
-shader_stage_to_ptarget(gl_shader_stage stage)
-{
-   switch (stage) {
-   case MESA_SHADER_VERTEX:
-      return PIPE_SHADER_VERTEX;
-   case MESA_SHADER_FRAGMENT:
-      return PIPE_SHADER_FRAGMENT;
-   case MESA_SHADER_GEOMETRY:
-      return PIPE_SHADER_GEOMETRY;
-   case MESA_SHADER_TESS_CTRL:
-      return PIPE_SHADER_TESS_CTRL;
-   case MESA_SHADER_TESS_EVAL:
-      return PIPE_SHADER_TESS_EVAL;
-   case MESA_SHADER_COMPUTE:
-      return PIPE_SHADER_COMPUTE;
-   }
-
-   assert(!"should not be reached");
-   return PIPE_SHADER_VERTEX;
-}
-
-
 /**
  * Convert a shader's GLSL IR into a Mesa gl_program, although without
  * generating Mesa IR.
@@ -5732,7 +5709,7 @@ get_mesa_program(struct gl_context *ctx,
    struct gl_shader_compiler_options *options =
          &ctx->Const.ShaderCompilerOptions[_mesa_shader_enum_to_shader_stage(shader->Type)];
    struct pipe_screen *pscreen = ctx->st->pipe->screen;
-   unsigned ptarget = shader_stage_to_ptarget(shader->Stage);
+   unsigned ptarget = st_shader_stage_to_ptarget(shader->Stage);
 
    validate_ir_tree(shader->ir);
 
@@ -5921,7 +5898,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       gl_shader_stage stage = _mesa_shader_enum_to_shader_stage(prog->_LinkedShaders[i]->Type);
       const struct gl_shader_compiler_options *options =
             &ctx->Const.ShaderCompilerOptions[stage];
-      unsigned ptarget = shader_stage_to_ptarget(stage);
+      unsigned ptarget = st_shader_stage_to_ptarget(stage);
       bool have_dround = pscreen->get_shader_param(pscreen, ptarget,
                                                    PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED);
       bool have_dfrexp = pscreen->get_shader_param(pscreen, ptarget,

From 6ca3ff982a9e6a54286158b457d479715be5ab17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 20:25:18 +0200
Subject: [PATCH 0686/1208] st/mesa: add shader dumping for shader-db

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 66 ++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 4c6e48aa222..6f007273c73 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -5877,6 +5877,71 @@ get_mesa_program(struct gl_context *ctx,
 
 extern "C" {
 
+static void
+st_dump_program_for_shader_db(struct gl_context *ctx,
+                              struct gl_shader_program *prog)
+{
+   /* Dump only successfully compiled and linked shaders to the specified
+    * file. This is for shader-db.
+    *
+    * These options allow some pre-processing of shaders while dumping,
+    * because some apps have ill-formed shaders.
+    */
+   const char *dump_filename = os_get_option("ST_DUMP_SHADERS");
+   const char *insert_directives = os_get_option("ST_DUMP_INSERT");
+
+   if (dump_filename && prog->Name != 0) {
+      FILE *f = fopen(dump_filename, "a");
+
+      if (f) {
+         for (unsigned i = 0; i < prog->NumShaders; i++) {
+            const struct gl_shader *sh = prog->Shaders[i];
+            const char *source;
+            bool skip_version = false;
+
+            if (!sh)
+               continue;
+
+            source = sh->Source;
+
+            /* This string mustn't be changed. shader-db uses it to find
+             * where the shader begins.
+             */
+            fprintf(f, "GLSL %s shader %d source for linked program %d:\n",
+                    _mesa_shader_stage_to_string(sh->Stage),
+                    i, prog->Name);
+
+            /* Dump the forced version if set. */
+            if (ctx->Const.ForceGLSLVersion) {
+               fprintf(f, "#version %i\n", ctx->Const.ForceGLSLVersion);
+               skip_version = true;
+            }
+
+            /* Insert directives (optional). */
+            if (insert_directives) {
+               if (!ctx->Const.ForceGLSLVersion && prog->Version)
+                  fprintf(f, "#version %i\n", prog->Version);
+               fprintf(f, "%s\n", insert_directives);
+               skip_version = true;
+            }
+
+            if (skip_version && strncmp(source, "#version ", 9) == 0) {
+               const char *next_line = strstr(source, "\n");
+
+               if (next_line)
+                  source = next_line + 1;
+               else
+                  continue;
+            }
+
+            fprintf(f, "%s", source);
+            fprintf(f, "\n");
+         }
+         fclose(f);
+      }
+   }
+}
+
 /**
  * Link a shader.
  * Called via ctx->Driver.LinkShader()
@@ -5997,6 +6062,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       _mesa_reference_program(ctx, &linked_prog, NULL);
    }
 
+   st_dump_program_for_shader_db(ctx, prog);
    return GL_TRUE;
 }
 

From 72f31c63d7b73abcdf47bc303d09987f299aff7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 17:26:10 +0200
Subject: [PATCH 0687/1208] st/mesa: remove st_finalize_textures atom

It only checks fragment textures and ignores other shaders, which makes it
incomplete, and textures are already finalized in update_single_texture.

There are no piglit regressions.

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/state_tracker/st_atom.c         |  1 -
 src/mesa/state_tracker/st_atom.h         |  1 -
 src/mesa/state_tracker/st_atom_texture.c | 45 ------------------------
 3 files changed, 47 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index 5fc1a77ce60..43dbadd4a7e 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -46,7 +46,6 @@ static const struct st_tracked_state *atoms[] =
    &st_update_depth_stencil_alpha,
    &st_update_clip,
 
-   &st_finalize_textures,
    &st_update_fp,
    &st_update_gp,
    &st_update_tep,
diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h
index 5735ca6930d..a24842baa4f 100644
--- a/src/mesa/state_tracker/st_atom.h
+++ b/src/mesa/state_tracker/st_atom.h
@@ -68,7 +68,6 @@ extern const struct st_tracked_state st_update_vertex_texture;
 extern const struct st_tracked_state st_update_geometry_texture;
 extern const struct st_tracked_state st_update_tessctrl_texture;
 extern const struct st_tracked_state st_update_tesseval_texture;
-extern const struct st_tracked_state st_finalize_textures;
 extern const struct st_tracked_state st_update_fs_constants;
 extern const struct st_tracked_state st_update_gs_constants;
 extern const struct st_tracked_state st_update_tes_constants;
diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index 4422d9a1a52..31e0f6ba06c 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -582,48 +582,3 @@ const struct st_tracked_state st_update_tesseval_texture = {
    },
    update_tesseval_textures				/* update */
 };
-
-
-
-static void
-finalize_textures(struct st_context *st)
-{
-   struct gl_context *ctx = st->ctx;
-   struct gl_fragment_program *fprog = ctx->FragmentProgram._Current;
-   const GLboolean prev_missing_textures = st->missing_textures;
-   GLuint su;
-
-   st->missing_textures = GL_FALSE;
-
-   for (su = 0; su < ctx->Const.MaxTextureCoordUnits; su++) {
-      if (fprog->Base.SamplersUsed & (1 << su)) {
-         const GLuint texUnit = fprog->Base.SamplerUnits[su];
-         struct gl_texture_object *texObj
-            = ctx->Texture.Unit[texUnit]._Current;
-
-         if (texObj) {
-            GLboolean retval;
-
-            retval = st_finalize_texture(ctx, st->pipe, texObj);
-            if (!retval) {
-               /* out of mem */
-               st->missing_textures = GL_TRUE;
-               continue;
-            }
-         }
-      }
-   }
-
-   if (prev_missing_textures != st->missing_textures)
-      st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
-}
-
-
-const struct st_tracked_state st_finalize_textures = {
-   "st_finalize_textures",		/* name */
-   {					/* dirty */
-      _NEW_TEXTURE,			/* mesa */
-      0,				/* st */
-   },
-   finalize_textures			/* update */
-};

From 5142564734bd68f165b02e29e384ebbcf91cce38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 28 Jul 2015 20:41:16 +0200
Subject: [PATCH 0688/1208] st/mesa: remove st_context::missing textures and
 get_passthrough_fs

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/state_tracker/st_atom_shader.c | 29 ++-----------------------
 src/mesa/state_tracker/st_context.h     |  1 -
 2 files changed, 2 insertions(+), 28 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index d27882def7b..fee15a980f3 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -49,24 +49,6 @@
 #include "st_program.h"
 
 
-/**
- * Return pointer to a pass-through fragment shader.
- * This shader is used when a texture is missing/incomplete.
- */
-static void *
-get_passthrough_fs(struct st_context *st)
-{
-   if (!st->passthrough_fs) {
-      st->passthrough_fs =
-         util_make_fragment_passthrough_shader(st->pipe, TGSI_SEMANTIC_COLOR,
-                                               TGSI_INTERPOLATE_PERSPECTIVE,
-                                               TRUE);
-   }
-
-   return st->passthrough_fs;
-}
-
-
 /**
  * Update fragment program state/atom.  This involves translating the
  * Mesa fragment program into a gallium fragment program and binding it.
@@ -96,15 +78,8 @@ update_fp( struct st_context *st )
 
    st_reference_fragprog(st, &st->fp, stfp);
 
-   if (st->missing_textures) {
-      /* use a pass-through frag shader that uses no textures */
-      void *fs = get_passthrough_fs(st);
-      cso_set_fragment_shader_handle(st->cso_context, fs);
-   }
-   else {
-      cso_set_fragment_shader_handle(st->cso_context,
-                                     st->fp_variant->driver_shader);
-   }
+   cso_set_fragment_shader_handle(st->cso_context,
+                                  st->fp_variant->driver_shader);
 }
 
 
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 48c9b6f0bb9..81d5480431a 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -140,7 +140,6 @@ struct st_context
 
    struct st_state_flags dirty;
 
-   GLboolean missing_textures;
    GLboolean vertdata_edgeflags;
    GLboolean edgeflag_culls_prims;
 

From bf4019a1c89755af94218055e86544f7823dc4ac Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 9 Jul 2015 16:33:59 +1000
Subject: [PATCH 0689/1208] radeon: add streamout status 1-3 queries.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support for queries against the non-0 vertex streams.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/radeon/r600_query.c   | 18 ++++++++++++++++--
 src/gallium/drivers/radeon/r600d_common.h |  3 +++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 6bf0271e14f..909d5029bfa 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -54,6 +54,8 @@ struct r600_query {
 	uint64_t end_result;
 	/* Fence for GPU_FINISHED. */
 	struct pipe_fence_handle *fence;
+	/* For transform feedback: which stream the query is for */
+	unsigned stream;
 };
 
 
@@ -155,6 +157,17 @@ static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
 	}
 }
 
+static unsigned event_type_for_stream(struct r600_query *query)
+{
+	switch (query->stream) {
+	default:
+	case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS;
+	case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1;
+	case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2;
+	case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3;
+	}
+}
+
 static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_query *query)
 {
 	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
@@ -189,7 +202,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 	case PIPE_QUERY_SO_STATISTICS:
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3));
+		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
 		radeon_emit(cs, va);
 		radeon_emit(cs, (va >> 32UL) & 0xFF);
 		break;
@@ -246,7 +259,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 		va += query->buffer.results_end + query->result_size/2;
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3));
+		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
 		radeon_emit(cs, va);
 		radeon_emit(cs, (va >> 32UL) & 0xFF);
 		break;
@@ -371,6 +384,7 @@ static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned q
 		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
 		query->result_size = 32;
 		query->num_cs_dw = 6;
+		query->stream = index;
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		/* 11 values on EG, 8 on R600. */
diff --git a/src/gallium/drivers/radeon/r600d_common.h b/src/gallium/drivers/radeon/r600d_common.h
index 74c8d8782a6..5a56a5454be 100644
--- a/src/gallium/drivers/radeon/r600d_common.h
+++ b/src/gallium/drivers/radeon/r600d_common.h
@@ -66,6 +66,9 @@
 #define PKT3_SET_SH_REG                        0x76 /* SI and later */
 #define PKT3_SET_UCONFIG_REG                   0x79 /* CIK and later */
 
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS1      0x1 /* EG and later */
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS2      0x2 /* EG and later */
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS3      0x3 /* EG and later */
 #define EVENT_TYPE_PS_PARTIAL_FLUSH            0x10
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14
 #define EVENT_TYPE_ZPASS_DONE                  0x15

From 736c6f3cfc2c69e3c29268d4ebb7110dd36ac97f Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 23 Jul 2015 17:26:56 -0700
Subject: [PATCH 0690/1208] meta/copy_image: Stash off the scissor

The meta CopyImageSubData path uses BlitFramebuffers to do the actual copy.
The only thing that can affect BlitFramebuffers other than the currently
bound framebuffers is the scissor so we need to save that off and reset it.
If we don't do this, applications that use a scissor together with
CopyImageSubData will get accidentally scissored copies.

Tested-by: Markus Wick <markus at selfnet.de>
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
---
 src/mesa/drivers/common/meta_copy_image.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/common/meta_copy_image.c b/src/mesa/drivers/common/meta_copy_image.c
index 1729766f78d..149ed18503c 100644
--- a/src/mesa/drivers/common/meta_copy_image.c
+++ b/src/mesa/drivers/common/meta_copy_image.c
@@ -138,8 +138,8 @@ _mesa_meta_CopyImageSubData_uncompressed(struct gl_context *ctx,
          goto cleanup;
    }
 
-   /* We really only need to stash the bound framebuffers. */
-   _mesa_meta_begin(ctx, 0);
+   /* We really only need to stash the bound framebuffers and scissor. */
+   _mesa_meta_begin(ctx, MESA_META_SCISSOR);
 
    _mesa_GenFramebuffers(2, fbos);
    _mesa_BindFramebuffer(GL_READ_FRAMEBUFFER, fbos[0]);

From b0193adbe9403545b0d9f7c7f24a1c30f1491a48 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 27 Jul 2015 23:15:39 -0700
Subject: [PATCH 0691/1208] vc4: Fix bus errors on dumping CL on hardware.

The kernel can't fixup unaligned float traps for us, so deref as a
uint32_t first.
---
 src/gallium/drivers/vc4/vc4_cl_dump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index 64de79cc830..e153a243090 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -34,7 +34,7 @@ dump_float(void *cl, uint32_t offset, uint32_t hw_offset)
         void *f = cl + offset;
 
         fprintf(stderr, "0x%08x 0x%08x:      %f (0x%08x)\n",
-                offset, hw_offset, *(float *)f, *(uint32_t *)f);
+                offset, hw_offset, uif(*(uint32_t *)f), *(uint32_t *)f);
 }
 
 static void

From 95faf2c6397ce231e94176d18cf8fd2c3265bb8a Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 27 Jul 2015 23:23:57 -0700
Subject: [PATCH 0692/1208] vc4: Fix raster surface shadow updates under DRI2.

Glamor asks GBM for the handle of the BO, then flinks it itself.  We
were marking the bo non-private in the flink and dmabuf (DRI3) paths,
but not the GEM handle path.  As a result, non-pageflipping DRI2
swapbuffers (EGL apps, in particular) were never updating the texture.
---
 src/gallium/drivers/vc4/vc4_screen.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index c069454c2d1..a20818da4d2 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -490,6 +490,12 @@ vc4_screen_bo_get_handle(struct pipe_screen *pscreen,
 {
         whandle->stride = stride;
 
+        /* If we're passing some reference to our BO out to some other part of
+         * the system, then we can't do any optimizations about only us being
+         * the ones seeing it (like BO caching or shadow update avoidance).
+         */
+        bo->private = false;
+
         switch (whandle->type) {
         case DRM_API_HANDLE_TYPE_SHARED:
                 return vc4_bo_flink(bo, &whandle->handle);

From 601733da6708722ceedd35afc7727c28779012f7 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 28 Jul 2015 00:05:33 -0700
Subject: [PATCH 0693/1208] vc4: Drop NV shader reloc validation.

It wasn't validating enough, and we don't need the packet.
---
 src/gallium/drivers/vc4/kernel/vc4_drv.h      |   1 -
 src/gallium/drivers/vc4/kernel/vc4_validate.c | 191 ++++++------------
 2 files changed, 66 insertions(+), 126 deletions(-)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h
index 1fd8aa9fb28..8dc3c11d2d6 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_drv.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h
@@ -72,7 +72,6 @@ struct vc4_exec_info {
 	 * command lists.
 	 */
 	struct vc4_shader_state {
-		uint8_t packet;
 		uint32_t addr;
 		/* Maximum vertex index referenced by any primitive using this
 		 * shader state.
diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index 1d457b80d52..b3d46212fc8 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -329,7 +329,6 @@ validate_gl_shader_state(VALIDATE_ARGS)
 		return -EINVAL;
 	}
 
-	exec->shader_state[i].packet = VC4_PACKET_GL_SHADER_STATE;
 	exec->shader_state[i].addr = *(uint32_t *)untrusted;
 	exec->shader_state[i].max_index = 0;
 
@@ -347,31 +346,6 @@ validate_gl_shader_state(VALIDATE_ARGS)
 	return 0;
 }
 
-static int
-validate_nv_shader_state(VALIDATE_ARGS)
-{
-	uint32_t i = exec->shader_state_count++;
-
-	if (i >= exec->shader_state_size) {
-		DRM_ERROR("More requests for shader states than declared\n");
-		return -EINVAL;
-	}
-
-	exec->shader_state[i].packet = VC4_PACKET_NV_SHADER_STATE;
-	exec->shader_state[i].addr = *(uint32_t *)untrusted;
-
-	if (exec->shader_state[i].addr & 15) {
-		DRM_ERROR("NV shader state address 0x%08x misaligned\n",
-			  exec->shader_state[i].addr);
-		return -EINVAL;
-	}
-
-	*(uint32_t *)validated = (exec->shader_state[i].addr +
-				  exec->shader_rec_p);
-
-	return 0;
-}
-
 static int
 validate_tile_binning_config(VALIDATE_ARGS)
 {
@@ -488,7 +462,7 @@ static const struct cmd_info {
 	VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, "primitive list format", NULL),
 
 	VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, "GL Shader State", validate_gl_shader_state),
-	VC4_DEFINE_PACKET(VC4_PACKET_NV_SHADER_STATE, "NV Shader State", validate_nv_shader_state),
+	/* We don't support validating NV shader states. */
 
 	VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, "configuration bits", NULL),
 	VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, "flat shade flags", NULL),
@@ -758,51 +732,29 @@ reloc_tex(struct vc4_exec_info *exec,
 }
 
 static int
-validate_shader_rec(struct drm_device *dev,
-		    struct vc4_exec_info *exec,
-		    struct vc4_shader_state *state)
+validate_gl_shader_rec(struct drm_device *dev,
+		       struct vc4_exec_info *exec,
+		       struct vc4_shader_state *state)
 {
 	uint32_t *src_handles;
 	void *pkt_u, *pkt_v;
-	enum shader_rec_reloc_type {
-		RELOC_CODE,
-		RELOC_VBO,
+	static const uint32_t shader_reloc_offsets[] = {
+		4, /* fs */
+		16, /* vs */
+		28, /* cs */
 	};
-	struct shader_rec_reloc {
-		enum shader_rec_reloc_type type;
-		uint32_t offset;
-	};
-	static const struct shader_rec_reloc gl_relocs[] = {
-		{ RELOC_CODE, 4 },  /* fs */
-		{ RELOC_CODE, 16 }, /* vs */
-		{ RELOC_CODE, 28 }, /* cs */
-	};
-	static const struct shader_rec_reloc nv_relocs[] = {
-		{ RELOC_CODE, 4 }, /* fs */
-		{ RELOC_VBO, 12 }
-	};
-	const struct shader_rec_reloc *relocs;
-	struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_relocs) + 8];
-	uint32_t nr_attributes = 0, nr_fixed_relocs, nr_relocs, packet_size;
+	uint32_t shader_reloc_count = ARRAY_SIZE(shader_reloc_offsets);
+	struct drm_gem_cma_object *bo[shader_reloc_count + 8];
+	uint32_t nr_attributes, nr_relocs, packet_size;
 	int i;
 	struct vc4_validated_shader_info *validated_shader = NULL;
 
-	if (state->packet == VC4_PACKET_NV_SHADER_STATE) {
-		relocs = nv_relocs;
-		nr_fixed_relocs = ARRAY_SIZE(nv_relocs);
-
-		packet_size = 16;
-	} else {
-		relocs = gl_relocs;
-		nr_fixed_relocs = ARRAY_SIZE(gl_relocs);
-
-		nr_attributes = state->addr & 0x7;
-		if (nr_attributes == 0)
-			nr_attributes = 8;
-		packet_size = gl_shader_rec_size(state->addr);
-	}
-	nr_relocs = nr_fixed_relocs + nr_attributes;
+	nr_attributes = state->addr & 0x7;
+	if (nr_attributes == 0)
+		nr_attributes = 8;
+	packet_size = gl_shader_rec_size(state->addr);
 
+	nr_relocs = ARRAY_SIZE(shader_reloc_offsets) + nr_attributes;
 	if (nr_relocs * 4 > exec->shader_rec_size) {
 		DRM_ERROR("overflowed shader recs reading %d handles "
 			  "from %d bytes left\n",
@@ -832,21 +784,17 @@ validate_shader_rec(struct drm_device *dev,
 	exec->shader_rec_v += roundup(packet_size, 16);
 	exec->shader_rec_size -= packet_size;
 
-	for (i = 0; i < nr_relocs; i++) {
-		enum vc4_bo_mode mode;
-
-		if (i < nr_fixed_relocs && relocs[i].type == RELOC_CODE)
-			mode = VC4_MODE_SHADER;
-		else
-			mode = VC4_MODE_RENDER;
-
-		if (!vc4_use_bo(exec, src_handles[i], mode, &bo[i])) {
+	for (i = 0; i < shader_reloc_count; i++) {
+		if (!vc4_use_bo(exec, src_handles[i], VC4_MODE_SHADER, &bo[i]))
+			return false;
+	}
+	for (i = shader_reloc_count; i < nr_relocs; i++) {
+		if (!vc4_use_bo(exec, src_handles[i], VC4_MODE_RENDER, &bo[i]))
 			return false;
-		}
 	}
 
-	for (i = 0; i < nr_fixed_relocs; i++) {
-		uint32_t o = relocs[i].offset;
+	for (i = 0; i < shader_reloc_count; i++) {
+		uint32_t o = shader_reloc_offsets[i];
 		uint32_t src_offset = *(uint32_t *)(pkt_u + o);
 		uint32_t *texture_handles_u;
 		void *uniform_data_u;
@@ -854,58 +802,51 @@ validate_shader_rec(struct drm_device *dev,
 
 		*(uint32_t *)(pkt_v + o) = bo[i]->paddr + src_offset;
 
-		switch (relocs[i].type) {
-		case RELOC_CODE:
-			if (src_offset != 0) {
-				DRM_ERROR("Shaders must be at offset 0 of "
-					  "the BO.\n");
-				goto fail;
-			}
-
-			kfree(validated_shader);
-			validated_shader = vc4_validate_shader(bo[i]);
-			if (!validated_shader)
-				goto fail;
-
-			if (validated_shader->uniforms_src_size >
-			    exec->uniforms_size) {
-				DRM_ERROR("Uniforms src buffer overflow\n");
-				goto fail;
-			}
-
-			texture_handles_u = exec->uniforms_u;
-			uniform_data_u = (texture_handles_u +
-					  validated_shader->num_texture_samples);
-
-			memcpy(exec->uniforms_v, uniform_data_u,
-			       validated_shader->uniforms_size);
-
-			for (tex = 0;
-			     tex < validated_shader->num_texture_samples;
-			     tex++) {
-				if (!reloc_tex(exec,
-					       uniform_data_u,
-					       &validated_shader->texture_samples[tex],
-					       texture_handles_u[tex])) {
-					goto fail;
-				}
-			}
-
-			*(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
-
-			exec->uniforms_u += validated_shader->uniforms_src_size;
-			exec->uniforms_v += validated_shader->uniforms_size;
-			exec->uniforms_p += validated_shader->uniforms_size;
-
-			break;
-
-		case RELOC_VBO:
-			break;
+		if (src_offset != 0) {
+			DRM_ERROR("Shaders must be at offset 0 of "
+				  "the BO.\n");
+			goto fail;
 		}
+
+		kfree(validated_shader);
+		validated_shader = vc4_validate_shader(bo[i]);
+		if (!validated_shader)
+			goto fail;
+
+		if (validated_shader->uniforms_src_size >
+		    exec->uniforms_size) {
+			DRM_ERROR("Uniforms src buffer overflow\n");
+			goto fail;
+		}
+
+		texture_handles_u = exec->uniforms_u;
+		uniform_data_u = (texture_handles_u +
+				  validated_shader->num_texture_samples);
+
+		memcpy(exec->uniforms_v, uniform_data_u,
+		       validated_shader->uniforms_size);
+
+		for (tex = 0;
+		     tex < validated_shader->num_texture_samples;
+		     tex++) {
+			if (!reloc_tex(exec,
+				       uniform_data_u,
+				       &validated_shader->texture_samples[tex],
+				       texture_handles_u[tex])) {
+				goto fail;
+			}
+		}
+
+		*(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
+
+		exec->uniforms_u += validated_shader->uniforms_src_size;
+		exec->uniforms_v += validated_shader->uniforms_size;
+		exec->uniforms_p += validated_shader->uniforms_size;
 	}
 
 	for (i = 0; i < nr_attributes; i++) {
-		struct drm_gem_cma_object *vbo = bo[nr_fixed_relocs + i];
+		struct drm_gem_cma_object *vbo =
+			bo[ARRAY_SIZE(shader_reloc_offsets) + i];
 		uint32_t o = 36 + i * 8;
 		uint32_t offset = *(uint32_t *)(pkt_u + o + 0);
 		uint32_t attr_size = *(uint8_t *)(pkt_u + o + 4) + 1;
@@ -952,7 +893,7 @@ vc4_validate_shader_recs(struct drm_device *dev,
 	int ret = 0;
 
 	for (i = 0; i < exec->shader_state_count; i++) {
-		ret = validate_shader_rec(dev, exec, &exec->shader_state[i]);
+		ret = validate_gl_shader_rec(dev, exec, &exec->shader_state[i]);
 		if (ret)
 			return ret;
 	}

From cbb7477e8a796211b664ff7e47334cb1b642556d Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 28 Jul 2015 00:29:31 -0700
Subject: [PATCH 0694/1208] vc4: Ensure that the bin CL is properly capped by
 increment/flush.

We don't want anything to appear after we've kicked off the render (and
thus job flush), since that might then get written out to the tile
allocation state.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/drivers/vc4/kernel/vc4_drv.h      |  4 ++
 src/gallium/drivers/vc4/kernel/vc4_gem.c      |  2 +
 src/gallium/drivers/vc4/kernel/vc4_validate.c | 56 ++++++++++---------
 3 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h
index 8dc3c11d2d6..5c8617939c8 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_drv.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h
@@ -87,6 +87,7 @@ struct vc4_exec_info {
 	bool found_tile_binning_mode_config_packet;
 	bool found_start_tile_binning_packet;
 	bool found_increment_semaphore_packet;
+	bool found_flush;
 	uint8_t bin_tiles_x, bin_tiles_y;
 	struct drm_gem_cma_object *tile_bo;
 	uint32_t tile_alloc_offset;
@@ -98,6 +99,9 @@ struct vc4_exec_info {
 	uint32_t ct0ca, ct0ea;
 	uint32_t ct1ca, ct1ea;
 
+	/* Pointer to the unvalidated bin CL (if present). */
+	void *bin_u;
+
 	/* Pointers to the shader recs.  These paddr gets incremented as CL
 	 * packets are relocated in validate_gl_shader_state, and the vaddrs
 	 * (u and v) get incremented and size decremented as the shader recs
diff --git a/src/gallium/drivers/vc4/kernel/vc4_gem.c b/src/gallium/drivers/vc4/kernel/vc4_gem.c
index e4b7fea5968..93f9ec7ed9b 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_gem.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_gem.c
@@ -112,6 +112,8 @@ vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
 
 	exec->ct0ca = exec->exec_bo->paddr + bin_offset;
 
+	exec->bin_u = bin;
+
 	exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset;
 	exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset;
 	exec->shader_rec_size = args->shader_rec_size;
diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index b3d46212fc8..c57ebecbb24 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -132,6 +132,15 @@ vc4_use_handle(struct vc4_exec_info *exec,
 			  mode, obj);
 }
 
+static bool
+validate_bin_pos(struct vc4_exec_info *exec, void *untrusted, uint32_t pos)
+{
+	/* Note that the untrusted pointer passed to these functions is
+	 * incremented past the packet byte.
+	 */
+	return (untrusted - 1 == exec->bin_u + pos);
+}
+
 static uint32_t
 gl_shader_rec_size(uint32_t pointer_bits)
 {
@@ -201,14 +210,15 @@ vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
 	return true;
 }
 
+
 static int
-validate_flush_all(VALIDATE_ARGS)
+validate_flush(VALIDATE_ARGS)
 {
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("VC4_PACKET_FLUSH_ALL after "
-			  "VC4_PACKET_INCREMENT_SEMAPHORE\n");
-		return -EINVAL;
+	if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 1)) {
+		DRM_ERROR("Bin CL must end with VC4_PACKET_FLUSH\n");
+		return false;
 	}
+	exec->found_flush = true;
 
 	return 0;
 }
@@ -233,17 +243,13 @@ validate_start_tile_binning(VALIDATE_ARGS)
 static int
 validate_increment_semaphore(VALIDATE_ARGS)
 {
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Duplicate VC4_PACKET_INCREMENT_SEMAPHORE\n");
+	if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 2)) {
+		DRM_ERROR("Bin CL must end with "
+			  "VC4_PACKET_INCREMENT_SEMAPHORE\n");
 		return -EINVAL;
 	}
 	exec->found_increment_semaphore_packet = true;
 
-	/* Once we've found the semaphore increment, there should be one FLUSH
-	 * then the end of the command list.  The FLUSH actually triggers the
-	 * increment, so we only need to make sure there
-	 */
-
 	return 0;
 }
 
@@ -257,11 +263,6 @@ validate_indexed_prim_list(VALIDATE_ARGS)
 	uint32_t index_size = (*(uint8_t *)(untrusted + 0) >> 4) ? 2 : 1;
 	struct vc4_shader_state *shader_state;
 
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n");
-		return -EINVAL;
-	}
-
 	/* Check overflow condition */
 	if (exec->shader_state_count == 0) {
 		DRM_ERROR("shader state must precede primitives\n");
@@ -295,11 +296,6 @@ validate_gl_array_primitive(VALIDATE_ARGS)
 	uint32_t max_index;
 	struct vc4_shader_state *shader_state;
 
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n");
-		return -EINVAL;
-	}
-
 	/* Check overflow condition */
 	if (exec->shader_state_count == 0) {
 		DRM_ERROR("shader state must precede primitives\n");
@@ -447,8 +443,8 @@ static const struct cmd_info {
 } cmd_info[] = {
 	VC4_DEFINE_PACKET(VC4_PACKET_HALT, "halt", NULL),
 	VC4_DEFINE_PACKET(VC4_PACKET_NOP, "nop", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", validate_flush_all),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", validate_flush),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", NULL),
 	VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING, "start tile binning", validate_start_tile_binning),
 	VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, "increment semaphore", validate_increment_semaphore),
 
@@ -554,8 +550,16 @@ vc4_validate_bin_cl(struct drm_device *dev,
 		return -EINVAL;
 	}
 
-	if (!exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE\n");
+	/* The bin CL must be ended with INCREMENT_SEMAPHORE and FLUSH.  The
+	 * semaphore is used to trigger the render CL to start up, and the
+	 * FLUSH is what caps the bin lists with
+	 * VC4_PACKET_RETURN_FROM_SUB_LIST (so they jump back to the main
+	 * render CL when they get called to) and actually triggers the queued
+	 * semaphore increment.
+	 */
+	if (!exec->found_increment_semaphore_packet || !exec->found_flush) {
+		DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE + "
+			  "VC4_PACKET_FLUSH\n");
 		return -EINVAL;
 	}
 

From 22954db71cd1d8d9ef6e5a16f568e4b3c7845777 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 28 Jul 2015 09:51:37 -0700
Subject: [PATCH 0695/1208] vc4: Make the object be the return value from
 vc4_use_bo().

Drops 40 bytes of code from validation.
---
 src/gallium/drivers/vc4/kernel/vc4_drv.h      |  7 ++--
 .../drivers/vc4/kernel/vc4_render_cl.c        |  6 ++--
 src/gallium/drivers/vc4/kernel/vc4_validate.c | 35 ++++++++++---------
 3 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h
index 5c8617939c8..127a366fb3c 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_drv.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h
@@ -171,10 +171,9 @@ vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec);
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
 
-bool vc4_use_bo(struct vc4_exec_info *exec,
-		uint32_t hindex,
-		enum vc4_bo_mode mode,
-		struct drm_gem_cma_object **obj);
+struct drm_gem_cma_object *vc4_use_bo(struct vc4_exec_info *exec,
+				      uint32_t hindex,
+				      enum vc4_bo_mode mode);
 
 int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec);
 
diff --git a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
index f55ffe5a8db..a0681042155 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
@@ -286,7 +286,8 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 	if (surf->hindex == ~0)
 		return 0;
 
-	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+	*obj = vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER);
+	if (!*obj)
 		return -EINVAL;
 
 	if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK |
@@ -365,7 +366,8 @@ vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec,
 	if (surf->hindex == ~0)
 		return 0;
 
-	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+	*obj = vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER);
+	if (!*obj)
 		return -EINVAL;
 
 	if (tiling > VC4_TILING_FORMAT_LT) {
diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index c57ebecbb24..321e8115f8f 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -94,19 +94,19 @@ size_is_lt(uint32_t width, uint32_t height, int cpp)
 		height <= 4 * utile_height(cpp));
 }
 
-bool
+struct drm_gem_cma_object *
 vc4_use_bo(struct vc4_exec_info *exec,
 	   uint32_t hindex,
-	   enum vc4_bo_mode mode,
-	   struct drm_gem_cma_object **obj)
+	   enum vc4_bo_mode mode)
 {
-	*obj = NULL;
+	struct drm_gem_cma_object *obj;
 
 	if (hindex >= exec->bo_count) {
 		DRM_ERROR("BO index %d greater than BO count %d\n",
 			  hindex, exec->bo_count);
-		return false;
+		return NULL;
 	}
+	obj = exec->bo[hindex].bo;
 
 	if (exec->bo[hindex].mode != mode) {
 		if (exec->bo[hindex].mode == VC4_MODE_UNDECIDED) {
@@ -114,22 +114,19 @@ vc4_use_bo(struct vc4_exec_info *exec,
 		} else {
 			DRM_ERROR("BO index %d reused with mode %d vs %d\n",
 				  hindex, exec->bo[hindex].mode, mode);
-			return false;
+			return NULL;
 		}
 	}
 
-	*obj = exec->bo[hindex].bo;
-	return true;
+	return obj;
 }
 
-static bool
+static struct drm_gem_cma_object *
 vc4_use_handle(struct vc4_exec_info *exec,
 	       uint32_t gem_handles_packet_index,
-	       enum vc4_bo_mode mode,
-	       struct drm_gem_cma_object **obj)
+	       enum vc4_bo_mode mode)
 {
-	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index],
-			  mode, obj);
+	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index], mode);
 }
 
 static bool
@@ -273,7 +270,8 @@ validate_indexed_prim_list(VALIDATE_ARGS)
 	if (max_index > shader_state->max_index)
 		shader_state->max_index = max_index;
 
-	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &ib))
+	ib = vc4_use_handle(exec, 0, VC4_MODE_RENDER);
+	if (!ib)
 		return -EINVAL;
 
 	if (offset > ib->base.size ||
@@ -590,7 +588,8 @@ reloc_tex(struct vc4_exec_info *exec,
 	uint32_t cube_map_stride = 0;
 	enum vc4_texture_data_type type;
 
-	if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex))
+	tex = vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER);
+	if (!tex)
 		return false;
 
 	if (sample->is_direct) {
@@ -789,11 +788,13 @@ validate_gl_shader_rec(struct drm_device *dev,
 	exec->shader_rec_size -= packet_size;
 
 	for (i = 0; i < shader_reloc_count; i++) {
-		if (!vc4_use_bo(exec, src_handles[i], VC4_MODE_SHADER, &bo[i]))
+		bo[i] = vc4_use_bo(exec, src_handles[i], VC4_MODE_SHADER);
+		if (!bo[i])
 			return false;
 	}
 	for (i = shader_reloc_count; i < nr_relocs; i++) {
-		if (!vc4_use_bo(exec, src_handles[i], VC4_MODE_RENDER, &bo[i]))
+		bo[i] = vc4_use_bo(exec, src_handles[i], VC4_MODE_RENDER);
+		if (!bo[i])
 			return false;
 	}
 

From 044f7bbda077ea7029fb1004183b29127307bd84 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 28 Jul 2015 10:11:08 -0700
Subject: [PATCH 0696/1208] vc4: Keep the validated shader around for the
 simulator execution.

This more closely matches the kernel behavior on shader validation now.
---
 src/gallium/drivers/vc4/kernel/vc4_validate.c | 19 ++++++-------------
 src/gallium/drivers/vc4/vc4_simulator.c       | 10 ++++++++++
 .../drivers/vc4/vc4_simulator_validate.h      |  1 +
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index 321e8115f8f..49bb64838ea 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -750,7 +750,6 @@ validate_gl_shader_rec(struct drm_device *dev,
 	struct drm_gem_cma_object *bo[shader_reloc_count + 8];
 	uint32_t nr_attributes, nr_relocs, packet_size;
 	int i;
-	struct vc4_validated_shader_info *validated_shader = NULL;
 
 	nr_attributes = state->addr & 0x7;
 	if (nr_attributes == 0)
@@ -799,6 +798,7 @@ validate_gl_shader_rec(struct drm_device *dev,
 	}
 
 	for (i = 0; i < shader_reloc_count; i++) {
+		struct vc4_validated_shader_info *validated_shader;
 		uint32_t o = shader_reloc_offsets[i];
 		uint32_t src_offset = *(uint32_t *)(pkt_u + o);
 		uint32_t *texture_handles_u;
@@ -810,18 +810,17 @@ validate_gl_shader_rec(struct drm_device *dev,
 		if (src_offset != 0) {
 			DRM_ERROR("Shaders must be at offset 0 of "
 				  "the BO.\n");
-			goto fail;
+			return -EINVAL;
 		}
 
-		kfree(validated_shader);
-		validated_shader = vc4_validate_shader(bo[i]);
+		validated_shader = to_vc4_bo(&bo[i]->base)->validated_shader;
 		if (!validated_shader)
-			goto fail;
+			return -EINVAL;
 
 		if (validated_shader->uniforms_src_size >
 		    exec->uniforms_size) {
 			DRM_ERROR("Uniforms src buffer overflow\n");
-			goto fail;
+			return -EINVAL;
 		}
 
 		texture_handles_u = exec->uniforms_u;
@@ -838,7 +837,7 @@ validate_gl_shader_rec(struct drm_device *dev,
 				       uniform_data_u,
 				       &validated_shader->texture_samples[tex],
 				       texture_handles_u[tex])) {
-				goto fail;
+				return -EINVAL;
 			}
 		}
 
@@ -881,13 +880,7 @@ validate_gl_shader_rec(struct drm_device *dev,
 		*(uint32_t *)(pkt_v + o) = vbo->paddr + offset;
 	}
 
-	kfree(validated_shader);
-
 	return 0;
-
-fail:
-	kfree(validated_shader);
-	return -EINVAL;
 }
 
 int
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index b58013dd2ee..4097dce28a7 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -79,6 +79,7 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec)
                 struct vc4_bo *bo = bos[i];
                 struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo);
 
+                struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base);
 #if 0
                 fprintf(stderr, "bo hindex %d: %s\n", i, bo->name);
 #endif
@@ -87,6 +88,15 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec)
                 memcpy(obj->vaddr, bo->map, bo->size);
 
                 exec->bo[i].bo = obj;
+
+                /* The kernel does this validation at shader create ioctl
+                 * time.
+                 */
+                if (strcmp(bo->name, "code") == 0) {
+                        drm_bo->validated_shader = vc4_validate_shader(obj);
+                        if (!drm_bo->validated_shader)
+                                abort();
+                }
         }
         return 0;
 }
diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h
index 2bb36b253bb..68ace0216aa 100644
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -78,6 +78,7 @@ struct drm_gem_cma_object {
 struct drm_vc4_bo {
         struct drm_gem_cma_object base;
         struct vc4_bo *bo;
+        struct vc4_validated_shader_info *validated_shader;
         struct list_head unref_head;
 };
 

From 1f5e070dd7ddd344a913f2f5daddebb4c51abb8a Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 28 Jul 2015 10:20:10 -0700
Subject: [PATCH 0697/1208] vc4: Simplify vc4_use_bo and make sure it's not a
 shader.

Since the conversion to keeping validated shaders around for the BO's
lifetime, we haven't been checking that rendering doesn't happen to
shaders.  Make vc4_use_bo check that always, and just don't use it for the
VC4_MODE_SHADER case (so now modes are unused)
---
 src/gallium/drivers/vc4/kernel/vc4_drv.h      | 16 +-------
 .../drivers/vc4/kernel/vc4_render_cl.c        |  4 +-
 src/gallium/drivers/vc4/kernel/vc4_validate.c | 39 +++++++++----------
 src/gallium/drivers/vc4/vc4_simulator.c       |  6 +--
 4 files changed, 26 insertions(+), 39 deletions(-)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h
index 127a366fb3c..ffc973735ae 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_drv.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h
@@ -26,17 +26,6 @@
 
 #include "vc4_simulator_validate.h"
 
-enum vc4_bo_mode {
-	VC4_MODE_UNDECIDED,
-	VC4_MODE_RENDER,
-	VC4_MODE_SHADER,
-};
-
-struct vc4_bo_exec_state {
-	struct drm_gem_cma_object *bo;
-	enum vc4_bo_mode mode;
-};
-
 struct vc4_exec_info {
 	/* Sequence number for this bin/render job. */
 	uint64_t seqno;
@@ -47,7 +36,7 @@ struct vc4_exec_info {
 	/* This is the array of BOs that were looked up at the start of exec.
 	 * Command validation will use indices into this array.
 	 */
-	struct vc4_bo_exec_state *bo;
+	struct drm_gem_cma_object **bo;
 	uint32_t bo_count;
 
 	/* List of other BOs used in the job that need to be released
@@ -172,8 +161,7 @@ struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
 
 struct drm_gem_cma_object *vc4_use_bo(struct vc4_exec_info *exec,
-				      uint32_t hindex,
-				      enum vc4_bo_mode mode);
+				      uint32_t hindex);
 
 int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec);
 
diff --git a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
index a0681042155..b827eb7e9e1 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
@@ -286,7 +286,7 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 	if (surf->hindex == ~0)
 		return 0;
 
-	*obj = vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER);
+	*obj = vc4_use_bo(exec, surf->hindex);
 	if (!*obj)
 		return -EINVAL;
 
@@ -366,7 +366,7 @@ vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec,
 	if (surf->hindex == ~0)
 		return 0;
 
-	*obj = vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER);
+	*obj = vc4_use_bo(exec, surf->hindex);
 	if (!*obj)
 		return -EINVAL;
 
diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index 49bb64838ea..e81dd9935ca 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -95,38 +95,32 @@ size_is_lt(uint32_t width, uint32_t height, int cpp)
 }
 
 struct drm_gem_cma_object *
-vc4_use_bo(struct vc4_exec_info *exec,
-	   uint32_t hindex,
-	   enum vc4_bo_mode mode)
+vc4_use_bo(struct vc4_exec_info *exec, uint32_t hindex)
 {
 	struct drm_gem_cma_object *obj;
+	struct drm_vc4_bo *bo;
 
 	if (hindex >= exec->bo_count) {
 		DRM_ERROR("BO index %d greater than BO count %d\n",
 			  hindex, exec->bo_count);
 		return NULL;
 	}
-	obj = exec->bo[hindex].bo;
+	obj = exec->bo[hindex];
+	bo = to_vc4_bo(&obj->base);
 
-	if (exec->bo[hindex].mode != mode) {
-		if (exec->bo[hindex].mode == VC4_MODE_UNDECIDED) {
-			exec->bo[hindex].mode = mode;
-		} else {
-			DRM_ERROR("BO index %d reused with mode %d vs %d\n",
-				  hindex, exec->bo[hindex].mode, mode);
-			return NULL;
-		}
+	if (bo->validated_shader) {
+		DRM_ERROR("Trying to use shader BO as something other than "
+			  "a shader\n");
+		return NULL;
 	}
 
 	return obj;
 }
 
 static struct drm_gem_cma_object *
-vc4_use_handle(struct vc4_exec_info *exec,
-	       uint32_t gem_handles_packet_index,
-	       enum vc4_bo_mode mode)
+vc4_use_handle(struct vc4_exec_info *exec, uint32_t gem_handles_packet_index)
 {
-	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index], mode);
+	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index]);
 }
 
 static bool
@@ -270,7 +264,7 @@ validate_indexed_prim_list(VALIDATE_ARGS)
 	if (max_index > shader_state->max_index)
 		shader_state->max_index = max_index;
 
-	ib = vc4_use_handle(exec, 0, VC4_MODE_RENDER);
+	ib = vc4_use_handle(exec, 0);
 	if (!ib)
 		return -EINVAL;
 
@@ -588,7 +582,7 @@ reloc_tex(struct vc4_exec_info *exec,
 	uint32_t cube_map_stride = 0;
 	enum vc4_texture_data_type type;
 
-	tex = vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER);
+	tex = vc4_use_bo(exec, texture_handle_index);
 	if (!tex)
 		return false;
 
@@ -787,12 +781,17 @@ validate_gl_shader_rec(struct drm_device *dev,
 	exec->shader_rec_size -= packet_size;
 
 	for (i = 0; i < shader_reloc_count; i++) {
-		bo[i] = vc4_use_bo(exec, src_handles[i], VC4_MODE_SHADER);
+		if (src_handles[i] > exec->bo_count) {
+			DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
+			return false;
+		}
+
+		bo[i] = exec->bo[src_handles[i]];
 		if (!bo[i])
 			return false;
 	}
 	for (i = shader_reloc_count; i < nr_relocs; i++) {
-		bo[i] = vc4_use_bo(exec, src_handles[i], VC4_MODE_RENDER);
+		bo[i] = vc4_use_bo(exec, src_handles[i]);
 		if (!bo[i])
 			return false;
 	}
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index 4097dce28a7..7cfd236349d 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -74,7 +74,7 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec)
         struct vc4_bo **bos = vc4->bo_pointers.base;
 
         exec->bo_count = args->bo_handle_count;
-        exec->bo = calloc(exec->bo_count, sizeof(struct vc4_bo_exec_state));
+        exec->bo = calloc(exec->bo_count, sizeof(void *));
         for (int i = 0; i < exec->bo_count; i++) {
                 struct vc4_bo *bo = bos[i];
                 struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo);
@@ -87,7 +87,7 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec)
                 vc4_bo_map(bo);
                 memcpy(obj->vaddr, bo->map, bo->size);
 
-                exec->bo[i].bo = obj;
+                exec->bo[i] = obj;
 
                 /* The kernel does this validation at shader create ioctl
                  * time.
@@ -105,7 +105,7 @@ static int
 vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
 {
         for (int i = 0; i < exec->bo_count; i++) {
-                struct drm_gem_cma_object *obj = exec->bo[i].bo;
+                struct drm_gem_cma_object *obj = exec->bo[i];
                 struct vc4_bo *bo = to_vc4_bo(&obj->base)->bo;
 
                 memcpy(bo->map, obj->vaddr, bo->size);

From aefec4fa226d06e4b414170739be18dd24d3eed7 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 28 Jul 2015 19:59:45 -0700
Subject: [PATCH 0698/1208] vc4: Drop unused vpm_offset value.

It's been dead since we started doing VS/CS attr offset setup during
shader compile.
---
 src/gallium/drivers/vc4/vc4_draw.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index 1c7f3b16be3..ff749fdd0d1 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -195,7 +195,6 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
         cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
 
         uint32_t max_index = 0xffff;
-        uint32_t vpm_offset = 0;
         for (int i = 0; i < vtx->num_elements; i++) {
                 struct pipe_vertex_element *elem = &vtx->pipe[i];
                 struct pipe_vertex_buffer *vb =
@@ -214,8 +213,6 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[i]);
                 cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[i]);
 
-                vpm_offset += align(elem_size, 4);
-
                 if (vb->stride > 0) {
                         max_index = MIN2(max_index,
                                          (vb_size - elem_size) / vb->stride);

From 2e04492a142102823dfb8fc8599cfd417b84c97a Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 28 Jul 2015 11:00:58 -0700
Subject: [PATCH 0699/1208] vc4: Skip re-emitting the shader_rec if it's
 unchanged.

It's a bunch of work for us to emit it (and its uniforms), more work for
the kernel to validate it, and additional work for the CLE to read
it. Improves es2gears framerate by about 50%.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/drivers/vc4/vc4_context.h  |  16 +++-
 src/gallium/drivers/vc4/vc4_draw.c     | 112 ++++++++++++++++---------
 src/gallium/drivers/vc4/vc4_program.c  |  17 +++-
 src/gallium/drivers/vc4/vc4_resource.c |   8 ++
 src/gallium/drivers/vc4/vc4_uniforms.c |  48 +++++++++++
 5 files changed, 158 insertions(+), 43 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 7faf5223630..30fb285eefe 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -67,7 +67,9 @@
 #define VC4_DIRTY_CLIP          (1 << 20)
 #define VC4_DIRTY_UNCOMPILED_VS (1 << 21)
 #define VC4_DIRTY_UNCOMPILED_FS (1 << 22)
-#define VC4_DIRTY_COMPILED_FS   (1 << 24)
+#define VC4_DIRTY_COMPILED_CS   (1 << 23)
+#define VC4_DIRTY_COMPILED_VS   (1 << 24)
+#define VC4_DIRTY_COMPILED_FS   (1 << 25)
 
 struct vc4_sampler_view {
         struct pipe_sampler_view base;
@@ -132,6 +134,12 @@ struct vc4_compiled_shader {
         struct vc4_ubo_range *ubo_ranges;
         uint32_t num_ubo_ranges;
         uint32_t ubo_size;
+        /**
+         * VC4_DIRTY_* flags that, when set in vc4->dirty, mean that the
+         * uniforms have to be rewritten (and therefore the shader state
+         * reemitted).
+         */
+        uint32_t uniform_dirty_bits;
 
         /** bitmask of which inputs are color inputs, for flat shade handling. */
         uint32_t color_inputs;
@@ -249,6 +257,11 @@ struct vc4_context {
          */
         bool draw_call_queued;
 
+        /** Maximum index buffer valid for the current shader_rec. */
+        uint32_t max_index;
+        /** Last index bias baked into the current shader_rec. */
+        uint32_t last_index_bias;
+
         struct primconvert_context *primconvert;
 
         struct hash_table *fs_cache, *vs_cache;
@@ -360,6 +373,7 @@ void vc4_simulator_init(struct vc4_screen *screen);
 int vc4_simulator_flush(struct vc4_context *vc4,
                         struct drm_vc4_submit_cl *args);
 
+void vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader);
 void vc4_write_uniforms(struct vc4_context *vc4,
                         struct vc4_compiled_shader *shader,
                         struct vc4_constbuf_stateobj *cb,
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index ff749fdd0d1..22ae8f27e4a 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -122,49 +122,13 @@ vc4_update_shadow_textures(struct pipe_context *pctx,
 }
 
 static void
-vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
+vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *info)
 {
-        struct vc4_context *vc4 = vc4_context(pctx);
-
-        if (info->mode >= PIPE_PRIM_QUADS) {
-                util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf);
-                util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);
-                util_primconvert_draw_vbo(vc4->primconvert, info);
-                perf_debug("Fallback conversion for %d %s vertices\n",
-                           info->count, u_prim_name(info->mode));
-                return;
-        }
-
-        /* Before setting up the draw, do any fixup blits necessary. */
-        vc4_update_shadow_textures(pctx, &vc4->verttex);
-        vc4_update_shadow_textures(pctx, &vc4->fragtex);
-
-        vc4_get_draw_cl_space(vc4);
-
+        /* VC4_DIRTY_VTXSTATE */
         struct vc4_vertex_stateobj *vtx = vc4->vtx;
+        /* VC4_DIRTY_VTXBUF */
         struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf;
 
-        if (vc4->prim_mode != info->mode) {
-                vc4->prim_mode = info->mode;
-                vc4->dirty |= VC4_DIRTY_PRIM_MODE;
-        }
-
-        vc4_start_draw(vc4);
-        vc4_update_compiled_shaders(vc4, info->mode);
-
-        vc4_emit_state(pctx);
-        vc4->dirty = 0;
-
-        vc4_write_uniforms(vc4, vc4->prog.fs,
-                           &vc4->constbuf[PIPE_SHADER_FRAGMENT],
-                           &vc4->fragtex);
-        vc4_write_uniforms(vc4, vc4->prog.vs,
-                           &vc4->constbuf[PIPE_SHADER_VERTEX],
-                           &vc4->verttex);
-        vc4_write_uniforms(vc4, vc4->prog.cs,
-                           &vc4->constbuf[PIPE_SHADER_VERTEX],
-                           &vc4->verttex);
-
         /* The simulator throws a fit if VS or CS don't read an attribute, so
          * we emit a dummy read.
          */
@@ -172,22 +136,27 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
         /* Emit the shader record. */
         struct vc4_cl_out *shader_rec =
                 cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit);
+        /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
         cl_u16(&shader_rec,
                VC4_SHADER_FLAG_ENABLE_CLIPPING |
                ((info->mode == PIPE_PRIM_POINTS &&
                  vc4->rasterizer->base.point_size_per_vertex) ?
                 VC4_SHADER_FLAG_VS_POINT_SIZE : 0));
+
+        /* VC4_DIRTY_COMPILED_FS */
         cl_u8(&shader_rec, 0); /* fs num uniforms (unused) */
         cl_u8(&shader_rec, vc4->prog.fs->num_inputs);
         cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.fs->bo, 0);
         cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
 
+        /* VC4_DIRTY_COMPILED_VS */
         cl_u16(&shader_rec, 0); /* vs num uniforms */
         cl_u8(&shader_rec, vc4->prog.vs->vattrs_live);
         cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]);
         cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.vs->bo, 0);
         cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
 
+        /* VC4_DIRTY_COMPILED_CS */
         cl_u16(&shader_rec, 0); /* cs num uniforms */
         cl_u8(&shader_rec, vc4->prog.cs->vattrs_live);
         cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]);
@@ -200,6 +169,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 struct pipe_vertex_buffer *vb =
                         &vertexbuf->vb[elem->vertex_buffer_index];
                 struct vc4_resource *rsc = vc4_resource(vb->buffer);
+                /* not vc4->dirty tracked: vc4->last_index_bias */
                 uint32_t offset = (vb->buffer_offset +
                                    elem->src_offset +
                                    vb->stride * info->index_bias);
@@ -239,10 +209,72 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
          * attributes.  This field also contains the offset into shader_rec.
          */
         cl_u32(&bcl, num_elements_emit & 0x7);
+        cl_end(&vc4->bcl, bcl);
+
+        vc4_write_uniforms(vc4, vc4->prog.fs,
+                           &vc4->constbuf[PIPE_SHADER_FRAGMENT],
+                           &vc4->fragtex);
+        vc4_write_uniforms(vc4, vc4->prog.vs,
+                           &vc4->constbuf[PIPE_SHADER_VERTEX],
+                           &vc4->verttex);
+        vc4_write_uniforms(vc4, vc4->prog.cs,
+                           &vc4->constbuf[PIPE_SHADER_VERTEX],
+                           &vc4->verttex);
+
+        vc4->last_index_bias = info->index_bias;
+        vc4->max_index = max_index;
+}
+
+static void
+vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
+{
+        struct vc4_context *vc4 = vc4_context(pctx);
+
+        if (info->mode >= PIPE_PRIM_QUADS) {
+                util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf);
+                util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);
+                util_primconvert_draw_vbo(vc4->primconvert, info);
+                perf_debug("Fallback conversion for %d %s vertices\n",
+                           info->count, u_prim_name(info->mode));
+                return;
+        }
+
+        /* Before setting up the draw, do any fixup blits necessary. */
+        vc4_update_shadow_textures(pctx, &vc4->verttex);
+        vc4_update_shadow_textures(pctx, &vc4->fragtex);
+
+        vc4_get_draw_cl_space(vc4);
+
+        if (vc4->prim_mode != info->mode) {
+                vc4->prim_mode = info->mode;
+                vc4->dirty |= VC4_DIRTY_PRIM_MODE;
+        }
+
+        vc4_start_draw(vc4);
+        vc4_update_compiled_shaders(vc4, info->mode);
+
+        vc4_emit_state(pctx);
+
+        if ((vc4->dirty & (VC4_DIRTY_VTXBUF |
+                           VC4_DIRTY_VTXSTATE |
+                           VC4_DIRTY_PRIM_MODE |
+                           VC4_DIRTY_RASTERIZER |
+                           VC4_DIRTY_COMPILED_CS |
+                           VC4_DIRTY_COMPILED_VS |
+                           VC4_DIRTY_COMPILED_FS |
+                           vc4->prog.cs->uniform_dirty_bits |
+                           vc4->prog.vs->uniform_dirty_bits |
+                           vc4->prog.fs->uniform_dirty_bits)) ||
+            vc4->last_index_bias != info->index_bias) {
+                vc4_emit_gl_shader_state(vc4, info);
+        }
+
+        vc4->dirty = 0;
 
         /* Note that the primitive type fields match with OpenGL/gallium
          * definitions, up to but not including QUADS.
          */
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
         if (info->indexed) {
                 uint32_t offset = vc4->indexbuf.offset;
                 uint32_t index_size = vc4->indexbuf.index_size;
@@ -265,7 +297,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                        VC4_INDEX_BUFFER_U8));
                 cl_u32(&bcl, info->count);
                 cl_reloc(vc4, &vc4->bcl, &bcl, rsc->bo, offset);
-                cl_u32(&bcl, max_index);
+                cl_u32(&bcl, vc4->max_index);
 
                 if (vc4->indexbuf.index_size == 4)
                         pipe_resource_reference(&prsc, NULL);
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 561da1074ce..a35b50cd39b 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2205,6 +2205,8 @@ copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,
         memcpy(uinfo->contents, c->uniform_contents,
                count * sizeof(*uinfo->contents));
         uinfo->num_texture_samples = c->num_texture_samples;
+
+        vc4_set_shader_uniform_dirty_flags(shader);
 }
 
 static struct vc4_compiled_shader *
@@ -2440,9 +2442,20 @@ vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
                 (prim_mode == PIPE_PRIM_POINTS &&
                  vc4->rasterizer->base.point_size_per_vertex);
 
-        vc4->prog.vs = vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
+        struct vc4_compiled_shader *vs =
+                vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
+        if (vs != vc4->prog.vs) {
+                vc4->prog.vs = vs;
+                vc4->dirty |= VC4_DIRTY_COMPILED_VS;
+        }
+
         key->is_coord = true;
-        vc4->prog.cs = vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
+        struct vc4_compiled_shader *cs =
+                vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
+        if (cs != vc4->prog.cs) {
+                vc4->prog.cs = cs;
+                vc4->dirty |= VC4_DIRTY_COMPILED_CS;
+        }
 }
 
 void
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index cab76406055..5d5166fd818 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -102,6 +102,12 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
 
         if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) {
                 vc4_resource_bo_alloc(rsc);
+
+                /* If it might be bound as one of our vertex buffers, make
+                 * sure we re-emit vertex buffer state.
+                 */
+                if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
+                        vc4->dirty |= VC4_DIRTY_VTXBUF;
         } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
                 if (vc4_cl_references_bo(pctx, rsc->bo)) {
                         if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
@@ -110,6 +116,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                             prsc->height0 == box->height &&
                             prsc->depth0 == box->depth) {
                                 vc4_resource_bo_alloc(rsc);
+                                if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
+                                        vc4->dirty |= VC4_DIRTY_VTXBUF;
                         } else {
                                 vc4_flush(pctx);
                         }
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
index 5613d6b28c0..d4c71376a55 100644
--- a/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -284,3 +284,51 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
 
         cl_end(&vc4->uniforms, uniforms);
 }
+
+void
+vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
+{
+        uint32_t dirty = 0;
+
+        for (int i = 0; i < shader->uniforms.count; i++) {
+                switch (shader->uniforms.contents[i]) {
+                case QUNIFORM_CONSTANT:
+                        break;
+                case QUNIFORM_UNIFORM:
+                case QUNIFORM_UBO_ADDR:
+                        dirty |= VC4_DIRTY_CONSTBUF;
+                        break;
+
+                case QUNIFORM_VIEWPORT_X_SCALE:
+                case QUNIFORM_VIEWPORT_Y_SCALE:
+                case QUNIFORM_VIEWPORT_Z_OFFSET:
+                case QUNIFORM_VIEWPORT_Z_SCALE:
+                        dirty |= VC4_DIRTY_VIEWPORT;
+                        break;
+
+                case QUNIFORM_USER_CLIP_PLANE:
+                        dirty |= VC4_DIRTY_CLIP;
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P0:
+                case QUNIFORM_TEXTURE_CONFIG_P1:
+                case QUNIFORM_TEXTURE_CONFIG_P2:
+                case QUNIFORM_TEXTURE_BORDER_COLOR:
+                case QUNIFORM_TEXRECT_SCALE_X:
+                case QUNIFORM_TEXRECT_SCALE_Y:
+                        dirty |= VC4_DIRTY_TEXSTATE;
+                        break;
+
+                case QUNIFORM_BLEND_CONST_COLOR:
+                        dirty |= VC4_DIRTY_BLEND_COLOR;
+                        break;
+
+                case QUNIFORM_STENCIL:
+                case QUNIFORM_ALPHA_REF:
+                        dirty |= VC4_DIRTY_ZSA;
+                        break;
+                }
+        }
+
+        shader->uniform_dirty_bits = dirty;
+}

From b868971e786b849e70675852a0043538bcce0739 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Mon, 27 Jul 2015 13:29:20 +0300
Subject: [PATCH 0700/1208] glsl: move max_index calc to
 assign_attribute_or_color_locations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Change function to get all gl_constants for inspection, this is used
by follow-up patch.

v2: rebase, update function documentation

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Timothy Arceri <t_arceri@yahoo.com.au>
---
 src/glsl/linker.cpp | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 8f2c8ee9a05..c8d2e8e9571 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -2313,12 +2313,10 @@ find_available_slots(unsigned used_mask, unsigned needed_count)
  * Assign locations for either VS inputs or FS outputs
  *
  * \param prog          Shader program whose variables need locations assigned
+ * \param constants     Driver specific constant values for the program.
  * \param target_index  Selector for the program target to receive location
  *                      assignmnets.  Must be either \c MESA_SHADER_VERTEX or
  *                      \c MESA_SHADER_FRAGMENT.
- * \param max_index     Maximum number of generic locations.  This corresponds
- *                      to either the maximum number of draw buffers or the
- *                      maximum number of generic attributes.
  *
  * \return
  * If locations are successfully assigned, true is returned.  Otherwise an
@@ -2326,9 +2324,17 @@ find_available_slots(unsigned used_mask, unsigned needed_count)
  */
 bool
 assign_attribute_or_color_locations(gl_shader_program *prog,
-				    unsigned target_index,
-				    unsigned max_index)
+                                    struct gl_constants *constants,
+                                    unsigned target_index)
 {
+   /* Maximum number of generic locations.  This corresponds to either the
+    * maximum number of draw buffers or the maximum number of generic
+    * attributes.
+    */
+   unsigned max_index = (target_index == MESA_SHADER_VERTEX) ?
+      constants->Program[target_index].MaxAttribs :
+      MAX2(constants->MaxDrawBuffers, constants->MaxDualSourceDrawBuffers);
+
    /* Mark invalid locations as being used.
     */
    unsigned used_locations = (max_index >= 32)
@@ -3648,12 +3654,13 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       }
    }
 
-   if (!assign_attribute_or_color_locations(prog, MESA_SHADER_VERTEX,
-                                            ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs)) {
+   if (!assign_attribute_or_color_locations(prog, &ctx->Const,
+                                            MESA_SHADER_VERTEX)) {
       goto done;
    }
 
-   if (!assign_attribute_or_color_locations(prog, MESA_SHADER_FRAGMENT, MAX2(ctx->Const.MaxDrawBuffers, ctx->Const.MaxDualSourceDrawBuffers))) {
+   if (!assign_attribute_or_color_locations(prog, &ctx->Const,
+                                            MESA_SHADER_FRAGMENT)) {
       goto done;
    }
 

From e17056f5a20beb752a530180fce1aba0e68877b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Fri, 3 Jul 2015 10:19:23 +0300
Subject: [PATCH 0701/1208] glsl: verify location when dual source blending
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same check is made for glBindFragDataLocationIndexed but it was missing
when using layout qualifiers.

Fixes following Piglit test:
	arb_blend_func_extended-output-location

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Timothy Arceri <t_arceri@yahoo.com.au>
---
 src/glsl/linker.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index c8d2e8e9571..a7812116944 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -2431,6 +2431,25 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
 	 }
       }
 
+      /* From GL4.5 core spec, section 15.2 (Shader Execution):
+       *
+       *     "Output binding assignments will cause LinkProgram to fail:
+       *     ...
+       *     If the program has an active output assigned to a location greater
+       *     than or equal to the value of MAX_DUAL_SOURCE_DRAW_BUFFERS and has
+       *     an active output assigned an index greater than or equal to one;"
+       */
+      if (target_index == MESA_SHADER_FRAGMENT && var->data.index >= 1 &&
+          var->data.location - generic_base >=
+          (int) constants->MaxDualSourceDrawBuffers) {
+         linker_error(prog,
+                      "output location %d >= GL_MAX_DUAL_SOURCE_DRAW_BUFFERS "
+                      "with index %u for %s\n",
+                      var->data.location - generic_base, var->data.index,
+                      var->name);
+         return false;
+      }
+
       const unsigned slots = var->type->count_attribute_slots();
 
       /* From GL4.5 core spec, section 11.1.1 (Vertex Attributes):

From e235ca159f5f6de2bd29616fdda5c02dc69b0d7f Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 22 Jul 2015 20:08:23 -0700
Subject: [PATCH 0702/1208] glsl: Fix a bug where LHS swizzles of swizzles were
 too small.

A simple shader such as

   vec4 color;
   color.xy.x = 1.0;

would cause ir_assignment::set_lhs() to generate bogus IR:

   (swiz xy (swiz x (constant float (1.0))))

We were setting the number of components of each new RHS swizzle based
on the highest channel used in the LHS swizzle.  So, .xy.y would
generate (swiz xy (swiz xx ...)), while .xy.x would break.

Our existing Piglit test happened to use .xzy.z, which worked, since
'z' is the third component, resulting in an xxx swizzle.

This patch sets the number of swizzle components based on the size of
the LHS swizzle's inner value, so we always have the correct number
at each step.

Fixes new Piglit tests glsl-vs-swizzle-swizzle-lhs-[23].
Fixes ir_validate assertions in in Metro 2033 Redux.

v2: Move num_components updating completely out of update_rhs_swizzle
    (suggested by Timothy Arceri).  Simplify.

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Timothy Arceri <t_arceri@yahoo.com.au>
---
 src/glsl/ir.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index 3a40926e911..724861b1e9f 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -63,8 +63,6 @@ update_rhs_swizzle(ir_swizzle_mask &m, unsigned from, unsigned to)
    case 3: m.w = from; break;
    default: assert(!"Should not get here.");
    }
-
-   m.num_components = MAX2(m.num_components, (to + 1));
 }
 
 void
@@ -95,6 +93,7 @@ ir_assignment::set_lhs(ir_rvalue *lhs)
 
 	 write_mask |= (((this->write_mask >> i) & 1) << c);
 	 update_rhs_swizzle(rhs_swiz, i, c);
+         rhs_swiz.num_components = swiz->val->type->vector_elements;
       }
 
       this->write_mask = write_mask;
@@ -114,6 +113,7 @@ ir_assignment::set_lhs(ir_rvalue *lhs)
 	 if (write_mask & (1 << i))
 	    update_rhs_swizzle(rhs_swiz, i, rhs_chan++);
       }
+      rhs_swiz.num_components = rhs_chan;
       this->rhs = new(mem_ctx) ir_swizzle(this->rhs, rhs_swiz);
    }
 

From 055e3a3f87d8be5374902d2ae6fecb0eb5c66714 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 28 Jul 2015 18:45:32 -0700
Subject: [PATCH 0703/1208] i965: Use real stage in "Unsupported form of
 variable indexing" warning.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Other stages can be miserably slow too!

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/drivers/dri/i965/brw_shader.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 703e9aa913e..4186432ae41 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -290,8 +290,9 @@ process_glsl_ir(gl_shader_stage stage,
                                           options->EmitNoIndirectUniform);
 
    if (unlikely(brw->perf_debug && lowered_variable_indexing)) {
-      perf_debug("Unsupported form of variable indexing in FS; falling "
-                 "back to very inefficient code generation\n");
+      perf_debug("Unsupported form of variable indexing in %s; falling "
+                 "back to very inefficient code generation\n",
+                 _mesa_shader_stage_to_abbrev(shader->Stage));
    }
 
    lower_ubo_reference(shader, shader->ir);

From c00d093c8f247c41f9122143c49ffa93865a0ded Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Tue, 10 Feb 2015 16:40:40 +0100
Subject: [PATCH 0704/1208] mesa: Return INVALID_ENUM in glClearBufferiv() when
 buffer is not color or stencil

Page 497 of the PDF, section '17.4.3.1 Clearing Individual Buffers' of the
OpenGL 4.5 spec states:

    "An INVALID_ENUM error is generated by ClearBufferiv and
     ClearNamedFramebufferiv if buffer is not COLOR or STENCIL."

Fixes 1 dEQP test:
* dEQP-GLES3.functional.negative_api.buffer.clear_bufferiv

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/clear.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/mesa/main/clear.c b/src/mesa/main/clear.c
index 8284dcab528..3bfcc5c0e39 100644
--- a/src/mesa/main/clear.c
+++ b/src/mesa/main/clear.c
@@ -325,6 +325,18 @@ _mesa_ClearBufferiv(GLenum buffer, GLint drawbuffer, const GLint *value)
       _mesa_update_state( ctx );
    }
 
+   /* Page 498 of the PDF, section '17.4.3.1 Clearing Individual Buffers'
+    * of the OpenGL 4.5 spec states:
+    *
+    *    "An INVALID_ENUM error is generated by ClearBufferiv and
+    *     ClearNamedFramebufferiv if buffer is not COLOR or STENCIL."
+    */
+   if (buffer == GL_DEPTH || buffer == GL_DEPTH_STENCIL) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glClearBufferiv(buffer=GL_DEPTH || GL_DEPTH_STENCIL)");
+      return;
+   }
+
    switch (buffer) {
    case GL_STENCIL:
       /* Page 264 (page 280 of the PDF) of the OpenGL 3.0 spec says:

From d1bb3b4910e6c02344550b8982aa8442cd7efd29 Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Thu, 23 Jul 2015 16:38:32 +0200
Subject: [PATCH 0705/1208] mesa/es3.1: Add ES 3.1 handling to get.c and
 get_hash_generator.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marta Lofstedt <marta.lofstedt@linux.intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/mesa/main/get.c                 |  5 ++++-
 src/mesa/main/get_hash_generator.py | 12 ++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index ce786915508..ec7eb718ef8 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -1232,10 +1232,13 @@ find_value(const char *func, GLenum pname, void **p, union value *v)
     * value since it's compatible with GLES2 its entry in table_set[] is at the
     * end.
     */
-   STATIC_ASSERT(ARRAY_SIZE(table_set) == API_OPENGL_LAST + 2);
+   STATIC_ASSERT(ARRAY_SIZE(table_set) == API_OPENGL_LAST + 3);
    if (_mesa_is_gles3(ctx)) {
       api = API_OPENGL_LAST + 1;
    }
+   if (_mesa_is_gles31(ctx)) {
+      api = API_OPENGL_LAST + 2;
+   }
    mask = ARRAY_SIZE(table(api)) - 1;
    hash = (pname * prime_factor);
    while (1) {
diff --git a/src/mesa/main/get_hash_generator.py b/src/mesa/main/get_hash_generator.py
index b200d197341..c777b782442 100644
--- a/src/mesa/main/get_hash_generator.py
+++ b/src/mesa/main/get_hash_generator.py
@@ -44,7 +44,7 @@ prime_factor = 89
 prime_step = 281
 hash_table_size = 1024
 
-gl_apis=set(["GL", "GL_CORE", "GLES", "GLES2", "GLES3"])
+gl_apis=set(["GL", "GL_CORE", "GLES", "GLES2", "GLES3", "GLES31"])
 
 def print_header():
    print "typedef const unsigned short table_t[%d];\n" % (hash_table_size)
@@ -68,6 +68,7 @@ api_enum = [
    'GLES2',
    'GL_CORE',
    'GLES3', # Not in gl_api enum in mtypes.h
+   'GLES31', # Not in gl_api enum in mtypes.h
 ]
 
 def api_index(api):
@@ -167,10 +168,13 @@ def generate_hash_tables(enum_list, enabled_apis, param_descriptors):
 
          for api in valid_apis:
             add_to_hash_table(tables[api], hash_val, len(params))
-            # Also add GLES2 items to the GLES3 hash table
+            # Also add GLES2 items to the GLES3 and GLES31 hash table
             if api == "GLES2":
                add_to_hash_table(tables["GLES3"], hash_val, len(params))
-
+               add_to_hash_table(tables["GLES31"], hash_val, len(params))
+            # Also add GLES3 items to the GLES31 hash table
+            if api == "GLES3":
+               add_to_hash_table(tables["GLES31"], hash_val, len(params))
          params.append(["GL_" + enum_name, param[1]])
 
    sorted_tables={}
@@ -206,7 +210,7 @@ if __name__ == '__main__':
       die("missing descriptor file (-f)\n")
 
    # generate the code for all APIs
-   enabled_apis = set(["GLES", "GLES2", "GLES3", "GL", "GL_CORE"])
+   enabled_apis = set(["GLES", "GLES2", "GLES3", "GLES31", "GL", "GL_CORE"])
 
    try:
       api_desc = gl_XML.parse_GL_API(api_desc_file)

From 9ec50dc6bb192818dde221e561fb6be6c4bd417b Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Mon, 27 Jul 2015 15:22:49 +0200
Subject: [PATCH 0706/1208] mesa/es3.1: enable GL_ARB_shader_image_load_store
 for GLES 3.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/get.c              |  6 ++++++
 src/mesa/main/get_hash_params.py | 17 +++++++++++------
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index ec7eb718ef8..dc0493049b6 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -367,6 +367,12 @@ static const int extra_ARB_draw_indirect_es31[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_shader_image_load_store_es31[] = {
+   EXT(ARB_shader_image_load_store),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 4137e7f33c5..34f95d67769 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -407,6 +407,15 @@ descriptor=[
   [ "TEXTURE_EXTERNAL_OES", "LOC_CUSTOM, TYPE_BOOLEAN, 0, extra_OES_EGL_image_external" ],
 ]},
 
+# Enums in OpenGL and ES 3.1
+{ "apis": ["GL", "GL_CORE", "GLES31"], "params": [
+# GL_ARB_shader_image_load_store / GLES 3.1
+  [ "MAX_IMAGE_UNITS", "CONTEXT_INT(Const.MaxImageUnits), extra_ARB_shader_image_load_store_es31" ],
+  [ "MAX_VERTEX_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms), extra_ARB_shader_image_load_store_es31" ],
+  [ "MAX_FRAGMENT_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms), extra_ARB_shader_image_load_store_es31" ],
+  [ "MAX_COMBINED_IMAGE_UNIFORMS", "CONTEXT_INT(Const.MaxCombinedImageUniforms), extra_ARB_shader_image_load_store_es31" ],
+]},
+
 # Enums in OpenGL Core profile and ES 3.1
 { "apis": ["GL_CORE", "GLES3"], "params": [
 # GL_ARB_draw_indirect / GLES 3.1
@@ -779,13 +788,9 @@ descriptor=[
   [ "MAX_VERTEX_ATTRIB_BINDINGS", "CONTEXT_ENUM(Const.MaxVertexAttribBindings), NO_EXTRA" ],
 
 # GL_ARB_shader_image_load_store
-  [ "MAX_IMAGE_UNITS", "CONTEXT_INT(Const.MaxImageUnits), extra_ARB_shader_image_load_store"],
-  [ "MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS", "CONTEXT_INT(Const.MaxCombinedImageUnitsAndFragmentOutputs), extra_ARB_shader_image_load_store"],
-  [ "MAX_IMAGE_SAMPLES", "CONTEXT_INT(Const.MaxImageSamples), extra_ARB_shader_image_load_store"],
-  [ "MAX_VERTEX_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms), extra_ARB_shader_image_load_store"],
+  [ "MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS", "CONTEXT_INT(Const.MaxCombinedImageUnitsAndFragmentOutputs), extra_ARB_shader_image_load_store" ],
+  [ "MAX_IMAGE_SAMPLES", "CONTEXT_INT(Const.MaxImageSamples), extra_ARB_shader_image_load_store" ],
   [ "MAX_GEOMETRY_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxImageUniforms), extra_ARB_shader_image_load_store_and_geometry_shader"],
-  [ "MAX_FRAGMENT_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms), extra_ARB_shader_image_load_store"],
-  [ "MAX_COMBINED_IMAGE_UNIFORMS", "CONTEXT_INT(Const.MaxCombinedImageUniforms), extra_ARB_shader_image_load_store"],
 
 # GL_ARB_compute_shader
   [ "MAX_COMPUTE_WORK_GROUP_INVOCATIONS", "CONTEXT_INT(Const.MaxComputeWorkGroupInvocations), extra_ARB_compute_shader" ],

From cd14fcbca0a1dcecfdbee97a3524123ba87f901d Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Thu, 23 Jul 2015 16:38:34 +0200
Subject: [PATCH 0707/1208] mesa/es3.1: enable GL_ARB_shader_atomic_counters
 for GLES 3.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/mesa/main/get.c              |  6 ++++++
 src/mesa/main/get_hash_params.py | 20 +++++++++++---------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index dc0493049b6..39fe725041f 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -373,6 +373,12 @@ static const int extra_ARB_shader_image_load_store_es31[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_shader_atomic_counters_es31[] = {
+   EXT(ARB_shader_atomic_counters),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 34f95d67769..1bafc32f858 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -414,6 +414,17 @@ descriptor=[
   [ "MAX_VERTEX_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms), extra_ARB_shader_image_load_store_es31" ],
   [ "MAX_FRAGMENT_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms), extra_ARB_shader_image_load_store_es31" ],
   [ "MAX_COMBINED_IMAGE_UNIFORMS", "CONTEXT_INT(Const.MaxCombinedImageUniforms), extra_ARB_shader_image_load_store_es31" ],
+
+# GL_ARB_shader_atomic_counters / GLES 3.1
+  [ "ATOMIC_COUNTER_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_ATOMIC_COUNTER_BUFFER_BINDINGS", "CONTEXT_INT(Const.MaxAtomicBufferBindings), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_ATOMIC_COUNTER_BUFFER_SIZE", "CONTEXT_INT(Const.MaxAtomicBufferSize), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_VERTEX_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_VERTEX_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxAtomicCounters), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_FRAGMENT_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_FRAGMENT_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicCounters), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_COMBINED_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.MaxCombinedAtomicBuffers), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_COMBINED_ATOMIC_COUNTERS", "CONTEXT_INT(Const.MaxCombinedAtomicCounters), extra_ARB_shader_atomic_counters_es31" ],
 ]},
 
 # Enums in OpenGL Core profile and ES 3.1
@@ -771,17 +782,8 @@ descriptor=[
   [ "PROGRAM_PIPELINE_BINDING", "LOC_CUSTOM, TYPE_INT, GL_PROGRAM_PIPELINE_BINDING, NO_EXTRA" ],
 
 # GL_ARB_shader_atomic_counters
-  [ "ATOMIC_COUNTER_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_shader_atomic_counters" ],
-  [ "MAX_ATOMIC_COUNTER_BUFFER_BINDINGS", "CONTEXT_INT(Const.MaxAtomicBufferBindings), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_ATOMIC_COUNTER_BUFFER_SIZE", "CONTEXT_INT(Const.MaxAtomicBufferSize), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_VERTEX_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxAtomicBuffers), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_VERTEX_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxAtomicCounters), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_FRAGMENT_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicBuffers), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_FRAGMENT_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicCounters), extra_ARB_shader_atomic_counters" ],
   [ "MAX_GEOMETRY_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_and_geometry_shader" ],
   [ "MAX_GEOMETRY_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicCounters), extra_ARB_shader_atomic_counters_and_geometry_shader" ],
-  [ "MAX_COMBINED_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.MaxCombinedAtomicBuffers), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_COMBINED_ATOMIC_COUNTERS", "CONTEXT_INT(Const.MaxCombinedAtomicCounters), extra_ARB_shader_atomic_counters" ],
 
 # GL_ARB_vertex_attrib_binding
   [ "MAX_VERTEX_ATTRIB_RELATIVE_OFFSET", "CONTEXT_ENUM(Const.MaxVertexAttribRelativeOffset), NO_EXTRA" ],

From c561b2faa80d07eedfe201ffdbb3f7746e33a049 Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Mon, 27 Jul 2015 15:22:50 +0200
Subject: [PATCH 0708/1208] mesa/es3.1: enable GL_ARB_texture_multisample for
 GLES 3.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/get.c              |  6 ++++++
 src/mesa/main/get_hash_params.py | 14 ++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 39fe725041f..60c1b1bc0a1 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -379,6 +379,12 @@ static const int extra_ARB_shader_atomic_counters_es31[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_texture_multisample_es31[] = {
+   EXT(ARB_texture_multisample),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 1bafc32f858..2040a01ae0b 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -425,6 +425,14 @@ descriptor=[
   [ "MAX_FRAGMENT_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicCounters), extra_ARB_shader_atomic_counters_es31" ],
   [ "MAX_COMBINED_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.MaxCombinedAtomicBuffers), extra_ARB_shader_atomic_counters_es31" ],
   [ "MAX_COMBINED_ATOMIC_COUNTERS", "CONTEXT_INT(Const.MaxCombinedAtomicCounters), extra_ARB_shader_atomic_counters_es31" ],
+
+# GL_ARB_texture_multisample / GLES 3.1
+  [ "TEXTURE_BINDING_2D_MULTISAMPLE", "LOC_CUSTOM, TYPE_INT, TEXTURE_2D_MULTISAMPLE_INDEX, extra_ARB_texture_multisample_es31" ],
+  [ "MAX_COLOR_TEXTURE_SAMPLES", "CONTEXT_INT(Const.MaxColorTextureSamples), extra_ARB_texture_multisample_es31" ],
+  [ "MAX_DEPTH_TEXTURE_SAMPLES", "CONTEXT_INT(Const.MaxDepthTextureSamples), extra_ARB_texture_multisample_es31" ],
+  [ "MAX_INTEGER_SAMPLES", "CONTEXT_INT(Const.MaxIntegerSamples), extra_ARB_texture_multisample_es31" ],
+  [ "SAMPLE_MASK", "CONTEXT_BOOL(Multisample.SampleMask), extra_ARB_texture_multisample_es31" ],
+  [ "MAX_SAMPLE_MASK_WORDS", "CONST(1), extra_ARB_texture_multisample_es31" ],
 ]},
 
 # Enums in OpenGL Core profile and ES 3.1
@@ -717,13 +725,7 @@ descriptor=[
   [ "TEXTURE_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
 
 # GL_ARB_texture_multisample / GL 3.2
-  [ "TEXTURE_BINDING_2D_MULTISAMPLE", "LOC_CUSTOM, TYPE_INT, TEXTURE_2D_MULTISAMPLE_INDEX, extra_ARB_texture_multisample" ],
   [ "TEXTURE_BINDING_2D_MULTISAMPLE_ARRAY", "LOC_CUSTOM, TYPE_INT, TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX, extra_ARB_texture_multisample" ],
-  [ "MAX_COLOR_TEXTURE_SAMPLES", "CONTEXT_INT(Const.MaxColorTextureSamples), extra_ARB_texture_multisample" ],
-  [ "MAX_DEPTH_TEXTURE_SAMPLES", "CONTEXT_INT(Const.MaxDepthTextureSamples), extra_ARB_texture_multisample" ],
-  [ "MAX_INTEGER_SAMPLES", "CONTEXT_INT(Const.MaxIntegerSamples), extra_ARB_texture_multisample" ],
-  [ "SAMPLE_MASK", "CONTEXT_BOOL(Multisample.SampleMask), extra_ARB_texture_multisample" ],
-  [ "MAX_SAMPLE_MASK_WORDS", "CONST(1), extra_ARB_texture_multisample" ],
 
 # GL 3.0
   [ "CONTEXT_FLAGS", "CONTEXT_INT(Const.ContextFlags), extra_version_30" ],

From 49021e5058130db299ac6843e34c5f5c53e565ad Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Mon, 27 Jul 2015 15:22:51 +0200
Subject: [PATCH 0709/1208] mesa/es3.1: enable GL_ARB_texture_gather for GLES
 3.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/get.c              | 6 ++++++
 src/mesa/main/get_hash_params.py | 6 ++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 60c1b1bc0a1..a443493a33f 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -385,6 +385,12 @@ static const int extra_ARB_texture_multisample_es31[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_texture_gather_es31[] = {
+   EXT(ARB_texture_gather),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 2040a01ae0b..94d856a466b 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -433,6 +433,10 @@ descriptor=[
   [ "MAX_INTEGER_SAMPLES", "CONTEXT_INT(Const.MaxIntegerSamples), extra_ARB_texture_multisample_es31" ],
   [ "SAMPLE_MASK", "CONTEXT_BOOL(Multisample.SampleMask), extra_ARB_texture_multisample_es31" ],
   [ "MAX_SAMPLE_MASK_WORDS", "CONST(1), extra_ARB_texture_multisample_es31" ],
+
+# GL_ARB_texture_gather / GLES 3.1
+  [ "MIN_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MinProgramTextureGatherOffset), extra_ARB_texture_gather_es31"],
+  [ "MAX_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MaxProgramTextureGatherOffset), extra_ARB_texture_gather_es31"],
 ]},
 
 # Enums in OpenGL Core profile and ES 3.1
@@ -776,8 +780,6 @@ descriptor=[
   [ "TEXTURE_BINDING_CUBE_MAP_ARRAY_ARB", "LOC_CUSTOM, TYPE_INT, TEXTURE_CUBE_ARRAY_INDEX, extra_ARB_texture_cube_map_array" ],
 
 # GL_ARB_texture_gather
-  [ "MIN_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MinProgramTextureGatherOffset), extra_ARB_texture_gather"],
-  [ "MAX_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MaxProgramTextureGatherOffset), extra_ARB_texture_gather"],
   [ "MAX_PROGRAM_TEXTURE_GATHER_COMPONENTS_ARB", "CONTEXT_INT(Const.MaxProgramTextureGatherComponents), extra_ARB_texture_gather"],
 
 # GL_ARB_separate_shader_objects

From 49db765debf9d1a810810935fafc3eef229e1511 Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Wed, 29 Jul 2015 10:10:40 +0300
Subject: [PATCH 0710/1208] mesa/es3.1: enable GL_ARB_compute_shader for GLES
 3.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/mesa/main/get.c              |  6 ++++++
 src/mesa/main/get_hash_params.py | 20 ++++++++++----------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index a443493a33f..072c1a5a49b 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -391,6 +391,12 @@ static const int extra_ARB_texture_gather_es31[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_compute_shader_es31[] = {
+   EXT(ARB_compute_shader),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 94d856a466b..cbcf636ebf4 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -437,6 +437,16 @@ descriptor=[
 # GL_ARB_texture_gather / GLES 3.1
   [ "MIN_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MinProgramTextureGatherOffset), extra_ARB_texture_gather_es31"],
   [ "MAX_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MaxProgramTextureGatherOffset), extra_ARB_texture_gather_es31"],
+
+# GL_ARB_compute_shader / GLES 3.1
+  [ "MAX_COMPUTE_WORK_GROUP_INVOCATIONS", "CONTEXT_INT(Const.MaxComputeWorkGroupInvocations), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_UNIFORM_BLOCKS", "CONST(MAX_COMPUTE_UNIFORM_BLOCKS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_TEXTURE_IMAGE_UNITS", "CONST(MAX_COMPUTE_TEXTURE_IMAGE_UNITS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS", "CONST(MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_ATOMIC_COUNTERS", "CONST(MAX_COMPUTE_ATOMIC_COUNTERS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_SHARED_MEMORY_SIZE", "CONST(MAX_COMPUTE_SHARED_MEMORY_SIZE), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_UNIFORM_COMPONENTS", "CONST(MAX_COMPUTE_UNIFORM_COMPONENTS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_IMAGE_UNIFORMS", "CONST(MAX_COMPUTE_IMAGE_UNIFORMS), extra_ARB_compute_shader_es31" ],
 ]},
 
 # Enums in OpenGL Core profile and ES 3.1
@@ -798,16 +808,6 @@ descriptor=[
   [ "MAX_IMAGE_SAMPLES", "CONTEXT_INT(Const.MaxImageSamples), extra_ARB_shader_image_load_store" ],
   [ "MAX_GEOMETRY_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxImageUniforms), extra_ARB_shader_image_load_store_and_geometry_shader"],
 
-# GL_ARB_compute_shader
-  [ "MAX_COMPUTE_WORK_GROUP_INVOCATIONS", "CONTEXT_INT(Const.MaxComputeWorkGroupInvocations), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_UNIFORM_BLOCKS", "CONST(MAX_COMPUTE_UNIFORM_BLOCKS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_TEXTURE_IMAGE_UNITS", "CONST(MAX_COMPUTE_TEXTURE_IMAGE_UNITS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS", "CONST(MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_ATOMIC_COUNTERS", "CONST(MAX_COMPUTE_ATOMIC_COUNTERS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_SHARED_MEMORY_SIZE", "CONST(MAX_COMPUTE_SHARED_MEMORY_SIZE), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_UNIFORM_COMPONENTS", "CONST(MAX_COMPUTE_UNIFORM_COMPONENTS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_IMAGE_UNIFORMS", "CONST(MAX_COMPUTE_IMAGE_UNIFORMS), extra_ARB_compute_shader" ],
-
 # GL_ARB_framebuffer_no_attachments
   ["MAX_FRAMEBUFFER_WIDTH", "CONTEXT_INT(Const.MaxFramebufferWidth), extra_ARB_framebuffer_no_attachments"],
   ["MAX_FRAMEBUFFER_HEIGHT", "CONTEXT_INT(Const.MaxFramebufferHeight), extra_ARB_framebuffer_no_attachments"],

From cb1cfb710c5a30f2e9b9ea1bca9d7ae0f23bcdfc Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Thu, 23 Jul 2015 16:38:38 +0200
Subject: [PATCH 0711/1208] mesa/es3.1: enable GL_ARB_explicit_uniform_location
 for GLES 3.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/mesa/main/get.c              | 6 ++++++
 src/mesa/main/get_hash_params.py | 4 +++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 072c1a5a49b..5b308e80668 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -397,6 +397,12 @@ static const int extra_ARB_compute_shader_es31[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_explicit_uniform_location_es31[] = {
+   EXT(ARB_explicit_uniform_location),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index cbcf636ebf4..7dc92f10100 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -447,6 +447,9 @@ descriptor=[
   [ "MAX_COMPUTE_SHARED_MEMORY_SIZE", "CONST(MAX_COMPUTE_SHARED_MEMORY_SIZE), extra_ARB_compute_shader_es31" ],
   [ "MAX_COMPUTE_UNIFORM_COMPONENTS", "CONST(MAX_COMPUTE_UNIFORM_COMPONENTS), extra_ARB_compute_shader_es31" ],
   [ "MAX_COMPUTE_IMAGE_UNIFORMS", "CONST(MAX_COMPUTE_IMAGE_UNIFORMS), extra_ARB_compute_shader_es31" ],
+
+# GL_ARB_explicit_uniform_location / GLES 3.1
+  [ "MAX_UNIFORM_LOCATIONS", "CONTEXT_INT(Const.MaxUserAssignableUniformLocations), extra_ARB_explicit_uniform_location_es31" ],
 ]},
 
 # Enums in OpenGL Core profile and ES 3.1
@@ -538,7 +541,6 @@ descriptor=[
   [ "MAX_LIST_NESTING", "CONST(MAX_LIST_NESTING), NO_EXTRA" ],
   [ "MAX_NAME_STACK_DEPTH", "CONST(MAX_NAME_STACK_DEPTH), NO_EXTRA" ],
   [ "MAX_PIXEL_MAP_TABLE", "CONST(MAX_PIXEL_MAP_TABLE), NO_EXTRA" ],
-  [ "MAX_UNIFORM_LOCATIONS", "CONTEXT_INT(Const.MaxUserAssignableUniformLocations), extra_ARB_explicit_uniform_location" ],
   [ "NAME_STACK_DEPTH", "CONTEXT_INT(Select.NameStackDepth), NO_EXTRA" ],
   [ "PACK_LSB_FIRST", "CONTEXT_BOOL(Pack.LsbFirst), NO_EXTRA" ],
   [ "PACK_SWAP_BYTES", "CONTEXT_BOOL(Pack.SwapBytes), NO_EXTRA" ],

From 19d88e3f9f621643ba0524ff37e9a33272353941 Mon Sep 17 00:00:00 2001
From: Fabio Pedretti <fabio.ped@libero.it>
Date: Tue, 28 Jul 2015 20:53:25 +0200
Subject: [PATCH 0712/1208] docs: consolidate nvc0 status

---
 docs/GL3.txt | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index d438403b54a..56753fddf4d 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -92,11 +92,11 @@ GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
   GL_ARB_vertex_type_2_10_10_10_rev                     DONE ()
 
 
-GL 4.0, GLSL 4.00:
+GL 4.0, GLSL 4.00 --- all DONE: nvc0
 
-  GL_ARB_draw_buffers_blend                            DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_draw_indirect                                 DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_gpu_shader5                                   DONE (i965, nvc0)
+  GL_ARB_draw_buffers_blend                            DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_draw_indirect                                 DONE (i965, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_gpu_shader5                                   DONE (i965)
   - 'precise' qualifier                                DONE
   - Dynamically uniform sampler array indices          DONE (r600, radeonsi, softpipe)
   - Dynamically uniform UBO array indices              DONE (r600, radeonsi)
@@ -109,26 +109,26 @@ GL 4.0, GLSL 4.00:
   - Enhanced per-sample shading                        DONE (r600, radeonsi)
   - Interpolation functions                            DONE (r600, radeonsi)
   - New overload resolution rules                      DONE
-  GL_ARB_gpu_shader_fp64                               DONE (nvc0, radeonsi, llvmpipe, softpipe)
-  GL_ARB_sample_shading                                DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_shader_subroutine                             DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_tessellation_shader                           DONE (nvc0, radeonsi)
-  GL_ARB_texture_buffer_object_rgb32                   DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_cube_map_array                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_gather                                DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_query_lod                             DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_transform_feedback2                           DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_transform_feedback3                           DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_gpu_shader_fp64                               DONE (radeonsi, llvmpipe, softpipe)
+  GL_ARB_sample_shading                                DONE (i965, nv50, r600, radeonsi)
+  GL_ARB_shader_subroutine                             DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_tessellation_shader                           DONE (radeonsi)
+  GL_ARB_texture_buffer_object_rgb32                   DONE (i965, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_cube_map_array                        DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_gather                                DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_query_lod                             DONE (i965, nv50, r600, radeonsi)
+  GL_ARB_transform_feedback2                           DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_transform_feedback3                           DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
 
 
-GL 4.1, GLSL 4.10:
+GL 4.1, GLSL 4.10 --- all DONE: nvc0
 
-  GL_ARB_ES2_compatibility                             DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_ES2_compatibility                             DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_get_program_binary                            DONE (0 binary formats)
   GL_ARB_separate_shader_objects                       DONE (all drivers)
   GL_ARB_shader_precision                              DONE (all drivers that support GLSL 4.10)
-  GL_ARB_vertex_attrib_64bit                           DONE (nvc0, radeonsi, llvmpipe, softpipe)
-  GL_ARB_viewport_array                                DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
+  GL_ARB_vertex_attrib_64bit                           DONE (radeonsi, llvmpipe, softpipe)
+  GL_ARB_viewport_array                                DONE (i965, nv50, r600, radeonsi, llvmpipe)
 
 
 GL 4.2, GLSL 4.20:

From 3f0e7c28fe5252f0613b548efd1cbf8e4bc0eb9a Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sun, 26 Jul 2015 01:27:17 +0100
Subject: [PATCH 0713/1208] radeon: move streamout buffer config to streamout
 enable function. (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will be used here later.

v2: update atom sizes
add check for old vs new enabled mask

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.h |  3 +++
 src/gallium/drivers/radeon/r600_streamout.c   | 21 +++++++++++--------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index d225f2528ab..74d747f730b 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -329,6 +329,9 @@ struct r600_streamout {
 	 * it must be set explicitly when binding a shader. */
 	unsigned			*stride_in_dw;
 
+	/* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */
+	unsigned			hw_enabled_mask;
+
 	/* The state of VGT_STRMOUT_(CONFIG|EN). */
 	struct r600_atom		enable_atom;
 	bool				streamout_enabled;
diff --git a/src/gallium/drivers/radeon/r600_streamout.c b/src/gallium/drivers/radeon/r600_streamout.c
index bc8bf97ef89..815b641579c 100644
--- a/src/gallium/drivers/radeon/r600_streamout.c
+++ b/src/gallium/drivers/radeon/r600_streamout.c
@@ -88,8 +88,7 @@ void r600_streamout_buffers_dirty(struct r600_common_context *rctx)
 		12 + /* flush_vgt_streamout */
 		num_bufs * 11; /* STRMOUT_BUFFER_UPDATE, BUFFER_SIZE */
 
-	begin->num_dw = 12 + /* flush_vgt_streamout */
-			3; /* VGT_STRMOUT_BUFFER_CONFIG */
+	begin->num_dw = 12; /* flush_vgt_streamout */
 
 	if (rctx->chip_class >= SI) {
 		begin->num_dw += num_bufs * 4; /* SET_CONTEXT_REG */
@@ -192,11 +191,6 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 
 	r600_flush_vgt_streamout(rctx);
 
-	r600_write_context_reg(cs, rctx->chip_class >= EVERGREEN ?
-				       R_028B98_VGT_STRMOUT_BUFFER_CONFIG :
-				       R_028B20_VGT_STRMOUT_BUFFER_EN,
-			       rctx->streamout.enabled_mask);
-
 	for (i = 0; i < rctx->streamout.num_targets; i++) {
 		if (!t[i])
 			continue;
@@ -326,6 +320,12 @@ static bool r600_get_strmout_en(struct r600_common_context *rctx)
 static void r600_emit_streamout_enable(struct r600_common_context *rctx,
 				       struct r600_atom *atom)
 {
+	r600_write_context_reg(rctx->rings.gfx.cs,
+			       rctx->chip_class >= EVERGREEN ?
+				       R_028B98_VGT_STRMOUT_BUFFER_CONFIG :
+				       R_028B20_VGT_STRMOUT_BUFFER_EN,
+			       rctx->streamout.hw_enabled_mask);
+
 	r600_write_context_reg(rctx->rings.gfx.cs,
 			       rctx->chip_class >= EVERGREEN ?
 				       R_028B94_VGT_STRMOUT_CONFIG :
@@ -336,9 +336,12 @@ static void r600_emit_streamout_enable(struct r600_common_context *rctx,
 static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable)
 {
 	bool old_strmout_en = r600_get_strmout_en(rctx);
+	unsigned old_hw_enabled_mask = rctx->streamout.hw_enabled_mask;
 
 	rctx->streamout.streamout_enabled = enable;
-	if (old_strmout_en != r600_get_strmout_en(rctx))
+	rctx->streamout.hw_enabled_mask = rctx->streamout.enabled_mask;
+	if ((old_strmout_en != r600_get_strmout_en(rctx)) ||
+            (old_hw_enabled_mask != rctx->streamout.hw_enabled_mask))
 		rctx->streamout.enable_atom.dirty = true;
 }
 
@@ -365,5 +368,5 @@ void r600_streamout_init(struct r600_common_context *rctx)
 	rctx->b.stream_output_target_destroy = r600_so_target_destroy;
 	rctx->streamout.begin_atom.emit = r600_emit_streamout_begin;
 	rctx->streamout.enable_atom.emit = r600_emit_streamout_enable;
-	rctx->streamout.enable_atom.num_dw = 3;
+	rctx->streamout.enable_atom.num_dw = 6;
 }

From 2294ba9565fbae49f1fc77ca171e9d6aafa34005 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 9 Jul 2015 16:34:59 +1000
Subject: [PATCH 0714/1208] radeon: add support for streams to the common
 streamout code. (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds to the common radeon streamout code, support
for multiple streams.

It updates radeonsi/r600 to set the enabled mask up.

v2: update for changes in previous patch.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/r600_shader.c        |  7 ++++
 src/gallium/drivers/r600/r600_shader.h        |  1 +
 src/gallium/drivers/r600/r600_state_common.c  |  2 ++
 src/gallium/drivers/radeon/r600_pipe_common.h |  1 +
 src/gallium/drivers/radeon/r600_streamout.c   | 34 +++++++++++++------
 .../drivers/radeonsi/si_state_shaders.c       | 18 ++++++++--
 6 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 1a72bf6e77e..dda38f6f905 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -310,6 +310,7 @@ struct r600_shader_ctx {
 	int					gs_next_vertex;
 	struct r600_shader	*gs_for_vs;
 	int					gs_export_gpr_treg;
+	unsigned				enabled_stream_buffers_mask;
 };
 
 struct r600_shader_tgsi_instruction {
@@ -1402,6 +1403,9 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output
 		 * with MEM_STREAM instructions */
 		output.array_size = 0xFFF;
 		output.comp_mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
+
+		ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer);
+
 		if (ctx->bc->chip_class >= EVERGREEN) {
 			switch (so->output[i].output_buffer) {
 			case 0:
@@ -1718,6 +1722,8 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
 	gs->gs_copy_shader = cshader;
 
 	ctx.bc->nstack = 1;
+
+	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
 	cshader->shader.ring_item_size = ocnt * 16;
 
 	return r600_bytecode_build(ctx.bc);
@@ -2261,6 +2267,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	    so.num_outputs && !use_llvm)
 		emit_streamout(&ctx, &so);
 
+	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
 	convert_edgeflag_to_int(&ctx);
 
 	if (ring_outputs) {
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index dd359d7e959..5d05c8153d7 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -125,6 +125,7 @@ struct r600_pipe_shader {
 	struct r600_shader_key	key;
 	unsigned		db_shader_control;
 	unsigned		ps_depth_export;
+	unsigned		enabled_stream_buffers_mask;
 };
 
 /* return the table index 0-5 for TGSI_INTERPOLATE_LINEAR/PERSPECTIVE and
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 0c78b50aa0c..455e59aef6c 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1208,6 +1208,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 				rctx->clip_misc_state.clip_disable = rctx->gs_shader->current->shader.vs_position_window_space;
 				rctx->clip_misc_state.atom.dirty = true;
 			}
+			rctx->b.streamout.enabled_stream_buffers_mask = rctx->gs_shader->current->gs_copy_shader->enabled_stream_buffers_mask;
 		}
 
 		r600_shader_select(ctx, rctx->vs_shader, &vs_dirty);
@@ -1242,6 +1243,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 				rctx->clip_misc_state.clip_disable = rctx->vs_shader->current->shader.vs_position_window_space;
 				rctx->clip_misc_state.atom.dirty = true;
 			}
+			rctx->b.streamout.enabled_stream_buffers_mask = rctx->vs_shader->current->enabled_stream_buffers_mask;
 		}
 	}
 
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 74d747f730b..fcb758b2086 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -328,6 +328,7 @@ struct r600_streamout {
 	/* External state which comes from the vertex shader,
 	 * it must be set explicitly when binding a shader. */
 	unsigned			*stride_in_dw;
+	unsigned			enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */
 
 	/* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */
 	unsigned			hw_enabled_mask;
diff --git a/src/gallium/drivers/radeon/r600_streamout.c b/src/gallium/drivers/radeon/r600_streamout.c
index 815b641579c..026fb30f749 100644
--- a/src/gallium/drivers/radeon/r600_streamout.c
+++ b/src/gallium/drivers/radeon/r600_streamout.c
@@ -320,17 +320,24 @@ static bool r600_get_strmout_en(struct r600_common_context *rctx)
 static void r600_emit_streamout_enable(struct r600_common_context *rctx,
 				       struct r600_atom *atom)
 {
-	r600_write_context_reg(rctx->rings.gfx.cs,
-			       rctx->chip_class >= EVERGREEN ?
-				       R_028B98_VGT_STRMOUT_BUFFER_CONFIG :
-				       R_028B20_VGT_STRMOUT_BUFFER_EN,
-			       rctx->streamout.hw_enabled_mask);
+	unsigned strmout_config_reg = R_028AB0_VGT_STRMOUT_EN;
+	unsigned strmout_config_val = S_028B94_STREAMOUT_0_EN(r600_get_strmout_en(rctx));
+	unsigned strmout_buffer_reg = R_028B20_VGT_STRMOUT_BUFFER_EN;
+	unsigned strmout_buffer_val = rctx->streamout.hw_enabled_mask &
+				      rctx->streamout.enabled_stream_buffers_mask;
 
-	r600_write_context_reg(rctx->rings.gfx.cs,
-			       rctx->chip_class >= EVERGREEN ?
-				       R_028B94_VGT_STRMOUT_CONFIG :
-				       R_028AB0_VGT_STRMOUT_EN,
-			       S_028B94_STREAMOUT_0_EN(r600_get_strmout_en(rctx)));
+	if (rctx->chip_class >= EVERGREEN) {
+		strmout_buffer_reg = R_028B98_VGT_STRMOUT_BUFFER_CONFIG;
+
+		strmout_config_reg = R_028B94_VGT_STRMOUT_CONFIG;
+		strmout_config_val |=
+			S_028B94_RAST_STREAM(0) |
+			S_028B94_STREAMOUT_1_EN(r600_get_strmout_en(rctx)) |
+			S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) |
+			S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx));
+	}
+	r600_write_context_reg(rctx->rings.gfx.cs, strmout_buffer_reg, strmout_buffer_val);
+	r600_write_context_reg(rctx->rings.gfx.cs, strmout_config_reg, strmout_config_val);
 }
 
 static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable)
@@ -339,7 +346,12 @@ static void r600_set_streamout_enable(struct r600_common_context *rctx, bool ena
 	unsigned old_hw_enabled_mask = rctx->streamout.hw_enabled_mask;
 
 	rctx->streamout.streamout_enabled = enable;
-	rctx->streamout.hw_enabled_mask = rctx->streamout.enabled_mask;
+
+	rctx->streamout.hw_enabled_mask = rctx->streamout.enabled_mask |
+					  (rctx->streamout.enabled_mask << 4) |
+					  (rctx->streamout.enabled_mask << 8) |
+					  (rctx->streamout.enabled_mask << 12);
+
 	if ((old_strmout_en != r600_get_strmout_en(rctx)) ||
             (old_hw_enabled_mask != rctx->streamout.hw_enabled_mask))
 		rctx->streamout.enable_atom.dirty = true;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 24afed06fce..18bddfda265 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1245,6 +1245,18 @@ static void si_update_vgt_shader_config(struct si_context *sctx)
 	si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
 }
 
+static void si_update_so(struct si_context *sctx, struct si_shader_selector *shader)
+{
+	struct pipe_stream_output_info *so = &shader->so;
+	uint32_t enabled_stream_buffers_mask = 0;
+	int i;
+
+	for (i = 0; i < so->num_outputs; i++)
+		enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer);
+	sctx->b.streamout.enabled_stream_buffers_mask = enabled_stream_buffers_mask;
+	sctx->b.streamout.stride_in_dw = shader->so.stride;
+}
+
 void si_update_shaders(struct si_context *sctx)
 {
 	struct pipe_context *ctx = (struct pipe_context*)sctx;
@@ -1277,7 +1289,7 @@ void si_update_shaders(struct si_context *sctx)
 		} else {
 			/* TES as VS */
 			si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4);
-			sctx->b.streamout.stride_in_dw = sctx->tes_shader->so.stride;
+			si_update_so(sctx, sctx->tes_shader);
 		}
 	} else if (sctx->gs_shader) {
 		/* VS as ES */
@@ -1287,7 +1299,7 @@ void si_update_shaders(struct si_context *sctx)
 		/* VS as VS */
 		si_shader_select(ctx, sctx->vs_shader);
 		si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
-		sctx->b.streamout.stride_in_dw = sctx->vs_shader->so.stride;
+		si_update_so(sctx, sctx->vs_shader);
 	}
 
 	/* Update GS. */
@@ -1295,7 +1307,7 @@ void si_update_shaders(struct si_context *sctx)
 		si_shader_select(ctx, sctx->gs_shader);
 		si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
 		si_pm4_bind_state(sctx, vs, sctx->gs_shader->current->gs_copy_shader->pm4);
-		sctx->b.streamout.stride_in_dw = sctx->gs_shader->so.stride;
+		si_update_so(sctx, sctx->gs_shader);
 
 		if (!sctx->gs_rings)
 			si_init_gs_rings(sctx);

From d0a42b457fb905ce2cc12bb05110ef63656221c9 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 28 Jul 2015 11:25:59 +0300
Subject: [PATCH 0715/1208] i965/fs: Detect multi-register MOVs correctly in
 register_coalesce.

register_coalesce() was considering the exec_size of the MOV
instruction alone to decide whether the register at offset+1 of the
source VGRF was being copied to inst->dst.reg_offset+1 of the
destination VGRF, which is only a valid assumption if the move has a
32-bit execution type.  Use regs_read() instead to find out the number
of registers copied by the instruction.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index a253c81c845..4f00b7f146f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -195,7 +195,7 @@ fs_visitor::register_coalesce()
             continue;
          }
          reg_to_offset[offset] = inst->dst.reg_offset;
-         if (inst->exec_size == 16)
+         if (inst->regs_written > 1)
             reg_to_offset[offset + 1] = inst->dst.reg_offset + 1;
          mov[offset] = inst;
          channels_remaining -= inst->regs_written;

From 170200e0fcb0b16d20bff86e1258e0a1b2034c10 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 14:20:32 +0300
Subject: [PATCH 0716/1208] i965/fs: Fix rewrite of the second half of 16-wide
 coalesced registers.

The register coalesce pass wasn't rewriting the destination and
sources of instructions that accessed the second half of a coalesced
register previously copied with a 16-wide MOV instruction.  E.g.:

| ADD (16) vgrf0:f, vgrf0:f, 1.0:f
| MOV (16) vgrf1:f, vgrf0:f
| MOV (8)  vgrf2:f, vgrf0+1:f { sechalf }

would get incorrectly register-coalesced into:

| ADD (16) vgrf1:f, vgrf1:f, 1.0:f
| MOV (8)  vgrf2:f, vgrf0+1:f { sechalf }

The reason is that the mov[i] pointer was being left equal to NULL for
every other register.  The fact that we've made it to the rewrite loop
implies that the whole register will be coalesced, so it doesn't seem
right not to update something that uses it depending on whether mov[i]
is NULL or not.  Fixes an amount of texturing and image_load_store
piglit tests on my SIMD-lowering branch.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 .../dri/i965/brw_fs_register_coalesce.cpp     | 27 +++++++++----------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 4f00b7f146f..20a54800099 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -228,7 +228,6 @@ fs_visitor::register_coalesce()
          continue;
 
       progress = true;
-      bool was_load_payload = inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD;
 
       for (int i = 0; i < src_size; i++) {
          if (mov[i]) {
@@ -243,20 +242,18 @@ fs_visitor::register_coalesce()
 
       foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
          for (int i = 0; i < src_size; i++) {
-            if (mov[i] || was_load_payload) {
-               if (scan_inst->dst.file == GRF &&
-                   scan_inst->dst.reg == reg_from &&
-                   scan_inst->dst.reg_offset == i) {
-                  scan_inst->dst.reg = reg_to;
-                  scan_inst->dst.reg_offset = reg_to_offset[i];
-               }
-               for (int j = 0; j < scan_inst->sources; j++) {
-                  if (scan_inst->src[j].file == GRF &&
-                      scan_inst->src[j].reg == reg_from &&
-                      scan_inst->src[j].reg_offset == i) {
-                     scan_inst->src[j].reg = reg_to;
-                     scan_inst->src[j].reg_offset = reg_to_offset[i];
-                  }
+            if (scan_inst->dst.file == GRF &&
+                scan_inst->dst.reg == reg_from &&
+                scan_inst->dst.reg_offset == i) {
+               scan_inst->dst.reg = reg_to;
+               scan_inst->dst.reg_offset = reg_to_offset[i];
+            }
+            for (int j = 0; j < scan_inst->sources; j++) {
+               if (scan_inst->src[j].file == GRF &&
+                   scan_inst->src[j].reg == reg_from &&
+                   scan_inst->src[j].reg_offset == i) {
+                  scan_inst->src[j].reg = reg_to;
+                  scan_inst->src[j].reg_offset = reg_to_offset[i];
                }
             }
          }

From 24d74b66883da1955f8c2223367d41470d99df6d Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 28 Jul 2015 12:07:56 +0300
Subject: [PATCH 0717/1208] i965/fs: Simplify instruction rewrite loop in the
 register coalesce pass.

For some reason the loop that rewrites all occurrences of the
coalesced register was iterating over all possible offsets until it
would find one that compares equal to the offset of a source or
destination of any instruction in the program.  Since the mapping
between old and new offsets is already available in the regs_to_offset
array and we know that the whole register has been coalesced we can
just look it up.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 .../dri/i965/brw_fs_register_coalesce.cpp     | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 20a54800099..72e873857ce 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -241,20 +241,19 @@ fs_visitor::register_coalesce()
       }
 
       foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
-         for (int i = 0; i < src_size; i++) {
-            if (scan_inst->dst.file == GRF &&
-                scan_inst->dst.reg == reg_from &&
-                scan_inst->dst.reg_offset == i) {
-               scan_inst->dst.reg = reg_to;
-               scan_inst->dst.reg_offset = reg_to_offset[i];
-            }
-            for (int j = 0; j < scan_inst->sources; j++) {
-               if (scan_inst->src[j].file == GRF &&
-                   scan_inst->src[j].reg == reg_from &&
-                   scan_inst->src[j].reg_offset == i) {
-                  scan_inst->src[j].reg = reg_to;
-                  scan_inst->src[j].reg_offset = reg_to_offset[i];
-               }
+         if (scan_inst->dst.file == GRF &&
+             scan_inst->dst.reg == reg_from) {
+            scan_inst->dst.reg = reg_to;
+            scan_inst->dst.reg_offset =
+               reg_to_offset[scan_inst->dst.reg_offset];
+         }
+
+         for (int j = 0; j < scan_inst->sources; j++) {
+            if (scan_inst->src[j].file == GRF &&
+                scan_inst->src[j].reg == reg_from) {
+               scan_inst->src[j].reg = reg_to;
+               scan_inst->src[j].reg_offset =
+                  reg_to_offset[scan_inst->src[j].reg_offset];
             }
          }
       }

From fb7eba97d7235d49ac712a21fb51009c86f3bc64 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 21 Jul 2015 17:28:39 +0300
Subject: [PATCH 0718/1208] i965/fs: Factor out source components calculation
 to a separate method.

This cleans up fs_inst::regs_read() slightly by disentangling the
calculation of "components" from the handling of message payload
arguments.  This will also simplify the SIMD lowering and logical send
message lowering passes, because it will avoid expressions like
'regs_read * REG_SIZE / component_size' which are not only ugly, they
may be inaccurate because regs_read rounds up the result to the
closest register multiple so they could give incorrect results when
the component size is lower than one register (e.g. uniforms).  This
didn't seem to be a problem right now because all such expressions
happen to be dealing with per-channel GRFs only currently, but that's
by no means obvious so better be safe than sorry.

v2: Split PIXEL_X/Y and LINTERP into separate case blocks.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp  | 33 +++++++++++++++++++--------
 src/mesa/drivers/dri/i965/brw_ir_fs.h |  1 +
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 7f25a21f0d1..7d9b565d7f3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -663,10 +663,29 @@ fs_inst::is_partial_write() const
            !this->dst.is_contiguous());
 }
 
+unsigned
+fs_inst::components_read(unsigned i) const
+{
+   switch (opcode) {
+   case FS_OPCODE_LINTERP:
+      if (i == 0)
+         return 2;
+      else
+         return 1;
+
+   case FS_OPCODE_PIXEL_X:
+   case FS_OPCODE_PIXEL_Y:
+      assert(i == 0);
+      return 2;
+
+   default:
+      return 1;
+   }
+}
+
 int
 fs_inst::regs_read(int arg) const
 {
-   unsigned components = 1;
    switch (opcode) {
    case FS_OPCODE_FB_WRITE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
@@ -688,15 +707,8 @@ fs_inst::regs_read(int arg) const
       break;
 
    case FS_OPCODE_LINTERP:
-      if (arg == 0)
-         return exec_size / 4;
-      else
+      if (arg == 1)
          return 1;
-
-   case FS_OPCODE_PIXEL_X:
-   case FS_OPCODE_PIXEL_Y:
-      if (arg == 0)
-         components = 2;
       break;
 
    case SHADER_OPCODE_LOAD_PAYLOAD:
@@ -720,7 +732,8 @@ fs_inst::regs_read(int arg) const
       return 1;
    case GRF:
    case HW_REG:
-      return DIV_ROUND_UP(components * src[arg].component_size(exec_size),
+      return DIV_ROUND_UP(components_read(arg) *
+                          src[arg].component_size(exec_size),
                           REG_SIZE);
    case MRF:
       unreachable("MRF registers are not allowed as sources");
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index 693357f2796..97c6f8b2500 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -201,6 +201,7 @@ public:
    bool is_send_from_grf() const;
    bool is_partial_write() const;
    bool is_copy_payload(const brw::simple_allocator &grf_alloc) const;
+   unsigned components_read(unsigned i) const;
    int regs_read(int arg) const;
    bool can_do_source_mods(const struct brw_device_info *devinfo);
    bool has_side_effects() const;

From 1dd3543ac1bebe089bfe3a8ae5efbe3f564e1144 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 17:44:58 +0300
Subject: [PATCH 0719/1208] i965/fs: Add stub lowering pass for logical
 send-message opcodes.

This pass will house ad-hoc lowering code for several send
message-like virtual opcodes that will represent their logically
independent arguments as separate instruction sources rather than as a
single payload blob.  This pass will basically just take the separate
arguments that are supposed to be part of the payload and concatenate
them to construct a message in the form required by the hardware.
Virtual instructions in separate-source form will eventually allow
some simplification of the visitor code and make several
transformations easier like lowering SIMD16 instructions to SIMD8
algorithmically in cases where the hardware doesn't support the former
natively.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 29 +++++++++++++++++++++++++++-
 src/mesa/drivers/dri/i965/brw_fs.h   |  1 +
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 7d9b565d7f3..9d702a6c4b2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3198,6 +3198,30 @@ fs_visitor::lower_integer_multiplication()
    return progress;
 }
 
+bool
+fs_visitor::lower_logical_sends()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
+                                 .group(inst->exec_size, inst->force_sechalf)
+                                 .at(block, inst);
+
+      switch (inst->opcode) {
+      default:
+         continue;
+      }
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 void
 fs_visitor::dump_instructions()
 {
@@ -3645,9 +3669,12 @@ fs_visitor::optimize()
       backend_shader::dump_instructions(filename);
    }
 
-   bool progress;
+   bool progress = false;
    int iteration = 0;
    int pass_num = 0;
+
+   OPT(lower_logical_sends);
+
    do {
       progress = false;
       pass_num = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 283ee361d17..73559f4f15e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -182,6 +182,7 @@ public:
    void no16(const char *msg);
    void lower_uniform_pull_constant_loads();
    bool lower_load_payload();
+   bool lower_logical_sends();
    bool lower_integer_multiplication();
    bool opt_combine_constants();
 

From 5a5607a16ce7bf5eace2cf4b267af304aef05e90 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 14 Jul 2015 19:32:03 +0300
Subject: [PATCH 0720/1208] i965/fs: Add builder emit method taking a variable
 number of source registers.

And start using it in fs_builder::LOAD_PAYLOAD().  This will be used
to emit logical send message opcodes which have an unusually large
number of arguments.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_builder.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index c4ee9d484b7..eea1eae0129 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -306,6 +306,17 @@ namespace brw {
          }
       }
 
+      /**
+       * Create and insert an instruction with a variable number of sources
+       * into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
+           unsigned n) const
+      {
+         return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
+      }
+
       /**
        * Insert a preallocated instruction into the program.
        */
@@ -518,9 +529,7 @@ namespace brw {
       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
                    unsigned sources, unsigned header_size) const
       {
-         instruction *inst = emit(instruction(SHADER_OPCODE_LOAD_PAYLOAD,
-                                              dispatch_width(), dst,
-                                              src, sources));
+         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
          inst->header_size = header_size;
          inst->regs_written = header_size +
                               (sources - header_size) * (dispatch_width() / 8);

From 86ae788baefefdb2fa77fe3c242ad2d81c8e834e Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Thu, 16 Jul 2015 15:58:56 +0300
Subject: [PATCH 0721/1208] i965/fs: Fix return value of fs_inst::regs_read()
 for BAD_FILE.

Typically BAD_FILE sources are used to mark a source as not present
what implies that no registers are read.  This will become much more
frequent with logical send opcodes which have a large number of
sources, many of them optionally used and marked as BAD_FILE when they
aren't applicable.  It will prove to be useful to be able to rely on
the value of regs_read() regardless of whether a source is present or
not.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 9d702a6c4b2..57e4290432e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -727,6 +727,7 @@ fs_inst::regs_read(int arg) const
 
    switch (src[arg].file) {
    case BAD_FILE:
+      return 0;
    case UNIFORM:
    case IMM:
       return 1;

From 8368939e5d94f8d4ae55a1f22a755922ee77132b Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 21:15:31 +0300
Subject: [PATCH 0722/1208] i965/fs: Implement pass to lower instructions of
 unsupported SIMD width.

This lowering pass implements an algorithm to expand SIMDN
instructions into a sequence of SIMDM instructions in cases where the
hardware doesn't support the original execution size natively for some
particular instruction.  The most important use-cases are:

 - Lowering send message instructions that don't support SIMD16
   natively into SIMD8 (several texturing, framebuffer write and typed
   surface operations).

 - Lowering messages that don't support SIMD8 natively into SIMD16
   (*cough*gen4*cough*).

 - 64-bit precision operations (e.g. FP64 and 64-bit integer
   multiplication).

 - SIMD32.

The algorithm works by splitting the sources of the original
instruction into chunks of width appropriate for the lowered
instructions, and then interleaving the results component-wise into
the destination of the original instruction.  The pass is controlled
by the get_lowered_simd_width() function that currently just returns
the original execution size making the whole pass a no-op for the
moment until some user is introduced.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>

v2: Reverse order of the source transformations and split_inst emit
    call to make the code a bit easier to understand.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 143 +++++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_fs.h   |   1 +
 2 files changed, 144 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 57e4290432e..b8df4b30e24 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3223,6 +3223,148 @@ fs_visitor::lower_logical_sends()
    return progress;
 }
 
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst.  The instruction will be left untouched by
+ * fs_visitor::lower_simd_width() if the returned value is equal to the
+ * original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct brw_device_info *devinfo,
+                       const fs_inst *inst)
+{
+   switch (inst->opcode) {
+   default:
+      return inst->exec_size;
+   }
+}
+
+/**
+ * The \p rows array of registers represents a \p num_rows by \p num_columns
+ * matrix in row-major order, write it in column-major order into the register
+ * passed as destination.  \p stride gives the separation between matrix
+ * elements in the input in fs_builder::dispatch_width() units.
+ */
+static void
+emit_transpose(const fs_builder &bld,
+               const fs_reg &dst, const fs_reg *rows,
+               unsigned num_rows, unsigned num_columns, unsigned stride)
+{
+   fs_reg *const components = new fs_reg[num_rows * num_columns];
+
+   for (unsigned i = 0; i < num_columns; ++i) {
+      for (unsigned j = 0; j < num_rows; ++j)
+         components[num_rows * i + j] = offset(rows[j], bld, stride * i);
+   }
+
+   bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
+
+   delete[] components;
+}
+
+bool
+fs_visitor::lower_simd_width()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
+
+      if (lower_width != inst->exec_size) {
+         /* Builder matching the original instruction. */
+         const fs_builder ibld = bld.at(block, inst)
+                                    .exec_all(inst->force_writemask_all)
+                                    .group(inst->exec_size, inst->force_sechalf);
+
+         /* Split the copies in chunks of the execution width of either the
+          * original or the lowered instruction, whichever is lower.
+          */
+         const unsigned copy_width = MIN2(lower_width, inst->exec_size);
+         const unsigned n = inst->exec_size / copy_width;
+         const unsigned dst_size = inst->regs_written * REG_SIZE /
+            inst->dst.component_size(inst->exec_size);
+         fs_reg dsts[4];
+
+         assert(n > 0 && n <= ARRAY_SIZE(dsts) &&
+                !inst->writes_accumulator && !inst->mlen);
+
+         for (unsigned i = 0; i < n; i++) {
+            /* Emit a copy of the original instruction with the lowered width.
+             * If the EOT flag was set throw it away except for the last
+             * instruction to avoid killing the thread prematurely.
+             */
+            fs_inst split_inst = *inst;
+            split_inst.exec_size = lower_width;
+            split_inst.eot = inst->eot && i == n - 1;
+
+            /* Set exec_all if the lowered width is higher than the original
+             * to avoid breaking the compiler invariant that no control
+             * flow-masked instruction is wider than the shader's
+             * dispatch_width.  Then transform the sources and destination and
+             * emit the lowered instruction.
+             */
+            const fs_builder lbld = ibld.exec_all(lower_width > inst->exec_size)
+                                        .group(lower_width, i);
+
+            for (unsigned j = 0; j < inst->sources; j++) {
+               if (inst->src[j].file != BAD_FILE &&
+                   !is_uniform(inst->src[j])) {
+                  /* Get the i-th copy_width-wide chunk of the source. */
+                  const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
+                  const unsigned src_size = inst->components_read(j);
+
+                  /* Use a trivial transposition to copy one every n
+                   * copy_width-wide components of the register into a
+                   * temporary passed as source to the lowered instruction.
+                   */
+                  split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
+                  emit_transpose(lbld.group(copy_width, 0),
+                                 split_inst.src[j], &src, 1, src_size, n);
+               }
+            }
+
+            if (inst->regs_written) {
+               /* Allocate enough space to hold the result of the lowered
+                * instruction and fix up the number of registers written.
+                */
+               split_inst.dst = dsts[i] =
+                  lbld.vgrf(inst->dst.type, dst_size);
+               split_inst.regs_written =
+                  DIV_ROUND_UP(inst->regs_written * lower_width,
+                               inst->exec_size);
+            }
+
+            lbld.emit(split_inst);
+         }
+
+         if (inst->regs_written) {
+            /* Distance between useful channels in the temporaries, skipping
+             * garbage if the lowered instruction is wider than the original.
+             */
+            const unsigned m = lower_width / copy_width;
+
+            /* Interleave the components of the result from the lowered
+             * instructions.  We need to set exec_all() when copying more than
+             * one half per component, because LOAD_PAYLOAD (in terms of which
+             * emit_transpose is implemented) can only use the same channel
+             * enable signals for all of its non-header sources.
+             */
+            emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
+                               .group(copy_width, 0),
+                           inst->dst, dsts, n, dst_size, m);
+         }
+
+         inst->remove(block);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 void
 fs_visitor::dump_instructions()
 {
@@ -3674,6 +3816,7 @@ fs_visitor::optimize()
    int iteration = 0;
    int pass_num = 0;
 
+   OPT(lower_simd_width);
    OPT(lower_logical_sends);
 
    do {
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 73559f4f15e..8467e2cee71 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -184,6 +184,7 @@ public:
    bool lower_load_payload();
    bool lower_logical_sends();
    bool lower_integer_multiplication();
+   bool lower_simd_width();
    bool opt_combine_constants();
 
    void emit_dummy_fs();

From a9f31a032b0a1068a4e2ceed9ed4680ecf13e28b Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 16:14:36 +0300
Subject: [PATCH 0723/1208] i965/fs: Define logical framebuffer write opcode.

The logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects its arguments
that make up the payload separately as individual sources, like:

 fb_write_logical null, color0, color1, src0_alpha,
                        src_depth, dst_depth, sample_mask, num_components

This patch defines the opcode and usual instruction boilerplate,
including a placeholder lowering function provided mainly as
self-documentation.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_defines.h  | 15 ++++++++++
 src/mesa/drivers/dri/i965/brw_fs.cpp     | 35 ++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_fs.h       |  2 +-
 src/mesa/drivers/dri/i965/brw_shader.cpp |  2 ++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index e6fdc3dd125..9f8d7337047 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -875,6 +875,21 @@ enum opcode {
     * instructions.
     */
    FS_OPCODE_FB_WRITE = 128,
+
+   /**
+    * Same as FS_OPCODE_FB_WRITE but expects its arguments separately as
+    * individual sources instead of as a single payload blob:
+    *
+    * Source 0: [required] Color 0.
+    * Source 1: [optional] Color 1 (for dual source blend messages).
+    * Source 2: [optional] Src0 Alpha.
+    * Source 3: [optional] Source Depth (passthrough from the thread payload).
+    * Source 4: [optional] Destination Depth (gl_FragDepth).
+    * Source 5: [optional] Sample Mask (gl_SampleMask).
+    * Source 6: [required] Number of color components (as a UD immediate).
+    */
+   FS_OPCODE_FB_WRITE_LOGICAL,
+
    FS_OPCODE_BLORP_FB_WRITE,
    FS_OPCODE_REP_FB_WRITE,
    SHADER_OPCODE_RCP,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index b8df4b30e24..c481d0b74cb 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -678,6 +678,14 @@ fs_inst::components_read(unsigned i) const
       assert(i == 0);
       return 2;
 
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      assert(src[6].file == IMM);
+      /* First/second FB write color. */
+      if (i < 2)
+         return src[6].fixed_hw_reg.dw1.ud;
+      else
+         return 1;
+
    default:
       return 1;
    }
@@ -3199,6 +3207,25 @@ fs_visitor::lower_integer_multiplication()
    return progress;
 }
 
+static void
+lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
+                            const brw_wm_prog_data *prog_data,
+                            const brw_wm_prog_key *key,
+                            const fs_visitor::thread_payload &payload)
+{
+   assert(inst->src[6].file == IMM);
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg &color0 = inst->src[0];
+   const fs_reg &color1 = inst->src[1];
+   const fs_reg &src0_alpha = inst->src[2];
+   const fs_reg &src_depth = inst->src[3];
+   const fs_reg &dst_depth = inst->src[4];
+   fs_reg sample_mask = inst->src[5];
+   const unsigned components = inst->src[6].fixed_hw_reg.dw1.ud;
+
+   assert(!"Not implemented");
+}
+
 bool
 fs_visitor::lower_logical_sends()
 {
@@ -3210,6 +3237,14 @@ fs_visitor::lower_logical_sends()
                                  .at(block, inst);
 
       switch (inst->opcode) {
+      case FS_OPCODE_FB_WRITE_LOGICAL:
+         assert(stage == MESA_SHADER_FRAGMENT);
+         lower_fb_write_logical_send(ibld, inst,
+                                     (const brw_wm_prog_data *)prog_data,
+                                     (const brw_wm_prog_key *)key,
+                                     payload);
+         break;
+
       default:
          continue;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 8467e2cee71..3769d9fbfd0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -386,7 +386,7 @@ public:
    fs_reg result;
 
    /** Register numbers for thread payload fields. */
-   struct {
+   struct thread_payload {
       uint8_t source_depth_reg;
       uint8_t source_w_reg;
       uint8_t aa_dest_stencil_reg;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 4186432ae41..8215a2f9bb1 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -537,6 +537,8 @@ brw_instruction_name(enum opcode op)
       return opcode_descs[op].name;
    case FS_OPCODE_FB_WRITE:
       return "fb_write";
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      return "fb_write_logical";
    case FS_OPCODE_BLORP_FB_WRITE:
       return "blorp_fb_write";
    case FS_OPCODE_REP_FB_WRITE:

From fa75f2d56616cba81014d4fc02931dcfaedaf5b9 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 15:41:34 +0300
Subject: [PATCH 0724/1208] i965/fs: Honour the instruction force_sechalf and
 exec_size fields for FB writes.

We were previously guessing the half based on the EOT flag which seems
rather gross.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 7f5ac6b9665..55bc6db901b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -221,11 +221,11 @@ fs_generator::fire_fb_write(fs_inst *inst,
    if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
    else if (prog_data->dual_src_blend) {
-      if (dispatch_width == 8 || !inst->eot)
+      if (!inst->force_sechalf)
          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
       else
          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
-   } else if (dispatch_width == 16)
+   } else if (inst->exec_size == 16)
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
    else
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;

From f68ec2baf49e37f9ce4fffe95f13177eb7225015 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 15:40:18 +0300
Subject: [PATCH 0725/1208] i965/fs: Make sure that the type sizes are
 compatible during copy propagation.

It's surprising that we weren't checking for this already.  A future
patch will cause code like the following to be emitted:

 MOV(16) tmp<1>:uw, src
 MOV(8) dst<1>:ud, tmp<8,8,1>:ud

The second MOV comes from the expansion of a LOAD_PAYLOAD header copy,
so I don't have control over its types.  Copy propagation will happily
turn this into:

 MOV(8) dst<1>:ud, src

Which has different semantics.  Fix it by preventing propagation in
cases where a single channel of the instruction would span several
channels of the copy (this requirement could in fact be relaxed if the
copy is just a trivial memcpy, but this case is unusual enough that I
don't think it matters in practice).

I'm deliberately only checking if the type of the instruction is
larger than the original, because the converse case seems to be
handled correctly already in the code below.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 54e9114fe6b..269bdb5b272 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -339,6 +339,14 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
    if (entry->src.stride * inst->src[arg].stride > 4)
       return false;
 
+   /* Bail if the instruction type is larger than the execution type of the
+    * copy, what implies that each channel is reading multiple channels of the
+    * destination of the copy, and simply replacing the sources would give a
+    * program with different semantics.
+    */
+   if (type_sz(entry->dst.type) < type_sz(inst->src[arg].type))
+      return false;
+
    /* Bail if the result of composing both strides cannot be expressed
     * as another stride. This avoids, for example, trying to transform
     * this:

From 1ad928ed9f4e7723f709f91d18d17726c92f0b7b Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 15 Jul 2015 16:42:57 +0300
Subject: [PATCH 0726/1208] i965/fs: Fix slight layering violation in
 emit_single_fb_writes().

In cases where the color0 argument wasn't being provided,
emit_single_fb_writes() would take the alpha channel directly from the
visitor state instead of taking it from its arguments.  This sort of
hack didn't fit nicely into the logical send-message approach because
all parameters of the instruction have to be visible to the SIMD
lowering pass for it to be able to split them into halves at all.

Fix it by using LOAD_PAYLOAD in fs_visitor::emit_fb_writes() to
provide an actual color0 vector with undefined contents except for the
alpha component to match the previous behavior when no color buffers
are enabled.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 22 +++++++++-----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 47dc9254664..b267e8cf015 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1551,17 +1551,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
 
    payload_header_size = length;
 
-   if (color0.file == BAD_FILE) {
-      /* Even if there's no color buffers enabled, we still need to send
-       * alpha out the pipeline to our null renderbuffer to support
-       * alpha-testing, alpha-to-coverage, and so on.
-       */
-      if (this->outputs[0].file != BAD_FILE)
-         setup_color_payload(&sources[length + 3],
-                             offset(this->outputs[0], bld, 3),
-                             1, exec_size, false);
-      length += 4;
-   } else if (color1.file == BAD_FILE) {
+   if (color1.file == BAD_FILE) {
       if (src0_alpha.file != BAD_FILE) {
          setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false);
          length++;
@@ -1709,7 +1699,15 @@ fs_visitor::emit_fb_writes()
        * alpha out the pipeline to our null renderbuffer to support
        * alpha-testing, alpha-to-coverage, and so on.
        */
-      inst = emit_single_fb_write(bld, reg_undef, reg_undef, reg_undef, 0,
+      /* FINISHME: Factor out this frequently recurring pattern into a
+       * helper function.
+       */
+      const fs_reg srcs[] = { reg_undef, reg_undef,
+                              reg_undef, offset(this->outputs[0], bld, 3) };
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+      bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
+
+      inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4,
                                   dispatch_width);
       inst->target = 0;
    }

From 6bd991a1377862e3b1b9c05e835289fff9d6785f Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 15 Jul 2015 17:05:27 +0300
Subject: [PATCH 0727/1208] i965/fs: Simplify control flow in
 emit_single_fb_write().

Flatten the if ladder to match the way that the ordering of these
fields is specified in the hardware documentation a bit more closely.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 28 +++++++++++---------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index b267e8cf015..e91e3edffc9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1551,19 +1551,23 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
 
    payload_header_size = length;
 
-   if (color1.file == BAD_FILE) {
-      if (src0_alpha.file != BAD_FILE) {
-         setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false);
-         length++;
-      }
+   if (src0_alpha.file != BAD_FILE) {
+      /* FIXME: This is being passed at the wrong location in the payload and
+       * doesn't work when gl_SampleMask and MRTs are used simultaneously.
+       * It's supposed to be immediately before oMask but there seems to be no
+       * reasonable way to pass them in the correct order because LOAD_PAYLOAD
+       * requires header sources to form a contiguous segment at the beginning
+       * of the message and src0_alpha has per-channel semantics.
+       */
+      setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false);
+      length++;
+   }
 
-      setup_color_payload(&sources[length], color0, components,
-                          exec_size, use_2nd_half);
-      length += 4;
-   } else {
-      setup_color_payload(&sources[length], color0, components,
-                          exec_size, use_2nd_half);
-      length += 4;
+   setup_color_payload(&sources[length], color0, components,
+                       exec_size, use_2nd_half);
+   length += 4;
+
+   if (color1.file != BAD_FILE) {
       setup_color_payload(&sources[length], color1, components,
                           exec_size, use_2nd_half);
       length += 4;

From b145855df624d0031eb2399503389948ebfcdd26 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 15 Jul 2015 18:49:55 +0300
Subject: [PATCH 0728/1208] i965/fs: Move up prog_data->uses_omask assignment
 up to brw_codegen_wm_prog().

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 --
 src/mesa/drivers/dri/i965/brw_wm.c           | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index e91e3edffc9..1313a3c0df2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1535,8 +1535,6 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
       length++;
    }
 
-   prog_data->uses_omask =
-      prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
    if (prog_data->uses_omask) {
       assert(this->sample_mask.file != BAD_FILE);
       /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 592a72927c3..b590b177ece 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -181,7 +181,8 @@ brw_codegen_wm_prog(struct brw_context *brw,
     * so the shader definitely kills pixels.
     */
    prog_data.uses_kill = fp->program.UsesKill || key->alpha_test_func;
-
+   prog_data.uses_omask =
+      fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
    prog_data.computed_depth_mode = computed_depth_mode(&fp->program);
 
    /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */

From b1abfc49476f0277ddee0df269b56fc3de714c4b Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 15 Jul 2015 18:50:59 +0300
Subject: [PATCH 0729/1208] i965/fs: Move up Gen6 no16 check to
 emit_fb_writes().

And update the comment.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 1313a3c0df2..564d69404ba 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1572,15 +1572,6 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
    }
 
    if (source_depth_to_render_target) {
-      if (devinfo->gen == 6) {
-	 /* For outputting oDepth on gen6, SIMD8 writes have to be
-	  * used.  This would require SIMD8 moves of each half to
-	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
-	  * Just bail on doing so for now.
-	  */
-	 no16("Missing support for simd16 depth writes on gen6\n");
-      }
-
       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
 	 /* Hand over gl_FragDepth. */
 	 assert(this->frag_depth.file != BAD_FILE);
@@ -1643,6 +1634,17 @@ fs_visitor::emit_fb_writes()
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
    fs_inst *inst = NULL;
+
+   if (source_depth_to_render_target && devinfo->gen == 6) {
+      /* For outputting oDepth on gen6, SIMD8 writes have to be used.  This
+       * would require SIMD8 moves of each half to message regs, e.g. by using
+       * the SIMD lowering pass.  Unfortunately this is more difficult than it
+       * sounds because the SIMD8 single-source message lacks channel selects
+       * for the second and third subspans.
+       */
+      no16("Missing support for simd16 depth writes on gen6\n");
+   }
+
    if (do_dual_src) {
       const fs_builder abld = bld.annotate("FB dual-source write");
 

From 98b0122e0a194e0d6c5d3eb05fd3f29a5286b3b3 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 15 Jul 2015 17:31:04 +0300
Subject: [PATCH 0730/1208] i965/fs: Don't attempt to copy the useless half of
 oMask for SIMD8 FB writes.

There's no need to initialize the wrong half of oMask in the payload
when we're doing an 8-wide framebuffer write because it will be
ignored by the hardware anyway.  By doing it this way we can let the
SIMD lowering pass split the sample_mask source as a regular
per-channel source, otherwise we would have to introduce some sort of
per-instruction source query or use fs_inst::header_size for the
lowering pass to be able to find out whether some source is
header-like, and leave the source untouched in that case.

As a bonus this achieves the same purpose as the previous code without
making use of the SET_OMASK pseudo-instruction, which will be removed
in a future commit.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 26 ++++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 564d69404ba..a0dcca0c2b3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1499,6 +1499,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   const fs_builder ubld = bld.group(exec_size, use_2nd_half);
    int header_size = 2, payload_header_size;
 
    /* We can potentially have a message length of up to 15, so we have to set
@@ -1536,14 +1537,24 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
    }
 
    if (prog_data->uses_omask) {
-      assert(this->sample_mask.file != BAD_FILE);
-      /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
-       * it's unsinged single words, one vgrf is always 16-wide.
-       */
       sources[length] = fs_reg(GRF, alloc.allocate(1),
-                               BRW_REGISTER_TYPE_UW);
-      bld.exec_all().annotate("FB write oMask")
-         .emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
+                               BRW_REGISTER_TYPE_UD);
+
+      /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
+       * relevant.  Since it's unsigned single words one vgrf is always
+       * 16-wide, but only the lower or higher 8 channels will be used by the
+       * hardware when doing a SIMD8 write depending on whether we have
+       * selected the subspans for the first or second half respectively.
+       */
+      fs_reg sample_mask = this->sample_mask;
+      assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
+      sample_mask.type = BRW_REGISTER_TYPE_UW;
+      sample_mask.stride *= 2;
+
+      ubld.annotate("FB write oMask")
+          .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
+                    use_2nd_half),
+               half(sample_mask, use_2nd_half));
       length++;
    }
 
@@ -1590,7 +1601,6 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
    if (payload.dest_depth_reg)
       sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0));
 
-   const fs_builder ubld = bld.group(exec_size, use_2nd_half);
    fs_inst *load;
    fs_inst *write;
    if (devinfo->gen >= 7) {

From cecf738b0fbe8ebafe304c717e847f1d3f41d3ca Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 14:49:27 +0300
Subject: [PATCH 0731/1208] i965/fs: Remove the FS_OPCODE_SET_OMASK
 pseudo-opcode.

This is now unused.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_defines.h       |  1 -
 src/mesa/drivers/dri/i965/brw_fs.h            |  4 ---
 .../drivers/dri/i965/brw_fs_generator.cpp     | 35 -------------------
 src/mesa/drivers/dri/i965/brw_shader.cpp      |  2 --
 4 files changed, 42 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 9f8d7337047..2af7413d993 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -984,7 +984,6 @@ enum opcode {
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
    FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
    FS_OPCODE_DISCARD_JUMP,
-   FS_OPCODE_SET_OMASK,
    FS_OPCODE_SET_SAMPLE_ID,
    FS_OPCODE_SET_SIMD4X2_OFFSET,
    FS_OPCODE_PACK_HALF_2x16_SPLIT,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 3769d9fbfd0..dd38940081c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -495,10 +495,6 @@ private:
                                           struct brw_reg msg_data,
                                           unsigned msg_type);
 
-   void generate_set_omask(fs_inst *inst,
-                           struct brw_reg dst,
-                           struct brw_reg sample_mask);
-
    void generate_set_sample_id(fs_inst *inst,
                                struct brw_reg dst,
                                struct brw_reg src0,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 55bc6db901b..a176fcf9c41 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1364,37 +1364,6 @@ fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
    brw_pop_insn_state(p);
 }
 
-/* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
- * (when mask is passed as a uniform) of register mask before moving it
- * to register dst.
- */
-void
-fs_generator::generate_set_omask(fs_inst *inst,
-                                 struct brw_reg dst,
-                                 struct brw_reg mask)
-{
-   bool stride_8_8_1 =
-    (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
-     mask.width == BRW_WIDTH_8 &&
-     mask.hstride == BRW_HORIZONTAL_STRIDE_1);
-
-   bool stride_0_1_0 = has_scalar_region(mask);
-
-   assert(stride_8_8_1 || stride_0_1_0);
-   assert(dst.type == BRW_REGISTER_TYPE_UW);
-
-   brw_push_insn_state(p);
-   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-
-   if (stride_8_8_1) {
-      brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type));
-   } else if (stride_0_1_0) {
-      brw_MOV(p, dst, retype(mask, dst.type));
-   }
-   brw_pop_insn_state(p);
-}
-
 /* Sets vstride=1, width=4, hstride=0 of register src1 during
  * the ADD instruction.
  */
@@ -2074,10 +2043,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          brw_broadcast(p, dst, src[0], src[1]);
          break;
 
-      case FS_OPCODE_SET_OMASK:
-         generate_set_omask(inst, dst, src[0]);
-         break;
-
       case FS_OPCODE_SET_SAMPLE_ID:
          generate_set_sample_id(inst, dst, src[0], src[1]);
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 8215a2f9bb1..1e660d71cfc 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -664,8 +664,6 @@ brw_instruction_name(enum opcode op)
    case FS_OPCODE_DISCARD_JUMP:
       return "discard_jump";
 
-   case FS_OPCODE_SET_OMASK:
-      return "set_omask";
    case FS_OPCODE_SET_SAMPLE_ID:
       return "set_sample_id";
    case FS_OPCODE_SET_SIMD4X2_OFFSET:

From 633938afd349f2b423146969688c11f1e29ca17a Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 21:19:28 +0300
Subject: [PATCH 0732/1208] i965/fs: Hook up SIMD lowering to unroll FB writes
 of unsupported width.

This shouldn't have any effect because we don't emit logical
framebuffer writes yet.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index c481d0b74cb..13cdf31900b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3269,6 +3269,15 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
                        const fs_inst *inst)
 {
    switch (inst->opcode) {
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
+       * here.
+       */
+      assert(devinfo->gen != 6 || inst->src[3].file == BAD_FILE ||
+             inst->exec_size == 8);
+      /* Dual-source FB writes are unsupported in SIMD16 mode. */
+      return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size);
+
    default:
       return inst->exec_size;
    }

From 59e7e6f7a21f13ff8963cf21af2e969f1f7961f5 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 17:59:34 +0300
Subject: [PATCH 0733/1208] i965/fs: Implement lowering of logical framebuffer
 writes.

This does essentially the same thing as
fs_visitor::emit_single_fb_write(), with some slight differences:

 - We don't have to worry about exec_size and use_2nd_half anymore,
   16-wide sources have already been lowered to 8-wide thanks to the
   previous commit and the manual argument unzipping is no longer
   required.

 - The src/dst_depth and sample_mask values are now explicit sources
   of the instruction instead of being taken from the visitor state
   directly.  The same goes for the kill-pixel mask that will be
   passed to the instruction explicitly as predicate.

 - Everything is now done in static functions to improve
   encapsulation.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Acked-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 137 ++++++++++++++++++++++++++-
 1 file changed, 136 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 13cdf31900b..936f0d19706 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3207,6 +3207,25 @@ fs_visitor::lower_integer_multiplication()
    return progress;
 }
 
+static void
+setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
+                    fs_reg *dst, fs_reg color, unsigned components)
+{
+   if (key->clamp_fragment_color) {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+      assert(color.type == BRW_REGISTER_TYPE_F);
+
+      for (unsigned i = 0; i < components; i++)
+         set_saturate(true,
+                      bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
+
+      color = tmp;
+   }
+
+   for (unsigned i = 0; i < components; i++)
+      dst[i] = offset(color, bld, i);
+}
+
 static void
 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
                             const brw_wm_prog_data *prog_data,
@@ -3223,7 +3242,123 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    fs_reg sample_mask = inst->src[5];
    const unsigned components = inst->src[6].fixed_hw_reg.dw1.ud;
 
-   assert(!"Not implemented");
+   /* We can potentially have a message length of up to 15, so we have to set
+    * base_mrf to either 0 or 1 in order to fit in m0..m15.
+    */
+   fs_reg sources[15];
+   int header_size = 2, payload_header_size;
+   unsigned length = 0;
+
+   /* From the Sandy Bridge PRM, volume 4, page 198:
+    *
+    *     "Dispatched Pixel Enables. One bit per pixel indicating
+    *      which pixels were originally enabled when the thread was
+    *      dispatched. This field is only required for the end-of-
+    *      thread message and on all dual-source messages."
+    */
+   if (devinfo->gen >= 6 &&
+       (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
+       color1.file == BAD_FILE &&
+       key->nr_color_regions == 1) {
+      header_size = 0;
+   }
+
+   if (header_size != 0) {
+      assert(header_size == 2);
+      /* Allocate 2 registers for a header */
+      length += 2;
+   }
+
+   if (payload.aa_dest_stencil_reg) {
+      sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1));
+      bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
+         .MOV(sources[length],
+              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
+      length++;
+   }
+
+   if (prog_data->uses_omask) {
+      sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1),
+                               BRW_REGISTER_TYPE_UD);
+
+      /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
+       * relevant.  Since it's unsigned single words one vgrf is always
+       * 16-wide, but only the lower or higher 8 channels will be used by the
+       * hardware when doing a SIMD8 write depending on whether we have
+       * selected the subspans for the first or second half respectively.
+       */
+      assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
+      sample_mask.type = BRW_REGISTER_TYPE_UW;
+      sample_mask.stride *= 2;
+
+      bld.exec_all().annotate("FB write oMask")
+         .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
+                   inst->force_sechalf),
+              sample_mask);
+      length++;
+   }
+
+   payload_header_size = length;
+
+   if (src0_alpha.file != BAD_FILE) {
+      /* FIXME: This is being passed at the wrong location in the payload and
+       * doesn't work when gl_SampleMask and MRTs are used simultaneously.
+       * It's supposed to be immediately before oMask but there seems to be no
+       * reasonable way to pass them in the correct order because LOAD_PAYLOAD
+       * requires header sources to form a contiguous segment at the beginning
+       * of the message and src0_alpha has per-channel semantics.
+       */
+      setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
+      length++;
+   }
+
+   setup_color_payload(bld, key, &sources[length], color0, components);
+   length += 4;
+
+   if (color1.file != BAD_FILE) {
+      setup_color_payload(bld, key, &sources[length], color1, components);
+      length += 4;
+   }
+
+   if (src_depth.file != BAD_FILE) {
+      sources[length] = src_depth;
+      length++;
+   }
+
+   if (dst_depth.file != BAD_FILE) {
+      sources[length] = dst_depth;
+      length++;
+   }
+
+   fs_inst *load;
+   if (devinfo->gen >= 7) {
+      /* Send from the GRF */
+      fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
+      load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
+      payload.reg = bld.shader->alloc.allocate(load->regs_written);
+      load->dst = payload;
+
+      inst->src[0] = payload;
+      inst->resize_sources(1);
+      inst->base_mrf = -1;
+   } else {
+      /* Send from the MRF */
+      load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
+                              sources, length, payload_header_size);
+
+      /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
+       * will do this for us if we just give it a COMPR4 destination.
+       */
+      if (devinfo->gen < 6 && bld.dispatch_width() == 16)
+         load->dst.reg |= BRW_MRF_COMPR4;
+
+      inst->resize_sources(0);
+      inst->base_mrf = 1;
+   }
+
+   inst->opcode = FS_OPCODE_FB_WRITE;
+   inst->mlen = load->regs_written;
+   inst->header_size = header_size;
 }
 
 bool

From f18792aa10cedba2034762eade816c4c77ca46c6 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Thu, 16 Jul 2015 16:12:48 +0300
Subject: [PATCH 0734/1208] i965/fs: Reimplement emit_single_fb_write() in
 terms of logical framebuffer writes.

The only non-trivial thing it still has to do is figure out where to
take the src/dst depth values from and predicate the instruction if
discard is in use.  The manual SIMD unrolling logic in the dual-source
case goes away because this is now handled transparently by the SIMD
lowering pass.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Acked-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.h           |   5 +-
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 203 ++-----------------
 2 files changed, 21 insertions(+), 187 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index dd38940081c..bd93a6a3746 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -290,13 +290,10 @@ public:
    bool optimize_frontfacing_ternary(nir_alu_instr *instr,
                                      const fs_reg &result);
 
-   void setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
-                            unsigned exec_size, bool use_2nd_half);
    void emit_alpha_test();
    fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
                                  fs_reg color1, fs_reg color2,
-                                 fs_reg src0_alpha, unsigned components,
-                                 unsigned exec_size, bool use_2nd_half = false);
+                                 fs_reg src0_alpha, unsigned components);
    void emit_fb_writes();
    void emit_urb_writes();
    void emit_cs_terminate();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index a0dcca0c2b3..ef8e967cfb2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1409,33 +1409,6 @@ fs_visitor::emit_interpolation_setup_gen6()
    }
 }
 
-void
-fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
-                                unsigned exec_size, bool use_2nd_half)
-{
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   fs_inst *inst;
-
-   if (key->clamp_fragment_color) {
-      fs_reg tmp = vgrf(glsl_type::vec4_type);
-      assert(color.type == BRW_REGISTER_TYPE_F);
-      for (unsigned i = 0; i < components; i++) {
-         inst = bld.MOV(offset(tmp, bld, i), offset(color, bld, i));
-         inst->saturate = true;
-      }
-      color = tmp;
-   }
-
-   if (exec_size < dispatch_width) {
-      unsigned half_idx = use_2nd_half ? 1 : 0;
-      for (unsigned i = 0; i < components; i++)
-         dst[i] = half(offset(color, bld, i), half_idx);
-   } else {
-      for (unsigned i = 0; i < components; i++)
-         dst[i] = offset(color, bld, i);
-   }
-}
-
 static enum brw_conditional_mod
 cond_for_alpha_func(GLenum func)
 {
@@ -1493,146 +1466,36 @@ fs_visitor::emit_alpha_test()
 fs_inst *
 fs_visitor::emit_single_fb_write(const fs_builder &bld,
                                  fs_reg color0, fs_reg color1,
-                                 fs_reg src0_alpha, unsigned components,
-                                 unsigned exec_size, bool use_2nd_half)
+                                 fs_reg src0_alpha, unsigned components)
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   const fs_builder ubld = bld.group(exec_size, use_2nd_half);
-   int header_size = 2, payload_header_size;
 
-   /* We can potentially have a message length of up to 15, so we have to set
-    * base_mrf to either 0 or 1 in order to fit in m0..m15.
-    */
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
-   int length = 0;
-
-   /* From the Sandy Bridge PRM, volume 4, page 198:
-    *
-    *     "Dispatched Pixel Enables. One bit per pixel indicating
-    *      which pixels were originally enabled when the thread was
-    *      dispatched. This field is only required for the end-of-
-    *      thread message and on all dual-source messages."
-    */
-   if (devinfo->gen >= 6 &&
-       (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
-       color1.file == BAD_FILE &&
-       key->nr_color_regions == 1) {
-      header_size = 0;
-   }
-
-   if (header_size != 0) {
-      assert(header_size == 2);
-      /* Allocate 2 registers for a header */
-      length += 2;
-   }
-
-   if (payload.aa_dest_stencil_reg) {
-      sources[length] = fs_reg(GRF, alloc.allocate(1));
-      bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
-         .MOV(sources[length],
-              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
-      length++;
-   }
-
-   if (prog_data->uses_omask) {
-      sources[length] = fs_reg(GRF, alloc.allocate(1),
-                               BRW_REGISTER_TYPE_UD);
-
-      /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
-       * relevant.  Since it's unsigned single words one vgrf is always
-       * 16-wide, but only the lower or higher 8 channels will be used by the
-       * hardware when doing a SIMD8 write depending on whether we have
-       * selected the subspans for the first or second half respectively.
-       */
-      fs_reg sample_mask = this->sample_mask;
-      assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
-      sample_mask.type = BRW_REGISTER_TYPE_UW;
-      sample_mask.stride *= 2;
-
-      ubld.annotate("FB write oMask")
-          .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
-                    use_2nd_half),
-               half(sample_mask, use_2nd_half));
-      length++;
-   }
-
-   payload_header_size = length;
-
-   if (src0_alpha.file != BAD_FILE) {
-      /* FIXME: This is being passed at the wrong location in the payload and
-       * doesn't work when gl_SampleMask and MRTs are used simultaneously.
-       * It's supposed to be immediately before oMask but there seems to be no
-       * reasonable way to pass them in the correct order because LOAD_PAYLOAD
-       * requires header sources to form a contiguous segment at the beginning
-       * of the message and src0_alpha has per-channel semantics.
-       */
-      setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false);
-      length++;
-   }
-
-   setup_color_payload(&sources[length], color0, components,
-                       exec_size, use_2nd_half);
-   length += 4;
-
-   if (color1.file != BAD_FILE) {
-      setup_color_payload(&sources[length], color1, components,
-                          exec_size, use_2nd_half);
-      length += 4;
-   }
+   /* Hand over gl_FragDepth or the payload depth. */
+   const fs_reg dst_depth = (payload.dest_depth_reg ?
+                             fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)) :
+                             fs_reg());
+   fs_reg src_depth;
 
    if (source_depth_to_render_target) {
-      if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
-	 /* Hand over gl_FragDepth. */
-	 assert(this->frag_depth.file != BAD_FILE);
-         if (exec_size < dispatch_width) {
-            sources[length] = half(this->frag_depth, use_2nd_half);
-         } else {
-            sources[length] = this->frag_depth;
-         }
-      } else {
-	 /* Pass through the payload depth. */
-         sources[length] = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
-      }
-      length++;
+      if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
+         src_depth = frag_depth;
+      else
+         src_depth = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
    }
 
-   if (payload.dest_depth_reg)
-      sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0));
+   const fs_reg sources[] = {
+      color0, color1, src0_alpha, src_depth, dst_depth, sample_mask,
+      fs_reg(components)
+   };
+   fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
+                             sources, ARRAY_SIZE(sources));
 
-   fs_inst *load;
-   fs_inst *write;
-   if (devinfo->gen >= 7) {
-      /* Send from the GRF */
-      fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
-      load = ubld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
-      payload.reg = alloc.allocate(load->regs_written);
-      load->dst = payload;
-      write = ubld.emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
-      write->base_mrf = -1;
-   } else {
-      /* Send from the MRF */
-      load = ubld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
-                               sources, length, payload_header_size);
-
-      /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
-       * will do this for us if we just give it a COMPR4 destination.
-       */
-      if (devinfo->gen < 6 && exec_size == 16)
-         load->dst.reg |= BRW_MRF_COMPR4;
-
-      write = ubld.emit(FS_OPCODE_FB_WRITE);
-      write->exec_size = exec_size;
-      write->base_mrf = 1;
-   }
-
-   write->mlen = load->regs_written;
-   write->header_size = header_size;
    if (prog_data->uses_kill) {
       write->predicate = BRW_PREDICATE_NORMAL;
       write->flag_subreg = 1;
    }
+
    return write;
 }
 
@@ -1659,33 +1522,9 @@ fs_visitor::emit_fb_writes()
       const fs_builder abld = bld.annotate("FB dual-source write");
 
       inst = emit_single_fb_write(abld, this->outputs[0],
-                                  this->dual_src_output, reg_undef, 4, 8);
+                                  this->dual_src_output, reg_undef, 4);
       inst->target = 0;
 
-      /* SIMD16 dual source blending requires to send two SIMD8 dual source
-       * messages, where each message contains color data for 8 pixels. Color
-       * data for the first group of pixels is stored in the "lower" half of
-       * the color registers, so in SIMD16, the previous message did:
-       * m + 0: r0
-       * m + 1: g0
-       * m + 2: b0
-       * m + 3: a0
-       *
-       * Here goes the second message, which packs color data for the
-       * remaining 8 pixels. Color data for these pixels is stored in the
-       * "upper" half of the color registers, so we need to do:
-       * m + 0: r1
-       * m + 1: g1
-       * m + 2: b1
-       * m + 3: a1
-       */
-      if (dispatch_width == 16) {
-         inst = emit_single_fb_write(abld, this->outputs[0],
-                                     this->dual_src_output, reg_undef, 4, 8,
-                                     true);
-         inst->target = 0;
-      }
-
       prog_data->dual_src_blend = true;
    } else {
       for (int target = 0; target < key->nr_color_regions; target++) {
@@ -1702,8 +1541,7 @@ fs_visitor::emit_fb_writes()
 
          inst = emit_single_fb_write(abld, this->outputs[target], reg_undef,
                                      src0_alpha,
-                                     this->output_components[target],
-                                     dispatch_width);
+                                     this->output_components[target]);
          inst->target = target;
       }
    }
@@ -1721,8 +1559,7 @@ fs_visitor::emit_fb_writes()
       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
       bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
 
-      inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4,
-                                  dispatch_width);
+      inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
       inst->target = 0;
    }
 

From 33deff4f0582d2c073d34d4d6ec8344d2b1fbf7d Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 21 Jul 2015 18:42:27 +0300
Subject: [PATCH 0735/1208] i965/fs: Define logical texture sampling opcodes.

Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:

 tex_logical dst, coordinates, shadow_c, lod, lod2,
                  sample_index, mcs, sampler, offset,
                  num_coordinate_components, num_grad_components

This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_defines.h  | 31 ++++++++
 src/mesa/drivers/dri/i965/brw_fs.cpp     | 92 ++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_shader.cpp | 25 +++++++
 3 files changed, 148 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 2af7413d993..373d6b8f8ab 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -903,18 +903,49 @@ enum opcode {
    SHADER_OPCODE_SIN,
    SHADER_OPCODE_COS,
 
+   /**
+    * Texture sampling opcodes.
+    *
+    * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+    * opcode but instead of taking a single payload blob they expect their
+    * arguments separately as individual sources:
+    *
+    * Source 0: [optional] Texture coordinates.
+    * Source 1: [optional] Shadow comparitor.
+    * Source 2: [optional] dPdx if the operation takes explicit derivatives,
+    *                      otherwise LOD value.
+    * Source 3: [optional] dPdy if the operation takes explicit derivatives.
+    * Source 4: [optional] Sample index.
+    * Source 5: [optional] MCS data.
+    * Source 6: [required] Texture sampler.
+    * Source 7: [optional] Texel offset.
+    * Source 8: [required] Number of coordinate components (as UD immediate).
+    * Source 9: [required] Number derivative components (as UD immediate).
+    */
    SHADER_OPCODE_TEX,
+   SHADER_OPCODE_TEX_LOGICAL,
    SHADER_OPCODE_TXD,
+   SHADER_OPCODE_TXD_LOGICAL,
    SHADER_OPCODE_TXF,
+   SHADER_OPCODE_TXF_LOGICAL,
    SHADER_OPCODE_TXL,
+   SHADER_OPCODE_TXL_LOGICAL,
    SHADER_OPCODE_TXS,
+   SHADER_OPCODE_TXS_LOGICAL,
    FS_OPCODE_TXB,
+   FS_OPCODE_TXB_LOGICAL,
    SHADER_OPCODE_TXF_CMS,
+   SHADER_OPCODE_TXF_CMS_LOGICAL,
    SHADER_OPCODE_TXF_UMS,
+   SHADER_OPCODE_TXF_UMS_LOGICAL,
    SHADER_OPCODE_TXF_MCS,
+   SHADER_OPCODE_TXF_MCS_LOGICAL,
    SHADER_OPCODE_LOD,
+   SHADER_OPCODE_LOD_LOGICAL,
    SHADER_OPCODE_TG4,
+   SHADER_OPCODE_TG4_LOGICAL,
    SHADER_OPCODE_TG4_OFFSET,
+   SHADER_OPCODE_TG4_OFFSET_LOGICAL,
 
    /**
     * Combines multiple sources of size 1 into a larger virtual GRF.
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 936f0d19706..d9096f1b4cf 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -686,6 +686,31 @@ fs_inst::components_read(unsigned i) const
       else
          return 1;
 
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      assert(src[8].file == IMM && src[9].file == IMM);
+      /* Texture coordinates. */
+      if (i == 0)
+         return src[8].fixed_hw_reg.dw1.ud;
+      /* Texture derivatives. */
+      else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
+         return src[9].fixed_hw_reg.dw1.ud;
+      /* Texture offset. */
+      else if (i == 7)
+         return 2;
+      else
+         return 1;
+
    default:
       return 1;
    }
@@ -3361,6 +3386,25 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    inst->header_size = header_size;
 }
 
+static void
+lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
+{
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg &coordinate = inst->src[0];
+   const fs_reg &shadow_c = inst->src[1];
+   const fs_reg &lod = inst->src[2];
+   const fs_reg &lod2 = inst->src[3];
+   const fs_reg &sample_index = inst->src[4];
+   const fs_reg &mcs = inst->src[5];
+   const fs_reg &sampler = inst->src[6];
+   const fs_reg &offset_value = inst->src[7];
+   assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
+   const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
+   const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud;
+
+   assert(!"Not implemented");
+}
+
 bool
 fs_visitor::lower_logical_sends()
 {
@@ -3380,6 +3424,54 @@ fs_visitor::lower_logical_sends()
                                      payload);
          break;
 
+      case SHADER_OPCODE_TEX_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
+         break;
+
+      case SHADER_OPCODE_TXD_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
+         break;
+
+      case SHADER_OPCODE_TXF_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
+         break;
+
+      case SHADER_OPCODE_TXL_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
+         break;
+
+      case SHADER_OPCODE_TXS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
+         break;
+
+      case FS_OPCODE_TXB_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
+         break;
+
+      case SHADER_OPCODE_TXF_CMS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
+         break;
+
+      case SHADER_OPCODE_TXF_UMS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
+         break;
+
+      case SHADER_OPCODE_TXF_MCS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
+         break;
+
+      case SHADER_OPCODE_LOD_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
+         break;
+
+      case SHADER_OPCODE_TG4_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
+         break;
+
+      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
+         break;
+
       default:
          continue;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 1e660d71cfc..9e3a9740d74 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -567,28 +567,53 @@ brw_instruction_name(enum opcode op)
 
    case SHADER_OPCODE_TEX:
       return "tex";
+   case SHADER_OPCODE_TEX_LOGICAL:
+      return "tex_logical";
    case SHADER_OPCODE_TXD:
       return "txd";
+   case SHADER_OPCODE_TXD_LOGICAL:
+      return "txd_logical";
    case SHADER_OPCODE_TXF:
       return "txf";
+   case SHADER_OPCODE_TXF_LOGICAL:
+      return "txf_logical";
    case SHADER_OPCODE_TXL:
       return "txl";
+   case SHADER_OPCODE_TXL_LOGICAL:
+      return "txl_logical";
    case SHADER_OPCODE_TXS:
       return "txs";
+   case SHADER_OPCODE_TXS_LOGICAL:
+      return "txs_logical";
    case FS_OPCODE_TXB:
       return "txb";
+   case FS_OPCODE_TXB_LOGICAL:
+      return "txb_logical";
    case SHADER_OPCODE_TXF_CMS:
       return "txf_cms";
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+      return "txf_cms_logical";
    case SHADER_OPCODE_TXF_UMS:
       return "txf_ums";
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+      return "txf_ums_logical";
    case SHADER_OPCODE_TXF_MCS:
       return "txf_mcs";
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+      return "txf_mcs_logical";
    case SHADER_OPCODE_LOD:
       return "lod";
+   case SHADER_OPCODE_LOD_LOGICAL:
+      return "lod_logical";
    case SHADER_OPCODE_TG4:
       return "tg4";
+   case SHADER_OPCODE_TG4_LOGICAL:
+      return "tg4_logical";
    case SHADER_OPCODE_TG4_OFFSET:
       return "tg4_offset";
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      return "tg4_offset_logical";
+
    case SHADER_OPCODE_SHADER_TIME_ADD:
       return "shader_time_add";
 

From 8fbb3d3569e6d353dee6e558eb9fd961b5a8a12c Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 15:42:20 +0300
Subject: [PATCH 0736/1208] i965/fs: Use exec_size instead of dispatch_width to
 determine the message variant.

dispatch_width is global for a single compilation and doesn't
necessarily match the desired execution width if we had to lower the
original full-width instruction due to hardware limitations.  These
were all inside a Gen4-specific branch so this patch shouldn't have
any effect on more recent hardware.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index a176fcf9c41..811fb738058 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -655,7 +655,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 	 /* Note that G45 and older determines shadow compare and dispatch width
 	  * from message length for most messages.
 	  */
-         if (dispatch_width == 8) {
+         if (inst->exec_size == 8) {
             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
             if (inst->shadow_compare) {
                assert(inst->mlen == 6);
@@ -674,7 +674,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 	 break;
       case FS_OPCODE_TXB:
 	 if (inst->shadow_compare) {
-            assert(dispatch_width == 8);
+            assert(inst->exec_size == 8);
 	    assert(inst->mlen == 6);
 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
 	 } else {
@@ -685,7 +685,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 	 break;
       case SHADER_OPCODE_TXL:
 	 if (inst->shadow_compare) {
-            assert(dispatch_width == 8);
+            assert(inst->exec_size == 8);
 	    assert(inst->mlen == 6);
 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
 	 } else {
@@ -696,7 +696,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 	 break;
       case SHADER_OPCODE_TXD:
 	 /* There is no sample_d_c message; comparisons are done manually */
-         assert(dispatch_width == 8);
+         assert(inst->exec_size == 8);
 	 assert(inst->mlen == 7 || inst->mlen == 10);
 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
 	 break;

From 44a8cf488e0370d7e5abe363c1fd2d21247a6e32 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 15:33:04 +0300
Subject: [PATCH 0737/1208] i965/fs: Fix opt_zero_samples() for texturing ops
 not matching dispatch_width.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index d9096f1b4cf..5ca476d7bd2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2162,11 +2162,11 @@ fs_visitor::opt_zero_samples()
        *     "Parameter 0 is required except for the sampleinfo message, which
        *      has no parameter 0"
        */
-      while (inst->mlen > inst->header_size + dispatch_width / 8 &&
+      while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
              load_payload->src[(inst->mlen - inst->header_size) /
-                               (dispatch_width / 8) +
+                               (inst->exec_size / 8) +
                                inst->header_size - 1].is_zero()) {
-         inst->mlen -= dispatch_width / 8;
+         inst->mlen -= inst->exec_size / 8;
          progress = true;
       }
    }

From fc2273a3400963e478582ee1efbfc8cdaae3eae7 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Fri, 17 Jul 2015 18:46:21 +0300
Subject: [PATCH 0738/1208] i965/fs: Pass a BAD_FILE header source to
 LOAD_PAYLOAD in emit_texture_gen7().

So that it's left uninitialized by LOAD_PAYLOAD, we only need to
reserve space for it in the message since it will be initialized
implicitly by the generator.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index ef8e967cfb2..09beb4d21c1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -483,7 +483,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
        * need to offset the Sampler State Pointer in the header.
        */
       header_size = 1;
-      sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+      sources[0] = fs_reg();
       length++;
    }
 

From a69332a31243a7733dab926b765964ba6df827b2 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Fri, 24 Jul 2015 16:41:19 +0300
Subject: [PATCH 0739/1208] i965/fs: Fix misleading comment regarding the
 message header in emit_texture_gen7.

This hasn't been overallocating space for the header for a long time.
It still leaves the header uninitialized though until the generator
fixes it.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 09beb4d21c1..5977f6de5a7 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -473,8 +473,9 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
    if (op == ir_tg4 || offset_value.file != BAD_FILE ||
        is_high_sampler(devinfo, sampler)) {
       /* For general texture offsets (no txf workaround), we need a header to
-       * put them in.  Note that for SIMD16 we're making space for two actual
-       * hardware registers here, so the emit will have to fix up for this.
+       * put them in.  Note that we're only reserving space for it in the
+       * message payload as it will be initialized implicitly by the
+       * generator.
        *
        * * ir4_tg4 needs to place its channel select in the header,
        * for interaction with ARB_texture_swizzle

From 8be01e3548bdd900b7cadb5c9a77e52b01151cfe Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 18:08:51 +0300
Subject: [PATCH 0740/1208] i965/fs: Implement lowering of logical texturing
 opcodes on Gen7+.

This should be largely equivalent to emit_texture_gen7() except that
we now get i965 sampling opcodes directly rather than
ir_texture_opcode enum values.  The mapping is as follows:

 - ir_tex -> SHADER_OPCODE_TEX
 - ir_txb -> FS_OPCODE_TXB
 - ir_txl -> SHADER_OPCODE_TXL
 - ir_txd -> SHADER_OPCODE_TXD
 - ir_txf -> SHADER_OPCODE_TXF
 - ir_txf_ms -> SHADER_OPCODE_TXF_CMS
 - ir_txs -> SHADER_OPCODE_TXS
 - ir_query_levels -> SHADER_OPCODE_TXS too, the visitor will make
                      sure that the provided lod value is zero in this
                      case.
 - ir_lod -> SHADER_OPCODE_LOD
 - ir_tg4 -> SHADER_OPCODE_TG4_OFFSET if the offset value is not
             immediate, SHADER_OPCODE_TG4 otherwise.

Other than that there are only minor changes and style fixes like the
implementation now being factored out in static functions to improve
encapsulation.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 217 ++++++++++++++++++++++++++-
 1 file changed, 216 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5ca476d7bd2..3149f20ebf9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3386,6 +3386,214 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    inst->header_size = header_size;
 }
 
+static bool
+is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
+{
+   if (devinfo->gen < 8 && !devinfo->is_haswell)
+      return false;
+
+   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
+}
+
+static void
+lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
+                                fs_reg coordinate,
+                                const fs_reg &shadow_c,
+                                fs_reg lod, fs_reg lod2,
+                                const fs_reg &sample_index,
+                                const fs_reg &mcs, const fs_reg &sampler,
+                                fs_reg offset_value,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   int reg_width = bld.dispatch_width() / 8;
+   unsigned header_size = 0, length = 0;
+   fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
+   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
+      sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+   if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
+       offset_value.file != BAD_FILE ||
+       is_high_sampler(devinfo, sampler)) {
+      /* For general texture offsets (no txf workaround), we need a header to
+       * put them in.  Note that we're only reserving space for it in the
+       * message payload as it will be initialized implicitly by the
+       * generator.
+       *
+       * TG4 needs to place its channel select in the header, for interaction
+       * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
+       * larger sampler numbers we need to offset the Sampler State Pointer in
+       * the header.
+       */
+      header_size = 1;
+      sources[0] = fs_reg();
+      length++;
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+      bld.MOV(sources[length], shadow_c);
+      length++;
+   }
+
+   bool coordinate_done = false;
+
+   /* The sampler can only meaningfully compute LOD for fragment shader
+    * messages. For all other stages, we change the opcode to TXL and
+    * hardcode the LOD to 0.
+    */
+   if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
+       op == SHADER_OPCODE_TEX) {
+      op = SHADER_OPCODE_TXL;
+      lod = fs_reg(0.0f);
+   }
+
+   /* Set up the LOD info */
+   switch (op) {
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXL:
+      bld.MOV(sources[length], lod);
+      length++;
+      break;
+   case SHADER_OPCODE_TXD:
+      /* TXD should have been lowered in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8);
+
+      /* Load dPdx and the coordinate together:
+       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+       */
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+
+         /* For cube map array, the coordinate is (u,v,r,ai) but there are
+          * only derivatives for (u, v, r).
+          */
+         if (i < grad_components) {
+            bld.MOV(sources[length], lod);
+            lod = offset(lod, bld, 1);
+            length++;
+
+            bld.MOV(sources[length], lod2);
+            lod2 = offset(lod2, bld, 1);
+            length++;
+         }
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TXS:
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
+      length++;
+      break;
+   case SHADER_OPCODE_TXF:
+      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
+       * On Gen9 they are u, v, lod, r
+       */
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+      coordinate = offset(coordinate, bld, 1);
+      length++;
+
+      if (devinfo->gen >= 9) {
+         if (coord_components >= 2) {
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+            coordinate = offset(coordinate, bld, 1);
+         }
+         length++;
+      }
+
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+      length++;
+
+      for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TXF_CMS:
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
+      length++;
+
+      /* Data from the multisample control surface. */
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
+      length++;
+
+      /* There is no offsetting for this message; just copy in the integer
+       * texture coordinates.
+       */
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TG4_OFFSET:
+      /* gather4_po_c should have been lowered in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
+
+      /* More crazy intermixing */
+      for (unsigned i = 0; i < 2; i++) { /* u, v */
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      for (unsigned i = 0; i < 2; i++) { /* offu, offv */
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
+         offset_value = offset(offset_value, bld, 1);
+         length++;
+      }
+
+      if (coord_components == 3) { /* r if present */
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      coordinate_done = true;
+      break;
+   default:
+      break;
+   }
+
+   /* Set up the coordinate (except for cases where it was done above) */
+   if (!coordinate_done) {
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+   }
+
+   int mlen;
+   if (reg_width == 2)
+      mlen = length * reg_width - header_size;
+   else
+      mlen = length * reg_width;
+
+   const fs_reg src_payload = fs_reg(GRF, bld.shader->alloc.allocate(mlen),
+                                     BRW_REGISTER_TYPE_F);
+   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
+
+   /* Generate the SEND. */
+   inst->opcode = op;
+   inst->src[0] = src_payload;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = -1;
+   inst->mlen = mlen;
+   inst->header_size = header_size;
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
 static void
 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
 {
@@ -3402,7 +3610,14 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
    const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
    const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud;
 
-   assert(!"Not implemented");
+   if (devinfo->gen >= 7) {
+      lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sample_index,
+                                      mcs, sampler, offset_value,
+                                      coord_components, grad_components);
+   } else {
+      assert(!"Not implemented");
+   }
 }
 
 bool

From 03582f95b256e483fc1b0d78bd6a49203a448a23 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Fri, 17 Jul 2015 18:50:27 +0300
Subject: [PATCH 0741/1208] i965/fs: Lower SHADER_OPCODE_TXF_UMS/MCS_LOGICAL
 too on Gen7+.

These weren't being handled by emit_texture_gen7() but we can easily
lower them here for consistency with other texturing opcodes.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 3149f20ebf9..bd48decdc6a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3515,12 +3515,18 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
       coordinate_done = true;
       break;
    case SHADER_OPCODE_TXF_CMS:
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
-      length++;
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+      if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
+         length++;
+      }
 
-      /* Data from the multisample control surface. */
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
-      length++;
+      if (op == SHADER_OPCODE_TXF_CMS) {
+         /* Data from the multisample control surface. */
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
+         length++;
+      }
 
       /* There is no offsetting for this message; just copy in the integer
        * texture coordinates.

From 501134b9fe02633ca0cdda66a9b670ae38e791f7 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sat, 18 Jul 2015 16:52:06 +0300
Subject: [PATCH 0742/1208] i965/fs: Implement lowering of logical texturing
 opcodes on Gen5-6.

This should be largely equivalent to emit_texture_gen5() except for
slight codestyle changes and the use i965 opcodes instead of the
ir_texture_opcode enum, see "i965/fs: Implement lowering of logical
texturing opcodes on Gen7+." for the mapping between them.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 103 +++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index bd48decdc6a..f36933d485d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3386,6 +3386,104 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    inst->header_size = header_size;
 }
 
+static void
+lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
+                                fs_reg coordinate,
+                                const fs_reg &shadow_c,
+                                fs_reg lod, fs_reg lod2,
+                                const fs_reg &sample_index,
+                                const fs_reg &sampler,
+                                const fs_reg &offset_value,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
+   fs_reg msg_coords = message;
+   unsigned header_size = 0;
+
+   if (offset_value.file != BAD_FILE) {
+      /* The offsets set up by the visitor are in the m1 header, so we can't
+       * go headerless.
+       */
+      header_size = 1;
+      message.reg--;
+   }
+
+   for (unsigned i = 0; i < coord_components; i++) {
+      bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
+      coordinate = offset(coordinate, bld, 1);
+   }
+   fs_reg msg_end = offset(msg_coords, bld, coord_components);
+   fs_reg msg_lod = offset(msg_coords, bld, 4);
+
+   if (shadow_c.file != BAD_FILE) {
+      fs_reg msg_shadow = msg_lod;
+      bld.MOV(msg_shadow, shadow_c);
+      msg_lod = offset(msg_shadow, bld, 1);
+      msg_end = msg_lod;
+   }
+
+   switch (op) {
+   case SHADER_OPCODE_TXL:
+   case FS_OPCODE_TXB:
+      bld.MOV(msg_lod, lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXD:
+      /**
+       *  P   =  u,    v,    r
+       * dPdx = dudx, dvdx, drdx
+       * dPdy = dudy, dvdy, drdy
+       *
+       * Load up these values:
+       * - dudx   dudy   dvdx   dvdy   drdx   drdy
+       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
+       */
+      msg_end = msg_lod;
+      for (unsigned i = 0; i < grad_components; i++) {
+         bld.MOV(msg_end, lod);
+         lod = offset(lod, bld, 1);
+         msg_end = offset(msg_end, bld, 1);
+
+         bld.MOV(msg_end, lod2);
+         lod2 = offset(lod2, bld, 1);
+         msg_end = offset(msg_end, bld, 1);
+      }
+      break;
+   case SHADER_OPCODE_TXS:
+      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
+      bld.MOV(msg_lod, lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXF:
+      msg_lod = offset(msg_coords, bld, 3);
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXF_CMS:
+      msg_lod = offset(msg_coords, bld, 3);
+      /* lod */
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
+      /* sample index */
+      bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
+      msg_end = offset(msg_lod, bld, 2);
+      break;
+   default:
+      break;
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = message.reg;
+   inst->mlen = msg_end.reg - message.reg;
+   inst->header_size = header_size;
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
 static bool
 is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
 {
@@ -3621,6 +3719,11 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
                                       shadow_c, lod, lod2, sample_index,
                                       mcs, sampler, offset_value,
                                       coord_components, grad_components);
+   } else if (devinfo->gen >= 5) {
+      lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sample_index,
+                                      sampler, offset_value,
+                                      coord_components, grad_components);
    } else {
       assert(!"Not implemented");
    }

From 2cd466f6c3192015ea1794afc57eb453d7f13818 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sat, 18 Jul 2015 17:09:37 +0300
Subject: [PATCH 0743/1208] i965/fs: Implement lowering of logical texturing
 opcodes on Gen4.

Unlike its Gen5 and Gen7 counterparts this patch isn't a plain
refactor of the previous Gen4 texturing code, it's more of a rewrite
largely based on emit_texture_gen4_simd16().  The reason is that on
the one hand the original emit_texture_gen4() code didn't seem easily
fixable to be SIMD width-invariant and had plenty of clutter to
support SIMD-width workarounds which are no longer required.  On the
other hand emit_texture_gen4_simd16() was missing a number of
SIMD8-only opcodes.  This should generalize both and roughly match
their current behaviour where there is overlap.

Incidentally this will fix the following piglits on Gen4:

    arb_shader_texture_lod.execution.arb_shader_texture_lod-texgrad
    arb_shader_texture_lod.execution.tex-miplevel-selection *gradarb 2d
    arb_shader_texture_lod.execution.tex-miplevel-selection *gradarb 3d
    arb_shader_texture_lod.execution.tex-miplevel-selection *projgradarb 2d
    arb_shader_texture_lod.execution.tex-miplevel-selection *projgradarb 2d_projvec4
    arb_shader_texture_lod.execution.tex-miplevel-selection *projgradarb 3d

Acked-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 108 ++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index f36933d485d..77d4a7d7b66 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3386,6 +3386,110 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    inst->header_size = header_size;
 }
 
+static void
+lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
+                                const fs_reg &coordinate,
+                                const fs_reg &shadow_c,
+                                const fs_reg &lod, const fs_reg &lod2,
+                                const fs_reg &sampler,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
+                         op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
+   fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
+   fs_reg msg_end = msg_begin;
+
+   /* g0 header. */
+   msg_end = offset(msg_end, bld.group(8, 0), 1);
+
+   for (unsigned i = 0; i < coord_components; i++)
+      bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
+              offset(coordinate, bld, i));
+
+   msg_end = offset(msg_end, bld, coord_components);
+
+   /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
+    * require all three components to be present and zero if they are unused.
+    */
+   if (coord_components > 0 &&
+       (has_lod || shadow_c.file != BAD_FILE ||
+        (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
+      for (unsigned i = coord_components; i < 3; i++)
+         bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
+
+      msg_end = offset(msg_end, bld, 3 - coord_components);
+   }
+
+   if (op == SHADER_OPCODE_TXD) {
+      /* TXD unsupported in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8);
+
+      /* the slots for u and v are always present, but r is optional */
+      if (coord_components < 2)
+         msg_end = offset(msg_end, bld, 2 - coord_components);
+
+      /*  P   = u, v, r
+       * dPdx = dudx, dvdx, drdx
+       * dPdy = dudy, dvdy, drdy
+       *
+       * 1-arg: Does not exist.
+       *
+       * 2-arg: dudx   dvdx   dudy   dvdy
+       *        dPdx.x dPdx.y dPdy.x dPdy.y
+       *        m4     m5     m6     m7
+       *
+       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
+       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
+       *        m5     m6     m7     m8     m9     m10
+       */
+      for (unsigned i = 0; i < grad_components; i++)
+         bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
+
+      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+
+      for (unsigned i = 0; i < grad_components; i++)
+         bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
+
+      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+   }
+
+   if (has_lod) {
+      /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
+       * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
+       */
+      assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
+             bld.dispatch_width() == 16);
+
+      const brw_reg_type type =
+         (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
+          BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
+      bld.MOV(retype(msg_end, type), lod);
+      msg_end = offset(msg_end, bld, 1);
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+      if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
+         /* There's no plain shadow compare message, so we use shadow
+          * compare with a bias of 0.0.
+          */
+         bld.MOV(msg_end, fs_reg(0.0f));
+         msg_end = offset(msg_end, bld, 1);
+      }
+
+      bld.MOV(msg_end, shadow_c);
+      msg_end = offset(msg_end, bld, 1);
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = msg_begin.reg;
+   inst->mlen = msg_end.reg - msg_begin.reg;
+   inst->header_size = 1;
+}
+
 static void
 lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
                                 fs_reg coordinate,
@@ -3725,7 +3829,9 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
                                       sampler, offset_value,
                                       coord_components, grad_components);
    } else {
-      assert(!"Not implemented");
+      lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sampler,
+                                      coord_components, grad_components);
    }
 }
 

From 4be99438e6e40280f9dc071882ce3bfbfabadb4a Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 21:19:52 +0300
Subject: [PATCH 0744/1208] i965/fs: Hook up SIMD lowering to handle texturing
 opcodes of unsupported width.

This should match the set of cases in which we currently call fail()
or no16() from the emit_texture_*() methods and the ones in which
emit_texture_gen4() enables the SIMD16 workaround.

Hint for reviewers: It's not a big deal if I happen to have missed
some case here, it will just lead to an assertion failure down the
road which is easily fixable, however being stricter than necessary
won't cause any visible breakage, it would just decrease performance
silently due to the unnecessary message splitting, so feel free to
double-check that all cases listed here already cause a SIMD8/16
fall-back with the current texturing code -- You may want to skip over
the Gen5-6 cases though if you don't have pencil and paper at hand.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 33 ++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 77d4a7d7b66..6d6de3bc677 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3935,6 +3935,39 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
       /* Dual-source FB writes are unsupported in SIMD16 mode. */
       return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size);
 
+   case SHADER_OPCODE_TXD_LOGICAL:
+      /* TXD is unsupported in SIMD16 mode. */
+      return 8;
+
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
+      /* gather4_po_c is unsupported in SIMD16 mode. */
+      const fs_reg &shadow_c = inst->src[1];
+      return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
+   }
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL: {
+      /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
+       * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
+       * mode because the message exceeds the maximum length of 11.
+       */
+      const fs_reg &shadow_c = inst->src[1];
+      if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
+         return 16;
+      else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
+         return 8;
+      else
+         return inst->exec_size;
+   }
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+      /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
+       * messages.  Use SIMD16 instead.
+       */
+      if (devinfo->gen == 4)
+         return 16;
+      else
+         return inst->exec_size;
+
    default:
       return inst->exec_size;
    }

From ba78a5007171afaa5f2d76d71be131f01a5b5023 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 16:07:45 +0300
Subject: [PATCH 0745/1208] i965/fs: Reimplement emit_texture() in terms of
 logical send messages.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 66 +++++++++++++++-----
 1 file changed, 49 insertions(+), 17 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 5977f6de5a7..98e75d6c6c6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -861,6 +861,14 @@ fs_visitor::emit_texture(ir_texture_opcode op,
       }
    }
 
+   if (op == ir_query_levels) {
+      /* textureQueryLevels() is implemented in terms of TXS so we need to
+       * pass a valid LOD argument.
+       */
+      assert(lod.file == BAD_FILE);
+      lod = fs_reg(0u);
+   }
+
    if (coordinate.file != BAD_FILE) {
       /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
        * samplers.  This should only be a problem with GL_CLAMP on Gen7.
@@ -873,26 +881,50 @@ fs_visitor::emit_texture(ir_texture_opcode op,
     * samples, so don't worry about them.
     */
    fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
+   const fs_reg srcs[] = {
+      coordinate, shadow_c, lod, lod2,
+      sample_index, mcs, sampler_reg, offset_value,
+      fs_reg(coord_components), fs_reg(grad_components)
+   };
+   enum opcode opcode;
 
-   if (devinfo->gen >= 7) {
-      inst = emit_texture_gen7(op, dst, coordinate, coord_components,
-                               shadow_c, lod, lod2, grad_components,
-                               sample_index, mcs, sampler_reg,
-                               offset_value);
-   } else if (devinfo->gen >= 5) {
-      inst = emit_texture_gen5(op, dst, coordinate, coord_components,
-                               shadow_c, lod, lod2, grad_components,
-                               sample_index, sampler,
-                               offset_value.file != BAD_FILE);
-   } else if (dispatch_width == 16) {
-      inst = emit_texture_gen4_simd16(op, dst, coordinate, coord_components,
-                                      shadow_c, lod, sampler);
-   } else {
-      inst = emit_texture_gen4(op, dst, coordinate, coord_components,
-                               shadow_c, lod, lod2, grad_components,
-                               sampler);
+   switch (op) {
+   case ir_tex:
+      opcode = SHADER_OPCODE_TEX_LOGICAL;
+      break;
+   case ir_txb:
+      opcode = FS_OPCODE_TXB_LOGICAL;
+      break;
+   case ir_txl:
+      opcode = SHADER_OPCODE_TXL_LOGICAL;
+      break;
+   case ir_txd:
+      opcode = SHADER_OPCODE_TXD_LOGICAL;
+      break;
+   case ir_txf:
+      opcode = SHADER_OPCODE_TXF_LOGICAL;
+      break;
+   case ir_txf_ms:
+      opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
+      break;
+   case ir_txs:
+   case ir_query_levels:
+      opcode = SHADER_OPCODE_TXS_LOGICAL;
+      break;
+   case ir_lod:
+      opcode = SHADER_OPCODE_LOD_LOGICAL;
+      break;
+   case ir_tg4:
+      opcode = (offset_value.file != BAD_FILE && offset_value.file != IMM ?
+                SHADER_OPCODE_TG4_OFFSET_LOGICAL : SHADER_OPCODE_TG4_LOGICAL);
+      break;
+   default:
+      unreachable("Invalid texture opcode.");
    }
 
+   inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+   inst->regs_written = 4 * dispatch_width / 8;
+
    if (shadow_c.file != BAD_FILE)
       inst->shadow_compare = true;
 

From 59979b133dd16bf46803f87e78677eba944cc757 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Fri, 17 Jul 2015 18:23:31 +0300
Subject: [PATCH 0746/1208] i965/fs: Reimplement emit_mcs_fetch() in terms of
 logical sends.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.h           |  3 +-
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 36 +++++++-------------
 2 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index bd93a6a3746..2de97eb25e2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -243,7 +243,8 @@ public:
                      uint32_t sampler,
                      fs_reg sampler_reg,
                      int texunit);
-   fs_reg emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler);
+   fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
+                         const fs_reg &sampler);
    void emit_gen6_gather_wa(uint8_t wa, fs_reg dst);
    void resolve_source_modifiers(fs_reg *src);
    void emit_discard_jump();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 98e75d6c6c6..0cfce9f3640 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -797,31 +797,21 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
 
 /* Sample from the MCS surface attached to this multisample texture. */
 fs_reg
-fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
+fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
+                           const fs_reg &sampler)
 {
-   int reg_width = dispatch_width / 8;
-   fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
-                           BRW_REGISTER_TYPE_F);
-   fs_reg dest = vgrf(glsl_type::uvec4_type);
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
+   const fs_reg dest = vgrf(glsl_type::uvec4_type);
+   const fs_reg srcs[] = {
+      coordinate, fs_reg(), fs_reg(), fs_reg(), fs_reg(), fs_reg(),
+      sampler, fs_reg(), fs_reg(components), fs_reg(0)
+   };
+   fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
+                            ARRAY_SIZE(srcs));
 
-   /* parameters are: u, v, r; missing parameters are treated as zero */
-   for (int i = 0; i < components; i++) {
-      sources[i] = vgrf(glsl_type::float_type);
-      bld.MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate);
-      coordinate = offset(coordinate, bld, 1);
-   }
-
-   bld.LOAD_PAYLOAD(payload, sources, components, 0);
-
-   fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
-   inst->base_mrf = -1;
-   inst->mlen = components * reg_width;
-   inst->header_size = 0;
-   inst->regs_written = 4 * reg_width; /* we only care about one reg of
-                                        * response, but the sampler always
-                                        * writes 4/8
-                                        */
+   /* We only care about one reg of response, but the sampler always writes
+    * 4/8.
+    */
+   inst->regs_written = 4 * dispatch_width / 8;
 
    return dest;
 }

From bd0d6a9cce8b28357888bb261fac639e2833c51f Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Fri, 24 Jul 2015 16:51:14 +0300
Subject: [PATCH 0747/1208] i965/fs: Remove the emit_texture_gen*() fs_visitor
 methods.

This is now dead code.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.h           |  21 -
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 608 -------------------
 2 files changed, 629 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 2de97eb25e2..6b75a8dbc3d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -208,27 +208,6 @@ public:
    void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
    fs_reg rescale_texcoord(fs_reg coordinate, int coord_components,
                            bool is_rect, uint32_t sampler, int texunit);
-   fs_inst *emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_comp,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              uint32_t sampler);
-   fs_inst *emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
-                                     fs_reg coordinate, int vector_elements,
-                                     fs_reg shadow_c, fs_reg lod,
-                                     uint32_t sampler);
-   fs_inst *emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_comp,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              fs_reg sample_index, uint32_t sampler,
-                              bool has_offset);
-   fs_inst *emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_comp,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              fs_reg sample_index, fs_reg mcs, fs_reg sampler,
-                              fs_reg offset_value);
    void emit_texture(ir_texture_opcode op,
                      const glsl_type *dest_type,
                      fs_reg coordinate, int components,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 0cfce9f3640..5f0549c3bc0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -77,614 +77,6 @@ fs_visitor::emit_vs_system_value(int location)
    return reg;
 }
 
-fs_inst *
-fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_c,
-                              fs_reg lod, fs_reg dPdy, int grad_components,
-                              uint32_t sampler)
-{
-   int mlen;
-   int base_mrf = 1;
-   bool simd16 = false;
-   fs_reg orig_dst;
-
-   /* g0 header. */
-   mlen = 1;
-
-   if (shadow_c.file != BAD_FILE) {
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 coordinate = offset(coordinate, bld, 1);
-      }
-
-      /* gen4's SIMD8 sampler always has the slots for u,v,r present.
-       * the unused slots must be zeroed.
-       */
-      for (int i = coord_components; i < 3; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
-      }
-      mlen += 3;
-
-      if (op == ir_tex) {
-	 /* There's no plain shadow compare message, so we use shadow
-	  * compare with a bias of 0.0.
-	  */
-         bld.MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
-	 mlen++;
-      } else if (op == ir_txb || op == ir_txl) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen), lod);
-	 mlen++;
-      } else {
-         unreachable("Should not get here.");
-      }
-
-      bld.MOV(fs_reg(MRF, base_mrf + mlen), shadow_c);
-      mlen++;
-   } else if (op == ir_tex) {
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 coordinate = offset(coordinate, bld, 1);
-      }
-      /* zero the others. */
-      for (int i = coord_components; i<3; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
-      }
-      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
-      mlen += 3;
-   } else if (op == ir_txd) {
-      fs_reg &dPdx = lod;
-
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 coordinate = offset(coordinate, bld, 1);
-      }
-      /* the slots for u and v are always present, but r is optional */
-      mlen += MAX2(coord_components, 2);
-
-      /*  P   = u, v, r
-       * dPdx = dudx, dvdx, drdx
-       * dPdy = dudy, dvdy, drdy
-       *
-       * 1-arg: Does not exist.
-       *
-       * 2-arg: dudx   dvdx   dudy   dvdy
-       *        dPdx.x dPdx.y dPdy.x dPdy.y
-       *        m4     m5     m6     m7
-       *
-       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
-       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
-       *        m5     m6     m7     m8     m9     m10
-       */
-      for (int i = 0; i < grad_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdx);
-	 dPdx = offset(dPdx, bld, 1);
-      }
-      mlen += MAX2(grad_components, 2);
-
-      for (int i = 0; i < grad_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdy);
-	 dPdy = offset(dPdy, bld, 1);
-      }
-      mlen += MAX2(grad_components, 2);
-   } else if (op == ir_txs) {
-      /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
-      simd16 = true;
-      bld.MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
-      mlen += 2;
-   } else {
-      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
-       * instructions.  We'll need to do SIMD16 here.
-       */
-      simd16 = true;
-      assert(op == ir_txb || op == ir_txl || op == ir_txf);
-
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
-                 coordinate);
-	 coordinate = offset(coordinate, bld, 1);
-      }
-
-      /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
-       * be necessary for TXF (ld), but seems wise to do for all messages.
-       */
-      for (int i = coord_components; i < 3; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f));
-      }
-
-      /* lod/bias appears after u/v/r. */
-      mlen += 6;
-
-      bld.MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod);
-      mlen++;
-
-      /* The unused upper half. */
-      mlen++;
-   }
-
-   if (simd16) {
-      /* Now, since we're doing simd16, the return is 2 interleaved
-       * vec4s where the odd-indexed ones are junk. We'll need to move
-       * this weirdness around to the expected layout.
-       */
-      orig_dst = dst;
-      dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
-   }
-
-   enum opcode opcode;
-   switch (op) {
-   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
-   case ir_txb: opcode = FS_OPCODE_TXB; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   default:
-      unreachable("not reached");
-   }
-
-   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
-   inst->base_mrf = base_mrf;
-   inst->mlen = mlen;
-   inst->header_size = 1;
-   inst->regs_written = simd16 ? 8 : 4;
-
-   if (simd16) {
-      for (int i = 0; i < 4; i++) {
-         bld.MOV(orig_dst, dst);
-	 orig_dst = offset(orig_dst, bld, 1);
-	 dst = offset(dst, bld, 2);
-      }
-   }
-
-   return inst;
-}
-
-fs_inst *
-fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
-                                     fs_reg coordinate, int vector_elements,
-                                     fs_reg shadow_c, fs_reg lod,
-                                     uint32_t sampler)
-{
-   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
-   bool has_lod = op == ir_txl || op == ir_txb || op == ir_txf || op == ir_txs;
-
-   if (has_lod && shadow_c.file != BAD_FILE)
-      no16("TXB and TXL with shadow comparison unsupported in SIMD16.");
-
-   if (op == ir_txd)
-      no16("textureGrad unsupported in SIMD16.");
-
-   /* Copy the coordinates. */
-   for (int i = 0; i < vector_elements; i++) {
-      bld.MOV(retype(offset(message, bld, i), coordinate.type), coordinate);
-      coordinate = offset(coordinate, bld, 1);
-   }
-
-   fs_reg msg_end = offset(message, bld, vector_elements);
-
-   /* Messages other than sample and ld require all three components */
-   if (vector_elements > 0 && (has_lod || shadow_c.file != BAD_FILE)) {
-      for (int i = vector_elements; i < 3; i++) {
-         bld.MOV(offset(message, bld, i), fs_reg(0.0f));
-      }
-      msg_end = offset(message, bld, 3);
-   }
-
-   if (has_lod) {
-      fs_reg msg_lod = retype(msg_end, op == ir_txf ?
-                              BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
-      bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, bld, 1);
-   }
-
-   if (shadow_c.file != BAD_FILE) {
-      fs_reg msg_ref = offset(message, bld, 3 + has_lod);
-      bld.MOV(msg_ref, shadow_c);
-      msg_end = offset(msg_ref, bld, 1);
-   }
-
-   enum opcode opcode;
-   switch (op) {
-   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
-   case ir_txb: opcode = FS_OPCODE_TXB;     break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   default: unreachable("not reached");
-   }
-
-   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
-   inst->base_mrf = message.reg - 1;
-   inst->mlen = msg_end.reg - inst->base_mrf;
-   inst->header_size = 1;
-   inst->regs_written = 8;
-
-   return inst;
-}
-
-/* gen5's sampler has slots for u, v, r, array index, then optional
- * parameters like shadow comparitor or LOD bias.  If optional
- * parameters aren't present, those base slots are optional and don't
- * need to be included in the message.
- *
- * We don't fill in the unnecessary slots regardless, which may look
- * surprising in the disassembly.
- */
-fs_inst *
-fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int vector_elements,
-                              fs_reg shadow_c,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              fs_reg sample_index, uint32_t sampler,
-                              bool has_offset)
-{
-   int reg_width = dispatch_width / 8;
-   unsigned header_size = 0;
-
-   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
-   fs_reg msg_coords = message;
-
-   if (has_offset) {
-      /* The offsets set up by the ir_texture visitor are in the
-       * m1 header, so we can't go headerless.
-       */
-      header_size = 1;
-      message.reg--;
-   }
-
-   for (int i = 0; i < vector_elements; i++) {
-      bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
-      coordinate = offset(coordinate, bld, 1);
-   }
-   fs_reg msg_end = offset(msg_coords, bld, vector_elements);
-   fs_reg msg_lod = offset(msg_coords, bld, 4);
-
-   if (shadow_c.file != BAD_FILE) {
-      fs_reg msg_shadow = msg_lod;
-      bld.MOV(msg_shadow, shadow_c);
-      msg_lod = offset(msg_shadow, bld, 1);
-      msg_end = msg_lod;
-   }
-
-   enum opcode opcode;
-   switch (op) {
-   case ir_tex:
-      opcode = SHADER_OPCODE_TEX;
-      break;
-   case ir_txb:
-      bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, bld, 1);
-
-      opcode = FS_OPCODE_TXB;
-      break;
-   case ir_txl:
-      bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, bld, 1);
-
-      opcode = SHADER_OPCODE_TXL;
-      break;
-   case ir_txd: {
-      /**
-       *  P   =  u,    v,    r
-       * dPdx = dudx, dvdx, drdx
-       * dPdy = dudy, dvdy, drdy
-       *
-       * Load up these values:
-       * - dudx   dudy   dvdx   dvdy   drdx   drdy
-       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
-       */
-      msg_end = msg_lod;
-      for (int i = 0; i < grad_components; i++) {
-         bld.MOV(msg_end, lod);
-         lod = offset(lod, bld, 1);
-         msg_end = offset(msg_end, bld, 1);
-
-         bld.MOV(msg_end, lod2);
-         lod2 = offset(lod2, bld, 1);
-         msg_end = offset(msg_end, bld, 1);
-      }
-
-      opcode = SHADER_OPCODE_TXD;
-      break;
-   }
-   case ir_txs:
-      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
-      bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, bld, 1);
-
-      opcode = SHADER_OPCODE_TXS;
-      break;
-   case ir_query_levels:
-      msg_lod = msg_end;
-      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
-      msg_end = offset(msg_lod, bld, 1);
-
-      opcode = SHADER_OPCODE_TXS;
-      break;
-   case ir_txf:
-      msg_lod = offset(msg_coords, bld, 3);
-      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
-      msg_end = offset(msg_lod, bld, 1);
-
-      opcode = SHADER_OPCODE_TXF;
-      break;
-   case ir_txf_ms:
-      msg_lod = offset(msg_coords, bld, 3);
-      /* lod */
-      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
-      /* sample index */
-      bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
-      msg_end = offset(msg_lod, bld, 2);
-
-      opcode = SHADER_OPCODE_TXF_CMS;
-      break;
-   case ir_lod:
-      opcode = SHADER_OPCODE_LOD;
-      break;
-   case ir_tg4:
-      opcode = SHADER_OPCODE_TG4;
-      break;
-   default:
-      unreachable("not reached");
-   }
-
-   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
-   inst->base_mrf = message.reg;
-   inst->mlen = msg_end.reg - message.reg;
-   inst->header_size = header_size;
-   inst->regs_written = 4 * reg_width;
-
-   if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
-      fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
-           " disallowed by hardware\n");
-   }
-
-   return inst;
-}
-
-static bool
-is_high_sampler(const struct brw_device_info *devinfo, fs_reg sampler)
-{
-   if (devinfo->gen < 8 && !devinfo->is_haswell)
-      return false;
-
-   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
-}
-
-fs_inst *
-fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_c,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              fs_reg sample_index, fs_reg mcs, fs_reg sampler,
-                              fs_reg offset_value)
-{
-   int reg_width = dispatch_width / 8;
-   unsigned header_size = 0;
-
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
-   for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
-      sources[i] = vgrf(glsl_type::float_type);
-   }
-   int length = 0;
-
-   if (op == ir_tg4 || offset_value.file != BAD_FILE ||
-       is_high_sampler(devinfo, sampler)) {
-      /* For general texture offsets (no txf workaround), we need a header to
-       * put them in.  Note that we're only reserving space for it in the
-       * message payload as it will be initialized implicitly by the
-       * generator.
-       *
-       * * ir4_tg4 needs to place its channel select in the header,
-       * for interaction with ARB_texture_swizzle
-       *
-       * The sampler index is only 4-bits, so for larger sampler numbers we
-       * need to offset the Sampler State Pointer in the header.
-       */
-      header_size = 1;
-      sources[0] = fs_reg();
-      length++;
-   }
-
-   if (shadow_c.file != BAD_FILE) {
-      bld.MOV(sources[length], shadow_c);
-      length++;
-   }
-
-   bool has_nonconstant_offset =
-      offset_value.file != BAD_FILE && offset_value.file != IMM;
-   bool coordinate_done = false;
-
-   /* The sampler can only meaningfully compute LOD for fragment shader
-    * messages. For all other stages, we change the opcode to ir_txl and
-    * hardcode the LOD to 0.
-    */
-   if (stage != MESA_SHADER_FRAGMENT && op == ir_tex) {
-      op = ir_txl;
-      lod = fs_reg(0.0f);
-   }
-
-   /* Set up the LOD info */
-   switch (op) {
-   case ir_tex:
-   case ir_lod:
-      break;
-   case ir_txb:
-      bld.MOV(sources[length], lod);
-      length++;
-      break;
-   case ir_txl:
-      bld.MOV(sources[length], lod);
-      length++;
-      break;
-   case ir_txd: {
-      no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
-
-      /* Load dPdx and the coordinate together:
-       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
-       */
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(sources[length], coordinate);
-	 coordinate = offset(coordinate, bld, 1);
-	 length++;
-
-         /* For cube map array, the coordinate is (u,v,r,ai) but there are
-          * only derivatives for (u, v, r).
-          */
-         if (i < grad_components) {
-            bld.MOV(sources[length], lod);
-            lod = offset(lod, bld, 1);
-            length++;
-
-            bld.MOV(sources[length], lod2);
-            lod2 = offset(lod2, bld, 1);
-            length++;
-         }
-      }
-
-      coordinate_done = true;
-      break;
-   }
-   case ir_txs:
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
-      length++;
-      break;
-   case ir_query_levels:
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u));
-      length++;
-      break;
-   case ir_txf:
-      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
-       * On Gen9 they are u, v, lod, r
-       */
-
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-      coordinate = offset(coordinate, bld, 1);
-      length++;
-
-      if (devinfo->gen >= 9) {
-         if (coord_components >= 2) {
-            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-            coordinate = offset(coordinate, bld, 1);
-         }
-         length++;
-      }
-
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
-      length++;
-
-      for (int i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
-         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-	 coordinate = offset(coordinate, bld, 1);
-	 length++;
-      }
-
-      coordinate_done = true;
-      break;
-   case ir_txf_ms:
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
-      length++;
-
-      /* data from the multisample control surface */
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
-      length++;
-
-      /* there is no offsetting for this message; just copy in the integer
-       * texture coordinates
-       */
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-         coordinate = offset(coordinate, bld, 1);
-         length++;
-      }
-
-      coordinate_done = true;
-      break;
-   case ir_tg4:
-      if (has_nonconstant_offset) {
-         if (shadow_c.file != BAD_FILE)
-            no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
-
-         /* More crazy intermixing */
-         for (int i = 0; i < 2; i++) { /* u, v */
-            bld.MOV(sources[length], coordinate);
-            coordinate = offset(coordinate, bld, 1);
-            length++;
-         }
-
-         for (int i = 0; i < 2; i++) { /* offu, offv */
-            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
-            offset_value = offset(offset_value, bld, 1);
-            length++;
-         }
-
-         if (coord_components == 3) { /* r if present */
-            bld.MOV(sources[length], coordinate);
-            coordinate = offset(coordinate, bld, 1);
-            length++;
-         }
-
-         coordinate_done = true;
-      }
-      break;
-   }
-
-   /* Set up the coordinate (except for cases where it was done above) */
-   if (!coordinate_done) {
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(sources[length], coordinate);
-         coordinate = offset(coordinate, bld, 1);
-         length++;
-      }
-   }
-
-   int mlen;
-   if (reg_width == 2)
-      mlen = length * reg_width - header_size;
-   else
-      mlen = length * reg_width;
-
-   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_F);
-   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
-
-   /* Generate the SEND */
-   enum opcode opcode;
-   switch (op) {
-   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
-   case ir_txb: opcode = FS_OPCODE_TXB; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
-   case ir_lod: opcode = SHADER_OPCODE_LOD; break;
-   case ir_tg4:
-      if (has_nonconstant_offset)
-         opcode = SHADER_OPCODE_TG4_OFFSET;
-      else
-         opcode = SHADER_OPCODE_TG4;
-      break;
-   default:
-      unreachable("not reached");
-   }
-   fs_inst *inst = bld.emit(opcode, dst, src_payload, sampler);
-   inst->base_mrf = -1;
-   inst->mlen = mlen;
-   inst->header_size = header_size;
-   inst->regs_written = 4 * reg_width;
-
-   if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
-      fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
-           " disallowed by hardware\n");
-   }
-
-   return inst;
-}
-
 fs_reg
 fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
                              bool is_rect, uint32_t sampler, int texunit)

From a0c02d2bbb765b0e997ad524d8e51838e529d9c0 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sun, 28 Jun 2015 21:04:17 +0300
Subject: [PATCH 0748/1208] i965: Define the setup_vector_uniform_values()
 backend_visitor interface.

This cleans up the VEC4 implementation of setup_uniform_values()
somewhat and will avoid duplication of the image uniform upload code
by having a common interface to upload a vector of uniforms on either
back-end.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp          | 12 ++++++
 src/mesa/drivers/dri/i965/brw_fs.h            |  4 ++
 src/mesa/drivers/dri/i965/brw_shader.h        |  4 ++
 src/mesa/drivers/dri/i965/brw_vec4.h          |  3 ++
 .../drivers/dri/i965/brw_vec4_visitor.cpp     | 40 ++++++++++---------
 5 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 6d6de3bc677..289757f1e28 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -897,6 +897,18 @@ fs_visitor::import_uniforms(fs_visitor *v)
    this->param_size = v->param_size;
 }
 
+void
+fs_visitor::setup_vector_uniform_values(const gl_constant_value *values, unsigned n)
+{
+   static const gl_constant_value zero = { 0 };
+
+   for (unsigned i = 0; i < n; ++i)
+      stage_prog_data->param[uniforms++] = &values[i];
+
+   for (unsigned i = n; i < 4; ++i)
+      stage_prog_data->param[uniforms++] = &zero;
+}
+
 fs_reg *
 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
                                          bool origin_upper_left)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 6b75a8dbc3d..822cd2736c4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -296,6 +296,10 @@ public:
    fs_reg get_timestamp(const brw::fs_builder &bld);
 
    struct brw_reg interp_reg(int location, int channel);
+
+   virtual void setup_vector_uniform_values(const gl_constant_value *values,
+                                            unsigned n);
+
    int implied_mrf_writes(fs_inst *inst);
 
    virtual void dump_instructions();
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index b2c1a0b8d69..925072f1349 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -26,6 +26,7 @@
 #include "brw_defines.h"
 #include "main/compiler.h"
 #include "glsl/ir.h"
+#include "program/prog_parameter.h"
 
 #ifdef __cplusplus
 #include "brw_ir_allocator.h"
@@ -268,6 +269,9 @@ public:
    void assign_common_binding_table_offsets(uint32_t next_binding_table_offset);
 
    virtual void invalidate_live_intervals() = 0;
+
+   virtual void setup_vector_uniform_values(const gl_constant_value *values,
+                                            unsigned n) = 0;
 };
 
 uint32_t brw_texture_offset(int *offsets, unsigned num_components);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 7bf027a0306..bf6183a74a6 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -176,9 +176,12 @@ public:
    void fail(const char *msg, ...);
 
    void setup_uniform_clipplane_values(gl_clip_plane *clip_planes);
+   virtual void setup_vector_uniform_values(const gl_constant_value *values,
+                                            unsigned n);
    void setup_uniform_values(ir_variable *ir);
    void setup_builtin_uniform_values(ir_variable *ir);
    int setup_uniforms(int payload_reg);
+
    bool reg_allocate_trivial();
    bool reg_allocate();
    void evaluate_spill_costs(float *spill_costs, bool *no_spill);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 6fee798038c..205bca1d1a8 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -670,6 +670,21 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
    this->type = brw_type_for_base_type(type);
 }
 
+void
+vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
+                                          unsigned n)
+{
+   static const gl_constant_value zero = { 0 };
+
+   for (unsigned i = 0; i < n; ++i)
+      stage_prog_data->param[4 * uniforms + i] = &values[i];
+
+   for (unsigned i = n; i < 4; ++i)
+      stage_prog_data->param[4 * uniforms + i] = &zero;
+
+   uniform_vector_size[uniforms++] = n;
+}
+
 /* Our support for uniforms is piggy-backed on the struct
  * gl_fragment_program, because that's where the values actually
  * get stored, rather than in some global gl_shader_program uniform
@@ -699,26 +714,13 @@ vec4_visitor::setup_uniform_values(ir_variable *ir)
          continue;
       }
 
-      gl_constant_value *components = storage->storage;
-      unsigned vector_count = (MAX2(storage->array_elements, 1) *
-                               storage->type->matrix_columns);
+      const unsigned vector_count = (MAX2(storage->array_elements, 1) *
+                                     storage->type->matrix_columns);
+      const unsigned vector_size = storage->type->vector_elements;
 
-      for (unsigned s = 0; s < vector_count; s++) {
-         assert(uniforms < uniform_array_size);
-         uniform_vector_size[uniforms] = storage->type->vector_elements;
-
-         int i;
-         for (i = 0; i < uniform_vector_size[uniforms]; i++) {
-            stage_prog_data->param[uniforms * 4 + i] = components;
-            components++;
-         }
-         for (; i < 4; i++) {
-            static gl_constant_value zero = { 0.0 };
-            stage_prog_data->param[uniforms * 4 + i] = &zero;
-         }
-
-         uniforms++;
-      }
+      for (unsigned s = 0; s < vector_count; s++)
+         setup_vector_uniform_values(&storage->storage[s * vector_size],
+                                     vector_size);
    }
 }
 

From 3af2623da5167aa686bcb2cff01d27058a507026 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 20 Jul 2015 17:38:15 +0300
Subject: [PATCH 0749/1208] i965: Lift the constness restriction on surface
 indices passed to untyped ops.

v2: Update NIR atomic intrinsic handling too (Ken).

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp   | 8 ++------
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp         | 3 +++
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 8 ++------
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp   | 2 ++
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 811fb738058..c86ca043b63 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1989,19 +1989,15 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          break;
 
       case SHADER_OPCODE_UNTYPED_ATOMIC:
-         assert(src[1].file == BRW_IMMEDIATE_VALUE &&
-                src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud,
                             inst->mlen, !inst->dst.is_null());
-         brw_mark_surface_used(prog_data, src[1].dw1.ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
-         assert(src[1].file == BRW_IMMEDIATE_VALUE &&
-                src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_surface_read(p, dst, src[0], src[1],
                                   inst->mlen, src[2].dw1.ud);
-         brw_mark_surface_used(prog_data, src[1].dw1.ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 33469d4ad69..f62acbc3a78 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1276,6 +1276,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          default:
             unreachable("Unreachable");
       }
+
+      /* Mark the surface as used. */
+      brw_mark_surface_used(stage_prog_data, surf_index);
       break;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index d2de2f0be25..0483afc8fe0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1465,19 +1465,15 @@ vec4_generator::generate_code(const cfg_t *cfg)
          break;
 
       case SHADER_OPCODE_UNTYPED_ATOMIC:
-         assert(src[1].file == BRW_IMMEDIATE_VALUE &&
-                src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud, inst->mlen,
                             !inst->dst.is_null());
-         brw_mark_surface_used(&prog_data->base, src[1].dw1.ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
-         assert(src[1].file == BRW_IMMEDIATE_VALUE &&
-                src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
                                   src[2].dw1.ud);
-         brw_mark_surface_used(&prog_data->base, src[1].dw1.ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 205bca1d1a8..bf17fe6baa7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -2439,6 +2439,8 @@ vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
                           src_reg(), src_reg());
    }
+
+   brw_mark_surface_used(stage_prog_data, surf_index);
 }
 
 void

From 7a594a95a930f1658062e4d86d0f37d491b372b3 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 21 Jul 2015 18:45:32 +0300
Subject: [PATCH 0750/1208] i965/fs: Define logical typed and untyped surface
 opcodes.

Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects its arguments
separately as individual sources, like:

 typed_surface_write_logical null, coordinates, source, surface,
                                    num_coordinates, num_components

This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mainly as
documentation for their source registers.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_defines.h  | 20 +++++
 src/mesa/drivers/dri/i965/brw_fs.cpp     | 93 ++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_shader.cpp | 16 ++++
 3 files changed, 129 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 373d6b8f8ab..f595366eeac 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -963,13 +963,33 @@ enum opcode {
 
    SHADER_OPCODE_SHADER_TIME_ADD,
 
+   /**
+    * Typed and untyped surface access opcodes.
+    *
+    * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+    * opcode but instead of taking a single payload blob they expect their
+    * arguments separately as individual sources:
+    *
+    * Source 0: [required] Surface coordinates.
+    * Source 1: [optional] Operation source.
+    * Source 2: [required] Surface index.
+    * Source 3: [required] Number of coordinate components (as UD immediate).
+    * Source 4: [required] Opcode-specific control immediate, same as source 2
+    *                      of the matching non-LOGICAL opcode.
+    */
    SHADER_OPCODE_UNTYPED_ATOMIC,
+   SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
    SHADER_OPCODE_UNTYPED_SURFACE_READ,
+   SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
    SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+   SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
 
    SHADER_OPCODE_TYPED_ATOMIC,
+   SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
    SHADER_OPCODE_TYPED_SURFACE_READ,
+   SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
    SHADER_OPCODE_TYPED_SURFACE_WRITE,
+   SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
 
    SHADER_OPCODE_MEMORY_FENCE,
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 289757f1e28..2af7a892ca4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -711,6 +711,49 @@ fs_inst::components_read(unsigned i) const
       else
          return 1;
 
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      assert(src[3].file == IMM);
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source (ignored for reads). */
+      else if (i == 1)
+         return 0;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      assert(src[3].file == IMM &&
+             src[4].file == IMM);
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source. */
+      else if (i == 1)
+         return src[4].fixed_hw_reg.dw1.ud;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
+      assert(src[3].file == IMM &&
+             src[4].file == IMM);
+      const unsigned op = src[4].fixed_hw_reg.dw1.ud;
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source. */
+      else if (i == 1 && op == BRW_AOP_CMPWR)
+         return 2;
+      else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
+                          op == BRW_AOP_PREDEC))
+         return 0;
+      else
+         return 1;
+   }
+
    default:
       return 1;
    }
@@ -3847,6 +3890,20 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
    }
 }
 
+static void
+lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
+                           const fs_reg &sample_mask)
+{
+   /* Get the logical send arguments. */
+   const fs_reg &addr = inst->src[0];
+   const fs_reg &src = inst->src[1];
+   const fs_reg &surface = inst->src[2];
+   const fs_reg &dims = inst->src[3];
+   const fs_reg &arg = inst->src[4];
+
+   assert(!"Not implemented");
+}
+
 bool
 fs_visitor::lower_logical_sends()
 {
@@ -3914,6 +3971,42 @@ fs_visitor::lower_logical_sends()
          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
          break;
 
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_SURFACE_READ,
+                                    fs_reg());
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+                                    fs_reg());
+         break;
+
+      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_ATOMIC,
+                                    fs_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_SURFACE_READ,
+                                    fs_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_SURFACE_WRITE,
+                                    fs_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_ATOMIC,
+                                    fs_reg());
+         break;
+
       default:
          continue;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 9e3a9740d74..58587b24403 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -619,16 +619,28 @@ brw_instruction_name(enum opcode op)
 
    case SHADER_OPCODE_UNTYPED_ATOMIC:
       return "untyped_atomic";
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+      return "untyped_atomic_logical";
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
       return "untyped_surface_read";
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+      return "untyped_surface_read_logical";
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
       return "untyped_surface_write";
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+      return "untyped_surface_write_logical";
    case SHADER_OPCODE_TYPED_ATOMIC:
       return "typed_atomic";
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+      return "typed_atomic_logical";
    case SHADER_OPCODE_TYPED_SURFACE_READ:
       return "typed_surface_read";
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      return "typed_surface_read_logical";
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
       return "typed_surface_write";
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return "typed_surface_write_logical";
    case SHADER_OPCODE_MEMORY_FENCE:
       return "memory_fence";
 
@@ -1181,10 +1193,14 @@ backend_instruction::has_side_effects() const
 {
    switch (opcode) {
    case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
    case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
    case SHADER_OPCODE_MEMORY_FENCE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
    case FS_OPCODE_FB_WRITE:

From 086d29f4d747bbcfe37beeb18ba77fb2cb84dbdc Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sat, 18 Jul 2015 16:16:19 +0300
Subject: [PATCH 0751/1208] i965/fs: Hook up SIMD lowering to unroll surface
 instructions of unsupported width.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 2af7a892ca4..7aa77ff6591 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -4073,6 +4073,11 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
       else
          return inst->exec_size;
 
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return 8;
+
    default:
       return inst->exec_size;
    }

From 3352724dfa4eb5c93290db92ae99d26d9b89e630 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 14 Jul 2015 18:42:57 +0300
Subject: [PATCH 0752/1208] i965/fs: Implement lowering of logical surface
 instructions.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 63 ++++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 7aa77ff6591..a9e8dc47cfc 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3890,6 +3890,20 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
    }
 }
 
+/**
+ * Initialize the header present in some typed and untyped surface
+ * messages.
+ */
+static fs_reg
+emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
+{
+   fs_builder ubld = bld.exec_all().group(8, 0);
+   const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+   ubld.MOV(dst, fs_reg(0));
+   ubld.MOV(component(dst, 7), sample_mask);
+   return dst;
+}
+
 static void
 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
                            const fs_reg &sample_mask)
@@ -3898,10 +3912,43 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
    const fs_reg &addr = inst->src[0];
    const fs_reg &src = inst->src[1];
    const fs_reg &surface = inst->src[2];
-   const fs_reg &dims = inst->src[3];
+   const UNUSED fs_reg &dims = inst->src[3];
    const fs_reg &arg = inst->src[4];
 
-   assert(!"Not implemented");
+   /* Calculate the total number of components of the payload. */
+   const unsigned addr_sz = inst->components_read(0);
+   const unsigned src_sz = inst->components_read(1);
+   const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
+   const unsigned sz = header_sz + addr_sz + src_sz;
+
+   /* Allocate space for the payload. */
+   fs_reg *const components = new fs_reg[sz];
+   const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+   unsigned n = 0;
+
+   /* Construct the payload. */
+   if (header_sz)
+      components[n++] = emit_surface_header(bld, sample_mask);
+
+   for (unsigned i = 0; i < addr_sz; i++)
+      components[n++] = offset(addr, bld, i);
+
+   for (unsigned i = 0; i < src_sz; i++)
+      components[n++] = offset(src, bld, i);
+
+   bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
+
+   /* Update the original instruction. */
+   inst->opcode = op;
+   inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
+   inst->header_size = header_sz;
+
+   inst->src[0] = payload;
+   inst->src[1] = surface;
+   inst->src[2] = arg;
+   inst->resize_sources(3);
+
+   delete[] components;
 }
 
 bool
@@ -3974,37 +4021,37 @@ fs_visitor::lower_logical_sends()
       case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
          lower_surface_logical_send(ibld, inst,
                                     SHADER_OPCODE_UNTYPED_SURFACE_READ,
-                                    fs_reg());
+                                    fs_reg(0xffff));
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
          lower_surface_logical_send(ibld, inst,
                                     SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
-                                    fs_reg());
+                                    ibld.sample_mask_reg());
          break;
 
       case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
          lower_surface_logical_send(ibld, inst,
                                     SHADER_OPCODE_UNTYPED_ATOMIC,
-                                    fs_reg());
+                                    ibld.sample_mask_reg());
          break;
 
       case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
          lower_surface_logical_send(ibld, inst,
                                     SHADER_OPCODE_TYPED_SURFACE_READ,
-                                    fs_reg());
+                                    fs_reg(0xffff));
          break;
 
       case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
          lower_surface_logical_send(ibld, inst,
                                     SHADER_OPCODE_TYPED_SURFACE_WRITE,
-                                    fs_reg());
+                                    ibld.sample_mask_reg());
          break;
 
       case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
          lower_surface_logical_send(ibld, inst,
                                     SHADER_OPCODE_TYPED_ATOMIC,
-                                    fs_reg());
+                                    ibld.sample_mask_reg());
          break;
 
       default:

From 03846696ce2deaaaff42b2acd7745b51a7f115f2 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 15:39:03 +0300
Subject: [PATCH 0753/1208] i965/fs: Handle zero-size allocations in
 fs_builder::vgrf().

This will be handy to avoid some ugly ternary operators in the next
patch, like:
 fs_reg reg = (size == 0 ? null_reg_ud() : vgrf(..., size));

Because a zero-size register allocation is guaranteed not to ever be
read or written we can just return the null register.  Another
possibility would be to actually allocate a zero-size VGRF what would
involve defining a zero-size register class in the register allocator
and a considerable amount of churn.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_builder.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index eea1eae0129..e7d5f8a381d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -160,10 +160,13 @@ namespace brw {
       dst_reg
       vgrf(enum brw_reg_type type, unsigned n = 1) const
       {
-         return dst_reg(GRF, shader->alloc.allocate(
-                           DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
-                                        REG_SIZE)),
-                        type);
+         if (n > 0)
+            return dst_reg(GRF, shader->alloc.allocate(
+                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
+                                           REG_SIZE)),
+                           type);
+         else
+            return retype(null_reg_ud(), type);
       }
 
       /**

From 1aab58f39450213ea2ac43549eefb8acd1e6584a Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Thu, 30 Apr 2015 19:31:44 +0300
Subject: [PATCH 0754/1208] i965/fs: Import surface message builder helper
 functions.

Implement helper functions that can be used to construct and send
untyped and typed surface read, write and atomic messages to the
shared dataport unit easily.

v2: Drop VEC4 suport.
v3: Reimplement in terms of logical send opcodes.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/Makefile.sources    |   2 +
 .../dri/i965/brw_fs_surface_builder.cpp       | 163 ++++++++++++++++++
 .../drivers/dri/i965/brw_fs_surface_builder.h |  69 ++++++++
 3 files changed, 234 insertions(+)
 create mode 100644 src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
 create mode 100644 src/mesa/drivers/dri/i965/brw_fs_surface_builder.h

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 5a33aacbc23..e861c2ccad5 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -60,6 +60,8 @@ i965_FILES = \
 	brw_fs_register_coalesce.cpp \
 	brw_fs_saturate_propagation.cpp \
 	brw_fs_sel_peephole.cpp \
+	brw_fs_surface_builder.cpp \
+	brw_fs_surface_builder.h \
 	brw_fs_vector_splitting.cpp \
 	brw_fs_visitor.cpp \
 	brw_gs.c \
diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
new file mode 100644
index 00000000000..cac958d49c1
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs_surface_builder.h"
+#include "brw_fs.h"
+
+using namespace brw;
+
+namespace brw {
+   namespace surface_access {
+      namespace {
+         /**
+          * Generate a logical send opcode for a surface message and return
+          * the result.
+          */
+         fs_reg
+         emit_send(const fs_builder &bld, enum opcode opcode,
+                   const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
+                   unsigned dims, unsigned arg, unsigned rsize,
+                   brw_predicate pred = BRW_PREDICATE_NONE)
+         {
+            /* Reduce the dynamically uniform surface index to a single
+             * scalar.
+             */
+            const fs_reg usurface = bld.emit_uniformize(surface);
+            const fs_reg srcs[] = {
+               addr, src, usurface, fs_reg(dims), fs_reg(arg)
+            };
+            const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
+            fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+
+            inst->regs_written = rsize * bld.dispatch_width() / 8;
+            inst->predicate = pred;
+            return dst;
+         }
+      }
+
+      /**
+       * Emit an untyped surface read opcode.  \p dims determines the number
+       * of components of the address and \p size the number of components of
+       * the returned value.
+       */
+      fs_reg
+      emit_untyped_read(const fs_builder &bld,
+                        const fs_reg &surface, const fs_reg &addr,
+                        unsigned dims, unsigned size,
+                        brw_predicate pred)
+      {
+         return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+                          addr, fs_reg(), surface, dims, size, size, pred);
+      }
+
+      /**
+       * Emit an untyped surface write opcode.  \p dims determines the number
+       * of components of the address and \p size the number of components of
+       * the argument.
+       */
+      void
+      emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
+                         const fs_reg &addr, const fs_reg &src,
+                         unsigned dims, unsigned size,
+                         brw_predicate pred)
+      {
+         emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+                   addr, src, surface, dims, size, 0, pred);
+      }
+
+      /**
+       * Emit an untyped surface atomic opcode.  \p dims determines the number
+       * of components of the address and \p rsize the number of components of
+       * the returned value (either zero or one).
+       */
+      fs_reg
+      emit_untyped_atomic(const fs_builder &bld,
+                          const fs_reg &surface, const fs_reg &addr,
+                          const fs_reg &src0, const fs_reg &src1,
+                          unsigned dims, unsigned rsize, unsigned op,
+                          brw_predicate pred)
+      {
+         /* FINISHME: Factor out this frequently recurring pattern into a
+          * helper function.
+          */
+         const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+         const fs_reg srcs[] = { src0, src1 };
+         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
+         bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
+
+         return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
+                          addr, tmp, surface, dims, op, rsize, pred);
+      }
+
+      /**
+       * Emit a typed surface read opcode.  \p dims determines the number of
+       * components of the address and \p size the number of components of the
+       * returned value.
+       */
+      fs_reg
+      emit_typed_read(const fs_builder &bld, const fs_reg &surface,
+                      const fs_reg &addr, unsigned dims, unsigned size)
+      {
+         return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
+                          addr, fs_reg(), surface, dims, size, size);
+      }
+
+      /**
+       * Emit a typed surface write opcode.  \p dims determines the number of
+       * components of the address and \p size the number of components of the
+       * argument.
+       */
+      void
+      emit_typed_write(const fs_builder &bld, const fs_reg &surface,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned dims, unsigned size)
+      {
+         emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
+                   addr, src, surface, dims, size, 0);
+      }
+
+      /**
+       * Emit a typed surface atomic opcode.  \p dims determines the number of
+       * components of the address and \p rsize the number of components of
+       * the returned value (either zero or one).
+       */
+      fs_reg
+      emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
+                        const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned dims, unsigned rsize, unsigned op,
+                        brw_predicate pred)
+      {
+         /* FINISHME: Factor out this frequently recurring pattern into a
+          * helper function.
+          */
+         const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+         const fs_reg srcs[] = { src0, src1 };
+         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
+         bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
+
+         return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
+                          addr, tmp, surface, dims, op, rsize);
+      }
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h
new file mode 100644
index 00000000000..264314b7724
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h
@@ -0,0 +1,69 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_FS_SURFACE_BUILDER_H
+#define BRW_FS_SURFACE_BUILDER_H
+
+#include "brw_fs_builder.h"
+#include "brw_context.h"
+
+namespace brw {
+   namespace surface_access {
+      fs_reg
+      emit_untyped_read(const fs_builder &bld,
+                        const fs_reg &surface, const fs_reg &addr,
+                        unsigned dims, unsigned size,
+                        brw_predicate pred = BRW_PREDICATE_NONE);
+
+      void
+      emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
+                         const fs_reg &addr, const fs_reg &src,
+                         unsigned dims, unsigned size,
+                         brw_predicate pred = BRW_PREDICATE_NONE);
+
+      fs_reg
+      emit_untyped_atomic(const fs_builder &bld,
+                          const fs_reg &surface, const fs_reg &addr,
+                          const fs_reg &src0, const fs_reg &src1,
+                          unsigned dims, unsigned rsize, unsigned op,
+                          brw_predicate pred = BRW_PREDICATE_NONE);
+
+      fs_reg
+      emit_typed_read(const fs_builder &bld, const fs_reg &surface,
+                      const fs_reg &addr, unsigned dims, unsigned size);
+
+      void
+      emit_typed_write(const fs_builder &bld, const fs_reg &surface,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned dims, unsigned size);
+
+      fs_reg
+      emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
+                        const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned dims, unsigned rsize, unsigned op,
+                        brw_predicate pred = BRW_PREDICATE_NONE);
+   }
+}
+#endif

From 854c4d8b37416d3e5593099a8e5441f3cf861173 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 5 May 2015 20:52:58 +0300
Subject: [PATCH 0755/1208] i965/fs: Revisit NIR atomic counter intrinsic
 translation.

Rewrite the NIR atomic counter intrinsics translation code making use
of the recently introduced surface builder.  This will allow the
removal of some of the functionality duplicated between the visitor
and surface builder.

v2: Drop VEC4 suport.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 49 ++++++++++++++++--------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index f62acbc3a78..d4812c55556 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -26,6 +26,7 @@
 #include "glsl/nir/glsl_to_nir.h"
 #include "program/prog_to_nir.h"
 #include "brw_fs.h"
+#include "brw_fs_surface_builder.h"
 #include "brw_nir.h"
 
 using namespace brw;
@@ -1257,28 +1258,42 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_atomic_counter_inc:
    case nir_intrinsic_atomic_counter_dec:
    case nir_intrinsic_atomic_counter_read: {
-      unsigned surf_index = prog_data->binding_table.abo_start +
-                            (unsigned) instr->const_index[0];
-      fs_reg offset = fs_reg(get_nir_src(instr->src[0]));
+      using namespace surface_access;
 
+      /* Get the arguments of the atomic intrinsic. */
+      const fs_reg offset = get_nir_src(instr->src[0]);
+      const unsigned surface = (stage_prog_data->binding_table.abo_start +
+                                instr->const_index[0]);
+      fs_reg tmp;
+
+      /* Emit a surface read or atomic op. */
       switch (instr->intrinsic) {
-         case nir_intrinsic_atomic_counter_inc:
-            emit_untyped_atomic(BRW_AOP_INC, surf_index, dest, offset,
-                                fs_reg(), fs_reg());
-            break;
-         case nir_intrinsic_atomic_counter_dec:
-            emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dest, offset,
-                                fs_reg(), fs_reg());
-            break;
-         case nir_intrinsic_atomic_counter_read:
-            emit_untyped_surface_read(surf_index, dest, offset);
-            break;
-         default:
-            unreachable("Unreachable");
+      case nir_intrinsic_atomic_counter_read:
+         tmp = surface_access::emit_untyped_read(
+            bld, fs_reg(surface), offset, 1, 1);
+         break;
+
+      case nir_intrinsic_atomic_counter_inc:
+         tmp = surface_access::emit_untyped_atomic(
+            bld, fs_reg(surface), offset, fs_reg(),
+            fs_reg(), 1, 1, BRW_AOP_INC);
+         break;
+
+      case nir_intrinsic_atomic_counter_dec:
+         tmp = surface_access::emit_untyped_atomic(
+            bld, fs_reg(surface), offset, fs_reg(),
+            fs_reg(), 1, 1, BRW_AOP_PREDEC);
+         break;
+
+      default:
+         unreachable("Unreachable");
       }
 
+      /* Assign the result. */
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
+
       /* Mark the surface as used. */
-      brw_mark_surface_used(stage_prog_data, surf_index);
+      brw_mark_surface_used(stage_prog_data, surface);
       break;
    }
 

From ea0ac53f059c418d5797c495b87020f2ca2ec842 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 29 Jun 2015 16:50:49 +0300
Subject: [PATCH 0756/1208] i965/fs: Drop unused untyped surface read and
 atomic emit methods.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.h           |   7 --
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp     |  13 +--
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 112 -------------------
 3 files changed, 5 insertions(+), 127 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 822cd2736c4..4749c47f224 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -286,13 +286,6 @@ public:
                         int shader_time_subindex,
                         fs_reg value);
 
-   void emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                            fs_reg dst, fs_reg offset, fs_reg src0,
-                            fs_reg src1);
-
-   void emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
-                                  fs_reg offset);
-
    fs_reg get_timestamp(const brw::fs_builder &bld);
 
    struct brw_reg interp_reg(int location, int channel);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index d4812c55556..3e8860bafc6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1269,20 +1269,17 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       /* Emit a surface read or atomic op. */
       switch (instr->intrinsic) {
       case nir_intrinsic_atomic_counter_read:
-         tmp = surface_access::emit_untyped_read(
-            bld, fs_reg(surface), offset, 1, 1);
+         tmp = emit_untyped_read(bld, fs_reg(surface), offset, 1, 1);
          break;
 
       case nir_intrinsic_atomic_counter_inc:
-         tmp = surface_access::emit_untyped_atomic(
-            bld, fs_reg(surface), offset, fs_reg(),
-            fs_reg(), 1, 1, BRW_AOP_INC);
+         tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
+                                   fs_reg(), 1, 1, BRW_AOP_INC);
          break;
 
       case nir_intrinsic_atomic_counter_dec:
-         tmp = surface_access::emit_untyped_atomic(
-            bld, fs_reg(surface), offset, fs_reg(),
-            fs_reg(), 1, 1, BRW_AOP_PREDEC);
+         tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
+                                   fs_reg(), 1, 1, BRW_AOP_PREDEC);
          break;
 
       default:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 5f0549c3bc0..111db8c4323 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -530,118 +530,6 @@ fs_visitor::try_replace_with_sel()
    return false;
 }
 
-void
-fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                                fs_reg dst, fs_reg offset, fs_reg src0,
-                                fs_reg src1)
-{
-   int reg_width = dispatch_width / 8;
-   int length = 0;
-
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
-
-   sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   /* Initialize the sample mask in the message header. */
-   bld.exec_all().MOV(sources[0], fs_reg(0u));
-
-   if (stage == MESA_SHADER_FRAGMENT) {
-      if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
-         bld.exec_all()
-            .MOV(component(sources[0], 7), brw_flag_reg(0, 1));
-      } else {
-         bld.exec_all()
-            .MOV(component(sources[0], 7),
-                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
-      }
-   } else {
-      /* The execution mask is part of the side-band information sent together with
-       * the message payload to the data port. It's implicitly ANDed with the sample
-       * mask sent in the header to compute the actual set of channels that execute
-       * the atomic operation.
-       */
-      assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
-      bld.exec_all()
-         .MOV(component(sources[0], 7), fs_reg(0xffffu));
-   }
-   length++;
-
-   /* Set the atomic operation offset. */
-   sources[1] = vgrf(glsl_type::uint_type);
-   bld.MOV(sources[1], offset);
-   length++;
-
-   /* Set the atomic operation arguments. */
-   if (src0.file != BAD_FILE) {
-      sources[length] = vgrf(glsl_type::uint_type);
-      bld.MOV(sources[length], src0);
-      length++;
-   }
-
-   if (src1.file != BAD_FILE) {
-      sources[length] = vgrf(glsl_type::uint_type);
-      bld.MOV(sources[length], src1);
-      length++;
-   }
-
-   int mlen = 1 + (length - 1) * reg_width;
-   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_UD);
-   bld.LOAD_PAYLOAD(src_payload, sources, length, 1);
-
-   /* Emit the instruction. */
-   fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
-                            fs_reg(surf_index), fs_reg(atomic_op));
-   inst->mlen = mlen;
-}
-
-void
-fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
-                                      fs_reg offset)
-{
-   int reg_width = dispatch_width / 8;
-
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
-
-   sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   /* Initialize the sample mask in the message header. */
-   bld.exec_all()
-      .MOV(sources[0], fs_reg(0u));
-
-   if (stage == MESA_SHADER_FRAGMENT) {
-      if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
-         bld.exec_all()
-            .MOV(component(sources[0], 7), brw_flag_reg(0, 1));
-      } else {
-         bld.exec_all()
-            .MOV(component(sources[0], 7),
-                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
-      }
-   } else {
-      /* The execution mask is part of the side-band information sent together with
-       * the message payload to the data port. It's implicitly ANDed with the sample
-       * mask sent in the header to compute the actual set of channels that execute
-       * the atomic operation.
-       */
-      assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
-      bld.exec_all()
-         .MOV(component(sources[0], 7), fs_reg(0xffffu));
-   }
-
-   /* Set the surface read offset. */
-   sources[1] = vgrf(glsl_type::uint_type);
-   bld.MOV(sources[1], offset);
-
-   int mlen = 1 + reg_width;
-   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_UD);
-   fs_inst *inst = bld.LOAD_PAYLOAD(src_payload, sources, 2, 1);
-
-   /* Emit the instruction. */
-   inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
-                   fs_reg(surf_index), fs_reg(1));
-   inst->mlen = mlen;
-}
-
 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
 void
 fs_visitor::emit_dummy_fs()

From 3e5a90792d14aeb599dd236f830e6e344b35c905 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 5 May 2015 22:12:03 +0300
Subject: [PATCH 0757/1208] i965/fs: Don't overwrite fs_visitor::uniforms and
 ::param_size during the SIMD16 run.

Image variables need to allocate additional uniform slots over
nir_shader::num_uniforms.  nir_setup_uniforms() overwrites the values
imported from the SIMD8 visitor and then exits early before entering
the nir_shader::uniforms loop, so image uniforms are never re-created.
Instead leave the imported values alone, they *must* be the same for
the uniform layout of both runs to be compatible.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 3e8860bafc6..03a1ef5fc8d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -182,9 +182,11 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
 void
 fs_visitor::nir_setup_uniforms(nir_shader *shader)
 {
-   uniforms = shader->num_uniforms;
    num_direct_uniforms = shader->num_direct_uniforms;
 
+   if (dispatch_width != 8)
+      return;
+
    /* We split the uniform register file in half.  The first half is
     * entirely direct uniforms.  The second half is indirect.
     */
@@ -192,8 +194,7 @@ fs_visitor::nir_setup_uniforms(nir_shader *shader)
    if (shader->num_uniforms > num_direct_uniforms)
       param_size[num_direct_uniforms] = shader->num_uniforms - num_direct_uniforms;
 
-   if (dispatch_width != 8)
-      return;
+   uniforms = shader->num_uniforms;
 
    if (shader_prog) {
       foreach_list_typed(nir_variable, var, node, &shader->uniforms) {

From b5f1a48e234d47b24df38cb562cffb8941d43795 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sun, 28 Jun 2015 21:15:28 +0300
Subject: [PATCH 0758/1208] i965/fs: Execute nir_setup_uniforms, _inputs and
 _outputs unconditionally.

Images take up zero uniform slots in the nir_shader::num_uniforms
calculation, but nir_setup_uniforms needs to be executed even if the
program has no non-image uniforms so the driver-specific image
parameters are uploaded.  nir_setup_uniforms is a no-op if there are
really no uniforms, so checking the num_uniform count is useless in
any case.

The nir_setup_inputs and _outputs changes shouldn't lead to any
functional change, they are just meant to preserve the symmetry
between them and nir_setup_uniforms.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 03a1ef5fc8d..6d22faa54c3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -39,21 +39,9 @@ fs_visitor::emit_nir_code()
    /* emit the arrays used for inputs and outputs - load/store intrinsics will
     * be converted to reads/writes of these arrays
     */
-
-   if (nir->num_inputs > 0) {
-      nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
-      nir_setup_inputs(nir);
-   }
-
-   if (nir->num_outputs > 0) {
-      nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs);
-      nir_setup_outputs(nir);
-   }
-
-   if (nir->num_uniforms > 0) {
-      nir_setup_uniforms(nir);
-   }
-
+   nir_setup_inputs(nir);
+   nir_setup_outputs(nir);
+   nir_setup_uniforms(nir);
    nir_emit_system_values(nir);
 
    /* get the main function and emit it */
@@ -67,6 +55,8 @@ fs_visitor::emit_nir_code()
 void
 fs_visitor::nir_setup_inputs(nir_shader *shader)
 {
+   nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, shader->num_inputs);
+
    foreach_list_typed(nir_variable, var, node, &shader->inputs) {
       enum brw_reg_type type = brw_type_for_base_type(var->type);
       fs_reg input = offset(nir_inputs, bld, var->data.driver_location);
@@ -129,6 +119,8 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
 {
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
+   nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, shader->num_outputs);
+
    foreach_list_typed(nir_variable, var, node, &shader->outputs) {
       fs_reg reg = offset(nir_outputs, bld, var->data.driver_location);
 

From 7cb60d770fc24bf00b6f7e5898cca1426e55c026 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 16:25:55 +0300
Subject: [PATCH 0759/1208] i965/fs: Translate memory barrier NIR intrinsics.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 6d22faa54c3..722e4e75a82 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1287,6 +1287,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
+   case nir_intrinsic_memory_barrier: {
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 16 / dispatch_width);
+      bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
+         ->regs_written = 2;
+      break;
+   }
+
    case nir_intrinsic_load_front_face:
       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
               *emit_frontfacing_interpolation());

From 6f7dea0b3212aa4ce49fcf9e94bf7aab130eeab2 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 17:54:46 +0300
Subject: [PATCH 0760/1208] i965/fs: Define a new fs_builder constructor taking
 an instruction as argument.

We have a number of optimization passes that repeat the same pattern
before inserting new instructions into the program based on some
previous instruction: They point the default builder at the original
instruction, then call exec_all() and group() to select the same
execution controls the original instruction had, and then maybe call
annotate() to clone the debug annotation from the original
instruction.

In fact an optimization pass missing any of these steps is likely to
be broken if the intention was to emit new code based on a preexisting
instruction, so let's make it easy for passes to do the right thing by
having an fs_builder constructor that automates the task of setting up
a builder to emit a given instruction provided as argument.

The following patches fix all cases I've found in which we weren't
explicitly initializing the execution controls of the emitted
instructions, and clean-up optimization passes which were already
doing the right thing to use the new constructor.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_builder.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index e7d5f8a381d..12653d055d0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -63,6 +63,22 @@ namespace brw {
       {
       }
 
+      /**
+       * Construct an fs_builder that inserts instructions into \p shader
+       * before instruction \p inst in basic block \p block.  The default
+       * execution controls and debug annotation are initialized from the
+       * instruction passed as argument.
+       */
+      fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
+         shader(shader), block(block), cursor(inst),
+         _dispatch_width(inst->exec_size),
+         _group(inst->force_sechalf ? 8 : 0),
+         force_writemask_all(inst->force_writemask_all)
+      {
+         annotation.str = inst->annotation;
+         annotation.ir = inst->ir;
+      }
+
       /**
        * Construct an fs_builder that inserts instructions before \p cursor in
        * basic block \p block, inheriting other code generation parameters

From 53077aee6670022e634a4775d8abbb59c458b7d7 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 19:09:45 +0300
Subject: [PATCH 0761/1208] i965/fs: Set the execution size of the MOVs
 correctly in opt_combine_constants().

The execution size was being left equal to the default of 8/16, which
AFAICT would have overwritten components other than the one we wanted
to initialize and could potentially have corrupted other registers.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
index 0af5a915c9f..c182232285e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
@@ -277,7 +277,7 @@ fs_visitor::opt_combine_constants()
        */
       exec_node *n = (imm->inst ? imm->inst :
                       imm->block->last_non_control_flow_inst()->next);
-      const fs_builder ibld = bld.at(imm->block, n).exec_all();
+      const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
 
       ibld.MOV(reg, fs_reg(imm->val));
       imm->reg = reg.reg;

From ce90227c71c8cbe6ca4317f1873ff12c70081c4c Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 17:55:49 +0300
Subject: [PATCH 0762/1208] i965/fs: Set execution controls correctly for
 lowered pull constant loads.

demote_pull_constants() was ignoring the execution size and channel
selects of the instruction that wanted the constant, which doesn't
matter for uniform pull constant loads because all channels get the
same scalar value, but it might for varying pull constant loads.  Fix
it by using the new fs_builder() constructor that takes care of
setting execution controls compatible with the instruction passed as
argument.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index a9e8dc47cfc..50ea1b271d9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1924,8 +1924,7 @@ fs_visitor::demote_pull_constants()
 	    continue;
 
          /* Set up the annotation tracking for new generated instructions. */
-         const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
-                                    .at(block, inst);
+         const fs_builder ibld(this, block, inst);
          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
          fs_reg dst = vgrf(glsl_type::float_type);
 
@@ -1940,8 +1939,9 @@ fs_visitor::demote_pull_constants()
             inst->src[i].reladdr = NULL;
             inst->src[i].stride = 1;
          } else {
+            const fs_builder ubld = ibld.exec_all().group(8, 0);
             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
-            ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
                       dst, surf_index, offset);
             inst->src[i].set_smear(pull_index & 3);
          }

From ff463af436bcf07430807512c9f0bf0f627288ce Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 18:38:59 +0300
Subject: [PATCH 0763/1208] i965/fs: Set execution controls correctly in
 lower_integer_multiplication().

lower_integer_multiplication() was ignoring the execution controls of
the original MUL instruction.  Fix it by using the new fs_builder
constructor.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 50ea1b271d9..02a11d0d1ae 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3139,7 +3139,7 @@ fs_visitor::lower_integer_multiplication()
            inst->dst.type != BRW_REGISTER_TYPE_UD))
          continue;
 
-      const fs_builder ibld = bld.at(block, inst);
+      const fs_builder ibld(this, block, inst);
 
       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of

From e1f4724097d1074ec9afdc9ce9ad024add125923 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 19:27:30 +0300
Subject: [PATCH 0764/1208] i965/fs: Set execution controls explicitly in
 opt_peephole_sel().

Emit the SELs and MOVs with the same execution controls as the
original MOVs, and the CMP with the same execution controls as the IF.
Also explicitly check that the execution controls of any pair of MOVs
being folded into a SEL are compatible (which is almost always going
to be the case), since otherwise it would seem wrong to initialize the
builder object below from the then_mov instruction only.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
index 8660ec08b8f..d190d8eb6b4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
@@ -174,6 +174,9 @@ fs_visitor::opt_peephole_sel()
 
          /* Check that the MOVs are the right form. */
          if (!then_mov[i]->dst.equals(else_mov[i]->dst) ||
+             then_mov[i]->exec_size != else_mov[i]->exec_size ||
+             then_mov[i]->force_sechalf != else_mov[i]->force_sechalf ||
+             then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
              then_mov[i]->is_partial_write() ||
              else_mov[i]->is_partial_write() ||
              then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
@@ -192,14 +195,17 @@ fs_visitor::opt_peephole_sel()
       if (movs == 0)
          continue;
 
-      const fs_builder ibld = bld.at(block, if_inst);
-
       /* Emit a CMP if our IF used the embedded comparison */
-      if (devinfo->gen == 6 && if_inst->conditional_mod)
+      if (devinfo->gen == 6 && if_inst->conditional_mod) {
+         const fs_builder ibld(this, block, if_inst);
          ibld.CMP(ibld.null_reg_d(), if_inst->src[0], if_inst->src[1],
                   if_inst->conditional_mod);
+      }
 
       for (int i = 0; i < movs; i++) {
+         const fs_builder ibld = fs_builder(this, then_block, then_mov[i])
+                                 .at(block, if_inst);
+
          if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) {
             ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]);
          } else {

From 09039f4bc120481219d01ed17e1552ca8ad66455 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 19:20:50 +0300
Subject: [PATCH 0765/1208] i965/fs: Initialize a builder explicitly in
 opt_peephole_predicated_break().

This wasn't taking into account the execution controls of the original
instruction, but it was most likely not a bug because control flow
instructions are typically full width.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 .../drivers/dri/i965/brw_fs_peephole_predicated_break.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
index d92d4bbd81d..b75f40ba5a1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
@@ -24,6 +24,8 @@
 #include "brw_fs.h"
 #include "brw_cfg.h"
 
+using namespace brw;
+
 /** @file brw_fs_peephole_predicated_break.cpp
  *
  * Loops are often structured as
@@ -85,9 +87,9 @@ fs_visitor::opt_peephole_predicated_break()
        * instruction to set the flag register.
        */
       if (devinfo->gen == 6 && if_inst->conditional_mod) {
-         bld.at(if_block, if_inst)
-            .CMP(bld.null_reg_d(), if_inst->src[0], if_inst->src[1],
-                 if_inst->conditional_mod);
+         const fs_builder ibld(this, if_block, if_inst);
+         ibld.CMP(ibld.null_reg_d(), if_inst->src[0], if_inst->src[1],
+                  if_inst->conditional_mod);
          jump_inst->predicate = BRW_PREDICATE_NORMAL;
       } else {
          jump_inst->predicate = if_inst->predicate;

From bfad71606a987f14f20d2c3607846648f8537f2b Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 18:15:44 +0300
Subject: [PATCH 0766/1208] i965/fs: Set up the builder execution size
 explicitly in opt_sampler_eot().

opt_sampler_eot() was relying on the default builder to have the same
width as the sampler and FB write opcodes it was eliminating, the
channel selects didn't matter because the builder was only being used
to allocate registers, no new instructions were being emitted with it.
A future commit will change the width of the default builder what will
break this assumption, so initialize it explicitly here.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 02a11d0d1ae..71d372ce2d5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2258,7 +2258,8 @@ fs_visitor::opt_sampler_eot()
       return false;
 
    /* Look for a texturing instruction immediately before the final FB_WRITE. */
-   fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
+   bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
+   fs_inst *fb_write = (fs_inst *)block->end();
    assert(fb_write->eot);
    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
 
@@ -2289,9 +2290,11 @@ fs_visitor::opt_sampler_eot()
    assert(!tex_inst->eot); /* We can't get here twice */
    assert((tex_inst->offset & (0xff << 24)) == 0);
 
+   const fs_builder ibld(this, block, tex_inst);
+
    tex_inst->offset |= fb_write->target << 24;
    tex_inst->eot = true;
-   tex_inst->dst = bld.null_reg_ud();
+   tex_inst->dst = ibld.null_reg_ud();
    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
 
    /* If a header is present, marking the eot is sufficient. Otherwise, we need
@@ -2303,8 +2306,8 @@ fs_visitor::opt_sampler_eot()
    if (tex_inst->header_size != 0)
       return true;
 
-   fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
-                                 load_payload->sources + 1);
+   fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
+                                  load_payload->sources + 1);
    fs_reg *new_sources =
       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
 

From a0b192d3d9fa64f6f8bff5f1e456e40e72f4875e Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 20:14:41 +0300
Subject: [PATCH 0767/1208] i965/fs: Don't rely on the default builder to
 create a null register in emit_spill.

It's not guaranteed to have the same width as the instruction
generating the spilled variable.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index f25f2ecce75..6a7ed64415d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -738,7 +738,7 @@ fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
 
    for (int i = 0; i < count / reg_size; i++) {
       fs_inst *spill_inst =
-         ibld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, bld.null_reg_f(), src);
+         ibld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, ibld.null_reg_f(), src);
       src.reg_offset += reg_size;
       spill_inst->offset = spill_offset + i * reg_size * REG_SIZE;
       spill_inst->mlen = 1 + reg_size; /* header, value */

From 930ebb258524762c765fa864ef7063bd8bb754a1 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 18:34:43 +0300
Subject: [PATCH 0768/1208] i965/fs: Switch lower_load_payload() to the
 fs_builder constructor from instruction.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 71d372ce2d5..354c45f6ec0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3040,7 +3040,8 @@ fs_visitor::lower_load_payload()
       if (dst.file == MRF)
          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
 
-      const fs_builder hbld = bld.exec_all().group(8, 0).at(block, inst);
+      const fs_builder ibld(this, block, inst);
+      const fs_builder hbld = ibld.exec_all().group(8, 0);
 
       for (uint8_t i = 0; i < inst->header_size; i++) {
          if (inst->src[i].file != BAD_FILE) {
@@ -3051,10 +3052,6 @@ fs_visitor::lower_load_payload()
          dst = offset(dst, hbld, 1);
       }
 
-      const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
-                                 .group(inst->exec_size, inst->force_sechalf)
-                                 .at(block, inst);
-
       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
           inst->exec_size > 8) {
          /* In this case, the payload portion of the LOAD_PAYLOAD isn't

From 992cda2c8a452ec86386a0f98eaf522afe206695 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 18:41:18 +0300
Subject: [PATCH 0769/1208] i965/fs: Switch lower_logical_sends() to the
 fs_builder constructor from instruction.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 354c45f6ec0..b4468e8307e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3957,9 +3957,7 @@ fs_visitor::lower_logical_sends()
    bool progress = false;
 
    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
-                                 .group(inst->exec_size, inst->force_sechalf)
-                                 .at(block, inst);
+      const fs_builder ibld(this, block, inst);
 
       switch (inst->opcode) {
       case FS_OPCODE_FB_WRITE_LOGICAL:

From 5e645e68d6672cac2872fa509fb22bc2581f4b67 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 19:18:51 +0300
Subject: [PATCH 0770/1208] i965/fs: Switch opt_cse() to the fs_builder
 constructor from instruction.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_cse.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index e33fe6a9802..a123ff2d89b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -180,8 +180,6 @@ create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
 {
    int written = inst->regs_written;
    int dst_width = inst->exec_size / 8;
-   const fs_builder ubld = bld.exec_all(inst->force_writemask_all)
-                              .group(inst->exec_size, inst->force_sechalf);
    fs_inst *copy;
 
    if (written > dst_width) {
@@ -204,11 +202,11 @@ create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
       }
       for (int i = header_size; i < sources; i++) {
          payload[i] = src;
-         src = offset(src, ubld, 1);
+         src = offset(src, bld, 1);
       }
-      copy = ubld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
+      copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
    } else {
-      copy = ubld.MOV(inst->dst, src);
+      copy = bld.MOV(inst->dst, src);
       copy->src[0].negate = negate;
    }
    assert(copy->regs_written == written);
@@ -258,13 +256,14 @@ fs_visitor::opt_cse_local(bblock_t *block)
              */
             bool no_existing_temp = entry->tmp.file == BAD_FILE;
             if (no_existing_temp && !entry->generator->dst.is_null()) {
+               const fs_builder ibld = fs_builder(this, block, entry->generator)
+                                       .at(block, entry->generator->next);
                int written = entry->generator->regs_written;
 
                entry->tmp = fs_reg(GRF, alloc.allocate(written),
                                    entry->generator->dst.type);
 
-               create_copy_instr(bld.at(block, entry->generator->next),
-                                 entry->generator, entry->tmp, false);
+               create_copy_instr(ibld, entry->generator, entry->tmp, false);
 
                entry->generator->dst = entry->tmp;
             }
@@ -273,8 +272,9 @@ fs_visitor::opt_cse_local(bblock_t *block)
             if (!inst->dst.is_null()) {
                assert(inst->regs_written == entry->generator->regs_written);
                assert(inst->dst.type == entry->tmp.type);
+               const fs_builder ibld(this, block, inst);
 
-               create_copy_instr(bld.at(block, inst), inst, entry->tmp, negate);
+               create_copy_instr(ibld, inst, entry->tmp, negate);
             }
 
             /* Set our iterator so that next time through the loop inst->next

From e42d2948d3c58b86d3770d296b96fafcd1218858 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 29 Jul 2015 15:37:52 +0300
Subject: [PATCH 0771/1208] Revert "pipe-loader: simplify
 pipe_loader_drm_probe"

This reverts commit a27ec5dc460b91dc44675f48cddbbb2631ee824f.  It
breaks the intended behaviour of pipe_loader_probe() with ndev==0 as
relied upon by clover to query the number of devices available to the
pipe loader in the system.

Acked-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
index cc6d74a964b..1799df7e4c5 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -101,13 +101,12 @@ open_drm_render_node_minor(int minor)
 int
 pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
 {
-   struct pipe_loader_device *dev;
    int i, j, fd;
 
    for (i = DRM_RENDER_NODE_MIN_MINOR, j = 0;
-        i <= DRM_RENDER_NODE_MAX_MINOR && j < ndev; i++) {
-
+        i <= DRM_RENDER_NODE_MAX_MINOR; i++) {
       fd = open_drm_render_node_minor(i);
+      struct pipe_loader_device *dev;
       if (fd < 0)
          continue;
 
@@ -116,7 +115,13 @@ pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
          continue;
       }
 
-      devs[j++] = dev;
+      if (j < ndev) {
+         devs[j] = dev;
+      } else {
+         close(fd);
+         dev->ops->release(&dev);
+      }
+      j++;
    }
 
    return j;

From ad75620863392b2164a415186087beb831ccfa4c Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 29 Jul 2015 09:37:14 -0400
Subject: [PATCH 0772/1208] nvc0/ir: output base for reading is based on laneid

PFETCH retrieves the address for incoming vertices, not output vertices
in TCS. For output vertices, we must use the laneid as a base.

Fixes barrier piglit test, which was failing for entirely non-barrier
reasons, but rather that it was (a) trying to draw multiple patches and
(b) the incoming patch size was not the same as the outgoing patch size.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 88078b157fa..3c558e941bd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1296,6 +1296,7 @@ private:
 
    Value *shiftAddress(Value *);
    Value *getVertexBase(int s);
+   Value *getOutputBase(int s);
    DataArray *getArrayForFile(unsigned file, int idx);
    Value *fetchSrc(int s, int c);
    Value *acquireDst(int d, int c);
@@ -1525,6 +1526,28 @@ Converter::getVertexBase(int s)
    return vtxBase[s];
 }
 
+Value *
+Converter::getOutputBase(int s)
+{
+   assert(s < 5);
+   if (!(vtxBaseValid & (1 << s))) {
+      Value *offset = loadImm(NULL, tgsi.getSrc(s).getIndex(1));
+      if (tgsi.getSrc(s).isIndirect(1))
+         offset = mkOp2v(OP_ADD, TYPE_U32, getSSA(),
+                         fetchSrc(tgsi.getSrc(s).getIndirect(1), 0, NULL),
+                         offset);
+      vtxBaseValid |= 1 << s;
+      vtxBase[s] = mkOp2v(
+         OP_ADD, TYPE_U32, getSSA(),
+         mkOp2v(
+            OP_SUB, TYPE_U32, getSSA(),
+            mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LANEID, 0)),
+            mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_INVOCATION_ID, 0))),
+         offset);
+   }
+   return vtxBase[s];
+}
+
 Value *
 Converter::fetchSrc(int s, int c)
 {
@@ -1539,6 +1562,8 @@ Converter::fetchSrc(int s, int c)
    if (src.is2D()) {
       switch (src.getFile()) {
       case TGSI_FILE_OUTPUT:
+         dimRel = getOutputBase(s);
+         break;
       case TGSI_FILE_INPUT:
          dimRel = getVertexBase(s);
          break;

From 9da9adcfd7df45a0a337e0fbf482f60ff5566499 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 29 Jul 2015 11:01:08 -0400
Subject: [PATCH 0773/1208] nvc0/ir: cache vertex out base so that we don't
 recompute again

The global CSE pass stinks and is unable to pull this out. Easy enough
to handle it here and avoid generating unnecessary special register
loads (which can allegedly be quite slow).

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 3c558e941bd..4847a0f3355 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1394,6 +1394,8 @@ private:
    Value *vtxBase[5]; // base address of vertex in primitive (for TP/GP)
    uint8_t vtxBaseValid;
 
+   Value *outBase; // base address of vertex out patch (for TCP)
+
    Stack condBBs;  // fork BB, then else clause BB
    Stack joinBBs;  // fork BB, for inserting join ops on ENDIF
    Stack loopBBs;  // loop headers
@@ -1537,13 +1539,7 @@ Converter::getOutputBase(int s)
                          fetchSrc(tgsi.getSrc(s).getIndirect(1), 0, NULL),
                          offset);
       vtxBaseValid |= 1 << s;
-      vtxBase[s] = mkOp2v(
-         OP_ADD, TYPE_U32, getSSA(),
-         mkOp2v(
-            OP_SUB, TYPE_U32, getSSA(),
-            mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LANEID, 0)),
-            mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_INVOCATION_ID, 0))),
-         offset);
+      vtxBase[s] = mkOp2v(OP_ADD, TYPE_U32, getSSA(), outBase, offset);
    }
    return vtxBase[s];
 }
@@ -3369,10 +3365,21 @@ Converter::run()
          clipVtx[c] = getScratch();
    }
 
-   if (prog->getType() == Program::TYPE_FRAGMENT) {
+   switch (prog->getType()) {
+   case Program::TYPE_TESSELLATION_CONTROL:
+      outBase = mkOp2v(
+         OP_SUB, TYPE_U32, getSSA(),
+         mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LANEID, 0)),
+         mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_INVOCATION_ID, 0)));
+      break;
+   case Program::TYPE_FRAGMENT: {
       Symbol *sv = mkSysVal(SV_POSITION, 3);
       fragCoord[3] = mkOp1v(OP_RDSV, TYPE_F32, getSSA(), sv);
       mkOp1(OP_RCP, TYPE_F32, fragCoord[3], fragCoord[3]);
+      break;
+   }
+   default:
+      break;
    }
 
    if (info->io.viewportId >= 0)

From 8b838fa9f01f7ee13fd9fc3e6545a677397f8023 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Fri, 10 Jul 2015 19:49:49 -0700
Subject: [PATCH 0774/1208] i965/cfg: Assert that cur_do/while/if pointers are
 non-NULL.

More.. like in commit 4d93a07c.
---
 src/mesa/drivers/dri/i965/brw_cfg.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp
index f1f230e3751..91d53eff5a7 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp
@@ -208,6 +208,7 @@ cfg_t::cfg_t(exec_list *instructions)
          cur_else = cur;
 
 	 next = new_block();
+         assert(cur_if != NULL);
 	 cur_if->add_successor(mem_ctx, next);
 
 	 set_next_block(&cur, next, ip);
@@ -274,6 +275,7 @@ cfg_t::cfg_t(exec_list *instructions)
          inst->exec_node::remove();
          cur->instructions.push_tail(inst);
 
+         assert(cur_do != NULL);
 	 cur->add_successor(mem_ctx, cur_do);
 
 	 next = new_block();
@@ -287,6 +289,7 @@ cfg_t::cfg_t(exec_list *instructions)
          inst->exec_node::remove();
          cur->instructions.push_tail(inst);
 
+         assert(cur_while != NULL);
 	 cur->add_successor(mem_ctx, cur_while);
 
 	 next = new_block();

From eaba922582cfdccc7b198f9b23d8bd3c26197d03 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 18:28:39 +0300
Subject: [PATCH 0775/1208] i965/fs: Initialize a builder explicitly in the
 gen4 send dependency work-arounds.

Instead of relying on the default one.  This shouldn't lead to any
functional changes because DEP_RESOLVE_MOV overrides the execution
size of the instruction anyway and other execution controls are
irrelevant.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index b4468e8307e..57e4dd783ea 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2827,7 +2827,8 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
       if (block->start() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
             if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst),
+                               first_write_grf + i);
          }
          return;
       }
@@ -2843,7 +2844,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
             if (reg >= first_write_grf &&
                 reg < first_write_grf + write_len &&
                 needs_dep[reg - first_write_grf]) {
-               DEP_RESOLVE_MOV(bld.at(block, inst), reg);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
                needs_dep[reg - first_write_grf] = false;
                if (scan_inst->exec_size == 16)
                   needs_dep[reg - first_write_grf + 1] = false;
@@ -2890,7 +2891,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
       if (block->end() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
             if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                               first_write_grf + i);
          }
          return;
       }
@@ -2905,7 +2907,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
           scan_inst->dst.reg >= first_write_grf &&
           scan_inst->dst.reg < first_write_grf + write_len &&
           needs_dep[scan_inst->dst.reg - first_write_grf]) {
-         DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
+         DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                         scan_inst->dst.reg);
          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
       }
 

From 4529916dfd227af6c4e151f45261db22157fe45f Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 18:42:31 +0300
Subject: [PATCH 0776/1208] i965/fs: Don't set exec_all on instructions wider
 than the original in lower_simd_width.

This could have led to somewhat increased bandwidth usage for lowered
texturing instructions on Gen4 (which is the only case in which
lower_width may be greater than inst->exec_size).  After the previous
patches the invariant mentioned in the comment should no longer be
assumed by any of the other optimization and lowering passes, so the
exec_all() call shouldn't be necessary anymore.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 57e4dd783ea..4947f24dc44 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -4163,10 +4163,15 @@ fs_visitor::lower_simd_width()
       const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
 
       if (lower_width != inst->exec_size) {
-         /* Builder matching the original instruction. */
+         /* Builder matching the original instruction.  We may also need to
+          * emit an instruction of width larger than the original, set the
+          * execution size of the builder to the highest of both for now so
+          * we're sure that both cases can be handled.
+          */
          const fs_builder ibld = bld.at(block, inst)
                                     .exec_all(inst->force_writemask_all)
-                                    .group(inst->exec_size, inst->force_sechalf);
+                                    .group(MAX2(inst->exec_size, lower_width),
+                                           inst->force_sechalf);
 
          /* Split the copies in chunks of the execution width of either the
           * original or the lowered instruction, whichever is lower.
@@ -4189,14 +4194,11 @@ fs_visitor::lower_simd_width()
             split_inst.exec_size = lower_width;
             split_inst.eot = inst->eot && i == n - 1;
 
-            /* Set exec_all if the lowered width is higher than the original
-             * to avoid breaking the compiler invariant that no control
-             * flow-masked instruction is wider than the shader's
-             * dispatch_width.  Then transform the sources and destination and
-             * emit the lowered instruction.
+            /* Select the correct channel enables for the i-th group, then
+             * transform the sources and destination and emit the lowered
+             * instruction.
              */
-            const fs_builder lbld = ibld.exec_all(lower_width > inst->exec_size)
-                                        .group(lower_width, i);
+            const fs_builder lbld = ibld.group(lower_width, i);
 
             for (unsigned j = 0; j < inst->sources; j++) {
                if (inst->src[j].file != BAD_FILE &&

From 02425d3ec2af6945a03583cadcaa5f3f330bbc0e Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 18:51:01 +0300
Subject: [PATCH 0777/1208] i965/fs: Make the default builder 64-wide before
 entering the optimization loop.

Not a typo.  Replace the default builder with one of bogus width to
catch cases in which optimization passes assume that the default
dispatch width is good enough.  The execution controls of instructions
emitted during optimization should in general match the original code
that is being manipulated.  Many of the problems fixed in this series
were caught by the assertions introduced in this patch.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp       | 6 ++++--
 src/mesa/drivers/dri/i965/brw_fs_builder.h | 3 +++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 4947f24dc44..565edeb401e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -4671,9 +4671,11 @@ fs_visitor::optimize()
     * Ideally optimization passes wouldn't be part of the visitor so they
     * wouldn't have access to bld at all, but they do, so just in case some
     * pass forgets to ask for a location explicitly set it to NULL here to
-    * make it trip.
+    * make it trip.  The dispatch width is initialized to a bogus value to
+    * make sure that optimizations set the execution controls explicitly to
+    * match the code they are manipulating instead of relying on the defaults.
     */
-   bld = bld.at(NULL, NULL);
+   bld = fs_builder(this, 64);
 
    split_virtual_grfs();
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index 12653d055d0..34545eaa0fb 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -176,6 +176,8 @@ namespace brw {
       dst_reg
       vgrf(enum brw_reg_type type, unsigned n = 1) const
       {
+         assert(dispatch_width() <= 32);
+
          if (n > 0)
             return dst_reg(GRF, shader->alloc.allocate(
                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
@@ -342,6 +344,7 @@ namespace brw {
       instruction *
       emit(instruction *inst) const
       {
+         assert(inst->exec_size <= 32);
          assert(inst->exec_size == dispatch_width() ||
                 force_writemask_all);
          assert(_group == 0 || _group == 8);

From 5c7fd670459ebff452adeec335c77854af903842 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 15 Jul 2015 20:54:46 -0700
Subject: [PATCH 0778/1208] glsl: Remove MSVC implementations of copysign and
 isnormal.

Non-Gallium parts of Mesa require MSVC 2013 which provides these.
---
 src/glsl/ir_constant_expression.cpp      | 14 +-------------
 src/glsl/nir/nir_constant_expressions.py | 14 +-------------
 2 files changed, 2 insertions(+), 26 deletions(-)

diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp
index 171b8e95444..7a38fa42193 100644
--- a/src/glsl/ir_constant_expression.cpp
+++ b/src/glsl/ir_constant_expression.cpp
@@ -40,12 +40,7 @@
 #include "glsl_types.h"
 #include "program/hash_table.h"
 
-#if defined(_MSC_VER) && (_MSC_VER < 1800)
-static int isnormal(double x)
-{
-   return _fpclass(x) == _FPCLASS_NN || _fpclass(x) == _FPCLASS_PN;
-}
-#elif defined(__SUNPRO_CC) && !defined(isnormal)
+#if defined(__SUNPRO_CC) && !defined(isnormal)
 #include <ieeefp.h>
 static int isnormal(double x)
 {
@@ -53,13 +48,6 @@ static int isnormal(double x)
 }
 #endif
 
-#if defined(_MSC_VER)
-static double copysign(double x, double y)
-{
-   return _copysign(x, y);
-}
-#endif
-
 static float
 dot_f(ir_constant *op0, ir_constant *op1)
 {
diff --git a/src/glsl/nir/nir_constant_expressions.py b/src/glsl/nir/nir_constant_expressions.py
index bf82fe533d6..8fd9b1039a7 100644
--- a/src/glsl/nir/nir_constant_expressions.py
+++ b/src/glsl/nir/nir_constant_expressions.py
@@ -31,12 +31,7 @@ template = """\
 #include "util/rounding.h" /* for _mesa_roundeven */
 #include "nir_constant_expressions.h"
 
-#if defined(_MSC_VER) && (_MSC_VER < 1800)
-static int isnormal(double x)
-{
-   return _fpclass(x) == _FPCLASS_NN || _fpclass(x) == _FPCLASS_PN;
-}
-#elif defined(__SUNPRO_CC)
+#if defined(__SUNPRO_CC)
 #include <ieeefp.h>
 static int isnormal(double x)
 {
@@ -44,13 +39,6 @@ static int isnormal(double x)
 }
 #endif
 
-#if defined(_MSC_VER)
-static double copysign(double x, double y)
-{
-   return _copysign(x, y);
-}
-#endif
-
 /**
  * Evaluate one component of packSnorm4x8.
  */

From ecc559218d0a544f8a5f878c500f125c2d588d82 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 15 Jul 2015 21:28:56 -0700
Subject: [PATCH 0779/1208] c99_math: Implement exp2f for MSVC.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 include/c99_math.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/c99_math.h b/include/c99_math.h
index 7ed7cc22176..0ca5a73b58a 100644
--- a/include/c99_math.h
+++ b/include/c99_math.h
@@ -140,6 +140,12 @@ llrintf(float f)
    return rounded;
 }
 
+static inline float
+exp2f(float f)
+{
+   return powf(2.0f, f);
+}
+
 #endif /* C99 */
 
 

From f8a647883a14694f1b758c12187b3f35b9d039a7 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sat, 11 Jul 2015 22:46:19 -0700
Subject: [PATCH 0780/1208] mesa: Use floats for viewport bounds.

ARB_viewport_array specifies that DEPTH_RANGE consists of double-
precision parameters (corresponding commit d4dc35987), and a preparatory
commit (6340e609a) added _mesa_get_viewport_xform() which returned
double-precision scale[3] and translate[3] vectors, even though X, Y,
Width, and Height were still floats.

All users of _mesa_get_viewport_xform() immediately convert the double
scale and translation vectors into floats (which were floats originally,
but were converted to doubles in _mesa_get_viewport_xform(), sigh).

i965 at least cannot consume doubles (see SF_CLIP_VIEWPORT). If we want
to pass doubles to hardware, we should have a different function that
does that.

Acked-by: Mathias Froehlich <Mathias.Froehlich@web.de>
---
 src/mesa/drivers/dri/i915/i915_state.c          |  2 +-
 src/mesa/drivers/dri/i965/brw_sf_state.c        |  2 +-
 src/mesa/drivers/dri/i965/gen6_viewport_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen7_viewport_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen8_viewport_state.c |  2 +-
 src/mesa/drivers/dri/r200/r200_state.c          |  2 +-
 src/mesa/drivers/dri/radeon/radeon_state.c      |  2 +-
 src/mesa/main/viewport.c                        | 14 +++++++-------
 src/mesa/main/viewport.h                        |  2 +-
 src/mesa/math/m_matrix.c                        |  4 ++--
 src/mesa/math/m_matrix.h                        |  4 ++--
 src/mesa/state_tracker/st_atom_viewport.c       |  2 +-
 src/mesa/tnl/t_context.c                        |  2 +-
 src/mesa/tnl/t_rasterpos.c                      |  2 +-
 14 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/mesa/drivers/dri/i915/i915_state.c b/src/mesa/drivers/dri/i915/i915_state.c
index 5f10b840b1a..4c83073e692 100644
--- a/src/mesa/drivers/dri/i915/i915_state.c
+++ b/src/mesa/drivers/dri/i915/i915_state.c
@@ -402,7 +402,7 @@ void
 intelCalcViewport(struct gl_context * ctx)
 {
    struct intel_context *intel = intel_context(ctx);
-   double scale[3], translate[3];
+   float scale[3], translate[3];
 
    _mesa_get_viewport_xform(ctx, 0, scale, translate);
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 5d9892214a9..3be6e4ab8e2 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -45,7 +45,7 @@ static void upload_sf_vp(struct brw_context *brw)
    struct gl_context *ctx = &brw->ctx;
    struct brw_sf_viewport *sfv;
    GLfloat y_scale, y_bias;
-   double scale[3], translate[3];
+   float scale[3], translate[3];
    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
 
    sfv = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
index 7c8d8849f4e..11b9a360ced 100644
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
@@ -101,7 +101,7 @@ gen6_upload_sf_vp(struct brw_context *brw)
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
 
       /* _NEW_VIEWPORT */
       _mesa_get_viewport_xform(ctx, i, scale, translate);
diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
index b655205ec35..c75dc9964bf 100644
--- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
@@ -53,7 +53,7 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, i, scale, translate);
 
       /* According to the "Vertex X,Y Clamping and Quantization" section of
diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
index 2d8eeb1f10f..2692ad55999 100644
--- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
@@ -53,7 +53,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, i, scale, translate);
 
       /* _NEW_VIEWPORT: Viewport Matrix Elements */
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index 2023ba09449..cca176d7f9b 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -1546,7 +1546,7 @@ void r200UpdateWindow( struct gl_context *ctx )
    GLfloat xoffset = 0;
    GLfloat yoffset = dPriv ? (GLfloat) dPriv->h : 0;
    const GLboolean render_to_fbo = (ctx->DrawBuffer ? _mesa_is_user_fbo(ctx->DrawBuffer) : 0);
-   double scale[3], translate[3];
+   float scale[3], translate[3];
    GLfloat y_scale, y_bias;
 
    if (render_to_fbo) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index 156afb6f6ac..74c1fc6c902 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -1354,7 +1354,7 @@ void radeonUpdateWindow( struct gl_context *ctx )
    GLfloat xoffset = 0.0;
    GLfloat yoffset = dPriv ? (GLfloat) dPriv->h : 0;
    const GLboolean render_to_fbo = (ctx->DrawBuffer ? _mesa_is_user_fbo(ctx->DrawBuffer) : 0);
-   double scale[3], translate[3];
+   float scale[3], translate[3];
    GLfloat y_scale, y_bias;
 
    if (render_to_fbo) {
diff --git a/src/mesa/main/viewport.c b/src/mesa/main/viewport.c
index 9917f2de299..7d8914291c3 100644
--- a/src/mesa/main/viewport.c
+++ b/src/mesa/main/viewport.c
@@ -443,12 +443,12 @@ _mesa_ClipControl(GLenum origin, GLenum depth)
  */
 void
 _mesa_get_viewport_xform(struct gl_context *ctx, unsigned i,
-                         double scale[3], double translate[3])
+                         float scale[3], float translate[3])
 {
-   double x = ctx->ViewportArray[i].X;
-   double y = ctx->ViewportArray[i].Y;
-   double half_width = 0.5*ctx->ViewportArray[i].Width;
-   double half_height = 0.5*ctx->ViewportArray[i].Height;
+   float x = ctx->ViewportArray[i].X;
+   float y = ctx->ViewportArray[i].Y;
+   float half_width = 0.5f * ctx->ViewportArray[i].Width;
+   float half_height = 0.5f * ctx->ViewportArray[i].Height;
    double n = ctx->ViewportArray[i].Near;
    double f = ctx->ViewportArray[i].Far;
 
@@ -462,8 +462,8 @@ _mesa_get_viewport_xform(struct gl_context *ctx, unsigned i,
       translate[1] = half_height + y;
    }
    if (ctx->Transform.ClipDepthMode == GL_NEGATIVE_ONE_TO_ONE) {
-      scale[2] = 0.5*(f - n);
-      translate[2] = 0.5*(n + f);
+      scale[2] = 0.5 * (f - n);
+      translate[2] = 0.5 * (n + f);
    } else {
       scale[2] = f - n;
       translate[2] = n;
diff --git a/src/mesa/main/viewport.h b/src/mesa/main/viewport.h
index 899dc2d0bcc..b0675db1096 100644
--- a/src/mesa/main/viewport.h
+++ b/src/mesa/main/viewport.h
@@ -73,6 +73,6 @@ _mesa_ClipControl(GLenum origin, GLenum depth);
 
 extern void
 _mesa_get_viewport_xform(struct gl_context *ctx, unsigned i,
-                         double scale[3], double translate[3]);
+                         float scale[3], float translate[3]);
 
 #endif
diff --git a/src/mesa/math/m_matrix.c b/src/mesa/math/m_matrix.c
index ecf564c0089..6a42c6c0ed2 100644
--- a/src/mesa/math/m_matrix.c
+++ b/src/mesa/math/m_matrix.c
@@ -1111,8 +1111,8 @@ _math_matrix_translate( GLmatrix *mat, GLfloat x, GLfloat y, GLfloat z )
  * Transforms Normalized Device Coords to window/Z values.
  */
 void
-_math_matrix_viewport(GLmatrix *m, const double scale[3],
-                      const double translate[3], double depthMax)
+_math_matrix_viewport(GLmatrix *m, const float scale[3],
+                      const float translate[3], double depthMax)
 {
    m->m[MAT_SX] = scale[0];
    m->m[MAT_TX] = translate[0];
diff --git a/src/mesa/math/m_matrix.h b/src/mesa/math/m_matrix.h
index 778d716dce7..c34d9e3022f 100644
--- a/src/mesa/math/m_matrix.h
+++ b/src/mesa/math/m_matrix.h
@@ -122,8 +122,8 @@ _math_matrix_frustum( GLmatrix *mat,
 		      GLfloat nearval, GLfloat farval );
 
 extern void
-_math_matrix_viewport( GLmatrix *m, const double scale[3],
-                       const double translate[3], double depthMax );
+_math_matrix_viewport( GLmatrix *m, const float scale[3],
+                       const float translate[3], double depthMax );
 
 extern void
 _math_matrix_set_identity( GLmatrix *dest );
diff --git a/src/mesa/state_tracker/st_atom_viewport.c b/src/mesa/state_tracker/st_atom_viewport.c
index 2f62590c4f1..9a692cecade 100644
--- a/src/mesa/state_tracker/st_atom_viewport.c
+++ b/src/mesa/state_tracker/st_atom_viewport.c
@@ -64,7 +64,7 @@ update_viewport( struct st_context *st )
     */
    for (i = 0; i < ctx->Const.MaxViewports; i++)
    {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, i, scale, translate);
 
       st->state.viewport[i].scale[0] = scale[0];
diff --git a/src/mesa/tnl/t_context.c b/src/mesa/tnl/t_context.c
index bc77ba8bf95..b5c0b3e1f5b 100644
--- a/src/mesa/tnl/t_context.c
+++ b/src/mesa/tnl/t_context.c
@@ -190,7 +190,7 @@ _tnl_InvalidateState( struct gl_context *ctx, GLuint new_state )
    }
 
    if (new_state & (_NEW_VIEWPORT | _NEW_BUFFERS)) {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, 0, scale, translate);
       _math_matrix_viewport(&tnl->_WindowMap, scale, translate,
                             ctx->DrawBuffer->_DepthMaxF);
diff --git a/src/mesa/tnl/t_rasterpos.c b/src/mesa/tnl/t_rasterpos.c
index d4b45bac9ac..7ef50ea7cd7 100644
--- a/src/mesa/tnl/t_rasterpos.c
+++ b/src/mesa/tnl/t_rasterpos.c
@@ -378,7 +378,7 @@ _tnl_RasterPos(struct gl_context *ctx, const GLfloat vObj[4])
       GLfloat eye[4], clip[4], ndc[3], d;
       GLfloat *norm, eyenorm[3];
       GLfloat *objnorm = ctx->Current.Attrib[VERT_ATTRIB_NORMAL];
-      double scale[3], translate[3];
+      float scale[3], translate[3];
 
       /* apply modelview matrix:  eye = MV * obj */
       TRANSFORM_POINT( eye, ctx->ModelviewMatrixStack.Top->m, vObj );

From b73782bf184b7053026e8dda54800d48e64e20da Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 13 Jul 2015 15:19:33 -0700
Subject: [PATCH 0781/1208] program: Use exp2(x) instead of pow(2.0, x).

---
 src/mesa/program/prog_execute.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/program/prog_execute.c b/src/mesa/program/prog_execute.c
index 46260b54882..77274e2cccc 100644
--- a/src/mesa/program/prog_execute.c
+++ b/src/mesa/program/prog_execute.c
@@ -723,7 +723,7 @@ _mesa_execute_program(struct gl_context * ctx,
                 * result.z = result.x * APPX(result.y)
                 * We do what the ARB extension says.
                 */
-               q[2] = (GLfloat) pow(2.0, t[0]);
+               q[2] = exp2f(t[0]);
             }
             q[1] = t[0] - floor_t0;
             q[3] = 1.0F;
@@ -734,7 +734,7 @@ _mesa_execute_program(struct gl_context * ctx,
          {
             GLfloat a[4], result[4], val;
             fetch_vector1(&inst->SrcReg[0], machine, a);
-            val = (GLfloat) pow(2.0, a[0]);
+            val = exp2f(a[0]);
             /*
             if (IS_INF_OR_NAN(val))
                val = 1.0e10;

From c67ce2bd3b27a26d7f5665f296d307c0de39b720 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 13 Jul 2015 15:19:54 -0700
Subject: [PATCH 0782/1208] gallium/auxiliary: Use exp2(x) instead of pow(2.0,
 x).

---
 src/gallium/auxiliary/util/u_format_rgb9e5.h | 6 +++---
 src/gallium/auxiliary/util/u_math.c          | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h b/src/gallium/auxiliary/util/u_format_rgb9e5.h
index 973e542c536..1c12a668d81 100644
--- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
+++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
@@ -115,8 +115,8 @@ static inline unsigned float3_to_rgb9e5(const float rgb[3])
    exp_shared = MAX2(-RGB9E5_EXP_BIAS-1, rgb9e5_FloorLog2(maxrgb)) + 1 + RGB9E5_EXP_BIAS;
    assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
    assert(exp_shared >= 0);
-   /* This pow function could be replaced by a table. */
-   denom = pow(2, exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS);
+   /* This exp2 function could be replaced by a table. */
+   denom = exp2(exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS);
 
    maxm = (int) floor(maxrgb / denom + 0.5);
    if (maxm == MAX_RGB9E5_MANTISSA+1) {
@@ -154,7 +154,7 @@ static inline void rgb9e5_to_float3(unsigned rgb, float retval[3])
 
    v.raw = rgb;
    exponent = v.field.biasedexponent - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS;
-   scale = (float) pow(2, exponent);
+   scale = exp2f(exponent);
 
    retval[0] = v.field.r * scale;
    retval[1] = v.field.g * scale;
diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c
index ae9e9513b04..c58af911be7 100644
--- a/src/gallium/auxiliary/util/u_math.c
+++ b/src/gallium/auxiliary/util/u_math.c
@@ -48,7 +48,7 @@ init_pow2_table(void)
 {
    int i;
    for (i = 0; i < POW2_TABLE_SIZE; i++)
-      pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
+      pow2_table[i] = exp2f((i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
 }
 
 

From c1da15709a0c0c2775bd9e534f67c60f7dc95ce8 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 12 Jul 2015 00:13:45 -0700
Subject: [PATCH 0783/1208] i965: Use float calculations when double is
 unnecessary.

Literals without an f/F suffix are of type double, and implicit
conversion rules specify that the float in (float op double) be
converted to a double before the operation is performed. I believe float
execution was intended (in nearly all cases) or is sufficient (in the
case of gen7_urb.c).

Removes a lot of float <-> double conversion instructions and replaces
many double instructions with float instructions which are cheaper.

   text     data      bss      dec      hex  filename
4928659   195160    26192  5150011   4e953b  i965_dri.so before
4928315   195152    26192  5149659   4e93db  i965_dri.so after

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_blorp_blit.cpp  | 22 +++++++++----------
 src/mesa/drivers/dri/i965/brw_fs.cpp          |  4 ++--
 .../drivers/dri/i965/brw_meta_fast_clear.c    |  4 ++--
 .../drivers/dri/i965/brw_meta_stencil_blit.c  |  4 ++--
 src/mesa/drivers/dri/i965/brw_misc_state.c    |  4 ++--
 src/mesa/drivers/dri/i965/brw_sampler_state.c |  4 ++--
 src/mesa/drivers/dri/i965/brw_sf_state.c      |  9 ++++----
 src/mesa/drivers/dri/i965/brw_state_cache.c   |  2 +-
 src/mesa/drivers/dri/i965/brw_util.h          |  4 ++--
 .../drivers/dri/i965/gen6_multisample_state.c |  4 ++--
 src/mesa/drivers/dri/i965/gen6_sf_state.c     |  2 +-
 src/mesa/drivers/dri/i965/gen7_sf_state.c     |  2 +-
 src/mesa/drivers/dri/i965/gen7_urb.c          |  2 +-
 src/mesa/drivers/dri/i965/gen8_sf_state.c     |  2 +-
 14 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index 1561b593969..205c905b447 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -1285,8 +1285,8 @@ brw_blorp_blit_program::translate_dst_to_src()
       /* Round the float coordinates down to nearest integer */
       emit_rndd(Xp_f, X_f);
       emit_rndd(Yp_f, Y_f);
-      emit_mul(X_f, Xp_f, brw_imm_f(1 / key->x_scale));
-      emit_mul(Y_f, Yp_f, brw_imm_f(1 / key->y_scale));
+      emit_mul(X_f, Xp_f, brw_imm_f(1.0f / key->x_scale));
+      emit_mul(Y_f, Yp_f, brw_imm_f(1.0f / key->y_scale));
       SWAP_XY_AND_XPYP();
    } else if (!key->bilinear_filter) {
       /* Round the float coordinates down to nearest integer by moving to
@@ -1442,7 +1442,7 @@ brw_blorp_blit_program::manual_blend_average(unsigned num_samples)
       for (int j = 0; j < 4; ++j) {
          emit_mul(offset(texture_data[0], 2*j),
                  offset(vec8(texture_data[0]), 2*j),
-                 brw_imm_f(1.0/num_samples));
+                 brw_imm_f(1.0f / num_samples));
       }
    }
 
@@ -1475,9 +1475,9 @@ brw_blorp_blit_program::manual_blend_bilinear(unsigned num_samples)
 
       /* Compute pixel coordinates */
       emit_add(vec16(x_sample_coords), Xp_f,
-              brw_imm_f((float)(i & 0x1) * (1.0 / key->x_scale)));
+              brw_imm_f((float)(i & 0x1) * (1.0f / key->x_scale)));
       emit_add(vec16(y_sample_coords), Yp_f,
-              brw_imm_f((float)((i >> 1) & 0x1) * (1.0 / key->y_scale)));
+              brw_imm_f((float)((i >> 1) & 0x1) * (1.0f / key->y_scale)));
       emit_mov(vec16(X), x_sample_coords);
       emit_mov(vec16(Y), y_sample_coords);
 
@@ -1789,7 +1789,7 @@ brw_blorp_coord_transform_params::setup(GLfloat src0, GLfloat src1,
        * so 0.5 provides the necessary correction.
        */
       multiplier = scale;
-      offset = src0 + (-dst0 + 0.5) * scale;
+      offset = src0 + (-dst0 + 0.5f) * scale;
    } else {
       /* When mirroring X we need:
        *   src_x - src_x0 = dst_x1 - dst_x - 0.5
@@ -1797,7 +1797,7 @@ brw_blorp_coord_transform_params::setup(GLfloat src0, GLfloat src1,
        *   src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
        */
       multiplier = -scale;
-      offset = src0 + (dst1 - 0.5) * scale;
+      offset = src0 + (dst1 - 0.5f) * scale;
    }
 }
 
@@ -1952,8 +1952,8 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
    /* Scaling factors used for bilinear filtering in multisample scaled
     * blits.
     */
-   wm_prog_key.x_scale = 2.0;
-   wm_prog_key.y_scale = src_mt->num_samples / 2.0;
+   wm_prog_key.x_scale = 2.0f;
+   wm_prog_key.y_scale = src_mt->num_samples / 2.0f;
 
    if (filter == GL_LINEAR && src.num_samples <= 1 && dst.num_samples <= 1)
       wm_prog_key.bilinear_filter = true;
@@ -2000,9 +2000,9 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
    x1 = wm_push_consts.dst_x1 = roundf(dst_x1);
    y1 = wm_push_consts.dst_y1 = roundf(dst_y1);
    wm_push_consts.rect_grid_x1 = (minify(src_mt->logical_width0, src_level) *
-                                  wm_prog_key.x_scale - 1.0);
+                                  wm_prog_key.x_scale - 1.0f);
    wm_push_consts.rect_grid_y1 = (minify(src_mt->logical_height0, src_level) *
-                                  wm_prog_key.y_scale - 1.0);
+                                  wm_prog_key.y_scale - 1.0f);
 
    wm_push_consts.x_transform.setup(src_x0, src_x1, dst_x0, dst_x1, mirror_x);
    wm_push_consts.y_transform.setup(src_y0, src_y1, dst_y0, dst_y1, mirror_y);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 565edeb401e..15fe3648af8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -975,11 +975,11 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
       bld.MOV(wpos, this->pixel_y);
    } else {
       fs_reg pixel_y = this->pixel_y;
-      float offset = (pixel_center_integer ? 0.0 : 0.5);
+      float offset = (pixel_center_integer ? 0.0f : 0.5f);
 
       if (flip) {
 	 pixel_y.negate = true;
-	 offset += key->drawable_height - 1.0;
+	 offset += key->drawable_height - 1.0f;
       }
 
       bld.ADD(wpos, pixel_y, fs_reg(offset));
diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index c5e556ee9eb..e7e8df5bded 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -348,7 +348,7 @@ is_color_fast_clear_compatible(struct brw_context *brw,
    }
 
    for (int i = 0; i < 4; i++) {
-      if (color->f[i] != 0.0 && color->f[i] != 1.0 &&
+      if (color->f[i] != 0.0f && color->f[i] != 1.0f &&
           _mesa_format_has_color_component(format, i)) {
          return false;
       }
@@ -366,7 +366,7 @@ compute_fast_clear_color_bits(const union gl_color_union *color)
    uint32_t bits = 0;
    for (int i = 0; i < 4; i++) {
       /* Testing for non-0 works for integer and float colors */
-      if (color->f[i] != 0.0)
+      if (color->f[i] != 0.0f)
          bits |= 1 << (GEN7_SURFACE_CLEAR_COLOR_SHIFT + (3 - i));
    }
    return bits;
diff --git a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
index d4abfe63de7..aa6df16eb04 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
@@ -239,10 +239,10 @@ setup_coord_coeff(GLuint prog, GLuint multiplier, GLuint offset,
 
    if (mirror) {
       _mesa_Uniform1f(multiplier, -scale);
-      _mesa_Uniform1f(offset, src_0 + (dst_1 - 0.5) * scale);
+      _mesa_Uniform1f(offset, src_0 + (dst_1 - 0.5f) * scale);
    } else {
       _mesa_Uniform1f(multiplier, scale);
-      _mesa_Uniform1f(offset, src_0 + (-dst_0 + 0.5) * scale);
+      _mesa_Uniform1f(offset, src_0 + (-dst_0 + 0.5f) * scale);
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 1bbb16cf697..16b0ed28d0d 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -834,13 +834,13 @@ static void upload_line_stipple(struct brw_context *brw)
 
    if (brw->gen >= 7) {
       /* in U1.16 */
-      tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
+      tmp = 1.0f / ctx->Line.StippleFactor;
       tmpi = tmp * (1<<16);
       OUT_BATCH(tmpi << 15 | ctx->Line.StippleFactor);
    }
    else {
       /* in U1.13 */
-      tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
+      tmp = 1.0f / ctx->Line.StippleFactor;
       tmpi = tmp * (1<<13);
       OUT_BATCH(tmpi << 16 | ctx->Line.StippleFactor);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index 22ccbfe8461..2021bb3b460 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -425,11 +425,11 @@ brw_update_sampler_state(struct brw_context *brw,
 
    /* Enable anisotropic filtering if desired. */
    unsigned max_anisotropy = BRW_ANISORATIO_2;
-   if (sampler->MaxAnisotropy > 1.0) {
+   if (sampler->MaxAnisotropy > 1.0f) {
       min_filter = BRW_MAPFILTER_ANISOTROPIC;
       mag_filter = BRW_MAPFILTER_ANISOTROPIC;
 
-      if (sampler->MaxAnisotropy > 2.0) {
+      if (sampler->MaxAnisotropy > 2.0f) {
 	 max_anisotropy =
             MIN2((sampler->MaxAnisotropy - 2) / 2, BRW_ANISORATIO_16);
       }
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 3be6e4ab8e2..b126f82ebbf 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -220,7 +220,7 @@ static void upload_sf_unit( struct brw_context *brw )
 
    /* _NEW_LINE */
    sf->sf6.line_width =
-      CLAMP(ctx->Line.Width, 1.0, ctx->Const.MaxLineWidth) * (1<<1);
+      CLAMP(ctx->Line.Width, 1.0f, ctx->Const.MaxLineWidth) * (1<<1);
 
    sf->sf6.line_endcap_aa_region_width = 1;
    if (ctx->Line.SmoothFlag)
@@ -259,9 +259,10 @@ static void upload_sf_unit( struct brw_context *brw )
 
    /* _NEW_POINT */
    sf->sf7.sprite_point = ctx->Point.PointSprite;
-   sf->sf7.point_size = CLAMP(rint(CLAMP(ctx->Point.Size,
-					 ctx->Point.MinSize,
-					 ctx->Point.MaxSize)), 1, 255) * (1<<3);
+   sf->sf7.point_size = CLAMP(rintf(CLAMP(ctx->Point.Size,
+                                          ctx->Point.MinSize,
+                                          ctx->Point.MaxSize)), 1.0f, 255.0f) *
+                        (1<<3);
    /* _NEW_PROGRAM | _NEW_POINT */
    sf->sf7.use_point_size_state = !(ctx->VertexProgram.PointSizeEnabled ||
 				    ctx->Point._Attenuated);
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 157b33d4f4c..693441c6f49 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -323,7 +323,7 @@ brw_upload_cache(struct brw_cache *cache,
 
    item->key = tmp;
 
-   if (cache->n_items > cache->size * 1.5)
+   if (cache->n_items > cache->size * 1.5f)
       rehash(cache);
 
    hash %= cache->size;
diff --git a/src/mesa/drivers/dri/i965/brw_util.h b/src/mesa/drivers/dri/i965/brw_util.h
index 04e4e944118..68f4318d371 100644
--- a/src/mesa/drivers/dri/i965/brw_util.h
+++ b/src/mesa/drivers/dri/i965/brw_util.h
@@ -53,14 +53,14 @@ brw_get_line_width(struct brw_context *brw)
    float line_width =
       CLAMP(!brw->ctx.Multisample._Enabled && !brw->ctx.Line.SmoothFlag
             ? roundf(brw->ctx.Line.Width) : brw->ctx.Line.Width,
-            0.0, brw->ctx.Const.MaxLineWidth);
+            0.0f, brw->ctx.Const.MaxLineWidth);
    uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
 
    /* Line width of 0 is not allowed when MSAA enabled */
    if (brw->ctx.Multisample._Enabled) {
       if (line_width_u3_7 == 0)
          line_width_u3_7 = 1;
-   } else if (brw->ctx.Line.SmoothFlag && line_width < 1.5) {
+   } else if (brw->ctx.Line.SmoothFlag && line_width < 1.5f) {
       /* For 1 pixel line thickness or less, the general
        * anti-aliasing algorithm gives up, and a garbage line is
        * generated.  Setting a Line Width of 0.0 specifies the
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index 36734f598fe..cf1421e5e9f 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -148,7 +148,7 @@ unsigned
 gen6_determine_sample_mask(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
-   float coverage = 1.0;
+   float coverage = 1.0f;
    float coverage_invert = false;
    unsigned sample_mask = ~0u;
 
@@ -166,7 +166,7 @@ gen6_determine_sample_mask(struct brw_context *brw)
    }
 
    if (num_samples > 1) {
-      int coverage_int = (int) (num_samples * coverage + 0.5);
+      int coverage_int = (int) (num_samples * coverage + 0.5f);
       uint32_t coverage_bits = (1 << coverage_int) - 1;
       if (coverage_invert)
          coverage_bits ^= (1 << num_samples) - 1;
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index b00517ed81e..4068f2844a2 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -383,7 +383,7 @@ upload_sf_state(struct brw_context *brw)
    point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
 
    /* Clamp to the hardware limits and convert to fixed point */
-   dw4 |= U_FIXED(CLAMP(point_size, 0.125, 255.875), 3);
+   dw4 |= U_FIXED(CLAMP(point_size, 0.125f, 255.875f), 3);
 
    /*
     * Window coordinates in an FBO are inverted, which means point
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
index 4fa46a8eb97..698b3d491bc 100644
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
@@ -220,7 +220,7 @@ upload_sf_state(struct brw_context *brw)
    point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
 
    /* Clamp to the hardware limits and convert to fixed point */
-   dw3 |= U_FIXED(CLAMP(point_size, 0.125, 255.875), 3);
+   dw3 |= U_FIXED(CLAMP(point_size, 0.125f, 255.875f), 3);
 
    /* _NEW_LIGHT */
    if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
index d371c193577..69162171c4e 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -228,7 +228,7 @@ gen7_upload_urb(struct brw_context *brw)
       remaining_space = total_wants;
    if (remaining_space > 0) {
       unsigned vs_additional = (unsigned)
-         round(vs_wants * (((double) remaining_space) / total_wants));
+         roundf(vs_wants * (((float) remaining_space) / total_wants));
       vs_chunks += vs_additional;
       remaining_space -= vs_additional;
       gs_chunks += remaining_space;
diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c
index c2b585d0001..6b655ee493e 100644
--- a/src/mesa/drivers/dri/i965/gen8_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c
@@ -169,7 +169,7 @@ upload_sf(struct brw_context *brw)
    point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
 
    /* Clamp to the hardware limits and convert to fixed point */
-   dw3 |= U_FIXED(CLAMP(point_size, 0.125, 255.875), 3);
+   dw3 |= U_FIXED(CLAMP(point_size, 0.125f, 255.875f), 3);
 
    /* _NEW_PROGRAM | _NEW_POINT */
    if (!(ctx->VertexProgram.PointSizeEnabled || ctx->Point._Attenuated))

From 4251ccb47b79c719918e7c372aebb6b2d9719922 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 12 Jul 2015 12:37:00 -0700
Subject: [PATCH 0784/1208] nir: Avoid double promotion.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/glsl/nir/nir_opcodes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index 56e96d9121e..df5b7e2d517 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -474,10 +474,10 @@ else
 """)
 
 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint], "", """
-dst = ldexp(src0, src1);
+dst = ldexpf(src0, src1);
 /* flush denormals to zero. */
 if (!isnormal(dst))
-   dst = copysign(0.0f, src0);
+   dst = copysignf(0.0f, src0);
 """)
 
 # Combines the first component of each input to make a 2-component vector.

From 29ef7a9f19265308e7852c0f8920e0f520f08df3 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 12 Jul 2015 18:01:42 -0700
Subject: [PATCH 0785/1208] gallium/auxiliary: Avoid double promotion.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/auxiliary/util/u_format_rgb9e5.h | 2 +-
 src/gallium/auxiliary/util/u_math.h          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h b/src/gallium/auxiliary/util/u_format_rgb9e5.h
index 1c12a668d81..9e4b1d607d7 100644
--- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
+++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
@@ -75,7 +75,7 @@ typedef union {
 
 static inline float rgb9e5_ClampRange(float x)
 {
-   if (x > 0.0) {
+   if (x > 0.0f) {
       if (x >= MAX_RGB9E5) {
          return MAX_RGB9E5;
       } else {
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index f5c409dbdf2..56bd185f527 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -240,7 +240,7 @@ util_iround(float f)
 static inline boolean
 util_is_approx(float a, float b, float tol)
 {
-   return fabs(b - a) <= tol;
+   return fabsf(b - a) <= tol;
 }
 
 

From b568a5f6a8c6bb07b27e9badce01a8a73ba56c03 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 12 Jul 2015 18:01:54 -0700
Subject: [PATCH 0786/1208] util: Avoid double promotion.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/util/register_allocate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/register_allocate.c b/src/util/register_allocate.c
index 2ad8c3ce11a..95be20fcc1b 100644
--- a/src/util/register_allocate.c
+++ b/src/util/register_allocate.c
@@ -648,7 +648,7 @@ ra_get_best_spill_node(struct ra_graph *g)
       float cost = g->nodes[n].spill_cost;
       float benefit;
 
-      if (cost <= 0.0)
+      if (cost <= 0.0f)
 	 continue;
 
       if (g->nodes[n].in_stack)

From 2b47ef715ad33f6c4a4881b10240d792ba9e60b2 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 12 Jul 2015 23:14:54 -0700
Subject: [PATCH 0787/1208] vbo: Avoid double promotion.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/vbo/vbo_context.c    | 6 +++---
 src/mesa/vbo/vbo_exec_array.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c
index fd1ffe2f76d..e3eb286e482 100644
--- a/src/mesa/vbo/vbo_context.c
+++ b/src/mesa/vbo/vbo_context.c
@@ -37,9 +37,9 @@
 
 static GLuint check_size( const GLfloat *attr )
 {
-   if (attr[3] != 1.0) return 4;
-   if (attr[2] != 0.0) return 3;
-   if (attr[1] != 0.0) return 2;
+   if (attr[3] != 1.0F) return 4;
+   if (attr[2] != 0.0F) return 3;
+   if (attr[1] != 0.0F) return 2;
    return 1;		
 }
 
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 29889382fb1..ec13449c751 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -255,7 +255,7 @@ check_array_data(struct gl_context *ctx, struct gl_client_array *array,
             GLint k;
             for (k = 0; k < array->Size; k++) {
                if (IS_INF_OR_NAN(f[k]) ||
-                   f[k] >= 1.0e20 || f[k] <= -1.0e10) {
+                   f[k] >= 1.0e20F || f[k] <= -1.0e10F) {
                   printf("Bad array data:\n");
                   printf("  Element[%u].%u = %f\n", j, k, f[k]);
                   printf("  Array %u at %p\n", attrib, (void* ) array);
@@ -263,7 +263,7 @@ check_array_data(struct gl_context *ctx, struct gl_client_array *array,
 			 array->Type, array->Size, array->Stride);
                   printf("  Address/offset %p in Buffer Object %u\n",
 			 array->Ptr, array->BufferObj->Name);
-                  f[k] = 1.0; /* XXX replace the bad value! */
+                  f[k] = 1.0F; /* XXX replace the bad value! */
                }
                /*assert(!IS_INF_OR_NAN(f[k]));*/
             }

From c92b2a1d7b286de8641512970a87c94809fbbc3f Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 12 Jul 2015 23:15:01 -0700
Subject: [PATCH 0788/1208] tnl: Avoid double promotion.

There are a couple of unrelated changes in t_vb_lighttmp.h that I hope
you'll excuse -- there's a block of code that's duplicated modulo a few
trivial differences that I took the liberty of fixing.
---
 src/mesa/tnl/t_draw.c           |  2 +-
 src/mesa/tnl/t_rasterpos.c      |  6 +++---
 src/mesa/tnl/t_vb_fog.c         |  6 +++---
 src/mesa/tnl/t_vb_light.c       | 22 +++++++++++-----------
 src/mesa/tnl/t_vb_lighttmp.h    | 16 +++++++---------
 src/mesa/tnl/t_vb_normals.c     |  4 ++--
 src/mesa/tnl/t_vertex_generic.c |  2 +-
 7 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c
index d5dcb5e8a7d..632acb55f8b 100644
--- a/src/mesa/tnl/t_draw.c
+++ b/src/mesa/tnl/t_draw.c
@@ -257,7 +257,7 @@ static GLboolean *_tnl_import_edgeflag( struct gl_context *ctx,
    GLuint i;
 
    for (i = 0; i < count; i++) {
-      *bptr++ = ((GLfloat *)ptr)[0] == 1.0;
+      *bptr++ = ((GLfloat *)ptr)[0] == 1.0F;
       ptr += stride;
    }
 
diff --git a/src/mesa/tnl/t_rasterpos.c b/src/mesa/tnl/t_rasterpos.c
index 7ef50ea7cd7..4bd9ac8539e 100644
--- a/src/mesa/tnl/t_rasterpos.c
+++ b/src/mesa/tnl/t_rasterpos.c
@@ -148,7 +148,7 @@ shade_rastpos(struct gl_context *ctx,
 	 SUB_3V(VP, light->_Position, vertex);
          /* d = length(VP) */
 	 d = (GLfloat) LEN_3FV( VP );
-	 if (d > 1.0e-6) {
+	 if (d > 1.0e-6F) {
             /* normalize VP */
 	    GLfloat invd = 1.0F / d;
 	    SELF_SCALE_SCALAR_3V(VP, invd);
@@ -172,7 +172,7 @@ shade_rastpos(struct gl_context *ctx,
 	 }
       }
 
-      if (attenuation < 1e-3)
+      if (attenuation < 1e-3F)
 	 continue;
 
       n_dot_VP = DOT3( normal, VP );
@@ -219,7 +219,7 @@ shade_rastpos(struct gl_context *ctx,
 	    shine = ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_SHININESS][0];
 	    spec_coef = powf(n_dot_h, shine);
 
-	    if (spec_coef > 1.0e-10) {
+	    if (spec_coef > 1.0e-10F) {
                if (ctx->Light.Model.ColorControl==GL_SEPARATE_SPECULAR_COLOR) {
                   ACC_SCALE_SCALAR_3V( specularContrib, spec_coef,
                                        light->_MatSpecular[0]);
diff --git a/src/mesa/tnl/t_vb_fog.c b/src/mesa/tnl/t_vb_fog.c
index 1ca72f866b7..5489ed6857f 100644
--- a/src/mesa/tnl/t_vb_fog.c
+++ b/src/mesa/tnl/t_vb_fog.c
@@ -45,8 +45,8 @@ struct fog_stage_data {
 #define FOG_STAGE_DATA(stage) ((struct fog_stage_data *)stage->privatePtr)
 
 #define FOG_EXP_TABLE_SIZE 256
-#define FOG_MAX (10.0)
-#define EXP_FOG_MAX .0006595
+#define FOG_MAX (10.0F)
+#define EXP_FOG_MAX .0006595F
 #define FOG_INCR (FOG_MAX/FOG_EXP_TABLE_SIZE)
 static GLfloat exp_table[FOG_EXP_TABLE_SIZE];
 static GLfloat inited = 0;
@@ -54,7 +54,7 @@ static GLfloat inited = 0;
 #if 1
 #define NEG_EXP( result, narg )						\
 do {									\
-   GLfloat f = (GLfloat) (narg * (1.0/FOG_INCR));			\
+   GLfloat f = (GLfloat) (narg * (1.0F / FOG_INCR));			\
    GLint k = (GLint) f;							\
    if (k > FOG_EXP_TABLE_SIZE-2) 					\
       result = (GLfloat) EXP_FOG_MAX;					\
diff --git a/src/mesa/tnl/t_vb_light.c b/src/mesa/tnl/t_vb_light.c
index dbd57fa6bfe..029265a4f83 100644
--- a/src/mesa/tnl/t_vb_light.c
+++ b/src/mesa/tnl/t_vb_light.c
@@ -137,23 +137,23 @@ validate_shine_table( struct gl_context *ctx, GLuint side, GLfloat shininess )
 	    break;
 
       m = s->tab;
-      m[0] = 0.0;
-      if (shininess == 0.0) {
+      m[0] = 0.0F;
+      if (shininess == 0.0F) {
 	 for (j = 1 ; j <= SHINE_TABLE_SIZE ; j++)
-	    m[j] = 1.0;
+	    m[j] = 1.0F;
       }
       else {
 	 for (j = 1 ; j < SHINE_TABLE_SIZE ; j++) {
-            GLdouble t, x = j / (GLfloat) (SHINE_TABLE_SIZE - 1);
-            if (x < 0.005) /* underflow check */
-               x = 0.005;
-            t = pow(x, shininess);
-	    if (t > 1e-20)
-	       m[j] = (GLfloat) t;
+            GLfloat t, x = j / (GLfloat) (SHINE_TABLE_SIZE - 1);
+            if (x < 0.005F) /* underflow check */
+               x = 0.005F;
+            t = powf(x, shininess);
+	    if (t > 1e-20F)
+	       m[j] = t;
 	    else
-	       m[j] = 0.0;
+	       m[j] = 0.0F;
 	 }
-	 m[SHINE_TABLE_SIZE] = 1.0;
+	 m[SHINE_TABLE_SIZE] = 1.0F;
       }
 
       s->shininess = shininess;
diff --git a/src/mesa/tnl/t_vb_lighttmp.h b/src/mesa/tnl/t_vb_lighttmp.h
index f8786accbbb..3aebcd4b799 100644
--- a/src/mesa/tnl/t_vb_lighttmp.h
+++ b/src/mesa/tnl/t_vb_lighttmp.h
@@ -112,7 +112,7 @@ static void TAG(light_rgba_spec)( struct gl_context *ctx,
 	 GLint side;
 	 GLfloat contrib[3];
 	 GLfloat attenuation;
-	 GLfloat VP[3];  /* unit vector from vertex to light */
+	 GLfloat VP[3];          /* unit vector from vertex to light */
 	 GLfloat n_dot_VP;       /* n dot VP */
 	 GLfloat *h;
 
@@ -129,7 +129,7 @@ static void TAG(light_rgba_spec)( struct gl_context *ctx,
 
 	    d = (GLfloat) LEN_3FV( VP );
 
-	    if (d > 1e-6) {
+	    if (d > 1e-6F) {
 	       GLfloat invd = 1.0F / d;
 	       SELF_SCALE_SCALAR_3V(VP, invd);
 	    }
@@ -152,7 +152,7 @@ static void TAG(light_rgba_spec)( struct gl_context *ctx,
 	    }
 	 }
 
-	 if (attenuation < 1e-3)
+	 if (attenuation < 1e-3F)
 	    continue;		/* this light makes no contribution */
 
 	 /* Compute dot product or normal and vector from V to light pos */
@@ -204,7 +204,7 @@ static void TAG(light_rgba_spec)( struct gl_context *ctx,
 
 	 if (n_dot_h > 0.0F) {
 	    GLfloat spec_coef = lookup_shininess(ctx, side, n_dot_h);
-	    if (spec_coef > 1.0e-10) {
+	    if (spec_coef > 1.0e-10F) {
 	       spec_coef *= attenuation;
 	       ACC_SCALE_SCALAR_3V( spec[side], spec_coef,
 				    light->_MatSpecular[side]);
@@ -283,12 +283,11 @@ static void TAG(light_rgba)( struct gl_context *ctx,
 
       /* Add contribution from each enabled light source */
       foreach (light, &ctx->Light.EnabledList) {
-
 	 GLfloat n_dot_h;
 	 GLfloat correction;
 	 GLint side;
 	 GLfloat contrib[3];
-	 GLfloat attenuation = 1.0;
+	 GLfloat attenuation;
 	 GLfloat VP[3];          /* unit vector from vertex to light */
 	 GLfloat n_dot_VP;       /* n dot VP */
 	 GLfloat *h;
@@ -302,12 +301,11 @@ static void TAG(light_rgba)( struct gl_context *ctx,
 	 else {
 	    GLfloat d;     /* distance from vertex to light */
 
-
 	    SUB_3V(VP, light->_Position, vertex);
 
 	    d = (GLfloat) LEN_3FV( VP );
 
-	    if ( d > 1e-6) {
+	    if (d > 1e-6F) {
 	       GLfloat invd = 1.0F / d;
 	       SELF_SCALE_SCALAR_3V(VP, invd);
 	    }
@@ -330,7 +328,7 @@ static void TAG(light_rgba)( struct gl_context *ctx,
 	    }
 	 }
 
-	 if (attenuation < 1e-3)
+	 if (attenuation < 1e-3F)
 	    continue;		/* this light makes no contribution */
 
 	 /* Compute dot product or normal and vector from V to light pos */
diff --git a/src/mesa/tnl/t_vb_normals.c b/src/mesa/tnl/t_vb_normals.c
index 9aee1a2fb0b..6fc89c23b33 100644
--- a/src/mesa/tnl/t_vb_normals.c
+++ b/src/mesa/tnl/t_vb_normals.c
@@ -114,7 +114,7 @@ validate_normal_stage(struct gl_context *ctx, struct tnl_pipeline_stage *stage)
 	 store->NormalTransform = _mesa_normal_tab[transform | NORM_NORMALIZE];
       }
       else if (ctx->Transform.RescaleNormals &&
-               ctx->_ModelViewInvScale != 1.0) {
+               ctx->_ModelViewInvScale != 1.0F) {
 	 store->NormalTransform = _mesa_normal_tab[transform | NORM_RESCALE];
       }
       else {
@@ -131,7 +131,7 @@ validate_normal_stage(struct gl_context *ctx, struct tnl_pipeline_stage *stage)
 	 store->NormalTransform = _mesa_normal_tab[NORM_NORMALIZE];
       }
       else if (!ctx->Transform.RescaleNormals &&
-	       ctx->_ModelViewInvScale != 1.0) {
+	       ctx->_ModelViewInvScale != 1.0F) {
 	 store->NormalTransform = _mesa_normal_tab[NORM_RESCALE];
       }
       else {
diff --git a/src/mesa/tnl/t_vertex_generic.c b/src/mesa/tnl/t_vertex_generic.c
index 2a25a96928f..6c40c868363 100644
--- a/src/mesa/tnl/t_vertex_generic.c
+++ b/src/mesa/tnl/t_vertex_generic.c
@@ -1026,7 +1026,7 @@ void _tnl_generic_interp( struct gl_context *ctx,
 
    if (tnl->NeedNdcCoords) {
       const GLfloat *dstclip = VB->ClipPtr->data[edst];
-      if (dstclip[3] != 0.0) {
+      if (dstclip[3] != 0.0f) {
 	 const GLfloat w = 1.0f / dstclip[3];
 	 GLfloat pos[4];
 

From 04aa8b58a09e3b415916fa569111c1f76d07a8d5 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 12 Jul 2015 23:15:10 -0700
Subject: [PATCH 0789/1208] swrast: Avoid double promotion.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/swrast/s_aaline.c        | 28 ++++++++++++++--------------
 src/mesa/swrast/s_aalinetemp.h    |  4 ++--
 src/mesa/swrast/s_atifragshader.c |  4 ++--
 src/mesa/swrast/s_copypix.c       |  6 +++---
 src/mesa/swrast/s_drawpix.c       | 12 ++++++------
 src/mesa/swrast/s_fragprog.c      |  4 ++--
 src/mesa/swrast/s_lines.c         |  4 ++--
 src/mesa/swrast/s_points.c        | 10 +++++-----
 src/mesa/swrast/s_span.c          | 10 +++++-----
 src/mesa/swrast/s_texcombine.c    |  6 +++---
 src/mesa/swrast/s_texfilter.c     |  8 ++++----
 src/mesa/swrast/s_tritemp.h       |  2 +-
 src/mesa/swrast/s_zoom.c          |  2 +-
 13 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/src/mesa/swrast/s_aaline.c b/src/mesa/swrast/s_aaline.c
index f3258e813a6..de5b42b9f6b 100644
--- a/src/mesa/swrast/s_aaline.c
+++ b/src/mesa/swrast/s_aaline.c
@@ -116,11 +116,11 @@ compute_plane(GLfloat x0, GLfloat y0, GLfloat x1, GLfloat y1,
    const GLfloat b = pz * py;
    const GLfloat c = px * px + py * py;
    const GLfloat d = -(a * x0 + b * y0 + c * z0);
-   if (a == 0.0 && b == 0.0 && c == 0.0 && d == 0.0) {
-      plane[0] = 0.0;
-      plane[1] = 0.0;
-      plane[2] = 1.0;
-      plane[3] = 0.0;
+   if (a == 0.0F && b == 0.0F && c == 0.0F && d == 0.0F) {
+      plane[0] = 0.0F;
+      plane[1] = 0.0F;
+      plane[2] = 1.0F;
+      plane[3] = 0.0F;
    }
    else {
       plane[0] = a;
@@ -135,9 +135,9 @@ compute_plane(GLfloat x0, GLfloat y0, GLfloat x1, GLfloat y1,
 static inline void
 constant_plane(GLfloat value, GLfloat plane[4])
 {
-   plane[0] = 0.0;
-   plane[1] = 0.0;
-   plane[2] = -1.0;
+   plane[0] = 0.0F;
+   plane[1] = 0.0F;
+   plane[2] = -1.0F;
    plane[3] = value;
 }
 
@@ -160,8 +160,8 @@ static inline GLfloat
 solve_plane_recip(GLfloat x, GLfloat y, const GLfloat plane[4])
 {
    const GLfloat denom = plane[3] + plane[0] * x + plane[1] * y;
-   if (denom == 0.0)
-      return 0.0;
+   if (denom == 0.0F)
+      return 0.0F;
    else
       return -plane[2] / denom;
 }
@@ -374,7 +374,7 @@ segment(struct gl_context *ctx,
       if (x0 < x1) {
          xLeft = x0 - line->halfWidth;
          xRight = x1 + line->halfWidth;
-         if (line->dy >= 0.0) {
+         if (line->dy >= 0.0F) {
             yBot = y0 - 3.0F * line->halfWidth;
             yTop = y0 + line->halfWidth;
          }
@@ -386,7 +386,7 @@ segment(struct gl_context *ctx,
       else {
          xLeft = x1 - line->halfWidth;
          xRight = x0 + line->halfWidth;
-         if (line->dy <= 0.0) {
+         if (line->dy <= 0.0F) {
             yBot = y1 - 3.0F * line->halfWidth;
             yTop = y1 + line->halfWidth;
          }
@@ -420,7 +420,7 @@ segment(struct gl_context *ctx,
       if (y0 < y1) {
          yBot = y0 - line->halfWidth;
          yTop = y1 + line->halfWidth;
-         if (line->dx >= 0.0) {
+         if (line->dx >= 0.0F) {
             xLeft = x0 - 3.0F * line->halfWidth;
             xRight = x0 + line->halfWidth;
          }
@@ -432,7 +432,7 @@ segment(struct gl_context *ctx,
       else {
          yBot = y1 - line->halfWidth;
          yTop = y0 + line->halfWidth;
-         if (line->dx <= 0.0) {
+         if (line->dx <= 0.0F) {
             xLeft = x1 - 3.0F * line->halfWidth;
             xRight = x1 + line->halfWidth;
          }
diff --git a/src/mesa/swrast/s_aalinetemp.h b/src/mesa/swrast/s_aalinetemp.h
index f1d078fd89b..bebb131a5d1 100644
--- a/src/mesa/swrast/s_aalinetemp.h
+++ b/src/mesa/swrast/s_aalinetemp.h
@@ -44,7 +44,7 @@ NAME(plot)(struct gl_context *ctx, struct LineInfo *line, int ix, int iy)
 
    (void) swrast;
 
-   if (coverage == 0.0)
+   if (coverage == 0.0F)
       return;
 
    line->span.end++;
@@ -123,7 +123,7 @@ NAME(line)(struct gl_context *ctx, const SWvertex *v0, const SWvertex *v1)
                                  ctx->Const.MinLineWidthAA,
                                  ctx->Const.MaxLineWidthAA);
 
-   if (line.len == 0.0 || IS_INF_OR_NAN(line.len))
+   if (line.len == 0.0F || IS_INF_OR_NAN(line.len))
       return;
 
    INIT_SPAN(line.span, GL_LINE);
diff --git a/src/mesa/swrast/s_atifragshader.c b/src/mesa/swrast/s_atifragshader.c
index 9e029db25ce..2974deed41b 100644
--- a/src/mesa/swrast/s_atifragshader.c
+++ b/src/mesa/swrast/s_atifragshader.c
@@ -436,13 +436,13 @@ execute_shader(struct gl_context *ctx, const struct ati_fragment_shader *shader,
 		     for (i = 0; i < 3; i++) {
 			dst[optype][i] =
 			   (src[optype][2][i] >
-			    0.5) ? src[optype][0][i] : src[optype][1][i];
+			    0.5F) ? src[optype][0][i] : src[optype][1][i];
 		     }
 		  }
 		  else {
 		     dst[optype][3] =
 			(src[optype][2][3] >
-			 0.5) ? src[optype][0][3] : src[optype][1][3];
+			 0.5F) ? src[optype][0][3] : src[optype][1][3];
 		  }
 		  break;
 
diff --git a/src/mesa/swrast/s_copypix.c b/src/mesa/swrast/s_copypix.c
index 8fde0c29540..0dbccc0f61d 100644
--- a/src/mesa/swrast/s_copypix.c
+++ b/src/mesa/swrast/s_copypix.c
@@ -52,7 +52,7 @@ regions_overlap(GLint srcx, GLint srcy,
                 GLint width, GLint height,
                 GLfloat zoomX, GLfloat zoomY)
 {
-   if (zoomX == 1.0 && zoomY == 1.0) {
+   if (zoomX == 1.0F && zoomY == 1.0F) {
       return _mesa_regions_overlap(srcx, srcy, srcx + width, srcy + height,
                                    dstx, dsty, dstx + width, dsty + height);
    }
@@ -201,8 +201,8 @@ scale_and_bias_z(struct gl_context *ctx, GLuint width,
    GLuint i;
 
    if (depthMax <= 0xffffff &&
-       ctx->Pixel.DepthScale == 1.0 &&
-       ctx->Pixel.DepthBias == 0.0) {
+       ctx->Pixel.DepthScale == 1.0F &&
+       ctx->Pixel.DepthBias == 0.0F) {
       /* no scale or bias and no clamping and no worry of overflow */
       const GLfloat depthMaxF = ctx->DrawBuffer->_DepthMaxF;
       for (i = 0; i < width; i++) {
diff --git a/src/mesa/swrast/s_drawpix.c b/src/mesa/swrast/s_drawpix.c
index fb677ee1b16..dc6827ede9f 100644
--- a/src/mesa/swrast/s_drawpix.c
+++ b/src/mesa/swrast/s_drawpix.c
@@ -264,7 +264,7 @@ draw_stencil_pixels( struct gl_context *ctx, GLint x, GLint y,
                      const struct gl_pixelstore_attrib *unpack,
                      const GLvoid *pixels )
 {
-   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0 || ctx->Pixel.ZoomY != 1.0;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F;
    const GLenum destType = GL_UNSIGNED_BYTE;
    GLint row;
    GLubyte *values;
@@ -309,8 +309,8 @@ draw_depth_pixels( struct gl_context *ctx, GLint x, GLint y,
                    const GLvoid *pixels )
 {
    const GLboolean scaleOrBias
-      = ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0;
-   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0 || ctx->Pixel.ZoomY != 1.0;
+      = ctx->Pixel.DepthScale != 1.0f || ctx->Pixel.DepthBias != 0.0f;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0f || ctx->Pixel.ZoomY != 1.0f;
    SWspan span;
 
    INIT_SPAN(span, GL_BITMAP);
@@ -415,7 +415,7 @@ draw_rgba_pixels( struct gl_context *ctx, GLint x, GLint y,
                   const GLvoid *pixels )
 {
    const GLint imgX = x, imgY = y;
-   const GLboolean zoom = ctx->Pixel.ZoomX!=1.0 || ctx->Pixel.ZoomY!=1.0;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F;
    GLbitfield transferOps = ctx->_ImageTransferState;
    SWspan span;
 
@@ -601,10 +601,10 @@ draw_depth_stencil_pixels(struct gl_context *ctx, GLint x, GLint y,
 {
    const GLint imgX = x, imgY = y;
    const GLboolean scaleOrBias
-      = ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0;
+      = ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F;
    const GLuint stencilMask = ctx->Stencil.WriteMask[0];
    const GLenum stencilType = GL_UNSIGNED_BYTE;
-   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0 || ctx->Pixel.ZoomY != 1.0;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F;
    struct gl_renderbuffer *depthRb, *stencilRb;
    struct gl_pixelstore_attrib clippedUnpack = *unpack;
 
diff --git a/src/mesa/swrast/s_fragprog.c b/src/mesa/swrast/s_fragprog.c
index 175915a5a0b..4fbf66b9db7 100644
--- a/src/mesa/swrast/s_fragprog.c
+++ b/src/mesa/swrast/s_fragprog.c
@@ -243,9 +243,9 @@ run_program(struct gl_context *ctx, SWspan *span, GLuint start, GLuint end)
             /* Store result depth/z */
             if (outputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
                const GLfloat depth = machine->Outputs[FRAG_RESULT_DEPTH][2];
-               if (depth <= 0.0)
+               if (depth <= 0.0F)
                   span->array->z[i] = 0;
-               else if (depth >= 1.0)
+               else if (depth >= 1.0F)
                   span->array->z[i] = ctx->DrawBuffer->_DepthMax;
                else
                   span->array->z[i] =
diff --git a/src/mesa/swrast/s_lines.c b/src/mesa/swrast/s_lines.c
index 58bd2fc720a..ab8da7db289 100644
--- a/src/mesa/swrast/s_lines.c
+++ b/src/mesa/swrast/s_lines.c
@@ -241,7 +241,7 @@ _swrast_choose_line( struct gl_context *ctx )
          USE(general_line);
       }
       else if (ctx->Depth.Test
-               || ctx->Line.Width != 1.0
+               || ctx->Line.Width != 1.0F
                || ctx->Line.StippleFlag) {
          /* no texture, but Z, fog, width>1, stipple, etc. */
 #if CHAN_BITS == 32
@@ -252,7 +252,7 @@ _swrast_choose_line( struct gl_context *ctx )
       }
       else {
          assert(!ctx->Depth.Test);
-         assert(ctx->Line.Width == 1.0);
+         assert(ctx->Line.Width == 1.0F);
          /* simple lines */
          USE(simple_no_z_rgba_line);
       }
diff --git a/src/mesa/swrast/s_points.c b/src/mesa/swrast/s_points.c
index 2212c95fa9a..d9aae73302c 100644
--- a/src/mesa/swrast/s_points.c
+++ b/src/mesa/swrast/s_points.c
@@ -208,9 +208,9 @@ sprite_point(struct gl_context *ctx, const SWvertex *vert)
       else {
          /* even size */
          /* 0.501 factor allows conformance to pass */
-         xmin = (GLint) (x + 0.501) - iRadius;
+         xmin = (GLint) (x + 0.501F) - iRadius;
          xmax = xmin + iSize - 1;
-         ymin = (GLint) (y + 0.501) - iRadius;
+         ymin = (GLint) (y + 0.501F) - iRadius;
          ymax = ymin + iSize - 1;
       }
 
@@ -423,9 +423,9 @@ large_point(struct gl_context *ctx, const SWvertex *vert)
       else {
          /* even size */
          /* 0.501 factor allows conformance to pass */
-         xmin = (GLint) (x + 0.501) - iRadius;
+         xmin = (GLint) (x + 0.501F) - iRadius;
          xmax = xmin + iSize - 1;
-         ymin = (GLint) (y + 0.501) - iRadius;
+         ymin = (GLint) (y + 0.501F) - iRadius;
          ymax = ymin + iSize - 1;
       }
 
@@ -552,7 +552,7 @@ _swrast_choose_point(struct gl_context *ctx)
       else if (ctx->Point.SmoothFlag) {
          swrast->Point = smooth_point;
       }
-      else if (size > 1.0 ||
+      else if (size > 1.0F ||
                ctx->Point._Attenuated ||
                ctx->VertexProgram.PointSizeEnabled) {
          swrast->Point = large_point;
diff --git a/src/mesa/swrast/s_span.c b/src/mesa/swrast/s_span.c
index 3db10e163d7..cd939ba9510 100644
--- a/src/mesa/swrast/s_span.c
+++ b/src/mesa/swrast/s_span.c
@@ -506,7 +506,7 @@ interpolate_texcoords(struct gl_context *ctx, SWspan *span)
             /* LOD is calculated directly in the ansiotropic filter, we can
              * skip the normal lambda function as the result is ignored.
              */
-            if (samp->MaxAnisotropy > 1.0 &&
+            if (samp->MaxAnisotropy > 1.0F &&
                 samp->MinFilter == GL_LINEAR_MIPMAP_LINEAR) {
                needLambda = GL_FALSE;
             }
@@ -886,16 +886,16 @@ apply_aa_coverage(SWspan *span)
       GLubyte (*rgba)[4] = span->array->rgba8;
       for (i = 0; i < span->end; i++) {
          const GLfloat a = rgba[i][ACOMP] * coverage[i];
-         rgba[i][ACOMP] = (GLubyte) CLAMP(a, 0.0, 255.0);
-         assert(coverage[i] >= 0.0);
-         assert(coverage[i] <= 1.0);
+         rgba[i][ACOMP] = (GLubyte) CLAMP(a, 0.0F, 255.0F);
+         assert(coverage[i] >= 0.0F);
+         assert(coverage[i] <= 1.0F);
       }
    }
    else if (span->array->ChanType == GL_UNSIGNED_SHORT) {
       GLushort (*rgba)[4] = span->array->rgba16;
       for (i = 0; i < span->end; i++) {
          const GLfloat a = rgba[i][ACOMP] * coverage[i];
-         rgba[i][ACOMP] = (GLushort) CLAMP(a, 0.0, 65535.0);
+         rgba[i][ACOMP] = (GLushort) CLAMP(a, 0.0F, 65535.0F);
       }
    }
    else {
diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c
index 453bd36367b..da4a013634c 100644
--- a/src/mesa/swrast/s_texcombine.c
+++ b/src/mesa/swrast/s_texcombine.c
@@ -670,8 +670,8 @@ _swrast_texture_span( struct gl_context *ctx, SWspan *span )
                }
             }
 
-            if (samp->MinLod != -1000.0 ||
-                samp->MaxLod != 1000.0) {
+            if (samp->MinLod != -1000.0F ||
+                samp->MaxLod != 1000.0F) {
                /* apply LOD clamping to lambda */
                const GLfloat min = samp->MinLod;
                const GLfloat max = samp->MaxLod;
@@ -682,7 +682,7 @@ _swrast_texture_span( struct gl_context *ctx, SWspan *span )
                }
             }
          }
-         else if (samp->MaxAnisotropy > 1.0 &&
+         else if (samp->MaxAnisotropy > 1.0F &&
                   samp->MinFilter == GL_LINEAR_MIPMAP_LINEAR) {
             /* sample_lambda_2d_aniso is beeing used as texture_sample_func,
              * it requires the current SWspan *span as an additional parameter.
diff --git a/src/mesa/swrast/s_texfilter.c b/src/mesa/swrast/s_texfilter.c
index cd6395fbcde..314170fc751 100644
--- a/src/mesa/swrast/s_texfilter.c
+++ b/src/mesa/swrast/s_texfilter.c
@@ -1902,7 +1902,7 @@ sample_lambda_2d_aniso(struct gl_context *ctx,
    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
    const GLboolean adjustLOD =
       (texUnit->LodBias + samp->LodBias != 0.0F)
-      || (samp->MinLod != -1000.0 || samp->MaxLod != 1000.0);
+      || (samp->MinLod != -1000.0F || samp->MaxLod != 1000.0F);
 
    GLuint i;
    
@@ -1973,8 +1973,8 @@ sample_lambda_2d_aniso(struct gl_context *ctx,
                      ctx->Const.MaxTextureLodBias);
             lod += bias;
 
-            if (samp->MinLod != -1000.0 ||
-                samp->MaxLod != 1000.0) {
+            if (samp->MinLod != -1000.0F ||
+                samp->MaxLod != 1000.0F) {
                /* apply LOD clamping to lambda */
                lod = CLAMP(lod, samp->MinLod, samp->MaxLod);
             }
@@ -3740,7 +3740,7 @@ _swrast_choose_texture_sample_func( struct gl_context *ctx,
          }
          else if (needLambda) {
             /* Anisotropic filtering extension. Activated only if mipmaps are used */
-            if (sampler->MaxAnisotropy > 1.0 &&
+            if (sampler->MaxAnisotropy > 1.0F &&
                 sampler->MinFilter == GL_LINEAR_MIPMAP_LINEAR) {
                return sample_lambda_2d_aniso;
             }
diff --git a/src/mesa/swrast/s_tritemp.h b/src/mesa/swrast/s_tritemp.h
index fddbbfd99d6..1d71839713c 100644
--- a/src/mesa/swrast/s_tritemp.h
+++ b/src/mesa/swrast/s_tritemp.h
@@ -242,7 +242,7 @@ static void NAME(struct gl_context *ctx, const SWvertex *v0,
       if (IS_INF_OR_NAN(area) || area == 0.0F)
          return;
 
-      if (area * bf * swrast->_BackfaceCullSign < 0.0)
+      if (area * bf * swrast->_BackfaceCullSign < 0.0F)
          return;
 
       oneOverArea = 1.0F / area;
diff --git a/src/mesa/swrast/s_zoom.c b/src/mesa/swrast/s_zoom.c
index 9879e2a5f10..34b8eb19657 100644
--- a/src/mesa/swrast/s_zoom.c
+++ b/src/mesa/swrast/s_zoom.c
@@ -114,7 +114,7 @@ unzoom_x(GLfloat zoomX, GLint imageX, GLint zx)
    (zx - imageX) / zoomX = x - imageX;
    */
    GLint x;
-   if (zoomX < 0.0)
+   if (zoomX < 0.0F)
       zx++;
    x = imageX + (GLint) ((zx - imageX) / zoomX);
    return x;

From 076f73edb34f2a83092c6c8ad04b53def2792bb8 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 12 Jul 2015 23:15:19 -0700
Subject: [PATCH 0790/1208] program: Avoid double promotion.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/program/prog_execute.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/mesa/program/prog_execute.c b/src/mesa/program/prog_execute.c
index 77274e2cccc..2c52d0db508 100644
--- a/src/mesa/program/prog_execute.c
+++ b/src/mesa/program/prog_execute.c
@@ -623,7 +623,7 @@ _mesa_execute_program(struct gl_context * ctx,
             GLfloat a[4], result[4];
             fetch_vector1(&inst->SrcReg[0], machine, a);
             result[0] = result[1] = result[2] = result[3]
-               = (GLfloat) cos(a[0]);
+               = cosf(a[0]);
             store_vector4(inst, machine, result);
          }
          break;
@@ -776,7 +776,7 @@ _mesa_execute_program(struct gl_context * ctx,
             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
                GLfloat a[4];
                fetch_vector1(&inst->SrcReg[0], machine, a);
-               cond = (a[0] != 0.0);
+               cond = (a[0] != 0.0F);
             }
             else {
                cond = eval_condition(machine, inst);
@@ -834,7 +834,7 @@ _mesa_execute_program(struct gl_context * ctx,
                val = -FLT_MAX;
             }
             else {
-               val = (float)(log(a[0]) * 1.442695F);
+               val = logf(a[0]) * 1.442695F;
             }
             result[0] = result[1] = result[2] = result[3] = val;
             store_vector4(inst, machine, result);
@@ -853,10 +853,10 @@ _mesa_execute_program(struct gl_context * ctx,
             result[1] = a[0];
             /* XXX we could probably just use pow() here */
             if (a[0] > 0.0F) {
-               if (a[1] == 0.0 && a[3] == 0.0)
+               if (a[1] == 0.0F && a[3] == 0.0F)
                   result[2] = 1.0F;
                else
-                  result[2] = (GLfloat) pow(a[1], a[3]);
+                  result[2] = powf(a[1], a[3]);
             }
             else {
                result[2] = 0.0F;
@@ -886,12 +886,12 @@ _mesa_execute_program(struct gl_context * ctx,
                   int exponent;
                   GLfloat mantissa = frexpf(t[0], &exponent);
                   q[0] = (GLfloat) (exponent - 1);
-                  q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
+                  q[1] = 2.0F * mantissa; /* map [.5, 1) -> [1, 2) */
 
 		  /* The fast LOG2 macro doesn't meet the precision
 		   * requirements.
 		   */
-                  q[2] = (float)(log(t[0]) * 1.442695F);
+                  q[2] = logf(t[0]) * 1.442695F;
                }
             }
             else {
@@ -1051,7 +1051,7 @@ _mesa_execute_program(struct gl_context * ctx,
             fetch_vector1(&inst->SrcReg[0], machine, a);
             fetch_vector1(&inst->SrcReg[1], machine, b);
             result[0] = result[1] = result[2] = result[3]
-               = (GLfloat) pow(a[0], b[0]);
+               = powf(a[0], b[0]);
             store_vector4(inst, machine, result);
          }
          break;
@@ -1095,10 +1095,10 @@ _mesa_execute_program(struct gl_context * ctx,
          {
             GLfloat a[4], result[4];
             fetch_vector1(&inst->SrcReg[0], machine, a);
-            result[0] = (GLfloat) cos(a[0]);
-            result[1] = (GLfloat) sin(a[0]);
-            result[2] = 0.0;    /* undefined! */
-            result[3] = 0.0;    /* undefined! */
+            result[0] = cosf(a[0]);
+            result[1] = sinf(a[0]);
+            result[2] = 0.0F;    /* undefined! */
+            result[3] = 0.0F;    /* undefined! */
             store_vector4(inst, machine, result);
          }
          break;
@@ -1161,7 +1161,7 @@ _mesa_execute_program(struct gl_context * ctx,
             GLfloat a[4], result[4];
             fetch_vector1(&inst->SrcReg[0], machine, a);
             result[0] = result[1] = result[2] = result[3]
-               = (GLfloat) sin(a[0]);
+               = sinf(a[0]);
             store_vector4(inst, machine, result);
          }
          break;
@@ -1360,7 +1360,7 @@ _mesa_execute_program(struct gl_context * ctx,
              * zero, we'd probably be fine except for an assert in
              * IROUND_POS() which gets triggered by the inf values created.
              */
-            if (texcoord[3] != 0.0) {
+            if (texcoord[3] != 0.0F) {
                texcoord[0] /= texcoord[3];
                texcoord[1] /= texcoord[3];
                texcoord[2] /= texcoord[3];
@@ -1380,7 +1380,7 @@ _mesa_execute_program(struct gl_context * ctx,
 
             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
             if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
-                texcoord[3] != 0.0) {
+                texcoord[3] != 0.0F) {
                texcoord[0] /= texcoord[3];
                texcoord[1] /= texcoord[3];
                texcoord[2] /= texcoord[3];

From 7adc9fa1f1d12683c5855bf5854dec814629093d Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 12 Jul 2015 23:15:32 -0700
Subject: [PATCH 0791/1208] mesa/math: Avoid double promotion.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/math/m_clip_tmp.h | 20 +++++------
 src/mesa/math/m_matrix.c   | 70 +++++++++++++++++++-------------------
 src/mesa/math/m_norm_tmp.h |  2 +-
 3 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/src/mesa/math/m_clip_tmp.h b/src/mesa/math/m_clip_tmp.h
index e289be7b302..60c00043725 100644
--- a/src/mesa/math/m_clip_tmp.h
+++ b/src/mesa/math/m_clip_tmp.h
@@ -194,13 +194,13 @@ static GLvector4f * TAG(cliptest_points3)( GLvector4f *clip_vec,
    STRIDE_LOOP {
       const GLfloat cx = from[0], cy = from[1], cz = from[2];
       GLubyte mask = 0;
-      if (cx >  1.0)       mask |= CLIP_RIGHT_BIT;
-      else if (cx < -1.0)  mask |= CLIP_LEFT_BIT;
-      if (cy >  1.0)       mask |= CLIP_TOP_BIT;
-      else if (cy < -1.0)  mask |= CLIP_BOTTOM_BIT;
+      if (cx >  1.0F)       mask |= CLIP_RIGHT_BIT;
+      else if (cx < -1.0F)  mask |= CLIP_LEFT_BIT;
+      if (cy >  1.0F)       mask |= CLIP_TOP_BIT;
+      else if (cy < -1.0F)  mask |= CLIP_BOTTOM_BIT;
       if (viewport_z_clip) {
-	 if (cz >  1.0)       mask |= CLIP_FAR_BIT;
-	 else if (cz < -1.0)  mask |= CLIP_NEAR_BIT;
+	 if (cz >  1.0F)       mask |= CLIP_FAR_BIT;
+	 else if (cz < -1.0F)  mask |= CLIP_NEAR_BIT;
       }
       clipMask[i] = mask;
       tmpOrMask |= mask;
@@ -230,10 +230,10 @@ static GLvector4f * TAG(cliptest_points2)( GLvector4f *clip_vec,
    STRIDE_LOOP {
       const GLfloat cx = from[0], cy = from[1];
       GLubyte mask = 0;
-      if (cx >  1.0)       mask |= CLIP_RIGHT_BIT;
-      else if (cx < -1.0)  mask |= CLIP_LEFT_BIT;
-      if (cy >  1.0)       mask |= CLIP_TOP_BIT;
-      else if (cy < -1.0)  mask |= CLIP_BOTTOM_BIT;
+      if (cx >  1.0F)       mask |= CLIP_RIGHT_BIT;
+      else if (cx < -1.0F)  mask |= CLIP_LEFT_BIT;
+      if (cy >  1.0F)       mask |= CLIP_TOP_BIT;
+      else if (cy < -1.0F)  mask |= CLIP_BOTTOM_BIT;
       clipMask[i] = mask;
       tmpOrMask |= mask;
       tmpAndMask &= mask;
diff --git a/src/mesa/math/m_matrix.c b/src/mesa/math/m_matrix.c
index 6a42c6c0ed2..6522200b345 100644
--- a/src/mesa/math/m_matrix.c
+++ b/src/mesa/math/m_matrix.c
@@ -380,7 +380,7 @@ static GLboolean invert_matrix_general( GLmatrix *mat )
    if (fabsf(r3[0])>fabsf(r2[0])) SWAP_ROWS(r3, r2);
    if (fabsf(r2[0])>fabsf(r1[0])) SWAP_ROWS(r2, r1);
    if (fabsf(r1[0])>fabsf(r0[0])) SWAP_ROWS(r1, r0);
-   if (0.0 == r0[0])  return GL_FALSE;
+   if (0.0F == r0[0])  return GL_FALSE;
 
    /* eliminate first variable     */
    m1 = r1[0]/r0[0]; m2 = r2[0]/r0[0]; m3 = r3[0]/r0[0];
@@ -388,31 +388,31 @@ static GLboolean invert_matrix_general( GLmatrix *mat )
    s = r0[2]; r1[2] -= m1 * s; r2[2] -= m2 * s; r3[2] -= m3 * s;
    s = r0[3]; r1[3] -= m1 * s; r2[3] -= m2 * s; r3[3] -= m3 * s;
    s = r0[4];
-   if (s != 0.0) { r1[4] -= m1 * s; r2[4] -= m2 * s; r3[4] -= m3 * s; }
+   if (s != 0.0F) { r1[4] -= m1 * s; r2[4] -= m2 * s; r3[4] -= m3 * s; }
    s = r0[5];
-   if (s != 0.0) { r1[5] -= m1 * s; r2[5] -= m2 * s; r3[5] -= m3 * s; }
+   if (s != 0.0F) { r1[5] -= m1 * s; r2[5] -= m2 * s; r3[5] -= m3 * s; }
    s = r0[6];
-   if (s != 0.0) { r1[6] -= m1 * s; r2[6] -= m2 * s; r3[6] -= m3 * s; }
+   if (s != 0.0F) { r1[6] -= m1 * s; r2[6] -= m2 * s; r3[6] -= m3 * s; }
    s = r0[7];
-   if (s != 0.0) { r1[7] -= m1 * s; r2[7] -= m2 * s; r3[7] -= m3 * s; }
+   if (s != 0.0F) { r1[7] -= m1 * s; r2[7] -= m2 * s; r3[7] -= m3 * s; }
 
    /* choose pivot - or die */
    if (fabsf(r3[1])>fabsf(r2[1])) SWAP_ROWS(r3, r2);
    if (fabsf(r2[1])>fabsf(r1[1])) SWAP_ROWS(r2, r1);
-   if (0.0 == r1[1])  return GL_FALSE;
+   if (0.0F == r1[1])  return GL_FALSE;
 
    /* eliminate second variable */
    m2 = r2[1]/r1[1]; m3 = r3[1]/r1[1];
    r2[2] -= m2 * r1[2]; r3[2] -= m3 * r1[2];
    r2[3] -= m2 * r1[3]; r3[3] -= m3 * r1[3];
-   s = r1[4]; if (0.0 != s) { r2[4] -= m2 * s; r3[4] -= m3 * s; }
-   s = r1[5]; if (0.0 != s) { r2[5] -= m2 * s; r3[5] -= m3 * s; }
-   s = r1[6]; if (0.0 != s) { r2[6] -= m2 * s; r3[6] -= m3 * s; }
-   s = r1[7]; if (0.0 != s) { r2[7] -= m2 * s; r3[7] -= m3 * s; }
+   s = r1[4]; if (0.0F != s) { r2[4] -= m2 * s; r3[4] -= m3 * s; }
+   s = r1[5]; if (0.0F != s) { r2[5] -= m2 * s; r3[5] -= m3 * s; }
+   s = r1[6]; if (0.0F != s) { r2[6] -= m2 * s; r3[6] -= m3 * s; }
+   s = r1[7]; if (0.0F != s) { r2[7] -= m2 * s; r3[7] -= m3 * s; }
 
    /* choose pivot - or die */
    if (fabsf(r3[2])>fabsf(r2[2])) SWAP_ROWS(r3, r2);
-   if (0.0 == r2[2])  return GL_FALSE;
+   if (0.0F == r2[2])  return GL_FALSE;
 
    /* eliminate third variable */
    m3 = r3[2]/r2[2];
@@ -421,7 +421,7 @@ static GLboolean invert_matrix_general( GLmatrix *mat )
    r3[7] -= m3 * r2[7];
 
    /* last check */
-   if (0.0 == r3[3]) return GL_FALSE;
+   if (0.0F == r3[3]) return GL_FALSE;
 
    s = 1.0F/r3[3];             /* now back substitute row 3 */
    r3[4] *= s; r3[5] *= s; r3[6] *= s; r3[7] *= s;
@@ -490,26 +490,26 @@ static GLboolean invert_matrix_3d_general( GLmatrix *mat )
     */
    pos = neg = 0.0;
    t =  MAT(in,0,0) * MAT(in,1,1) * MAT(in,2,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t =  MAT(in,1,0) * MAT(in,2,1) * MAT(in,0,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t =  MAT(in,2,0) * MAT(in,0,1) * MAT(in,1,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t = -MAT(in,2,0) * MAT(in,1,1) * MAT(in,0,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t = -MAT(in,1,0) * MAT(in,0,1) * MAT(in,2,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t = -MAT(in,0,0) * MAT(in,2,1) * MAT(in,1,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    det = pos + neg;
 
-   if (fabsf(det) < 1e-25)
+   if (fabsf(det) < 1e-25F)
       return GL_FALSE;
 
    det = 1.0F / det;
@@ -564,7 +564,7 @@ static GLboolean invert_matrix_3d( GLmatrix *mat )
                        MAT(in,0,1) * MAT(in,0,1) +
                        MAT(in,0,2) * MAT(in,0,2));
 
-      if (scale == 0.0)
+      if (scale == 0.0F)
          return GL_FALSE;
 
       scale = 1.0F / scale;
@@ -799,8 +799,8 @@ _math_matrix_rotate( GLmatrix *mat,
    GLfloat m[16];
    GLboolean optimized;
 
-   s = (GLfloat) sin( angle * M_PI / 180.0 );
-   c = (GLfloat) cos( angle * M_PI / 180.0 );
+   s = sinf( angle * M_PI / 180.0 );
+   c = cosf( angle * M_PI / 180.0 );
 
    memcpy(m, Identity, sizeof(GLfloat)*16);
    optimized = GL_FALSE;
@@ -859,7 +859,7 @@ _math_matrix_rotate( GLmatrix *mat,
    if (!optimized) {
       const GLfloat mag = sqrtf(x * x + y * y + z * z);
 
-      if (mag <= 1.0e-4) {
+      if (mag <= 1.0e-4F) {
          /* no rotation, leave mat as-is */
          return;
       }
@@ -1070,7 +1070,7 @@ _math_matrix_scale( GLmatrix *mat, GLfloat x, GLfloat y, GLfloat z )
    m[2] *= x;   m[6] *= y;   m[10] *= z;
    m[3] *= x;   m[7] *= y;   m[11] *= z;
 
-   if (fabsf(x - y) < 1e-8 && fabsf(x - z) < 1e-8)
+   if (fabsf(x - y) < 1e-8F && fabsf(x - z) < 1e-8F)
       mat->flags |= MAT_FLAG_UNIFORM_SCALE;
    else
       mat->flags |= MAT_FLAG_GENERAL_SCALE;
@@ -1206,7 +1206,7 @@ static void analyse_from_scratch( GLmatrix *mat )
    GLuint i;
 
    for (i = 0 ; i < 16 ; i++) {
-      if (m[i] == 0.0) mask |= (1<<i);
+      if (m[i] == 0.0F) mask |= (1<<i);
    }
 
    if (m[0] == 1.0F) mask |= (1<<16);
@@ -1240,12 +1240,12 @@ static void analyse_from_scratch( GLmatrix *mat )
       mat->type = MATRIX_2D;
 
       /* Check for scale */
-      if (SQ(mm-1) > SQ(1e-6) ||
-	  SQ(m4m4-1) > SQ(1e-6))
+      if (SQ(mm-1) > SQ(1e-6F) ||
+	  SQ(m4m4-1) > SQ(1e-6F))
 	 mat->flags |= MAT_FLAG_GENERAL_SCALE;
 
       /* Check for rotation */
-      if (SQ(mm4) > SQ(1e-6))
+      if (SQ(mm4) > SQ(1e-6F))
 	 mat->flags |= MAT_FLAG_GENERAL_3D;
       else
 	 mat->flags |= MAT_FLAG_ROTATION;
@@ -1255,9 +1255,9 @@ static void analyse_from_scratch( GLmatrix *mat )
       mat->type = MATRIX_3D_NO_ROT;
 
       /* Check for scale */
-      if (SQ(m[0]-m[5]) < SQ(1e-6) &&
-	  SQ(m[0]-m[10]) < SQ(1e-6)) {
-	 if (SQ(m[0]-1.0) > SQ(1e-6)) {
+      if (SQ(m[0]-m[5]) < SQ(1e-6F) &&
+	  SQ(m[0]-m[10]) < SQ(1e-6F)) {
+	 if (SQ(m[0]-1.0F) > SQ(1e-6F)) {
 	    mat->flags |= MAT_FLAG_UNIFORM_SCALE;
          }
       }
@@ -1275,8 +1275,8 @@ static void analyse_from_scratch( GLmatrix *mat )
       mat->type = MATRIX_3D;
 
       /* Check for scale */
-      if (SQ(c1-c2) < SQ(1e-6) && SQ(c1-c3) < SQ(1e-6)) {
-	 if (SQ(c1-1.0) > SQ(1e-6))
+      if (SQ(c1-c2) < SQ(1e-6F) && SQ(c1-c3) < SQ(1e-6F)) {
+	 if (SQ(c1-1.0F) > SQ(1e-6F))
 	    mat->flags |= MAT_FLAG_UNIFORM_SCALE;
 	 /* else no scale at all */
       }
@@ -1285,10 +1285,10 @@ static void analyse_from_scratch( GLmatrix *mat )
       }
 
       /* Check for rotation */
-      if (SQ(d1) < SQ(1e-6)) {
+      if (SQ(d1) < SQ(1e-6F)) {
 	 CROSS3( cp, m, m+4 );
 	 SUB_3V( cp, cp, (m+8) );
-	 if (LEN_SQUARED_3FV(cp) < SQ(1e-6))
+	 if (LEN_SQUARED_3FV(cp) < SQ(1e-6F))
 	    mat->flags |= MAT_FLAG_ROTATION;
 	 else
 	    mat->flags |= MAT_FLAG_GENERAL_3D;
diff --git a/src/mesa/math/m_norm_tmp.h b/src/mesa/math/m_norm_tmp.h
index d3ec1c22ecd..6f1db8d0bd0 100644
--- a/src/mesa/math/m_norm_tmp.h
+++ b/src/mesa/math/m_norm_tmp.h
@@ -80,7 +80,7 @@ TAG(transform_normalize_normals)( const GLmatrix *mat,
       }
    }
    else {
-      if (scale != 1.0) {
+      if (scale != 1.0f) {
 	 m0 *= scale,  m4 *= scale,  m8 *= scale;
 	 m1 *= scale,  m5 *= scale,  m9 *= scale;
 	 m2 *= scale,  m6 *= scale,  m10 *= scale;

From a562313f378a056c8d886e418b518063ab077c39 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 12 Jul 2015 23:15:42 -0700
Subject: [PATCH 0792/1208] mesa: Avoid double promotion.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/main/ffvertex_prog.c     | 10 +++++-----
 src/mesa/main/fog.c               |  2 +-
 src/mesa/main/get.c               |  2 +-
 src/mesa/main/light.c             | 30 +++++++++++++++---------------
 src/mesa/main/lines.c             |  4 ++--
 src/mesa/main/multisample.c       |  4 ++--
 src/mesa/main/pack.c              | 14 +++++++-------
 src/mesa/main/pixel.c             |  4 ++--
 src/mesa/main/pixeltransfer.c     |  8 ++++----
 src/mesa/main/points.c            |  8 ++++----
 src/mesa/main/readpix.c           |  4 ++--
 src/mesa/main/samplerobj.c        |  2 +-
 src/mesa/main/texparam.c          |  2 +-
 src/mesa/swrast_setup/ss_tritmp.h |  4 ++--
 14 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/src/mesa/main/ffvertex_prog.c b/src/mesa/main/ffvertex_prog.c
index 70adaf88551..95b428dca3e 100644
--- a/src/mesa/main/ffvertex_prog.c
+++ b/src/mesa/main/ffvertex_prog.c
@@ -189,15 +189,15 @@ static void make_state_key( struct gl_context *ctx, struct state_key *key )
 	 if (light->Enabled) {
 	    key->unit[i].light_enabled = 1;
 
-	    if (light->EyePosition[3] == 0.0)
+	    if (light->EyePosition[3] == 0.0F)
 	       key->unit[i].light_eyepos3_is_zero = 1;
 
-	    if (light->SpotCutoff == 180.0)
+	    if (light->SpotCutoff == 180.0F)
 	       key->unit[i].light_spotcutoff_is_180 = 1;
 
-	    if (light->ConstantAttenuation != 1.0 ||
-		light->LinearAttenuation != 0.0 ||
-		light->QuadraticAttenuation != 0.0)
+	    if (light->ConstantAttenuation != 1.0F ||
+		light->LinearAttenuation != 0.0F ||
+		light->QuadraticAttenuation != 0.0F)
 	       key->unit[i].light_attenuated = 1;
 	 }
       }
diff --git a/src/mesa/main/fog.c b/src/mesa/main/fog.c
index 3bce289e785..45f343d61c8 100644
--- a/src/mesa/main/fog.c
+++ b/src/mesa/main/fog.c
@@ -115,7 +115,7 @@ _mesa_Fogfv( GLenum pname, const GLfloat *params )
 	 ctx->Fog.Mode = m;
 	 break;
       case GL_FOG_DENSITY:
-	 if (*params<0.0) {
+	 if (*params<0.0F) {
 	    _mesa_error( ctx, GL_INVALID_VALUE, "glFog" );
             return;
 	 }
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 5b308e80668..307a5ffbd1c 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -676,7 +676,7 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
       break;
 
    case GL_EDGE_FLAG:
-      v->value_bool = ctx->Current.Attrib[VERT_ATTRIB_EDGEFLAG][0] == 1.0;
+      v->value_bool = ctx->Current.Attrib[VERT_ATTRIB_EDGEFLAG][0] == 1.0F;
       break;
 
    case GL_READ_BUFFER:
diff --git a/src/mesa/main/light.c b/src/mesa/main/light.c
index 89b1c4b142f..14b4b04162b 100644
--- a/src/mesa/main/light.c
+++ b/src/mesa/main/light.c
@@ -143,7 +143,7 @@ _mesa_light(struct gl_context *ctx, GLuint lnum, GLenum pname, const GLfloat *pa
       COPY_3V(light->SpotDirection, params);
       break;
    case GL_SPOT_EXPONENT:
-      assert(params[0] >= 0.0);
+      assert(params[0] >= 0.0F);
       assert(params[0] <= ctx->Const.MaxSpotExponent);
       if (light->SpotExponent == params[0])
 	 return;
@@ -151,12 +151,12 @@ _mesa_light(struct gl_context *ctx, GLuint lnum, GLenum pname, const GLfloat *pa
       light->SpotExponent = params[0];
       break;
    case GL_SPOT_CUTOFF:
-      assert(params[0] == 180.0 || (params[0] >= 0.0 && params[0] <= 90.0));
+      assert(params[0] == 180.0F || (params[0] >= 0.0F && params[0] <= 90.0F));
       if (light->SpotCutoff == params[0])
          return;
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
       light->SpotCutoff = params[0];
-      light->_CosCutoff = (GLfloat) (cos(light->SpotCutoff * M_PI / 180.0));
+      light->_CosCutoff = (cosf(light->SpotCutoff * M_PI / 180.0));
       if (light->_CosCutoff < 0)
          light->_CosCutoff = 0;
       if (light->SpotCutoff != 180.0F)
@@ -165,21 +165,21 @@ _mesa_light(struct gl_context *ctx, GLuint lnum, GLenum pname, const GLfloat *pa
          light->_Flags &= ~LIGHT_SPOT;
       break;
    case GL_CONSTANT_ATTENUATION:
-      assert(params[0] >= 0.0);
+      assert(params[0] >= 0.0F);
       if (light->ConstantAttenuation == params[0])
 	 return;
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
       light->ConstantAttenuation = params[0];
       break;
    case GL_LINEAR_ATTENUATION:
-      assert(params[0] >= 0.0);
+      assert(params[0] >= 0.0F);
       if (light->LinearAttenuation == params[0])
 	 return;
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
       light->LinearAttenuation = params[0];
       break;
    case GL_QUADRATIC_ATTENUATION:
-      assert(params[0] >= 0.0);
+      assert(params[0] >= 0.0F);
       if (light->QuadraticAttenuation == params[0])
 	 return;
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
@@ -238,31 +238,31 @@ _mesa_Lightfv( GLenum light, GLenum pname, const GLfloat *params )
       params = temp;
       break;
    case GL_SPOT_EXPONENT:
-      if (params[0] < 0.0 || params[0] > ctx->Const.MaxSpotExponent) {
+      if (params[0] < 0.0F || params[0] > ctx->Const.MaxSpotExponent) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
       break;
    case GL_SPOT_CUTOFF:
-      if ((params[0] < 0.0 || params[0] > 90.0) && params[0] != 180.0) {
+      if ((params[0] < 0.0F || params[0] > 90.0F) && params[0] != 180.0F) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
       break;
    case GL_CONSTANT_ATTENUATION:
-      if (params[0] < 0.0) {
+      if (params[0] < 0.0F) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
       break;
    case GL_LINEAR_ATTENUATION:
-      if (params[0] < 0.0) {
+      if (params[0] < 0.0F) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
       break;
    case GL_QUADRATIC_ATTENUATION:
-      if (params[0] < 0.0) {
+      if (params[0] < 0.0F) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
@@ -463,14 +463,14 @@ _mesa_LightModelfv( GLenum pname, const GLfloat *params )
       case GL_LIGHT_MODEL_LOCAL_VIEWER:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_pname;
-         newbool = (params[0]!=0.0);
+         newbool = (params[0] != 0.0F);
 	 if (ctx->Light.Model.LocalViewer == newbool)
 	    return;
 	 FLUSH_VERTICES(ctx, _NEW_LIGHT);
 	 ctx->Light.Model.LocalViewer = newbool;
          break;
       case GL_LIGHT_MODEL_TWO_SIDE:
-         newbool = (params[0]!=0.0);
+         newbool = (params[0] != 0.0F);
 	 if (ctx->Light.Model.TwoSide == newbool)
 	    return;
 	 FLUSH_VERTICES(ctx, _NEW_LIGHT);
@@ -975,7 +975,7 @@ compute_light_positions( struct gl_context *ctx )
       }
       else {
          /* positional light w/ homogeneous coordinate, divide by W */
-         GLfloat wInv = (GLfloat)1.0 / light->_Position[3];
+         GLfloat wInv = 1.0F / light->_Position[3];
          light->_Position[0] *= wInv;
          light->_Position[1] *= wInv;
          light->_Position[2] *= wInv;
@@ -1024,7 +1024,7 @@ update_modelview_scale( struct gl_context *ctx )
    if (!_math_matrix_is_length_preserving(ctx->ModelviewMatrixStack.Top)) {
       const GLfloat *m = ctx->ModelviewMatrixStack.Top->inv;
       GLfloat f = m[2] * m[2] + m[6] * m[6] + m[10] * m[10];
-      if (f < 1e-12) f = 1.0;
+      if (f < 1e-12f) f = 1.0f;
       if (ctx->_NeedEyeCoords)
 	 ctx->_ModelViewInvScale = 1.0f / sqrtf(f);
       else
diff --git a/src/mesa/main/lines.c b/src/mesa/main/lines.c
index 3c08ed2e713..c020fb3eb9e 100644
--- a/src/mesa/main/lines.c
+++ b/src/mesa/main/lines.c
@@ -45,7 +45,7 @@ _mesa_LineWidth( GLfloat width )
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glLineWidth %f\n", width);
 
-   if (width<=0.0) {
+   if (width <= 0.0F) {
       _mesa_error( ctx, GL_INVALID_VALUE, "glLineWidth" );
       return;
    }
@@ -63,7 +63,7 @@ _mesa_LineWidth( GLfloat width )
    if (ctx->API == API_OPENGL_CORE
        && ((ctx->Const.ContextFlags & GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT)
            != 0)
-       && width > 1.0) {
+       && width > 1.0F) {
       _mesa_error( ctx, GL_INVALID_VALUE, "glLineWidth" );
       return;
    }
diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c
index 816837b95bd..490bad507c3 100644
--- a/src/mesa/main/multisample.c
+++ b/src/mesa/main/multisample.c
@@ -43,7 +43,7 @@ _mesa_SampleCoverage(GLclampf value, GLboolean invert)
 
    FLUSH_VERTICES(ctx, 0);
 
-   ctx->Multisample.SampleCoverageValue = (GLfloat) CLAMP(value, 0.0, 1.0);
+   ctx->Multisample.SampleCoverageValue = CLAMP(value, 0.0f, 1.0f);
    ctx->Multisample.SampleCoverageInvert = invert;
    ctx->NewState |= _NEW_MULTISAMPLE;
 }
@@ -134,7 +134,7 @@ _mesa_MinSampleShading(GLclampf value)
 
    FLUSH_VERTICES(ctx, 0);
 
-   ctx->Multisample.MinSampleShadingValue = CLAMP(value, 0.0, 1.0);
+   ctx->Multisample.MinSampleShadingValue = CLAMP(value, 0.0f, 1.0f);
    ctx->NewState |= _NEW_MULTISAMPLE;
 }
 
diff --git a/src/mesa/main/pack.c b/src/mesa/main/pack.c
index f72360817e9..fb642b85be8 100644
--- a/src/mesa/main/pack.c
+++ b/src/mesa/main/pack.c
@@ -796,7 +796,7 @@ _mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
     * back to an int type can introduce errors that will show up as
     * artifacts in things like depth peeling which uses glCopyTexImage.
     */
-   if (ctx->Pixel.DepthScale == 1.0 && ctx->Pixel.DepthBias == 0.0) {
+   if (ctx->Pixel.DepthScale == 1.0F && ctx->Pixel.DepthBias == 0.0F) {
       if (srcType == GL_UNSIGNED_INT && dstType == GL_UNSIGNED_SHORT) {
          const GLuint *src = (const GLuint *) source;
          GLushort *dst = (GLushort *) dest;
@@ -874,8 +874,8 @@ _mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
       case GL_UNSIGNED_INT_24_8_EXT: /* GL_EXT_packed_depth_stencil */
          if (dstType == GL_UNSIGNED_INT_24_8_EXT &&
              depthMax == 0xffffff &&
-             ctx->Pixel.DepthScale == 1.0 &&
-             ctx->Pixel.DepthBias == 0.0) {
+             ctx->Pixel.DepthScale == 1.0F &&
+             ctx->Pixel.DepthBias == 0.0F) {
             const GLuint *src = (const GLuint *) source;
             GLuint *zValues = (GLuint *) dest;
             GLuint i;
@@ -945,7 +945,7 @@ _mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
    {
       const GLfloat scale = ctx->Pixel.DepthScale;
       const GLfloat bias = ctx->Pixel.DepthBias;
-      if (scale != 1.0 || bias != 0.0) {
+      if (scale != 1.0F || bias != 0.0F) {
          GLuint i;
          for (i = 0; i < n; i++) {
             depthValues[i] = depthValues[i] * scale + bias;
@@ -958,7 +958,7 @@ _mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
    if (needClamp) {
       GLuint i;
       for (i = 0; i < n; i++) {
-         depthValues[i] = (GLfloat)CLAMP(depthValues[i], 0.0, 1.0);
+         depthValues[i] = CLAMP(depthValues[i], 0.0F, 1.0F);
       }
    }
 
@@ -1025,7 +1025,7 @@ _mesa_pack_depth_span( struct gl_context *ctx, GLuint n, GLvoid *dest,
       return;
    }
 
-   if (ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0) {
+   if (ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F) {
       memcpy(depthCopy, depthSpan, n * sizeof(GLfloat));
       _mesa_scale_and_bias_depth(ctx, n, depthCopy);
       depthSpan = depthCopy;
@@ -1153,7 +1153,7 @@ _mesa_pack_depth_stencil_span(struct gl_context *ctx,GLuint n,
       return;
    }
 
-   if (ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0) {
+   if (ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F) {
       memcpy(depthCopy, depthVals, n * sizeof(GLfloat));
       _mesa_scale_and_bias_depth(ctx, n, depthCopy);
       depthVals = depthCopy;
diff --git a/src/mesa/main/pixel.c b/src/mesa/main/pixel.c
index ecda2694fc8..608a5454702 100644
--- a/src/mesa/main/pixel.c
+++ b/src/mesa/main/pixel.c
@@ -455,12 +455,12 @@ _mesa_GetnPixelMapusvARB( GLenum map, GLsizei bufSize, GLushort *values )
    /* special cases */
    case GL_PIXEL_MAP_I_TO_I:
       for (i = 0; i < mapsize; i++) {
-         values[i] = (GLushort) CLAMP(ctx->PixelMaps.ItoI.Map[i], 0.0, 65535.);
+         values[i] = (GLushort) CLAMP(ctx->PixelMaps.ItoI.Map[i], 0.0F, 65535.0F);
       }
       break;
    case GL_PIXEL_MAP_S_TO_S:
       for (i = 0; i < mapsize; i++) {
-         values[i] = (GLushort) CLAMP(ctx->PixelMaps.StoS.Map[i], 0.0, 65535.);
+         values[i] = (GLushort) CLAMP(ctx->PixelMaps.StoS.Map[i], 0.0F, 65535.0F);
       }
       break;
    default:
diff --git a/src/mesa/main/pixeltransfer.c b/src/mesa/main/pixeltransfer.c
index 94464ea6709..51f2ebf768f 100644
--- a/src/mesa/main/pixeltransfer.c
+++ b/src/mesa/main/pixeltransfer.c
@@ -47,25 +47,25 @@ _mesa_scale_and_bias_rgba(GLuint n, GLfloat rgba[][4],
                           GLfloat rBias, GLfloat gBias,
                           GLfloat bBias, GLfloat aBias)
 {
-   if (rScale != 1.0 || rBias != 0.0) {
+   if (rScale != 1.0F || rBias != 0.0F) {
       GLuint i;
       for (i = 0; i < n; i++) {
          rgba[i][RCOMP] = rgba[i][RCOMP] * rScale + rBias;
       }
    }
-   if (gScale != 1.0 || gBias != 0.0) {
+   if (gScale != 1.0F || gBias != 0.0F) {
       GLuint i;
       for (i = 0; i < n; i++) {
          rgba[i][GCOMP] = rgba[i][GCOMP] * gScale + gBias;
       }
    }
-   if (bScale != 1.0 || bBias != 0.0) {
+   if (bScale != 1.0F || bBias != 0.0F) {
       GLuint i;
       for (i = 0; i < n; i++) {
          rgba[i][BCOMP] = rgba[i][BCOMP] * bScale + bBias;
       }
    }
-   if (aScale != 1.0 || aBias != 0.0) {
+   if (aScale != 1.0F || aBias != 0.0F) {
       GLuint i;
       for (i = 0; i < n; i++) {
          rgba[i][ACOMP] = rgba[i][ACOMP] * aScale + aBias;
diff --git a/src/mesa/main/points.c b/src/mesa/main/points.c
index 5ad1f38f366..863e3c1af32 100644
--- a/src/mesa/main/points.c
+++ b/src/mesa/main/points.c
@@ -45,7 +45,7 @@ _mesa_PointSize( GLfloat size )
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   if (size <= 0.0) {
+   if (size <= 0.0F) {
       _mesa_error( ctx, GL_INVALID_VALUE, "glPointSize" );
       return;
    }
@@ -119,9 +119,9 @@ _mesa_PointParameterfv( GLenum pname, const GLfloat *params)
             return;
          FLUSH_VERTICES(ctx, _NEW_POINT);
          COPY_3V(ctx->Point.Params, params);
-         ctx->Point._Attenuated = (ctx->Point.Params[0] != 1.0 ||
-                                   ctx->Point.Params[1] != 0.0 ||
-                                   ctx->Point.Params[2] != 0.0);
+         ctx->Point._Attenuated = (ctx->Point.Params[0] != 1.0F ||
+                                   ctx->Point.Params[1] != 0.0F ||
+                                   ctx->Point.Params[2] != 0.0F);
          break;
       case GL_POINT_SIZE_MIN_EXT:
          if (params[0] < 0.0F) {
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index 9d72c1c31b0..2744232f906 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -268,7 +268,7 @@ read_uint_depth_pixels( struct gl_context *ctx,
    GLubyte *map, *dst;
    int stride, dstStride, j;
 
-   if (ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0)
+   if (ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F)
       return GL_FALSE;
 
    if (packing->SwapBytes)
@@ -821,7 +821,7 @@ read_depth_stencil_pixels(struct gl_context *ctx,
                           const struct gl_pixelstore_attrib *packing )
 {
    const GLboolean scaleOrBias
-      = ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0;
+      = ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F;
    const GLboolean stencilTransfer = ctx->Pixel.IndexShift
       || ctx->Pixel.IndexOffset || ctx->Pixel.MapStencilFlag;
    GLubyte *dst;
diff --git a/src/mesa/main/samplerobj.c b/src/mesa/main/samplerobj.c
index d023fefabff..32180fb1ba2 100644
--- a/src/mesa/main/samplerobj.c
+++ b/src/mesa/main/samplerobj.c
@@ -689,7 +689,7 @@ set_sampler_max_anisotropy(struct gl_context *ctx,
    if (samp->MaxAnisotropy == param)
       return GL_FALSE;
 
-   if (param < 1.0)
+   if (param < 1.0F)
       return INVALID_VALUE;
 
    flush(ctx);
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index efdeed529fa..88c2c142804 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -683,7 +683,7 @@ set_tex_parameterf(struct gl_context *ctx,
 
          if (texObj->Sampler.MaxAnisotropy == params[0])
             return GL_FALSE;
-         if (params[0] < 1.0) {
+         if (params[0] < 1.0F) {
             _mesa_error(ctx, GL_INVALID_VALUE, "glTex%sParameter(param)",
                         suffix);
             return GL_FALSE;
diff --git a/src/mesa/swrast_setup/ss_tritmp.h b/src/mesa/swrast_setup/ss_tritmp.h
index c38c76a4adb..adb77bd3247 100644
--- a/src/mesa/swrast_setup/ss_tritmp.h
+++ b/src/mesa/swrast_setup/ss_tritmp.h
@@ -58,7 +58,7 @@ static void TAG(triangle)(struct gl_context *ctx, GLuint e0, GLuint e1, GLuint e
 
       if (IND & (SS_TWOSIDE_BIT | SS_UNFILLED_BIT))
       {
-	 facing = (cc < 0.0) ^ ctx->Polygon._FrontBit;
+	 facing = (cc < 0.0F) ^ ctx->Polygon._FrontBit;
 
 	 if (IND & SS_UNFILLED_BIT)
 	    mode = facing ? ctx->Polygon.BackMode : ctx->Polygon.FrontMode;
@@ -138,7 +138,7 @@ static void TAG(triangle)(struct gl_context *ctx, GLuint e0, GLuint e1, GLuint e
           * so no MRD value is used here.
           */
 	 offset = ctx->Polygon.OffsetUnits;
-	 if (cc * cc > 1e-16) {
+	 if (cc * cc > 1e-16F) {
 	    const GLfloat ez = z[0] - z[2];
 	    const GLfloat fz = z[1] - z[2];
 	    const GLfloat oneOverArea = 1.0F / cc;

From 23bba717e1178d54927c4968a0466d706a630432 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 15 Jul 2015 21:29:21 -0700
Subject: [PATCH 0793/1208] glsl: Avoid double promotion.

---
 src/glsl/ir_constant_expression.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp
index 7a38fa42193..2853c1643b3 100644
--- a/src/glsl/ir_constant_expression.cpp
+++ b/src/glsl/ir_constant_expression.cpp
@@ -1662,10 +1662,10 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
             if (!isnormal(data.d[c]))
                data.d[c] = copysign(0.0, op[0]->value.d[c]);
          } else {
-            data.f[c] = ldexp(op[0]->value.f[c], op[1]->value.i[c]);
+            data.f[c] = ldexpf(op[0]->value.f[c], op[1]->value.i[c]);
             /* Flush subnormal values to zero. */
             if (!isnormal(data.f[c]))
-               data.f[c] = copysign(0.0f, op[0]->value.f[c]);
+               data.f[c] = copysignf(0.0f, op[0]->value.f[c]);
          }
       }
       break;

From 4fc86f183eee43117925499d8d1315be481ba636 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Fri, 17 Jul 2015 18:18:20 +0100
Subject: [PATCH 0794/1208] svga: scons: remove unused HAVE_SYS_TYPES_H define

There isn't a single instance in mesa that
mentions HAVE_SYS_TYPES_H, other than this file.

Cc: Jose Fonseca <jfonseca@vmware.com>
Acked-by: Brian Paul <brianp@vmware.com>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/gallium/drivers/svga/SConscript    | 1 -
 src/gallium/winsys/svga/drm/SConscript | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/gallium/drivers/svga/SConscript b/src/gallium/drivers/svga/SConscript
index bb4d034f1eb..0ee624616f9 100644
--- a/src/gallium/drivers/svga/SConscript
+++ b/src/gallium/drivers/svga/SConscript
@@ -11,7 +11,6 @@ if env['suncc']:
 if env['gcc'] or env['clang']:
 	env.Append(CPPDEFINES = [
 		'HAVE_STDINT_H', 
-		'HAVE_SYS_TYPES_H',
 	])
 	
 env.Prepend(CPPPATH = [
diff --git a/src/gallium/winsys/svga/drm/SConscript b/src/gallium/winsys/svga/drm/SConscript
index 099acdac8c0..25850531d31 100644
--- a/src/gallium/winsys/svga/drm/SConscript
+++ b/src/gallium/winsys/svga/drm/SConscript
@@ -8,7 +8,6 @@ if env['gcc'] or env['clang'] or env['icc']:
     env.Append(CCFLAGS = ['-fvisibility=hidden'])
     env.Append(CPPDEFINES = [
         'HAVE_STDINT_H', 
-        'HAVE_SYS_TYPES_H',
         '-D_FILE_OFFSET_BITS=64',
     ])
 

From c0731a1b14dc7385f4238b4508b88bfca2ef43cf Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 22 Jul 2015 16:04:28 +0100
Subject: [PATCH 0795/1208] targets/dri: scons: add missing link against libdrm

Otherwise the final dri module will have (additional) unresolved
symbols.

Cc: Brian Paul <brianp@vmware.com>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviwed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/targets/dri/SConscript | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/targets/dri/SConscript b/src/gallium/targets/dri/SConscript
index 8d29f3beb18..2fb0da09200 100644
--- a/src/gallium/targets/dri/SConscript
+++ b/src/gallium/targets/dri/SConscript
@@ -25,6 +25,8 @@ if env['llvm']:
     env.Append(CPPDEFINES = 'GALLIUM_LLVMPIPE')
     env.Prepend(LIBS = [llvmpipe])
 
+env.PkgUseModules('DRM')
+
 env.Append(CPPDEFINES = [
     'GALLIUM_VMWGFX',
     'GALLIUM_SOFTPIPE',

From e933d545997de9e50a8ed5247722c1c786bf4858 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Tue, 28 Jul 2015 19:52:49 -0700
Subject: [PATCH 0796/1208] i965/bxt: Support 3src simd16 instructions

This is easily accomplished by moving simd16 3src to GEN9_FEATURES.

v2: small cleanup to make it more similar to GEN8_FEATURES

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_device_info.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index 51a91b6c0d9..6fe6ea2eb56 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -307,6 +307,7 @@ static const struct brw_device_info brw_device_info_chv = {
    .must_use_separate_stencil = true,               \
    .has_llc = true,                                 \
    .has_pln = true,                                 \
+   .supports_simd16_3src = true,                    \
    .max_vs_threads = 280,                           \
    .max_gs_threads = 256,                           \
    .max_wm_threads = 408,                           \
@@ -324,17 +325,14 @@ static const struct brw_device_info brw_device_info_skl_early = {
 
 static const struct brw_device_info brw_device_info_skl_gt1 = {
    GEN9_FEATURES, .gt = 1,
-   .supports_simd16_3src = true,
 };
 
 static const struct brw_device_info brw_device_info_skl_gt2 = {
    GEN9_FEATURES, .gt = 2,
-   .supports_simd16_3src = true,
 };
 
 static const struct brw_device_info brw_device_info_skl_gt3 = {
    GEN9_FEATURES, .gt = 3,
-   .supports_simd16_3src = true,
 };
 
 static const struct brw_device_info brw_device_info_bxt = {

From 2b916c6e47862d82b5545e962ebb83b811904c3b Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Wed, 29 Jul 2015 22:20:04 +0200
Subject: [PATCH 0797/1208] c99_math: (trivial) implement exp2 for MSVC too

Unsurprisingly doesn't build otherwise with old msvc.
---
 include/c99_math.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/c99_math.h b/include/c99_math.h
index 0ca5a73b58a..8a67fb133d6 100644
--- a/include/c99_math.h
+++ b/include/c99_math.h
@@ -146,6 +146,12 @@ exp2f(float f)
    return powf(2.0f, f);
 }
 
+static inline double
+exp2(double d)
+{
+   return pow(2.0, d);
+}
+
 #endif /* C99 */
 
 

From 8413822c8cfaf9110625c1a4a66ee916c2a916e3 Mon Sep 17 00:00:00 2001
From: Jose Fonseca <jfonseca@vmware.com>
Date: Wed, 29 Jul 2015 20:45:09 +0100
Subject: [PATCH 0798/1208] gallium/auxiliary: Ensure c99_math.h is included.

As it is needed for exp2.

Trivial.
---
 src/gallium/auxiliary/util/u_format_rgb9e5.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h b/src/gallium/auxiliary/util/u_format_rgb9e5.h
index 9e4b1d607d7..59fc291e917 100644
--- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
+++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
@@ -26,9 +26,10 @@
 #ifndef RGB9E5_H
 #define RGB9E5_H
 
-#include <math.h>
 #include <assert.h>
 
+#include "c99_math.h"
+
 #define RGB9E5_EXPONENT_BITS          5
 #define RGB9E5_MANTISSA_BITS          9
 #define RGB9E5_EXP_BIAS               15

From 2484263fe97cebc9fa7a5c9de04c757dc6cc7713 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Wed, 29 Jul 2015 09:41:18 -0700
Subject: [PATCH 0799/1208] Delete duplicate function is_power_of_two() and use
 _mesa_is_pow_two()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/drivers/common/meta_blit.c                   |  6 +++---
 src/mesa/drivers/dri/i915/i915_texstate.c             |  2 +-
 src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp |  2 +-
 src/mesa/drivers/dri/i965/brw_reg.h                   |  2 +-
 src/mesa/drivers/dri/i965/brw_tex_layout.c            |  8 ++++----
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp      |  2 +-
 src/mesa/drivers/dri/i965/intel_blit.c                |  8 ++++----
 src/mesa/main/macros.h                                | 11 -----------
 8 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index 317a304bebf..71d18de87db 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -82,7 +82,7 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
    y_scale = samples * 0.5;
 
    /* We expect only power of 2 samples in source multisample buffer. */
-   assert(samples > 0 && is_power_of_two(samples));
+   assert(samples > 0 && _mesa_is_pow_two(samples));
    while (samples >> (shader_offset + 1)) {
       shader_offset++;
    }
@@ -263,7 +263,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
    }
 
    /* We expect only power of 2 samples in source multisample buffer. */
-   assert(samples > 0 && is_power_of_two(samples));
+   assert(samples > 0 && _mesa_is_pow_two(samples));
    while (samples >> (shader_offset + 1)) {
       shader_offset++;
    }
@@ -434,7 +434,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
           * (so the floating point exponent just gets increased), rather than
           * doing a naive sum and dividing.
           */
-         assert(is_power_of_two(samples));
+         assert(_mesa_is_pow_two(samples));
          /* Fetch each individual sample. */
          sample_resolve = rzalloc_size(mem_ctx, 1);
          for (i = 0; i < samples; i++) {
diff --git a/src/mesa/drivers/dri/i915/i915_texstate.c b/src/mesa/drivers/dri/i915/i915_texstate.c
index aef5ff99eb2..f653f441ad8 100644
--- a/src/mesa/drivers/dri/i915/i915_texstate.c
+++ b/src/mesa/drivers/dri/i915/i915_texstate.c
@@ -342,7 +342,7 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
        * Thus, I guess we need do this for other platforms as well.
        */
       if (tObj->Target == GL_TEXTURE_CUBE_MAP_ARB &&
-          !is_power_of_two(firstImage->Height))
+          !_mesa_is_pow_two(firstImage->Height))
          return false;
 
       state[I915_TEXREG_SS3] = ss3;     /* SS3_NORMALIZED_COORDS */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
index 01d3a569858..96d4f375da2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
@@ -173,7 +173,7 @@ ir_vector_reference_visitor::visit_enter(ir_assignment *ir)
       return visit_continue_with_parent;
    }
    if (ir->lhs->as_dereference_variable() &&
-       is_power_of_two(ir->write_mask) &&
+       _mesa_is_pow_two(ir->write_mask) &&
        !ir->condition) {
       /* If we're writing just a channel, then channel-splitting the LHS is OK.
        */
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index c8b134103bb..4867148c12f 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -853,7 +853,7 @@ static inline struct brw_reg
 spread(struct brw_reg reg, unsigned s)
 {
    if (s) {
-      assert(is_power_of_two(s));
+      assert(_mesa_is_pow_two(s));
 
       if (reg.hstride)
          reg.hstride += cvt(s) - 1;
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 85c0864d2c8..fb78b08b649 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -63,7 +63,7 @@ tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
    int i = 0;
 
    /* Alignment computations below assume bpp >= 8 and a power of 2. */
-   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp));
+   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
 
    switch(mt->target) {
    case GL_TEXTURE_1D:
@@ -95,7 +95,7 @@ tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
    ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
                align_yf[i] : align_ys[i];
 
-   assert(is_power_of_two(mt->num_samples));
+   assert(_mesa_is_pow_two(mt->num_samples));
 
    switch (mt->num_samples) {
    case 2:
@@ -199,7 +199,7 @@ tr_mode_vertical_texture_alignment(const struct brw_context *brw,
           mt->target != GL_TEXTURE_1D_ARRAY);
 
    /* Alignment computations below assume bpp >= 8 and a power of 2. */
-   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp)) ;
+   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp)) ;
 
    switch(mt->target) {
    case GL_TEXTURE_2D:
@@ -226,7 +226,7 @@ tr_mode_vertical_texture_alignment(const struct brw_context *brw,
    ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
                align_yf[i] : align_ys[i];
 
-   assert(is_power_of_two(mt->num_samples));
+   assert(_mesa_is_pow_two(mt->num_samples));
 
    switch (mt->num_samples) {
    case 4:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index 0483afc8fe0..92050b94d33 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1545,7 +1545,7 @@ vec4_generator::generate_code(const cfg_t *cfg)
           *
           * where they pack the four bytes from the low and high four DW.
           */
-         assert(is_power_of_two(dst.dw1.bits.writemask) &&
+         assert(_mesa_is_pow_two(dst.dw1.bits.writemask) &&
                 dst.dw1.bits.writemask != 0);
          unsigned offset = __builtin_ctz(dst.dw1.bits.writemask);
 
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index 4fc3fa803cb..6d92580e725 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -121,14 +121,14 @@ get_tr_horizontal_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
       return 0;
 
    /* Compute array index. */
-   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp));
+   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
    i = ffs(bpp / 8) - 1;
 
    align = tr_mode == INTEL_MIPTREE_TRMODE_YF ?
            align_2d_yf[i] :
            4 * align_2d_yf[i];
 
-   assert(is_power_of_two(align));
+   assert(_mesa_is_pow_two(align));
 
    /* XY_FAST_COPY_BLT doesn't support horizontal alignment of 16. */
    if (align == 16)
@@ -150,14 +150,14 @@ get_tr_vertical_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
       return 0;
 
    /* Compute array index. */
-   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp));
+   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
    i = ffs(bpp / 8) - 1;
 
    align = tr_mode == INTEL_MIPTREE_TRMODE_YF ?
            align_2d_yf[i] :
            4 * align_2d_yf[i];
 
-   assert(is_power_of_two(align));
+   assert(_mesa_is_pow_two(align));
 
    /* XY_FAST_COPY_BLT doesn't support vertical alignments of 16 and 32. */
    if (align == 16 || align == 32)
diff --git a/src/mesa/main/macros.h b/src/mesa/main/macros.h
index 0608650aeb4..07919a6e1e4 100644
--- a/src/mesa/main/macros.h
+++ b/src/mesa/main/macros.h
@@ -678,17 +678,6 @@ minify(unsigned value, unsigned levels)
     return MAX2(1, value >> levels);
 }
 
-/**
- * Return true if the given value is a power of two.
- *
- * Note that this considers 0 a power of two.
- */
-static inline bool
-is_power_of_two(unsigned value)
-{
-   return (value & (value - 1)) == 0;
-}
-
 /**
  * Align a value up to an alignment value
  *

From 92994742d079bffdc4c25fdc5a22c7438b7da9c7 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Wed, 29 Jul 2015 09:57:26 -0700
Subject: [PATCH 0800/1208] i965: Change the type of max_{vs, hs, ...}_threads
 variables to unsigned

Fixes following compiler warning:
brw_cs.cpp:386:27: warning: comparison between signed and unsigned
integer expressions [-Wsign-compare]

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_context.h | 12 ++++++------
 src/mesa/drivers/dri/i965/brw_cs.cpp    |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 8bbeb34075c..cd43ac5114e 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1241,12 +1241,12 @@ struct brw_context
     * Platform specific constants containing the maximum number of threads
     * for each pipeline stage.
     */
-   int max_vs_threads;
-   int max_hs_threads;
-   int max_ds_threads;
-   int max_gs_threads;
-   int max_wm_threads;
-   int max_cs_threads;
+   unsigned max_vs_threads;
+   unsigned max_hs_threads;
+   unsigned max_ds_threads;
+   unsigned max_gs_threads;
+   unsigned max_wm_threads;
+   unsigned max_cs_threads;
 
    /* BRW_NEW_URB_ALLOCATIONS:
     */
diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp
index d61bba002c4..29ee75b1e1a 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
@@ -82,7 +82,7 @@ brw_cs_emit(struct brw_context *brw,
    prog_data->local_size[0] = cp->LocalSize[0];
    prog_data->local_size[1] = cp->LocalSize[1];
    prog_data->local_size[2] = cp->LocalSize[2];
-   int local_workgroup_size =
+   unsigned local_workgroup_size =
       cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2];
 
    cfg_t *cfg = NULL;

From c73a13e9538cab1772b71fb5599e4944c540412e Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Wed, 29 Jul 2015 10:15:03 -0700
Subject: [PATCH 0801/1208] Delete unused functions in format parser

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Neil Roberts <neil@linux.intel.com>
---
 src/mesa/main/format_parser.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/mesa/main/format_parser.py b/src/mesa/main/format_parser.py
index 11184f78e2c..799b14f0b1c 100755
--- a/src/mesa/main/format_parser.py
+++ b/src/mesa/main/format_parser.py
@@ -40,9 +40,6 @@ SRGB = 'srgb'
 YUV = 'yuv'
 ZS = 'zs'
 
-def is_power_of_two(x):
-   return not bool(x & (x - 1))
-
 VERY_LARGE = 99999999999999999999999
 
 class Channel:
@@ -100,10 +97,6 @@ class Channel:
       else:
          return 1
 
-   def is_power_of_two(self):
-      """Returns true if the size of this channel is a power of two."""
-      return is_power_of_two(self.size)
-
    def datatype(self):
       """Returns the datatype corresponding to a channel type and size"""
       return _get_datatype(self.type, self.size)

From 3c73c418713adec52389e2723e38bf47df13a24b Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 20 Jul 2015 02:37:14 +0100
Subject: [PATCH 0802/1208] radeonsi: add GS multiple streams support (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is the final piece for ARB_gpu_shader5,

The code is based on the r600 code from Glenn Kennard,
and myself.

While developing this, I'm not 100% sure of all the calculations
made in the GS registers, this is why the max_stream is worked
out there and used to limit the changes in registers. Otherwise
my initial attempts either regressed GS texelFetch tests
or primitive-id-restart. The current code has no regressions
in piglit.

This commit doesn't enable ARB_gpu_shader5, since that just
bumps the glsl level to 4.00, so I'll just do a separate patch
for 4.10.

v1.1: fix bug introduced in rebase.
v2: Address Marek's review comments,
remove my llvm stream code for simpler C,
move gsvs_ring and gs_next_vertex to arrays.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/radeonsi/si_descriptors.c |  4 +-
 src/gallium/drivers/radeonsi/si_pipe.c        |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c      | 74 ++++++++++++++----
 src/gallium/drivers/radeonsi/si_state.c       |  4 -
 src/gallium/drivers/radeonsi/si_state.h       |  7 +-
 .../drivers/radeonsi/si_state_shaders.c       | 75 +++++++++++++++----
 6 files changed, 127 insertions(+), 39 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 2e2a35b2280..14bb6e1fa21 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -724,7 +724,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			struct pipe_resource *buffer,
 			unsigned stride, unsigned num_records,
 			bool add_tid, bool swizzle,
-			unsigned element_size, unsigned index_stride)
+			unsigned element_size, unsigned index_stride, uint64_t offset)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
@@ -741,7 +741,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 	if (buffer) {
 		uint64_t va;
 
-		va = r600_resource(buffer)->gpu_address;
+		va = r600_resource(buffer)->gpu_address + offset;
 
 		switch (element_size) {
 		default:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 808b9bcdc84..a120282e973 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -316,7 +316,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
 		return 4095;
 	case PIPE_CAP_MAX_VERTEX_STREAMS:
-		return 1;
+		return 4;
 
 	case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
 		return 2048;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index fa31f734ff2..d8bab875adb 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -31,6 +31,7 @@
 #include "gallivm/lp_bld_intr.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_bitarit.h"
 #include "gallivm/lp_bld_flow.h"
 #include "radeon/r600_cs.h"
 #include "radeon/radeon_llvm.h"
@@ -87,8 +88,8 @@ struct si_shader_context
 	LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
 	LLVMValueRef so_buffers[4];
 	LLVMValueRef esgs_ring;
-	LLVMValueRef gsvs_ring;
-	LLVMValueRef gs_next_vertex;
+	LLVMValueRef gsvs_ring[4];
+	LLVMValueRef gs_next_vertex[4];
 };
 
 static struct si_shader_context * si_shader_context(
@@ -1576,6 +1577,9 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 	LLVMValueRef can_emit =
 		LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
 
+	LLVMValueRef stream_id =
+		unpack_param(shader, shader->param_streamout_config, 24, 2);
+
 	/* Emit the streamout code conditionally. This actually avoids
 	 * out-of-bounds buffer access. The hw tells us via the SGPR
 	 * (so_vtx_count) which threads are allowed to emit streamout data. */
@@ -1615,7 +1619,9 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 			unsigned reg = so->output[i].register_index;
 			unsigned start = so->output[i].start_component;
 			unsigned num_comps = so->output[i].num_components;
+			unsigned stream = so->output[i].stream;
 			LLVMValueRef out[4];
+			struct lp_build_if_state if_ctx_stream;
 
 			assert(num_comps && num_comps <= 4);
 			if (!num_comps || num_comps > 4)
@@ -1649,11 +1655,18 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 				break;
 			}
 
+			LLVMValueRef can_emit_stream =
+				LLVMBuildICmp(builder, LLVMIntEQ,
+					      stream_id,
+					      lp_build_const_int32(gallivm, stream), "");
+
+			lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
 			build_tbuffer_store_dwords(shader, shader->so_buffers[buf_idx],
 						   vdata, num_comps,
 						   so_write_offset[buf_idx],
 						   LLVMConstInt(i32, 0, 0),
 						   so->output[i].dst_offset*4);
+			lp_build_endif(&if_ctx_stream);
 		}
 	}
 	lp_build_endif(&if_ctx);
@@ -3188,6 +3201,19 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
 	}
 }
 
+static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
+				       struct lp_build_emit_data *emit_data)
+{
+	LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
+	struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
+	unsigned stream;
+
+	assert(src0.File == TGSI_FILE_IMMEDIATE);
+
+	stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
+	return stream;
+}
+
 /* Emit one vertex from the geometry shader */
 static void si_llvm_emit_vertex(
 	const struct lp_build_tgsi_action *action,
@@ -3207,9 +3233,14 @@ static void si_llvm_emit_vertex(
 	LLVMValueRef args[2];
 	unsigned chan;
 	int i;
+	unsigned stream;
+
+	stream = si_llvm_get_stream(bld_base, emit_data);
 
 	/* Write vertex attribute values to GSVS ring */
-	gs_next_vertex = LLVMBuildLoad(gallivm->builder, si_shader_ctx->gs_next_vertex, "");
+	gs_next_vertex = LLVMBuildLoad(gallivm->builder,
+				       si_shader_ctx->gs_next_vertex[stream],
+				       "");
 
 	/* If this thread has already emitted the declared maximum number of
 	 * vertices, kill it: excessive vertex emissions are not supposed to
@@ -3222,6 +3253,7 @@ static void si_llvm_emit_vertex(
 	kill = lp_build_select(&bld_base->base, can_emit,
 			       lp_build_const_float(gallivm, 1.0f),
 			       lp_build_const_float(gallivm, -1.0f));
+
 	build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
 			LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
 
@@ -3241,7 +3273,7 @@ static void si_llvm_emit_vertex(
 			out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, "");
 
 			build_tbuffer_store(si_shader_ctx,
-					    si_shader_ctx->gsvs_ring,
+					    si_shader_ctx->gsvs_ring[stream],
 					    out_val, 1,
 					    voffset, soffset, 0,
 					    V_008F0C_BUF_DATA_FORMAT_32,
@@ -3251,10 +3283,11 @@ static void si_llvm_emit_vertex(
 	}
 	gs_next_vertex = lp_build_add(uint, gs_next_vertex,
 				      lp_build_const_int32(gallivm, 1));
-	LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex);
+
+	LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex[stream]);
 
 	/* Signal vertex emission */
-	args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS);
+	args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
 	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
 	build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
 			LLVMVoidTypeInContext(gallivm->context), args, 2,
@@ -3270,9 +3303,11 @@ static void si_llvm_emit_primitive(
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMValueRef args[2];
+	unsigned stream;
 
 	/* Signal primitive cut */
-	args[0] = lp_build_const_int32(gallivm,	SENDMSG_GS_OP_CUT | SENDMSG_GS);
+	stream = si_llvm_get_stream(bld_base, emit_data);
+	args[0] = lp_build_const_int32(gallivm,	SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
 	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
 	build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
 			LLVMVoidTypeInContext(gallivm->context), args, 2,
@@ -3651,13 +3686,21 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 			build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
 	}
 
-	if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY ||
-	    si_shader_ctx->shader->is_gs_copy_shader) {
+	if (si_shader_ctx->shader->is_gs_copy_shader) {
 		LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
 
-		si_shader_ctx->gsvs_ring =
+		si_shader_ctx->gsvs_ring[0] =
 			build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
 	}
+	if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) {
+		int i;
+		for (i = 0; i < 4; i++) {
+			LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS + i);
+
+			si_shader_ctx->gsvs_ring[i] =
+				build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
+		}
+	}
 }
 
 void si_shader_binary_read_config(const struct si_screen *sscreen,
@@ -3838,7 +3881,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 	preload_streamout_buffers(si_shader_ctx);
 	preload_ring_buffers(si_shader_ctx);
 
-	args[0] = si_shader_ctx->gsvs_ring;
+	args[0] = si_shader_ctx->gsvs_ring[0];
 	args[1] = lp_build_mul_imm(uint,
 				   LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 						si_shader_ctx->param_vertex_id),
@@ -4076,9 +4119,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	preload_ring_buffers(&si_shader_ctx);
 
 	if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
-		si_shader_ctx.gs_next_vertex =
-			lp_build_alloca(bld_base->base.gallivm,
-					bld_base->uint_bld.elem_type, "");
+		int i;
+		for (i = 0; i < 4; i++) {
+			si_shader_ctx.gs_next_vertex[i] =
+				lp_build_alloca(bld_base->base.gallivm,
+						bld_base->uint_bld.elem_type, "");
+		}
 	}
 
 	if (!lp_build_tgsi_llvm(bld_base, tokens)) {
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index ab5c3ca2a03..86e1624f3d3 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3138,10 +3138,6 @@ static void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0);
 	si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
 
-	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, 0);
-	si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, 0);
-	si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, 0);
-
 	si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
 	si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0);
 	si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 25220534eb4..e4d859a4fb7 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -148,7 +148,10 @@ struct si_shader_data {
 #define SI_RING_TESS_FACTOR	0 /* for HS (TCS)  */
 #define SI_RING_ESGS		0 /* for ES, GS */
 #define SI_RING_GSVS		1 /* for GS, VS */
-#define SI_NUM_RING_BUFFERS	2
+#define SI_RING_GSVS_1		2 /* 1, 2, 3 for GS */
+#define SI_RING_GSVS_2		3
+#define SI_RING_GSVS_3		4
+#define SI_NUM_RING_BUFFERS	5
 #define SI_SO_BUF_OFFSET	SI_NUM_RING_BUFFERS
 #define SI_NUM_RW_BUFFERS	(SI_SO_BUF_OFFSET + 4)
 
@@ -249,7 +252,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			struct pipe_resource *buffer,
 			unsigned stride, unsigned num_records,
 			bool add_tid, bool swizzle,
-			unsigned element_size, unsigned index_stride);
+			unsigned element_size, unsigned index_stride, uint64_t offset);
 void si_init_all_descriptors(struct si_context *sctx);
 void si_release_all_descriptors(struct si_context *sctx);
 void si_all_descriptors_begin_new_cs(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 18bddfda265..1a6854e069d 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -206,16 +206,32 @@ static void si_shader_es(struct si_shader *shader)
 		si_set_tesseval_regs(shader, pm4);
 }
 
+static unsigned si_gs_get_max_stream(struct si_shader *shader)
+{
+	struct pipe_stream_output_info *so = &shader->selector->so;
+	unsigned max_stream = 0, i;
+
+	if (so->num_outputs == 0)
+		return 0;
+
+	for (i = 0; i < so->num_outputs; i++) {
+		if (so->output[i].stream > max_stream)
+			max_stream = so->output[i].stream;
+	}
+	return max_stream;
+}
+
 static void si_shader_gs(struct si_shader *shader)
 {
-	unsigned gs_vert_itemsize = shader->selector->info.num_outputs * (16 >> 2);
+	unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16;
 	unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
-	unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out;
+	unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2;
 	unsigned gs_num_invocations = shader->selector->gs_num_invocations;
 	unsigned cut_mode;
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
 	uint64_t va;
+	unsigned max_stream = si_gs_get_max_stream(shader);
 
 	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
 	assert(gsvs_itemsize < (1 << 15));
@@ -243,16 +259,19 @@ static void si_shader_gs(struct si_shader *shader)
 		       S_028A40_GS_WRITE_OPTIMIZE(1));
 
 	si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize);
-	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize);
-	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize);
+	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
+	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
 
 	si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
 		       util_bitcount64(shader->selector->inputs_read) * (16 >> 2));
-	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize);
+	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
 
 	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
 
-	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize);
+	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2);
+	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0);
+	si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? gs_vert_itemsize >> 2 : 0);
+	si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? gs_vert_itemsize >> 2 : 0);
 
 	si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT,
 		       S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
@@ -1001,15 +1020,42 @@ static void si_init_gs_rings(struct si_context *sctx)
 
 	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
 			   sctx->esgs_ring, 0, esgs_ring_size,
-			   true, true, 4, 64);
+			   true, true, 4, 64, 0);
 	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
 			   sctx->esgs_ring, 0, esgs_ring_size,
-			   false, false, 0, 0);
+			   false, false, 0, 0, 0);
 	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
 			   sctx->gsvs_ring, 0, gsvs_ring_size,
-			   false, false, 0, 0);
+			   false, false, 0, 0, 0);
 }
 
+static void si_update_gs_rings(struct si_context *sctx)
+{
+	unsigned gs_vert_itemsize = sctx->gs_shader->info.num_outputs * 16;
+	unsigned gs_max_vert_out = sctx->gs_shader->gs_max_out_vertices;
+	unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out;
+	uint64_t offset;
+
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS,
+			   sctx->gsvs_ring, gsvs_itemsize,
+			   64, true, true, 4, 16, 0);
+
+	offset = gsvs_itemsize * 64;
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_1,
+			   sctx->gsvs_ring, gsvs_itemsize,
+			   64, true, true, 4, 16, offset);
+
+	offset = (gsvs_itemsize * 2) * 64;
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_2,
+			   sctx->gsvs_ring, gsvs_itemsize,
+			   64, true, true, 4, 16, offset);
+
+	offset = (gsvs_itemsize * 3) * 64;
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_3,
+			   sctx->gsvs_ring, gsvs_itemsize,
+			   64, true, true, 4, 16, offset);
+
+}
 /**
  * @returns 1 if \p sel has been updated to use a new scratch buffer and 0
  *          otherwise.
@@ -1171,7 +1217,7 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
 
 	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_TESS_CTRL,
 			   SI_RING_TESS_FACTOR, sctx->tf_ring, 0,
-			   sctx->tf_ring->width0, false, false, 0, 0);
+			   sctx->tf_ring->width0, false, false, 0, 0, 0);
 
 	sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
 }
@@ -1252,7 +1298,7 @@ static void si_update_so(struct si_context *sctx, struct si_shader_selector *sha
 	int i;
 
 	for (i = 0; i < so->num_outputs; i++)
-		enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer);
+		enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << (so->output[i].stream * 4);
 	sctx->b.streamout.enabled_stream_buffers_mask = enabled_stream_buffers_mask;
 	sctx->b.streamout.stride_in_dw = shader->so.stride;
 }
@@ -1311,15 +1357,12 @@ void si_update_shaders(struct si_context *sctx)
 
 		if (!sctx->gs_rings)
 			si_init_gs_rings(sctx);
+
 		if (sctx->emitted.named.gs_rings != sctx->gs_rings)
 			sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
 		si_pm4_bind_state(sctx, gs_rings, sctx->gs_rings);
 
-		si_set_ring_buffer(ctx, PIPE_SHADER_GEOMETRY, SI_RING_GSVS,
-				   sctx->gsvs_ring,
-				   sctx->gs_shader->gs_max_out_vertices *
-				   sctx->gs_shader->info.num_outputs * 16,
-				   64, true, true, 4, 16);
+		si_update_gs_rings(sctx);
 	} else {
 		si_pm4_bind_state(sctx, gs_rings, NULL);
 		si_pm4_bind_state(sctx, gs, NULL);

From af1e6aa75b7f518cc6b08717fa8844370be3f05c Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 22 Jul 2015 01:24:39 +0100
Subject: [PATCH 0803/1208] radeonsi: enable GL4.1 and update documentation
 (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This enables GL4.1 for radeonsi, and updates the
docs in the correct places.

v2: enable only for llvm 3.7 which has fixes in place.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                           | 16 ++++++++--------
 docs/relnotes/10.7.0.html              |  1 +
 src/gallium/drivers/radeonsi/si_pipe.c |  2 +-
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 56753fddf4d..dd4728ce843 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -96,18 +96,18 @@ GL 4.0, GLSL 4.00 --- all DONE: nvc0
 
   GL_ARB_draw_buffers_blend                            DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_draw_indirect                                 DONE (i965, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_gpu_shader5                                   DONE (i965)
+  GL_ARB_gpu_shader5                                   DONE (i965, radeonsi)
   - 'precise' qualifier                                DONE
-  - Dynamically uniform sampler array indices          DONE (r600, radeonsi, softpipe)
-  - Dynamically uniform UBO array indices              DONE (r600, radeonsi)
+  - Dynamically uniform sampler array indices          DONE (r600, softpipe)
+  - Dynamically uniform UBO array indices              DONE (r600)
   - Implicit signed -> unsigned conversions            DONE
   - Fused multiply-add                                 DONE ()
-  - Packing/bitfield/conversion functions              DONE (r600, radeonsi, softpipe)
-  - Enhanced textureGather                             DONE (r600, radeonsi, softpipe)
-  - Geometry shader instancing                         DONE (r600, radeonsi, llvmpipe, softpipe)
+  - Packing/bitfield/conversion functions              DONE (r600, softpipe)
+  - Enhanced textureGather                             DONE (r600, softpipe)
+  - Geometry shader instancing                         DONE (r600, llvmpipe, softpipe)
   - Geometry shader multiple streams                   DONE ()
-  - Enhanced per-sample shading                        DONE (r600, radeonsi)
-  - Interpolation functions                            DONE (r600, radeonsi)
+  - Enhanced per-sample shading                        DONE (r600)
+  - Interpolation functions                            DONE (r600)
   - New overload resolution rules                      DONE
   GL_ARB_gpu_shader_fp64                               DONE (radeonsi, llvmpipe, softpipe)
   GL_ARB_sample_shading                                DONE (i965, nv50, r600, radeonsi)
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
index 2df18c0532d..92ae20fe964 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/10.7.0.html
@@ -50,6 +50,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_fragment_layer_viewport on radeonsi</li>
 <li>GL_ARB_framebuffer_no_attachments on i965</li>
 <li>GL_ARB_get_texture_sub_image for all drivers</li>
+<li>GL_ARB_gpu_shader5 on radeonsi</li>
 <li>GL_ARB_gpu_shader_fp64 on llvmpipe, radeonsi</li>
 <li>GL_ARB_shader_stencil_export on llvmpipe</li>
 <li>GL_ARB_shader_subroutine on core profile all drivers</li>
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index a120282e973..82a6033f694 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -272,7 +272,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return 4;
 
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
-		return 330;
+		return HAVE_LLVM >= 0x0307 ? 410 : 330;
 
 	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
 		return MIN2(sscreen->b.info.vram_size, 0xFFFFFFFF);

From 1b2b0e42ce47bfd1fcb5513ed2c23b9bb7a5a5b8 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 29 Jul 2015 10:51:46 +0100
Subject: [PATCH 0804/1208] docs: consolidate radeonsi in GL3.txt

move into DONE for GL4.0 and GL4.1

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index dd4728ce843..0220f24fa7e 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -92,10 +92,10 @@ GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
   GL_ARB_vertex_type_2_10_10_10_rev                     DONE ()
 
 
-GL 4.0, GLSL 4.00 --- all DONE: nvc0
+GL 4.0, GLSL 4.00 --- all DONE: nvc0, radeonsi
 
-  GL_ARB_draw_buffers_blend                            DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_draw_indirect                                 DONE (i965, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_draw_buffers_blend                            DONE (i965, nv50, r600, llvmpipe, softpipe)
+  GL_ARB_draw_indirect                                 DONE (i965, r600, llvmpipe, softpipe)
   GL_ARB_gpu_shader5                                   DONE (i965, radeonsi)
   - 'precise' qualifier                                DONE
   - Dynamically uniform sampler array indices          DONE (r600, softpipe)
@@ -109,26 +109,26 @@ GL 4.0, GLSL 4.00 --- all DONE: nvc0
   - Enhanced per-sample shading                        DONE (r600)
   - Interpolation functions                            DONE (r600)
   - New overload resolution rules                      DONE
-  GL_ARB_gpu_shader_fp64                               DONE (radeonsi, llvmpipe, softpipe)
+  GL_ARB_gpu_shader_fp64                               DONE (llvmpipe, softpipe)
   GL_ARB_sample_shading                                DONE (i965, nv50, r600, radeonsi)
-  GL_ARB_shader_subroutine                             DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_tessellation_shader                           DONE (radeonsi)
-  GL_ARB_texture_buffer_object_rgb32                   DONE (i965, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_cube_map_array                        DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_gather                                DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_query_lod                             DONE (i965, nv50, r600, radeonsi)
-  GL_ARB_transform_feedback2                           DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_transform_feedback3                           DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_shader_subroutine                             DONE (i965, nv50, r600, llvmpipe, softpipe)
+  GL_ARB_tessellation_shader                           DONE ()
+  GL_ARB_texture_buffer_object_rgb32                   DONE (i965, r600, llvmpipe, softpipe)
+  GL_ARB_texture_cube_map_array                        DONE (i965, nv50, r600, llvmpipe, softpipe)
+  GL_ARB_texture_gather                                DONE (i965, nv50, r600, llvmpipe, softpipe)
+  GL_ARB_texture_query_lod                             DONE (i965, nv50, r600)
+  GL_ARB_transform_feedback2                           DONE (i965, nv50, r600, llvmpipe, softpipe)
+  GL_ARB_transform_feedback3                           DONE (i965, nv50, r600, llvmpipe, softpipe)
 
 
-GL 4.1, GLSL 4.10 --- all DONE: nvc0
+GL 4.1, GLSL 4.10 --- all DONE: nvc0, radeonsi
 
-  GL_ARB_ES2_compatibility                             DONE (i965, nv50, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_ES2_compatibility                             DONE (i965, nv50, r600, llvmpipe, softpipe)
   GL_ARB_get_program_binary                            DONE (0 binary formats)
   GL_ARB_separate_shader_objects                       DONE (all drivers)
   GL_ARB_shader_precision                              DONE (all drivers that support GLSL 4.10)
-  GL_ARB_vertex_attrib_64bit                           DONE (radeonsi, llvmpipe, softpipe)
-  GL_ARB_viewport_array                                DONE (i965, nv50, r600, radeonsi, llvmpipe)
+  GL_ARB_vertex_attrib_64bit                           DONE (llvmpipe, softpipe)
+  GL_ARB_viewport_array                                DONE (i965, nv50, r600, llvmpipe)
 
 
 GL 4.2, GLSL 4.20:

From 75a96cedf7b0e5613560be0962dec973a4d2f2fe Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sat, 4 Jul 2015 15:43:15 +1000
Subject: [PATCH 0805/1208] glsl: set stage flag for structs and arrays in
 resource list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes the remaining failing tests in:
ES31-CTS.program_interface_query.uniform-types

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/glsl/linker.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index a7812116944..a16dab4ddcd 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3129,9 +3129,19 @@ build_stageref(struct gl_shader_program *shProg, const char *name)
        */
       foreach_in_list(ir_instruction, node, sh->ir) {
          ir_variable *var = node->as_variable();
-         if (var && strcmp(var->name, name) == 0) {
-            stages |= (1 << i);
-            break;
+         if (var) {
+            unsigned baselen = strlen(var->name);
+            if (strncmp(var->name, name, baselen) == 0) {
+               /* Check for exact name matches but also check for arrays and
+                * structs.
+                */
+               if (name[baselen] == '\0' ||
+                   name[baselen] == '[' ||
+                   name[baselen] == '.') {
+                  stages |= (1 << i);
+                  break;
+               }
+            }
          }
       }
    }

From 3a21e4bd263002dd600e7a693536c93f68b285a5 Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Wed, 29 Jul 2015 17:40:37 +0100
Subject: [PATCH 0806/1208] i965/bxt: Don't use brw_device_info_skl_early on
 BXT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously it could end up using the “SKL early” device on BXT
depending on the revision number. This would probably break things
because for example has_llc would be wrong.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_device_info.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index 6fe6ea2eb56..2c5d778dbd2 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -365,7 +365,9 @@ brw_get_device_info(int devid, int revision)
       return NULL;
    }
 
-   if (devinfo->gen == 9 && (revision == 2 || revision == 3 || revision == -1))
+   if (devinfo->gen == 9 &&
+       !devinfo->is_broxton &&
+       (revision == 2 || revision == 3 || revision == -1))
       return &brw_device_info_skl_early;
 
    return devinfo;

From 8cd2f88845acd45ebcbaae2e68a8a47b3c17e6d5 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sat, 25 Jul 2015 12:33:53 +1000
Subject: [PATCH 0807/1208] mesa: fix and simplify resource query for arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This removes the need for multiple functions designed to validate an array
subscript and replaces them with a call to a single function.

The change also means that validation is now only done once and the index
is retrived at the same time, as a result the getUniformLocation code can
be simplified saving an extra hash table lookup (and yet another
validation call).

This chage also fixes some tests in:
ES31-CTS.program_interface_query.uniform

V3: rebase on subroutines, and move the resource index array == 0
check into _mesa_GetProgramResourceIndex() to simplify things further

V2: Fix bounds checks for program input/output, split unrelated comment fix
and _mesa_get_uniform_location() removal into their own patch.

Cc: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/program_resource.c |  14 ++-
 src/mesa/main/shader_query.cpp   | 174 +++++++++++++++++--------------
 src/mesa/main/shaderapi.c        |   2 +-
 src/mesa/main/shaderapi.h        |   3 +-
 src/mesa/main/uniforms.c         |   5 +-
 5 files changed, 106 insertions(+), 92 deletions(-)

diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index 641ef22c1a6..fdbd5b3a361 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -229,6 +229,7 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
                               const GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
+   unsigned array_index = 0;
    struct gl_program_resource *res;
    struct gl_shader_program *shProg =
       _mesa_lookup_shader_program_err(ctx, program,
@@ -268,13 +269,10 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
    case GL_PROGRAM_OUTPUT:
    case GL_UNIFORM:
    case GL_TRANSFORM_FEEDBACK_VARYING:
-      /* Validate name syntax for array variables */
-      if (!valid_program_resource_index_name(name))
-         return GL_INVALID_INDEX;
-      /* fall-through */
    case GL_UNIFORM_BLOCK:
-      res = _mesa_program_resource_find_name(shProg, programInterface, name);
-      if (!res)
+      res = _mesa_program_resource_find_name(shProg, programInterface, name,
+                                             &array_index);
+      if (!res || array_index > 0)
          return GL_INVALID_INDEX;
 
       return _mesa_program_resource_index(shProg, res);
@@ -403,7 +401,7 @@ _mesa_GetProgramResourceLocation(GLuint program, GLenum programInterface,
    struct gl_shader_program *shProg =
       lookup_linked_program(program, "glGetProgramResourceLocation");
 
-   if (!shProg || !name || invalid_array_element_syntax(name))
+   if (!shProg || !name)
       return -1;
 
    /* Validate programInterface. */
@@ -453,7 +451,7 @@ _mesa_GetProgramResourceLocationIndex(GLuint program, GLenum programInterface,
    struct gl_shader_program *shProg =
       lookup_linked_program(program, "glGetProgramResourceLocationIndex");
 
-   if (!shProg || !name || invalid_array_element_syntax(name))
+   if (!shProg || !name)
       return -1;
 
    /* From the GL_ARB_program_interface_query spec:
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 3e52a098a67..dd11b819c1c 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -44,7 +44,8 @@ extern "C" {
 
 static GLint
 program_resource_location(struct gl_shader_program *shProg,
-                          struct gl_program_resource *res, const char *name);
+                          struct gl_program_resource *res, const char *name,
+                          unsigned array_index);
 
 /**
  * Declare convenience functions to return resource data in a given type.
@@ -272,13 +273,15 @@ _mesa_GetAttribLocation(GLhandleARB program, const GLcharARB * name)
    if (shProg->_LinkedShaders[MESA_SHADER_VERTEX] == NULL)
       return -1;
 
+   unsigned array_index = 0;
    struct gl_program_resource *res =
-      _mesa_program_resource_find_name(shProg, GL_PROGRAM_INPUT, name);
+      _mesa_program_resource_find_name(shProg, GL_PROGRAM_INPUT, name,
+                                       &array_index);
 
    if (!res)
       return -1;
 
-   GLint loc = program_resource_location(shProg, res, name);
+   GLint loc = program_resource_location(shProg, res, name, array_index);
 
    /* The extra check against against 0 is made because of builtin-attribute
     * locations that have offset applied. Function program_resource_location
@@ -456,13 +459,15 @@ _mesa_GetFragDataLocation(GLuint program, const GLchar *name)
    if (shProg->_LinkedShaders[MESA_SHADER_FRAGMENT] == NULL)
       return -1;
 
+   unsigned array_index = 0;
    struct gl_program_resource *res =
-      _mesa_program_resource_find_name(shProg, GL_PROGRAM_OUTPUT, name);
+      _mesa_program_resource_find_name(shProg, GL_PROGRAM_OUTPUT, name,
+                                       &array_index);
 
    if (!res)
       return -1;
 
-   GLint loc = program_resource_location(shProg, res, name);
+   GLint loc = program_resource_location(shProg, res, name, array_index);
 
    /* The extra check against against 0 is made because of builtin-attribute
     * locations that have offset applied. Function program_resource_location
@@ -552,39 +557,32 @@ _mesa_program_resource_array_size(struct gl_program_resource *res)
    return 0;
 }
 
-static int
-array_index_of_resource(struct gl_program_resource *res,
-                        const char *name)
+/**
+ * Checks if array subscript is valid and if so sets array_index.
+ */
+static bool
+valid_array_index(const GLchar *name, unsigned *array_index)
 {
-   assert(res->Data);
+   unsigned idx = 0;
+   const GLchar *out_base_name_end;
 
-   switch (res->Type) {
-   case GL_PROGRAM_INPUT:
-   case GL_PROGRAM_OUTPUT:
-      return get_matching_index(RESOURCE_VAR(res), name);
-   default:
-      assert(!"support for resource type not implemented");
-      return -1;
-   }
+   idx = parse_program_resource_name(name, &out_base_name_end);
+   if (idx < 0)
+      return false;
+
+   if (array_index)
+      *array_index = idx;
+
+   return true;
 }
 
 /* Find a program resource with specific name in given interface.
  */
 struct gl_program_resource *
 _mesa_program_resource_find_name(struct gl_shader_program *shProg,
-                                 GLenum programInterface, const char *name)
+                                 GLenum programInterface, const char *name,
+                                 unsigned *array_index)
 {
-   GET_CURRENT_CONTEXT(ctx);
-   const char *full_name = name;
-
-   /* When context has 'VertexID_is_zero_based' set, gl_VertexID has been
-    * lowered to gl_VertexIDMESA.
-    */
-   if (name && ctx->Const.VertexID_is_zero_based) {
-      if (strcmp(name, "gl_VertexID") == 0)
-         full_name = "gl_VertexIDMESA";
-   }
-
    struct gl_program_resource *res = shProg->ProgramResourceList;
    for (unsigned i = 0; i < shProg->NumProgramResourceList; i++, res++) {
       if (res->Type != programInterface)
@@ -594,38 +592,46 @@ _mesa_program_resource_find_name(struct gl_shader_program *shProg,
       const char *rname = _mesa_program_resource_name(res);
       unsigned baselen = strlen(rname);
 
-      switch (programInterface) {
-      case GL_TRANSFORM_FEEDBACK_VARYING:
-      case GL_UNIFORM_BLOCK:
-      case GL_UNIFORM:
-      case GL_VERTEX_SUBROUTINE_UNIFORM:
-      case GL_GEOMETRY_SUBROUTINE_UNIFORM:
-      case GL_FRAGMENT_SUBROUTINE_UNIFORM:
-      case GL_COMPUTE_SUBROUTINE_UNIFORM:
-      case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
-      case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
-      case GL_VERTEX_SUBROUTINE:
-      case GL_GEOMETRY_SUBROUTINE:
-      case GL_FRAGMENT_SUBROUTINE:
-      case GL_COMPUTE_SUBROUTINE:
-      case GL_TESS_CONTROL_SUBROUTINE:
-      case GL_TESS_EVALUATION_SUBROUTINE:
-         if (strncmp(rname, name, baselen) == 0) {
+      if (strncmp(rname, name, baselen) == 0) {
+         switch (programInterface) {
+         case GL_UNIFORM_BLOCK:
             /* Basename match, check if array or struct. */
             if (name[baselen] == '\0' ||
                 name[baselen] == '[' ||
                 name[baselen] == '.') {
                return res;
             }
+            break;
+         case GL_TRANSFORM_FEEDBACK_VARYING:
+         case GL_UNIFORM:
+         case GL_VERTEX_SUBROUTINE_UNIFORM:
+         case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+         case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+         case GL_COMPUTE_SUBROUTINE_UNIFORM:
+         case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+         case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+         case GL_VERTEX_SUBROUTINE:
+         case GL_GEOMETRY_SUBROUTINE:
+         case GL_FRAGMENT_SUBROUTINE:
+         case GL_COMPUTE_SUBROUTINE:
+         case GL_TESS_CONTROL_SUBROUTINE:
+         case GL_TESS_EVALUATION_SUBROUTINE:
+            if (name[baselen] == '.') {
+               return res;
+            }
+            /* fall-through */
+         case GL_PROGRAM_INPUT:
+         case GL_PROGRAM_OUTPUT:
+            if (name[baselen] == '\0') {
+               return res;
+            } else if (name[baselen] == '[' &&
+                valid_array_index(name, array_index)) {
+               return res;
+            }
+            break;
+         default:
+            assert(!"not implemented for given interface");
          }
-         break;
-      case GL_PROGRAM_INPUT:
-      case GL_PROGRAM_OUTPUT:
-         if (array_index_of_resource(res, full_name) >= 0)
-            return res;
-         break;
-      default:
-         assert(!"not implemented for given interface");
       }
    }
    return NULL;
@@ -787,19 +793,9 @@ _mesa_get_program_resource_name(struct gl_shader_program *shProg,
 
 static GLint
 program_resource_location(struct gl_shader_program *shProg,
-                          struct gl_program_resource *res, const char *name)
+                          struct gl_program_resource *res, const char *name,
+                          unsigned array_index)
 {
-   unsigned index, offset;
-   int array_index = -1;
-   long offset_ret;
-   const GLchar *base_name_end;
-
-   if (res->Type == GL_PROGRAM_INPUT || res->Type == GL_PROGRAM_OUTPUT) {
-      array_index = array_index_of_resource(res, name);
-      if (array_index < 0)
-         return -1;
-   }
-
    /* Built-in locations should report GL_INVALID_INDEX. */
    if (is_gl_identifier(name))
       return GL_INVALID_INDEX;
@@ -810,13 +806,22 @@ program_resource_location(struct gl_shader_program *shProg,
     */
    switch (res->Type) {
    case GL_PROGRAM_INPUT:
+      /* If the input is an array, fail if the index is out of bounds. */
+      if (array_index > 0
+          && array_index >= RESOURCE_VAR(res)->type->length) {
+         return -1;
+      }
       return RESOURCE_VAR(res)->data.location + array_index - VERT_ATTRIB_GENERIC0;
    case GL_PROGRAM_OUTPUT:
+      /* If the output is an array, fail if the index is out of bounds. */
+      if (array_index > 0
+          && array_index >= RESOURCE_VAR(res)->type->length) {
+         return -1;
+      }
       return RESOURCE_VAR(res)->data.location + array_index - FRAG_RESULT_DATA0;
    case GL_UNIFORM:
-      index = _mesa_get_uniform_location(shProg, name, &offset);
-
-      if (index == GL_INVALID_INDEX)
+      /* If the uniform is built-in, fail. */
+      if (RESOURCE_UNI(res)->builtin)
          return -1;
 
       /* From the GL_ARB_uniform_buffer_object spec:
@@ -830,17 +835,21 @@ program_resource_location(struct gl_shader_program *shProg,
           RESOURCE_UNI(res)->atomic_buffer_index != -1)
          return -1;
 
-      /* location in remap table + array element offset */
-      return RESOURCE_UNI(res)->remap_location + offset;
-
+      /* fallthrough */
    case GL_VERTEX_SUBROUTINE_UNIFORM:
    case GL_GEOMETRY_SUBROUTINE_UNIFORM:
    case GL_FRAGMENT_SUBROUTINE_UNIFORM:
    case GL_COMPUTE_SUBROUTINE_UNIFORM:
    case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
    case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
-      offset_ret = parse_program_resource_name(name, &base_name_end);
-      return RESOURCE_UNI(res)->remap_location + ((offset_ret != -1) ? offset_ret : 0);
+      /* If the uniform is an array, fail if the index is out of bounds. */
+      if (array_index > 0
+          && array_index >= RESOURCE_UNI(res)->array_elements) {
+         return -1;
+      }
+
+      /* location in remap table + array element offset */
+      return RESOURCE_UNI(res)->remap_location + array_index;
    default:
       return -1;
    }
@@ -854,14 +863,16 @@ GLint
 _mesa_program_resource_location(struct gl_shader_program *shProg,
                                 GLenum programInterface, const char *name)
 {
+   unsigned array_index = 0;
    struct gl_program_resource *res =
-      _mesa_program_resource_find_name(shProg, programInterface, name);
+      _mesa_program_resource_find_name(shProg, programInterface, name,
+                                       &array_index);
 
    /* Resource not found. */
    if (!res)
       return -1;
 
-   return program_resource_location(shProg, res, name);
+   return program_resource_location(shProg, res, name, array_index);
 }
 
 /**
@@ -873,7 +884,7 @@ _mesa_program_resource_location_index(struct gl_shader_program *shProg,
                                       GLenum programInterface, const char *name)
 {
    struct gl_program_resource *res =
-      _mesa_program_resource_find_name(shProg, programInterface, name);
+      _mesa_program_resource_find_name(shProg, programInterface, name, NULL);
 
    /* Non-existent variable or resource is not referenced by fragment stage. */
    if (!res || !(res->StageReferences & (1 << MESA_SHADER_FRAGMENT)))
@@ -949,7 +960,8 @@ get_buffer_property(struct gl_shader_program *shProg,
          for (unsigned i = 0; i < RESOURCE_UBO(res)->NumUniforms; i++) {
             const char *iname = RESOURCE_UBO(res)->Uniforms[i].IndexName;
             struct gl_program_resource *uni =
-               _mesa_program_resource_find_name(shProg, GL_UNIFORM, iname);
+               _mesa_program_resource_find_name(shProg, GL_UNIFORM, iname,
+                                                NULL);
             if (!uni)
                continue;
             (*val)++;
@@ -959,7 +971,8 @@ get_buffer_property(struct gl_shader_program *shProg,
          for (unsigned i = 0; i < RESOURCE_UBO(res)->NumUniforms; i++) {
             const char *iname = RESOURCE_UBO(res)->Uniforms[i].IndexName;
             struct gl_program_resource *uni =
-               _mesa_program_resource_find_name(shProg, GL_UNIFORM, iname);
+               _mesa_program_resource_find_name(shProg, GL_UNIFORM, iname,
+                                                NULL);
             if (!uni)
                continue;
             *val++ =
@@ -1099,7 +1112,8 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
       case GL_PROGRAM_INPUT:
       case GL_PROGRAM_OUTPUT:
          *val = program_resource_location(shProg, res,
-                                          _mesa_program_resource_name(res));
+                                          _mesa_program_resource_name(res),
+                                          0);
          return 1;
       default:
          goto invalid_operation;
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 0887c68763e..b702dcde47a 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -2229,7 +2229,7 @@ _mesa_GetSubroutineIndex(GLuint program, GLenum shadertype,
    }
 
    resource_type = _mesa_shader_stage_to_subroutine(stage);
-   res = _mesa_program_resource_find_name(shProg, resource_type, name);
+   res = _mesa_program_resource_find_name(shProg, resource_type, name, NULL);
    if (!res) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
      return -1;
diff --git a/src/mesa/main/shaderapi.h b/src/mesa/main/shaderapi.h
index c081f749f93..0a10191684f 100644
--- a/src/mesa/main/shaderapi.h
+++ b/src/mesa/main/shaderapi.h
@@ -232,7 +232,8 @@ _mesa_program_resource_index(struct gl_shader_program *shProg,
 
 extern struct gl_program_resource *
 _mesa_program_resource_find_name(struct gl_shader_program *shProg,
-                                 GLenum programInterface, const char *name);
+                                 GLenum programInterface, const char *name,
+                                 unsigned *array_index);
 
 extern struct gl_program_resource *
 _mesa_program_resource_find_index(struct gl_shader_program *shProg,
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index 6ba746e2f58..ff1df72e1d6 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -952,7 +952,7 @@ _mesa_GetUniformBlockIndex(GLuint program,
 
    struct gl_program_resource *res =
       _mesa_program_resource_find_name(shProg, GL_UNIFORM_BLOCK,
-                                       uniformBlockName);
+                                       uniformBlockName, NULL);
    if (!res)
       return GL_INVALID_INDEX;
 
@@ -987,7 +987,8 @@ _mesa_GetUniformIndices(GLuint program,
 
    for (i = 0; i < uniformCount; i++) {
       struct gl_program_resource *res =
-         _mesa_program_resource_find_name(shProg, GL_UNIFORM, uniformNames[i]);
+         _mesa_program_resource_find_name(shProg, GL_UNIFORM, uniformNames[i],
+                                          NULL);
       uniformIndices[i] = _mesa_program_resource_index(shProg, res);
    }
 }

From 7f5f7d15fbbd3d306e43e1e9ff215750b8aaa7bf Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sat, 25 Jul 2015 12:39:43 +1000
Subject: [PATCH 0808/1208] mesa: remove now unused subscript validations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/program_resource.c | 51 ----------------------------
 src/mesa/main/shader_query.cpp   | 57 --------------------------------
 2 files changed, 108 deletions(-)

diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index fdbd5b3a361..3864775818c 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -203,27 +203,6 @@ is_xfb_marker(const char *str)
    return false;
 }
 
-/**
- * Checks if given name index is legal for GetProgramResourceIndex,
- * check is written to be compatible with GL_ARB_array_of_arrays.
- */
-static bool
-valid_program_resource_index_name(const GLchar *name)
-{
-   const char *array = strstr(name, "[");
-   const char *close = strrchr(name, ']');
-
-   /* Not array, no need for the check. */
-   if (!array)
-      return true;
-
-   /* Last array index has to be zero. */
-   if (!close || *--close != '0')
-      return false;
-
-   return true;
-}
-
 GLuint GLAPIENTRY
 _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
                               const GLchar *name)
@@ -345,36 +324,6 @@ _mesa_GetProgramResourceiv(GLuint program, GLenum programInterface,
                                 propCount, props, bufSize, length, params);
 }
 
-/**
- * Function verifies syntax of given name for GetProgramResourceLocation
- * and GetProgramResourceLocationIndex for the following cases:
- *
- * "array element portion of a string passed to GetProgramResourceLocation
- * or GetProgramResourceLocationIndex must not have, a "+" sign, extra
- * leading zeroes, or whitespace".
- *
- * Check is written to be compatible with GL_ARB_array_of_arrays.
- */
-static bool
-invalid_array_element_syntax(const GLchar *name)
-{
-   char *first = strchr(name, '[');
-   char *last = strrchr(name, '[');
-
-   if (!first)
-      return false;
-
-   /* No '+' or ' ' allowed anywhere. */
-   if (strchr(first, '+') || strchr(first, ' '))
-      return true;
-
-   /* Check that last array index is 0. */
-   if (last[1] == '0' && last[2] != ']')
-      return true;
-
-   return false;
-}
-
 static struct gl_shader_program *
 lookup_linked_program(GLuint program, const char *caller)
 {
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index dd11b819c1c..b49fd38d64d 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -191,63 +191,6 @@ _mesa_GetActiveAttrib(GLhandleARB program, GLuint desired_index,
                                   (GLint *) type, "glGetActiveAttrib");
 }
 
-/* Locations associated with shader variables (array or non-array) can be
- * queried using its base name or using the base name appended with the
- * valid array index. For example, in case of below vertex shader, valid
- * queries can be made to know the location of "xyz", "array", "array[0]",
- * "array[1]", "array[2]" and "array[3]". In this example index reurned
- * will be 0, 0, 0, 1, 2, 3 respectively.
- *
- * [Vertex Shader]
- * layout(location=0) in vec4 xyz;
- * layout(location=1) in vec4[4] array;
- * void main()
- * { }
- *
- * This requirement came up with the addition of ARB_program_interface_query
- * to OpenGL 4.3 specification. See page 101 (page 122 of the PDF) for details.
- *
- * This utility function is used by:
- * _mesa_GetAttribLocation
- * _mesa_GetFragDataLocation
- * _mesa_GetFragDataIndex
- *
- * Returns 0:
- *    if the 'name' string matches var->name.
- * Returns 'matched index':
- *    if the 'name' string matches var->name appended with valid array index.
- */
-int static inline
-get_matching_index(const ir_variable *const var, const char *name) {
-   unsigned idx = 0;
-   const char *const paren = strchr(name, '[');
-   const unsigned len = (paren != NULL) ? paren - name : strlen(name);
-
-   if (paren != NULL) {
-      if (!var->type->is_array())
-         return -1;
-
-      char *endptr;
-      idx = (unsigned) strtol(paren + 1, &endptr, 10);
-      const unsigned idx_len = endptr != (paren + 1) ? endptr - paren - 1 : 0;
-
-      /* Validate the sub string representing index in 'name' string */
-      if ((idx > 0 && paren[1] == '0') /* leading zeroes */
-          || (idx == 0 && idx_len > 1) /* all zeroes */
-          || paren[1] == ' ' /* whitespace */
-          || endptr[0] != ']' /* closing brace */
-          || endptr[1] != '\0' /* null char */
-          || idx_len == 0 /* missing index */
-          || idx >= var->type->length) /* exceeding array bound */
-         return -1;
-   }
-
-   if (strncmp(var->name, name, len) == 0 && var->name[len] == '\0')
-      return idx;
-
-   return -1;
-}
-
 GLint GLAPIENTRY
 _mesa_GetAttribLocation(GLhandleARB program, const GLcharARB * name)
 {

From fdb84876134ed074563b842eae20fd10dbe9e8d6 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sat, 4 Jul 2015 08:35:35 +1000
Subject: [PATCH 0809/1208] mesa: remove now unused _mesa_get_uniform_location
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/uniform_query.cpp | 75 ---------------------------------
 src/mesa/main/uniforms.h        |  4 --
 2 files changed, 79 deletions(-)

diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
index b5a94e9473b..036530e91b6 100644
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -978,81 +978,6 @@ _mesa_uniform_matrix(struct gl_context *ctx, struct gl_shader_program *shProg,
 }
 
 
-/**
- * Called via glGetUniformLocation().
- *
- * Returns the uniform index into UniformStorage (also the
- * glGetActiveUniformsiv uniform index), and stores the referenced
- * array offset in *offset, or GL_INVALID_INDEX (-1).
- */
-extern "C" unsigned
-_mesa_get_uniform_location(struct gl_shader_program *shProg,
-                           const GLchar *name,
-                           unsigned *out_offset)
-{
-   /* Page 80 (page 94 of the PDF) of the OpenGL 2.1 spec says:
-    *
-    *     "The first element of a uniform array is identified using the
-    *     name of the uniform array appended with "[0]". Except if the last
-    *     part of the string name indicates a uniform array, then the
-    *     location of the first element of that array can be retrieved by
-    *     either using the name of the uniform array, or the name of the
-    *     uniform array appended with "[0]"."
-    *
-    * Note: since uniform names are not allowed to use whitespace, and array
-    * indices within uniform names are not allowed to use "+", "-", or leading
-    * zeros, it follows that each uniform has a unique name up to the possible
-    * ambiguity with "[0]" noted above.  Therefore we don't need to worry
-    * about mal-formed inputs--they will properly fail when we try to look up
-    * the uniform name in shProg->UniformHash.
-    */
-
-   const GLchar *base_name_end;
-   long offset = parse_program_resource_name(name, &base_name_end);
-   bool array_lookup = offset >= 0;
-   char *name_copy;
-
-   if (array_lookup) {
-      name_copy = (char *) malloc(base_name_end - name + 1);
-      memcpy(name_copy, name, base_name_end - name);
-      name_copy[base_name_end - name] = '\0';
-   } else {
-      name_copy = (char *) name;
-      offset = 0;
-   }
-
-   unsigned location = 0;
-   const bool found = shProg->UniformHash->get(location, name_copy);
-
-   assert(!found
-	  || strcmp(name_copy, shProg->UniformStorage[location].name) == 0);
-
-   /* Free the temporary buffer *before* possibly returning an error.
-    */
-   if (name_copy != name)
-      free(name_copy);
-
-   if (!found)
-      return GL_INVALID_INDEX;
-
-   /* If the uniform is built-in, fail. */
-   if (shProg->UniformStorage[location].builtin)
-      return GL_INVALID_INDEX;
-
-   /* If the uniform is an array, fail if the index is out of bounds.
-    * (A negative index is caught above.)  This also fails if the uniform
-    * is not an array, but the user is trying to index it, because
-    * array_elements is zero and offset >= 0.
-    */
-   if (array_lookup
-       && offset >= (long) shProg->UniformStorage[location].array_elements) {
-      return GL_INVALID_INDEX;
-   }
-
-   *out_offset = offset;
-   return location;
-}
-
 extern "C" bool
 _mesa_sampler_uniforms_are_valid(const struct gl_shader_program *shProg,
 				 char *errMsg, size_t errMsgLength)
diff --git a/src/mesa/main/uniforms.h b/src/mesa/main/uniforms.h
index bd7b05e207a..e62eaa53ccc 100644
--- a/src/mesa/main/uniforms.h
+++ b/src/mesa/main/uniforms.h
@@ -343,10 +343,6 @@ void GLAPIENTRY
 _mesa_ProgramUniformMatrix4x3dv(GLuint program, GLint location, GLsizei count,
                                 GLboolean transpose, const GLdouble *value);
 
-unsigned
-_mesa_get_uniform_location(struct gl_shader_program *shProg,
-			   const GLchar *name, unsigned *offset);
-
 void
 _mesa_uniform(struct gl_context *ctx, struct gl_shader_program *shader_program,
 	      GLint location, GLsizei count,

From 518abd0bbe1886550b43c62679a2ebd41e8199e9 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@gmail.com>
Date: Thu, 30 Jul 2015 20:44:50 +1000
Subject: [PATCH 0810/1208] st/mesa: don't draw instead of asserting in
 transform feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

if we get a request to take the count from feedback, but there
is no buffer to take it from, just draw as if we got 0 vertices
so nothing.

This fixes this assert killing the ogl conform, and a piglit
test I've sent.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_cb_xformfb.c | 6 ++++--
 src/mesa/state_tracker/st_cb_xformfb.h | 2 +-
 src/mesa/state_tracker/st_draw.c       | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_xformfb.c b/src/mesa/state_tracker/st_cb_xformfb.c
index 07c118e227b..0708e68ec7b 100644
--- a/src/mesa/state_tracker/st_cb_xformfb.c
+++ b/src/mesa/state_tracker/st_cb_xformfb.c
@@ -191,7 +191,6 @@ st_transform_feedback_get_draw_target(struct gl_transform_feedback_object *obj)
       }
    }
 
-   assert(0);
    return NULL;
 }
 
@@ -211,14 +210,17 @@ st_end_transform_feedback(struct gl_context *ctx,
 }
 
 
-void
+bool
 st_transform_feedback_draw_init(struct gl_transform_feedback_object *obj,
                                 struct pipe_draw_info *out)
 {
    struct st_transform_feedback_object *sobj =
          st_transform_feedback_object(obj);
 
+   if (sobj->draw_count == NULL)
+      return false;
    out->count_from_stream_output = sobj->draw_count;
+   return true;
 }
 
 
diff --git a/src/mesa/state_tracker/st_cb_xformfb.h b/src/mesa/state_tracker/st_cb_xformfb.h
index 998c418257b..fb50deddfae 100644
--- a/src/mesa/state_tracker/st_cb_xformfb.h
+++ b/src/mesa/state_tracker/st_cb_xformfb.h
@@ -38,7 +38,7 @@ struct pipe_draw_info;
 extern void
 st_init_xformfb_functions(struct dd_function_table *functions);
 
-extern void
+extern bool
 st_transform_feedback_draw_init(struct gl_transform_feedback_object *obj,
                                 struct pipe_draw_info *out);
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 66b2f832933..bae8096e080 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -242,7 +242,8 @@ st_draw_vbo(struct gl_context *ctx,
       /* Transform feedback drawing is always non-indexed. */
       /* Set info.count_from_stream_output. */
       if (tfb_vertcount) {
-         st_transform_feedback_draw_init(tfb_vertcount, &info);
+         if (st_transform_feedback_draw_init(tfb_vertcount, &info) == false)
+            return;
       }
    }
 

From a5b3b24958b5e4344e7d8d1e029dbf7e5afb183c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20Wasserb=C3=A4ch?= <kai@dev.carbon-project.org>
Date: Thu, 30 Jul 2015 20:32:36 +0200
Subject: [PATCH 0811/1208] docs: trivial cleanup of GL3.txt, remove redundant
 radeonsi entries.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to 1b2b0e42ce47bfd1fcb5513ed2c23b9bb7a5a5b8

Signed-off-by: Kai Wasserbäch <kai@dev.carbon-project.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 0220f24fa7e..8124383b9ea 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -96,7 +96,7 @@ GL 4.0, GLSL 4.00 --- all DONE: nvc0, radeonsi
 
   GL_ARB_draw_buffers_blend                            DONE (i965, nv50, r600, llvmpipe, softpipe)
   GL_ARB_draw_indirect                                 DONE (i965, r600, llvmpipe, softpipe)
-  GL_ARB_gpu_shader5                                   DONE (i965, radeonsi)
+  GL_ARB_gpu_shader5                                   DONE (i965)
   - 'precise' qualifier                                DONE
   - Dynamically uniform sampler array indices          DONE (r600, softpipe)
   - Dynamically uniform UBO array indices              DONE (r600)
@@ -110,7 +110,7 @@ GL 4.0, GLSL 4.00 --- all DONE: nvc0, radeonsi
   - Interpolation functions                            DONE (r600)
   - New overload resolution rules                      DONE
   GL_ARB_gpu_shader_fp64                               DONE (llvmpipe, softpipe)
-  GL_ARB_sample_shading                                DONE (i965, nv50, r600, radeonsi)
+  GL_ARB_sample_shading                                DONE (i965, nv50, r600)
   GL_ARB_shader_subroutine                             DONE (i965, nv50, r600, llvmpipe, softpipe)
   GL_ARB_tessellation_shader                           DONE ()
   GL_ARB_texture_buffer_object_rgb32                   DONE (i965, r600, llvmpipe, softpipe)

From d0173bce371e3aafa732600c1456a9282ff5d900 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 29 Jul 2015 12:20:33 -0700
Subject: [PATCH 0812/1208] vc4: Fix return values from recent validation
 changes.

---
 src/gallium/drivers/vc4/kernel/vc4_validate.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index e81dd9935ca..cba948a1c67 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -207,7 +207,7 @@ validate_flush(VALIDATE_ARGS)
 {
 	if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 1)) {
 		DRM_ERROR("Bin CL must end with VC4_PACKET_FLUSH\n");
-		return false;
+		return -EINVAL;
 	}
 	exec->found_flush = true;
 
@@ -783,17 +783,17 @@ validate_gl_shader_rec(struct drm_device *dev,
 	for (i = 0; i < shader_reloc_count; i++) {
 		if (src_handles[i] > exec->bo_count) {
 			DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
-			return false;
+			return -EINVAL;
 		}
 
 		bo[i] = exec->bo[src_handles[i]];
 		if (!bo[i])
-			return false;
+			return -EINVAL;
 	}
 	for (i = shader_reloc_count; i < nr_relocs; i++) {
 		bo[i] = vc4_use_bo(exec, src_handles[i]);
 		if (!bo[i])
-			return false;
+			return -EINVAL;
 	}
 
 	for (i = 0; i < shader_reloc_count; i++) {

From 86541cf8cea77f4b887dd061e7d6e3e4767f86fd Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 29 Jul 2015 14:40:10 -0700
Subject: [PATCH 0813/1208] vc4: Avoid overflowing various static tables.

---
 src/gallium/drivers/vc4/kernel/vc4_validate.c | 2 +-
 src/gallium/drivers/vc4/vc4_cl_dump.c         | 2 +-
 src/gallium/drivers/vc4/vc4_formats.c         | 2 +-
 src/gallium/drivers/vc4/vc4_qpu_disasm.c      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index cba948a1c67..674ca637864 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -487,7 +487,7 @@ vc4_validate_bin_cl(struct drm_device *dev,
 		u8 cmd = *(uint8_t *)src_pkt;
 		const struct cmd_info *info;
 
-		if (cmd > ARRAY_SIZE(cmd_info)) {
+		if (cmd >= ARRAY_SIZE(cmd_info)) {
 			DRM_ERROR("0x%08x: packet %d out of bounds\n",
 				  src_offset, cmd);
 			return -EINVAL;
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index e153a243090..6d748010baf 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -397,7 +397,7 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render)
         while (offset < size) {
                 uint8_t header = cmds[offset];
 
-                if (header > ARRAY_SIZE(packet_info) ||
+                if (header >= ARRAY_SIZE(packet_info) ||
                     !packet_info[header].name) {
                         fprintf(stderr, "0x%08x 0x%08x: Unknown packet 0x%02x (%d)!\n",
                                 offset, hw_offset, header, header);
diff --git a/src/gallium/drivers/vc4/vc4_formats.c b/src/gallium/drivers/vc4/vc4_formats.c
index 004bac70c67..ffce61237de 100644
--- a/src/gallium/drivers/vc4/vc4_formats.c
+++ b/src/gallium/drivers/vc4/vc4_formats.c
@@ -108,7 +108,7 @@ static const struct vc4_format vc4_format_table[] = {
 static const struct vc4_format *
 get_format(enum pipe_format f)
 {
-        if (f > ARRAY_SIZE(vc4_format_table) ||
+        if (f >= ARRAY_SIZE(vc4_format_table) ||
             !vc4_format_table[f].present)
                 return NULL;
         else
diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
index 55e0e6139b5..00aeb300a9b 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
@@ -225,7 +225,7 @@ static const char *qpu_condflags[] = {
 };
 
 #define DESC(array, index)                                        \
-        ((index > ARRAY_SIZE(array) || !(array)[index]) ?         \
+        ((index >= ARRAY_SIZE(array) || !(array)[index]) ?         \
          "???" : (array)[index])
 
 static const char *

From df3005de189f5120bc06f6cba35ecaf5c4503229 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 29 Jul 2015 14:41:22 -0700
Subject: [PATCH 0814/1208] vc4: Avoid leaking indirect array access UBOs.

---
 src/gallium/drivers/vc4/vc4_uniforms.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
index d4c71376a55..3bf6672a88a 100644
--- a/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -283,6 +283,8 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
         }
 
         cl_end(&vc4->uniforms, uniforms);
+
+        vc4_bo_unreference(&ubo);
 }
 
 void

From c93ffd661a46f0f6d20c9ec2e97d4d9393e28111 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 29 Jul 2015 12:16:50 -0700
Subject: [PATCH 0815/1208] vc4: Mark our shaders as single-threaded.

I had my understanding of this bit flipped.  We're using the full register
space, so we need to say so.
---
 src/gallium/drivers/vc4/kernel/vc4_validate.c | 5 +++++
 src/gallium/drivers/vc4/vc4_draw.c            | 1 +
 2 files changed, 6 insertions(+)

diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index 674ca637864..b248831113c 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -780,6 +780,11 @@ validate_gl_shader_rec(struct drm_device *dev,
 	exec->shader_rec_v += roundup(packet_size, 16);
 	exec->shader_rec_size -= packet_size;
 
+	if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
+		DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
+		return -EINVAL;
+	}
+
 	for (i = 0; i < shader_reloc_count; i++) {
 		if (src_handles[i] > exec->bo_count) {
 			DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index 22ae8f27e4a..a4e5e092b1a 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -139,6 +139,7 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *i
         /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
         cl_u16(&shader_rec,
                VC4_SHADER_FLAG_ENABLE_CLIPPING |
+               VC4_SHADER_FLAG_FS_SINGLE_THREAD |
                ((info->mode == PIPE_PRIM_POINTS &&
                  vc4->rasterizer->base.point_size_per_vertex) ?
                 VC4_SHADER_FLAG_VS_POINT_SIZE : 0));

From b85f6ae4b24ee50948f14a9effa982eb0b9b3681 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 29 Jul 2015 15:52:18 -0700
Subject: [PATCH 0816/1208] vc4: Start adding a NIR-based output lowering pass.

For now, this just splits up store_output intrinsics to be scalars, and
drops unused outputs in the coordinate shader.  My goal is to be able to
drop a bunch of my VC4-specific optimization by letting NIR handle it.
---
 src/gallium/drivers/vc4/Makefile.sources   |   1 +
 src/gallium/drivers/vc4/vc4_nir_lower_io.c | 130 +++++++++++++++++++++
 src/gallium/drivers/vc4/vc4_program.c      |  12 +-
 src/gallium/drivers/vc4/vc4_qir.h          |   1 +
 4 files changed, 137 insertions(+), 7 deletions(-)
 create mode 100644 src/gallium/drivers/vc4/vc4_nir_lower_io.c

diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index c24b01d4fbb..b09ffa60c39 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -19,6 +19,7 @@ C_SOURCES := \
 	vc4_fence.c \
 	vc4_formats.c \
 	vc4_job.c \
+	vc4_nir_lower_io.c \
 	vc4_opt_algebraic.c \
 	vc4_opt_constant_folding.c \
 	vc4_opt_copy_propagation.c \
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
new file mode 100644
index 00000000000..43376888248
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "vc4_qir.h"
+#include "tgsi/tgsi_info.h"
+#include "glsl/nir/nir_builder.h"
+
+/**
+ * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into
+ * something amenable to the VC4 architecture.
+ *
+ * Currently, it split outputs into scalars, and drops any non-position values
+ * in coordinate shaders.
+ */
+
+static void
+vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
+                     nir_intrinsic_instr *intr)
+{
+        nir_variable *output_var = NULL;
+        foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+                if (var->data.driver_location == intr->const_index[0]) {
+                        output_var = var;
+                        break;
+                }
+        }
+        assert(output_var);
+        unsigned semantic_name = output_var->data.location;
+
+        if (c->stage == QSTAGE_COORD &&
+            (semantic_name != TGSI_SEMANTIC_POSITION &&
+             semantic_name != TGSI_SEMANTIC_PSIZE)) {
+                nir_instr_remove(&intr->instr);
+                return;
+        }
+
+        /* All TGSI-to-NIR outputs are VEC4. */
+        assert(intr->num_components == 4);
+
+        nir_builder_insert_before_instr(b, &intr->instr);
+
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, nir_intrinsic_store_output);
+                intr_comp->num_components = 1;
+                intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
+
+                assert(intr->src[0].is_ssa);
+                intr_comp->src[0] = nir_src_for_ssa(nir_swizzle(b,
+                                                                intr->src[0].ssa,
+                                                                &i, 1, false));
+                nir_builder_instr_insert(b, &intr_comp->instr);
+        }
+
+        nir_instr_remove(&intr->instr);
+}
+
+static void
+vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
+                       struct nir_instr *instr)
+{
+        if (instr->type != nir_instr_type_intrinsic)
+                return;
+        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+        switch (intr->intrinsic) {
+        case nir_intrinsic_store_output:
+                vc4_nir_lower_output(c, b, intr);
+                break;
+
+        default:
+                break;
+        }
+}
+
+static bool
+vc4_nir_lower_io_block(nir_block *block, void *arg)
+{
+        struct vc4_compile *c = arg;
+        nir_function_impl *impl =
+                nir_cf_node_get_function(&block->cf_node);
+
+        nir_builder b;
+        nir_builder_init(&b, impl);
+
+        nir_foreach_instr_safe(block, instr)
+                vc4_nir_lower_io_instr(c, &b, instr);
+
+        return true;
+}
+
+static bool
+vc4_nir_lower_io_impl(struct vc4_compile *c, nir_function_impl *impl)
+{
+        nir_foreach_block(impl, vc4_nir_lower_io_block, c);
+
+        nir_metadata_preserve(impl, nir_metadata_block_index |
+                              nir_metadata_dominance);
+
+        return true;
+}
+
+void
+vc4_nir_lower_io(struct vc4_compile *c)
+{
+        nir_foreach_overload(c->s, overload) {
+                if (overload->impl)
+                        vc4_nir_lower_io_impl(c, overload->impl);
+        }
+}
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index a35b50cd39b..85bb1c48780 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1895,13 +1895,10 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_store_output:
-                for (int i = 0; i < instr->num_components; i++) {
-                        c->outputs[instr->const_index[0] * 4 + i] =
-                                qir_MOV(c, ntq_get_src(c, instr->src[0], i));
-                }
-                c->num_outputs = MAX2(c->num_outputs,
-                                      instr->const_index[0] * 4 +
-                                      instr->num_components + 1);
+                assert(instr->num_components == 1);
+                c->outputs[instr->const_index[0]] =
+                        qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
+                c->num_outputs = MAX2(c->num_outputs, instr->const_index[0] + 1);
                 break;
 
         case nir_intrinsic_discard:
@@ -2102,6 +2099,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         c->s = tgsi_to_nir(tokens, &nir_options);
         nir_opt_global_to_local(c->s);
         nir_convert_to_ssa(c->s);
+        vc4_nir_lower_io(c);
         nir_lower_idiv(c->s);
 
         vc4_optimize_nir(c->s);
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 2f1e261f880..7e1c95d5cab 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -409,6 +409,7 @@ bool qir_opt_cse(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
 bool qir_opt_small_immediates(struct vc4_compile *c);
 bool qir_opt_vpm_writes(struct vc4_compile *c);
+void vc4_nir_lower_io(struct vc4_compile *c);
 void qir_lower_uniforms(struct vc4_compile *c);
 
 void qpu_schedule_instructions(struct vc4_compile *c);

From 27f728cdc5d90f63839fbeb1942e6f27339b102a Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 29 Jul 2015 17:16:26 -0700
Subject: [PATCH 0817/1208] vc4: Lower NIR inputs to scalar as well.

For now this is just scalarizing, but it also means we'll get to dump a
bunch of QIR-based lowering in a moment.
---
 src/gallium/drivers/vc4/vc4_nir_lower_io.c | 44 +++++++++++++++++++++-
 src/gallium/drivers/vc4/vc4_program.c      |  4 +-
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index 43376888248..9882b6b8a35 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -29,10 +29,46 @@
  * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into
  * something amenable to the VC4 architecture.
  *
- * Currently, it split outputs into scalars, and drops any non-position values
- * in coordinate shaders.
+ * Currently, it split inputs and outputs into scalars, and drops any
+ * non-position outputs in coordinate shaders.
  */
 
+static void
+vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
+                    nir_intrinsic_instr *intr)
+{
+        /* All TGSI-to-NIR inputs are vec4. */
+        assert(intr->num_components == 4);
+
+        nir_builder_insert_before_instr(b, &intr->instr);
+
+        /* Generate scalar loads equivalent to the original VEC4. */
+        nir_ssa_def *dests[4];
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, nir_intrinsic_load_input);
+                intr_comp->num_components = 1;
+                intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_builder_instr_insert(b, &intr_comp->instr);
+
+                dests[i] = &intr_comp->dest.ssa;
+        }
+
+        /* Batch things back together into a vec4.  This will get split by the
+         * later ALU scalarization pass.
+         */
+        nir_ssa_def *vec_instr = nir_vec4(b, dests[0], dests[1],
+                                          dests[2], dests[3]);
+
+        /* Replace the old intrinsic with a reference to our reconstructed
+         * vec4.
+         */
+        nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec_instr),
+                                 ralloc_parent(b->impl));
+        nir_instr_remove(&intr->instr);
+}
+
 static void
 vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
                      nir_intrinsic_instr *intr)
@@ -84,6 +120,10 @@ vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 
         switch (intr->intrinsic) {
+        case nir_intrinsic_load_input:
+                vc4_nir_lower_input(c, b, intr);
+                break;
+
         case nir_intrinsic_store_output:
                 vc4_nir_lower_output(c, b, intr);
                 break;
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 85bb1c48780..dfc3815c5c1 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1889,8 +1889,8 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_input:
-                for (int i = 0; i < instr->num_components; i++)
-                        dest[i] = c->inputs[instr->const_index[0] * 4 + i];
+                assert(instr->num_components == 1);
+                *dest = c->inputs[instr->const_index[0]];
 
                 break;
 

From 13ddd48b97474c261ef2d7412629748d6d91f2ad Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 29 Jul 2015 17:29:39 -0700
Subject: [PATCH 0818/1208] vc4: Move program keys to the header file.

I want to be able to inspect them from other files for lowering passes in
NIR.
---
 src/gallium/drivers/vc4/vc4_program.c | 47 -------------------------
 src/gallium/drivers/vc4/vc4_qir.h     | 49 +++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index dfc3815c5c1..b2efd68f39a 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -23,7 +23,6 @@
  */
 
 #include <inttypes.h>
-#include "pipe/p_state.h"
 #include "util/u_format.h"
 #include "util/u_hash.h"
 #include "util/u_math.h"
@@ -43,52 +42,6 @@
 #include "simpenrose/simpenrose.h"
 #endif
 
-struct vc4_key {
-        struct vc4_uncompiled_shader *shader_state;
-        struct {
-                enum pipe_format format;
-                unsigned compare_mode:1;
-                unsigned compare_func:3;
-                unsigned wrap_s:3;
-                unsigned wrap_t:3;
-                uint8_t swizzle[4];
-        } tex[VC4_MAX_TEXTURE_SAMPLERS];
-        uint8_t ucp_enables;
-};
-
-struct vc4_fs_key {
-        struct vc4_key base;
-        enum pipe_format color_format;
-        bool depth_enabled;
-        bool stencil_enabled;
-        bool stencil_twoside;
-        bool stencil_full_writemasks;
-        bool is_points;
-        bool is_lines;
-        bool alpha_test;
-        bool point_coord_upper_left;
-        bool light_twoside;
-        uint8_t alpha_test_func;
-        uint8_t logicop_func;
-        uint32_t point_sprite_mask;
-
-        struct pipe_rt_blend_state blend;
-};
-
-struct vc4_vs_key {
-        struct vc4_key base;
-
-        /**
-         * This is a proxy for the array of FS input semantics, which is
-         * larger than we would want to put in the key.
-         */
-        uint64_t compiled_fs_id;
-
-        enum pipe_format attr_formats[8];
-        bool is_coord;
-        bool per_vertex_point_size;
-};
-
 static void
 resize_qreg_array(struct vc4_compile *c,
                   struct qreg **regs,
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 7e1c95d5cab..80a19713a10 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -36,6 +36,9 @@
 #include "util/list.h"
 #include "util/u_math.h"
 
+#include "vc4_screen.h"
+#include "pipe/p_state.h"
+
 enum qfile {
         QFILE_NULL,
         QFILE_TEMP,
@@ -280,6 +283,52 @@ struct vc4_compiler_ubo_range {
         bool used;
 };
 
+struct vc4_key {
+        struct vc4_uncompiled_shader *shader_state;
+        struct {
+                enum pipe_format format;
+                unsigned compare_mode:1;
+                unsigned compare_func:3;
+                unsigned wrap_s:3;
+                unsigned wrap_t:3;
+                uint8_t swizzle[4];
+        } tex[VC4_MAX_TEXTURE_SAMPLERS];
+        uint8_t ucp_enables;
+};
+
+struct vc4_fs_key {
+        struct vc4_key base;
+        enum pipe_format color_format;
+        bool depth_enabled;
+        bool stencil_enabled;
+        bool stencil_twoside;
+        bool stencil_full_writemasks;
+        bool is_points;
+        bool is_lines;
+        bool alpha_test;
+        bool point_coord_upper_left;
+        bool light_twoside;
+        uint8_t alpha_test_func;
+        uint8_t logicop_func;
+        uint32_t point_sprite_mask;
+
+        struct pipe_rt_blend_state blend;
+};
+
+struct vc4_vs_key {
+        struct vc4_key base;
+
+        /**
+         * This is a proxy for the array of FS input semantics, which is
+         * larger than we would want to put in the key.
+         */
+        uint64_t compiled_fs_id;
+
+        enum pipe_format attr_formats[8];
+        bool is_coord;
+        bool per_vertex_point_size;
+};
+
 struct vc4_compile {
         struct vc4_context *vc4;
         nir_shader *s;

From 5a8c57b52287ba2bb8faa4447e7d1cc46ef1a3d4 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 29 Jul 2015 17:27:54 -0700
Subject: [PATCH 0819/1208] vc4: Move some FS input lowering into NIR.

---
 src/gallium/drivers/vc4/vc4_nir_lower_io.c | 47 ++++++++++++++++++++++
 src/gallium/drivers/vc4/vc4_program.c      | 38 ++---------------
 2 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index 9882b6b8a35..fa06c893cfb 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -42,6 +42,17 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
 
         nir_builder_insert_before_instr(b, &intr->instr);
 
+        nir_variable *input_var = NULL;
+        foreach_list_typed(nir_variable, var, node, &c->s->inputs) {
+                if (var->data.driver_location == intr->const_index[0]) {
+                        input_var = var;
+                        break;
+                }
+        }
+        assert(input_var);
+        int semantic_name = input_var->data.location;
+        int semantic_index = input_var->data.index;
+
         /* Generate scalar loads equivalent to the original VEC4. */
         nir_ssa_def *dests[4];
         for (unsigned i = 0; i < intr->num_components; i++) {
@@ -55,6 +66,42 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
                 dests[i] = &intr_comp->dest.ssa;
         }
 
+        switch (c->stage) {
+        case QSTAGE_FRAG:
+                switch (semantic_name) {
+                case TGSI_SEMANTIC_FACE:
+                        dests[0] = nir_fsub(b,
+                                            nir_imm_float(b, 1.0),
+                                            nir_fmul(b,
+                                                     nir_i2f(b, dests[0]),
+                                                     nir_imm_float(b, 2.0)));
+                        dests[1] = nir_imm_float(b, 0.0);
+                        dests[2] = nir_imm_float(b, 0.0);
+                        dests[3] = nir_imm_float(b, 1.0);
+                        break;
+                case TGSI_SEMANTIC_GENERIC:
+                        if (c->fs_key->point_sprite_mask &
+                            (1 << semantic_index)) {
+                                if (!c->fs_key->is_points) {
+                                        dests[0] = nir_imm_float(b, 0.0);
+                                        dests[1] = nir_imm_float(b, 0.0);
+                                }
+                                if (c->fs_key->point_coord_upper_left) {
+                                        dests[1] = nir_fsub(b,
+                                                            nir_imm_float(b, 1.0),
+                                                            dests[1]);
+                                }
+                                dests[2] = nir_imm_float(b, 0.0);
+                                dests[3] = nir_imm_float(b, 1.0);
+                        }
+                        break;
+                }
+                break;
+        case QSTAGE_COORD:
+        case QSTAGE_VERT:
+                break;
+        }
+
         /* Batch things back together into a vec4.  This will get split by the
          * later ALU scalarization pass.
          */
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index b2efd68f39a..ddc997003b2 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -707,26 +707,6 @@ emit_fragcoord_input(struct vc4_compile *c, int attr)
         c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));
 }
 
-static void
-emit_point_coord_input(struct vc4_compile *c, int attr)
-{
-        if (c->point_x.file == QFILE_NULL) {
-                c->point_x = qir_uniform_f(c, 0.0);
-                c->point_y = qir_uniform_f(c, 0.0);
-        }
-
-        c->inputs[attr * 4 + 0] = c->point_x;
-        if (c->fs_key->point_coord_upper_left) {
-                c->inputs[attr * 4 + 1] = qir_FSUB(c,
-                                                   qir_uniform_f(c, 1.0),
-                                                   c->point_y);
-        } else {
-                c->inputs[attr * 4 + 1] = c->point_y;
-        }
-        c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0);
-        c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0);
-}
-
 static struct qreg
 emit_fragment_varying(struct vc4_compile *c, uint8_t semantic,
                       uint8_t index, uint8_t swizzle)
@@ -767,19 +747,6 @@ emit_fragment_input(struct vc4_compile *c, int attr,
         }
 }
 
-static void
-emit_face_input(struct vc4_compile *c, int attr)
-{
-        c->inputs[attr * 4 + 0] = qir_FSUB(c,
-                                           qir_uniform_f(c, 1.0),
-                                           qir_FMUL(c,
-                                                    qir_ITOF(c, qir_FRAG_REV_FLAG(c)),
-                                                    qir_uniform_f(c, 2.0)));
-        c->inputs[attr * 4 + 1] = qir_uniform_f(c, 0.0);
-        c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0);
-        c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0);
-}
-
 static void
 add_output(struct vc4_compile *c,
            uint32_t decl_offset,
@@ -1707,11 +1674,12 @@ ntq_setup_inputs(struct vc4_compile *c)
                         if (semantic_name == TGSI_SEMANTIC_POSITION) {
                                 emit_fragcoord_input(c, loc);
                         } else if (semantic_name == TGSI_SEMANTIC_FACE) {
-                                emit_face_input(c, loc);
+                                c->inputs[loc * 4 + 0] = qir_FRAG_REV_FLAG(c);
                         } else if (semantic_name == TGSI_SEMANTIC_GENERIC &&
                                    (c->fs_key->point_sprite_mask &
                                     (1 << semantic_index))) {
-                                emit_point_coord_input(c, loc);
+                                c->inputs[loc * 4 + 0] = c->point_x;
+                                c->inputs[loc * 4 + 1] = c->point_y;
                         } else {
                                 emit_fragment_input(c, loc,
                                                     semantic_name,

From 7830e465a5f446616ce49a7f8219256a5503a68b Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 30 Jul 2015 11:16:13 -0700
Subject: [PATCH 0820/1208] vc4: Lower uniform loads to scalar in NIR.

This also moves the vec4-to-byte-addressing math into NIR, so that
algebraic has a chance at it.
---
 src/gallium/drivers/vc4/vc4_nir_lower_io.c | 86 ++++++++++++++++++----
 src/gallium/drivers/vc4/vc4_program.c      | 26 +++----
 2 files changed, 81 insertions(+), 31 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index fa06c893cfb..ffc120e8865 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -29,10 +29,29 @@
  * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into
  * something amenable to the VC4 architecture.
  *
- * Currently, it split inputs and outputs into scalars, and drops any
- * non-position outputs in coordinate shaders.
+ * Currently, it split inputs, outputs, and uniforms into scalars, drops any
+ * non-position outputs in coordinate shaders, and fixes up the addressing on
+ * indirect uniform loads.
  */
 
+static void
+replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr,
+                            nir_ssa_def **comps)
+{
+
+        /* Batch things back together into a vec4.  This will get split by the
+         * later ALU scalarization pass.
+         */
+        nir_ssa_def *vec = nir_vec4(b, comps[0], comps[1], comps[2], comps[3]);
+
+        /* Replace the old intrinsic with a reference to our reconstructed
+         * vec4.
+         */
+        nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec),
+                                 ralloc_parent(b->impl));
+        nir_instr_remove(&intr->instr);
+}
+
 static void
 vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
                     nir_intrinsic_instr *intr)
@@ -102,18 +121,7 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
                 break;
         }
 
-        /* Batch things back together into a vec4.  This will get split by the
-         * later ALU scalarization pass.
-         */
-        nir_ssa_def *vec_instr = nir_vec4(b, dests[0], dests[1],
-                                          dests[2], dests[3]);
-
-        /* Replace the old intrinsic with a reference to our reconstructed
-         * vec4.
-         */
-        nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec_instr),
-                                 ralloc_parent(b->impl));
-        nir_instr_remove(&intr->instr);
+        replace_intrinsic_with_vec4(b, intr, dests);
 }
 
 static void
@@ -158,6 +166,51 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
         nir_instr_remove(&intr->instr);
 }
 
+static void
+vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b,
+                      nir_intrinsic_instr *intr)
+{
+        /* All TGSI-to-NIR uniform loads are vec4. */
+        assert(intr->num_components == 4);
+
+        nir_builder_insert_before_instr(b, &intr->instr);
+
+        /* Generate scalar loads equivalent to the original VEC4. */
+        nir_ssa_def *dests[4];
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, intr->intrinsic);
+                intr_comp->num_components = 1;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+
+                if (intr->intrinsic == nir_intrinsic_load_uniform_indirect) {
+                        /* Convert the variable TGSI register index to a byte
+                         * offset.
+                         */
+                        intr_comp->src[0] =
+                                nir_src_for_ssa(nir_ishl(b,
+                                                         intr->src[0].ssa,
+                                                         nir_imm_int(b, 4)));
+
+                        /* Convert the offset to be a byte index, too. */
+                        intr_comp->const_index[0] = (intr->const_index[0] * 16 +
+                                                     i * 4);
+                } else {
+                        /* We want a dword index for non-indirect uniform
+                         * loads.
+                         */
+                        intr_comp->const_index[0] = (intr->const_index[0] * 4 +
+                                                     i);
+                }
+
+                dests[i] = &intr_comp->dest.ssa;
+
+                nir_builder_instr_insert(b, &intr_comp->instr);
+        }
+
+        replace_intrinsic_with_vec4(b, intr, dests);
+}
+
 static void
 vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
                        struct nir_instr *instr)
@@ -175,6 +228,11 @@ vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
                 vc4_nir_lower_output(c, b, intr);
                 break;
 
+        case nir_intrinsic_load_uniform:
+        case nir_intrinsic_load_uniform_indirect:
+                vc4_nir_lower_uniform(c, b, intr);
+                break;
+
         default:
                 break;
         }
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index ddc997003b2..f2742986beb 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -42,6 +42,9 @@
 #include "simpenrose/simpenrose.h"
 #endif
 
+static struct qreg
+ntq_get_src(struct vc4_compile *c, nir_src src, int i);
+
 static void
 resize_qreg_array(struct vc4_compile *c,
                   struct qreg **regs,
@@ -64,10 +67,10 @@ resize_qreg_array(struct vc4_compile *c,
 }
 
 static struct qreg
-indirect_uniform_load(struct vc4_compile *c,
-                      struct qreg indirect_offset,
-                      unsigned offset)
+indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
 {
+        struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
+        uint32_t offset = intr->const_index[0];
         struct vc4_compiler_ubo_range *range = NULL;
         unsigned i;
         for (i = 0; i < c->num_uniform_ranges; i++) {
@@ -89,10 +92,6 @@ indirect_uniform_load(struct vc4_compile *c,
         };
 
         offset -= range->src_offset;
-        /* Translate the user's TGSI register index from the TGSI register
-         * base to a byte offset.
-         */
-        indirect_offset = qir_SHL(c, indirect_offset, qir_uniform_ui(c, 4));
 
         /* Adjust for where we stored the TGSI register base. */
         indirect_offset = qir_ADD(c, indirect_offset,
@@ -1793,19 +1792,12 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
 
         switch (instr->intrinsic) {
         case nir_intrinsic_load_uniform:
-                for (int i = 0; i < instr->num_components; i++) {
-                        dest[i] = qir_uniform(c, QUNIFORM_UNIFORM,
-                                              instr->const_index[0] * 4 + i);
-                }
+                assert(instr->num_components == 1);
+                *dest = qir_uniform(c, QUNIFORM_UNIFORM, instr->const_index[0]);
                 break;
 
         case nir_intrinsic_load_uniform_indirect:
-                for (int i = 0; i < instr->num_components; i++) {
-                        dest[i] = indirect_uniform_load(c,
-                                                        ntq_get_src(c, instr->src[0], 0),
-                                                        (instr->const_index[0] *
-                                                         4 + i) * sizeof(float));
-                }
+                *dest = indirect_uniform_load(c, instr);
 
                 break;
 

From 7eaacc1678195738fab3bb98870828611cae066d Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Wed, 29 Jul 2015 12:35:24 -0700
Subject: [PATCH 0821/1208] i965/skl: Add production thread counts and URB size

This patch adjusts the SKL values to the best known values we have.

v2: Remove HS/DS/CS fields. Adding this makes most sense to add to the
GEN9_FEATURES macro, however, doing that would require updating BXT values, and
Jordan requested I not do that. Conveniently, this request makes a lot of sense
wrt to stable backport as HS, and DS do not even exist there.

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_device_info.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index 2c5d778dbd2..f313b2b3c53 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -308,13 +308,13 @@ static const struct brw_device_info brw_device_info_chv = {
    .has_llc = true,                                 \
    .has_pln = true,                                 \
    .supports_simd16_3src = true,                    \
-   .max_vs_threads = 280,                           \
-   .max_gs_threads = 256,                           \
-   .max_wm_threads = 408,                           \
+   .max_vs_threads = 336,                           \
+   .max_gs_threads = 336,                           \
+   .max_wm_threads = 64 * 6,                        \
    .urb = {                                         \
-      .size = 128,                                  \
+      .size = 192,                                  \
       .min_vs_entries = 64,                         \
-      .max_vs_entries = 1664,                       \
+      .max_vs_entries = 1856,                       \
       .max_gs_entries = 640,                        \
    }
 

From 3cb58010037bad24890785007fd8f47d67249f2f Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Thu, 30 Jul 2015 19:12:15 -0700
Subject: [PATCH 0822/1208] i965/bxt: Use more conservative thread counts

Since we really do not know what may occur in the future, pick a more
conservative value for thread counts until we know better what values are
correct. As far as I can tell, the old values will work fine, but some of the
registers seem to indicate that going even lower is possible and the purpose of
having early support is to enable as many configurations that can possibly
exist (we can trim things down after platforms begin shipping later).

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_device_info.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index f313b2b3c53..75d1ecb68de 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -340,8 +340,10 @@ static const struct brw_device_info brw_device_info_bxt = {
    .is_broxton = 1,
    .gt = 1,
    .has_llc = false,
-   .max_vs_threads = 112,
-   .max_gs_threads = 112,
+
+   /* XXX: These are preliminary thread counts and URB sizes. */
+   .max_vs_threads = 56,
+   .max_gs_threads = 56,
    .max_wm_threads = 32,
    .urb = {
       .size = 64,

From 383558c56427b0e8b4e56cce8737771ad053f753 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Thu, 30 Jul 2015 19:16:32 -0700
Subject: [PATCH 0823/1208] i965/gen9: Add hs, ds, and cs thread + urb info

For SKL: These are the production values.

For BXT: These are low estimates to enable platforms.

This patch was originally part of
i965/skl: Add production thread counts and URB size
but was split out at Jordan's request (which I found to be reasonable).

Note on stable inclusion: 10.6 does not care about hs, and ds. It does care
about cs, but since Jordan was the one that asked me to extract it, I'll leave
it up to him to deal with a backport to stable is required.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_device_info.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index 75d1ecb68de..fcc243e1d4c 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -310,11 +310,16 @@ static const struct brw_device_info brw_device_info_chv = {
    .supports_simd16_3src = true,                    \
    .max_vs_threads = 336,                           \
    .max_gs_threads = 336,                           \
+   .max_hs_threads = 336,                           \
+   .max_ds_threads = 336,                           \
    .max_wm_threads = 64 * 6,                        \
+   .max_cs_threads = 56,                            \
    .urb = {                                         \
       .size = 192,                                  \
       .min_vs_entries = 64,                         \
       .max_vs_entries = 1856,                       \
+      .max_hs_entries = 672,                        \
+      .max_ds_entries = 1120,                       \
       .max_gs_entries = 640,                        \
    }
 
@@ -343,12 +348,17 @@ static const struct brw_device_info brw_device_info_bxt = {
 
    /* XXX: These are preliminary thread counts and URB sizes. */
    .max_vs_threads = 56,
+   .max_hs_threads = 56,
+   .max_ds_threads = 56,
    .max_gs_threads = 56,
    .max_wm_threads = 32,
+   .max_cs_threads = 28,
    .urb = {
       .size = 64,
       .min_vs_entries = 34,
       .max_vs_entries = 640,
+      .max_hs_entries = 80,
+      .max_ds_entries = 80,
       .max_gs_entries = 256,
    }
 };

From 781dc7c0e1f41502f18e07c0940af949a78d2792 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Thu, 30 Jul 2015 14:45:57 +0300
Subject: [PATCH 0824/1208] i965/fs: Fix regression with SIMD8 VS since
 b5f1a48e234d47b24df38cb562cffb8941d43795.

With num_direct_uniforms == 0 there's no space allocated in the
param_size array for the one block of direct uniforms -- On the FS
stage this would be a harmless no-op because it would simply re-set
one of the param_size entries allocated for the sampler units to zero,
but on the VS stage it has been reported to cause memory corruption
followed by a crash -- Surprising how a full piglit run on Gen8 didn't
catch it.

Reported-and-reviewed-by: "Lofstedt, Marta" <marta.lofstedt@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 722e4e75a82..9cb7b0db3f6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -182,7 +182,8 @@ fs_visitor::nir_setup_uniforms(nir_shader *shader)
    /* We split the uniform register file in half.  The first half is
     * entirely direct uniforms.  The second half is indirect.
     */
-   param_size[0] = num_direct_uniforms;
+   if (num_direct_uniforms > 0)
+      param_size[0] = num_direct_uniforms;
    if (shader->num_uniforms > num_direct_uniforms)
       param_size[num_direct_uniforms] = shader->num_uniforms - num_direct_uniforms;
 

From b0528118dfb1af00e7d08cdb637191b80c14c2ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 00:53:16 +0200
Subject: [PATCH 0825/1208] radeonsi: completely rework updating descriptors
 without CP DMA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The patch has a better explanation. Just a summary here:
- The CPU always uploads a whole descriptor array to previously-unused memory.
- CP DMA isn't used.
- No caches need to be flushed.
- All descriptors are always up-to-date in memory even after a hang, because
  CP DMA doesn't serve as a middle man to update them.

This should bring:
- better hang recovery (descriptors are always up-to-date)
- better GPU performance (no KCACHE and TC flushes)
- worse CPU performance for partial updates (only whole arrays are uploaded)
- less used IB space (no CP_DMA and WRITE_DATA packets)
- simpler code
- hopefully, some of the corruption issues with SI cards will go away.
  If not, we'll know the issue is not here.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 356 ++++++------------
 src/gallium/drivers/radeonsi/si_pipe.h        |   6 -
 src/gallium/drivers/radeonsi/si_state.h       |  32 +-
 src/gallium/drivers/radeonsi/si_state_draw.c  |   7 +-
 4 files changed, 129 insertions(+), 272 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 14bb6e1fa21..48ec9b72043 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -24,14 +24,23 @@
  *      Marek Olšák <marek.olsak@amd.com>
  */
 
-/* Resource binding slots and sampler states (each described with 8 or 4 dwords)
- * live in memory on SI.
+/* Resource binding slots and sampler states (each described with 8 or
+ * 4 dwords) are stored in lists in memory which is accessed by shaders
+ * using scalar load instructions.
  *
- * This file is responsible for managing lists of resources and sampler states
- * in memory and binding them, which means updating those structures in memory.
+ * This file is responsible for managing such lists. It keeps a copy of all
+ * descriptors in CPU memory and re-uploads a whole list if some slots have
+ * been changed.
  *
- * There is also code for updating shader pointers to resources and sampler
- * states. CP DMA functions are here too.
+ * This code is also reponsible for updating shader pointers to those lists.
+ *
+ * Note that CP DMA can't be used for updating the lists, because a GPU hang
+ * could leave the list in a mid-IB state and the next IB would get wrong
+ * descriptors and the whole context would be unusable at that point.
+ * (Note: The register shadowing can't be used due to the same reason)
+ *
+ * Also, uploading descriptors to newly allocated memory doesn't require
+ * a KCACHE flush.
  */
 
 #include "radeon/r600_cs.h"
@@ -42,7 +51,6 @@
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
 
-#define SI_NUM_CONTEXTS 16
 
 /* NULL image and buffer descriptor.
  *
@@ -139,159 +147,62 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
 	}
 }
 
-static void si_init_descriptors(struct si_context *sctx,
-				struct si_descriptors *desc,
+static void si_init_descriptors(struct si_descriptors *desc,
 				unsigned shader_userdata_index,
 				unsigned element_dw_size,
-				unsigned num_elements,
-				void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
+				unsigned num_elements)
 {
-	assert(num_elements <= sizeof(desc->enabled_mask)*8);
-	assert(num_elements <= sizeof(desc->dirty_mask)*8);
+	int i;
 
-	desc->atom.emit = (void*)emit_func;
-	desc->shader_userdata_offset = shader_userdata_index * 4;
+	assert(num_elements <= sizeof(desc->enabled_mask)*8);
+
+	desc->list = CALLOC(num_elements, element_dw_size * 4);
 	desc->element_dw_size = element_dw_size;
 	desc->num_elements = num_elements;
-	desc->context_size = num_elements * element_dw_size * 4;
+	desc->list_dirty = true; /* upload the list before the next draw */
+	desc->shader_userdata_offset = shader_userdata_index * 4;
 
-	desc->buffer = (struct r600_resource*)
-		pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
-				   PIPE_USAGE_DEFAULT,
-				   SI_NUM_CONTEXTS * desc->context_size);
-
-	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
-			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-
-	/* We don't check for CS space here, because this should be called
-	 * only once at context initialization. */
-	si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address,
-				    desc->buffer->b.b.width0, 0,
-				    R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
+	/* Initialize the array to NULL descriptors if the element size is 8. */
+	if (element_dw_size == 8)
+		for (i = 0; i < num_elements; i++)
+			memcpy(desc->list + i*element_dw_size, null_descriptor,
+			       sizeof(null_descriptor));
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
 {
 	pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
+	FREE(desc->list);
 }
 
-static void si_update_descriptors(struct si_context *sctx,
+static bool si_upload_descriptors(struct si_context *sctx,
 				  struct si_descriptors *desc)
 {
-	if (desc->dirty_mask) {
-		desc->atom.num_dw =
-			7 + /* copy */
-			(4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask); /* update */
+	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
+	void *ptr;
 
-		desc->atom.dirty = true;
-		desc->pointer_dirty = true;
-		sctx->shader_userdata.atom.dirty = true;
+	if (!desc->list_dirty)
+		return true;
 
-		/* TODO: Investigate if these flushes can be removed after
-		 * adding CE support. */
+	u_upload_alloc(sctx->b.uploader, 0, list_size,
+		       &desc->buffer_offset,
+		       (struct pipe_resource**)&desc->buffer, &ptr);
+	if (!desc->buffer)
+		return false; /* skip the draw call */
 
-		/* The descriptors are read with the K cache. */
-		sctx->b.flags |= SI_CONTEXT_INV_KCACHE;
+	util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
 
-		/* Since SI uses uncached CP DMA to update descriptors,
-		 * we have to flush TC L2, which is used to fetch constants
-		 * along with KCACHE. */
-		if (sctx->b.chip_class == SI)
-			sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
-	} else {
-		desc->atom.dirty = false;
-	}
-}
+	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
+			      RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 
-static void si_emit_descriptors(struct si_context *sctx,
-				struct si_descriptors *desc,
-				uint32_t **descriptors)
-{
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint64_t va_base;
-	int packet_start = 0;
-	int packet_size = 0;
-	int last_index = desc->num_elements; /* point to a non-existing element */
-	uint64_t dirty_mask = desc->dirty_mask;
-	unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
-
-	assert(dirty_mask);
-
-	va_base = desc->buffer->gpu_address;
-
-	/* Copy the descriptors to a new context slot. */
-	si_emit_cp_dma_copy_buffer(sctx,
-				   va_base + new_context_id * desc->context_size,
-				   va_base + desc->current_context_id * desc->context_size,
-				   desc->context_size, R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
-
-	va_base += new_context_id * desc->context_size;
-
-	/* Update the descriptors.
-	 * Updates of consecutive descriptors are merged to one WRITE_DATA packet.
-	 *
-	 * XXX When unbinding lots of resources, consider clearing the memory
-	 *     with CP DMA instead of emitting zeros.
-	 */
-	while (dirty_mask) {
-		int i = u_bit_scan64(&dirty_mask);
-
-		assert(i < desc->num_elements);
-
-		if (last_index+1 == i && packet_size) {
-			/* Append new data at the end of the last packet. */
-			packet_size += desc->element_dw_size;
-			cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
-		} else {
-			/* Start a new packet. */
-			uint64_t va = va_base + i * desc->element_dw_size * 4;
-
-			packet_start = cs->cdw;
-			packet_size = 2 + desc->element_dw_size;
-
-			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
-			radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ?
-						PKT3_WRITE_DATA_DST_SEL_MEM_SYNC :
-						PKT3_WRITE_DATA_DST_SEL_TC_L2) |
-					     PKT3_WRITE_DATA_WR_CONFIRM |
-					     PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
-			radeon_emit(cs, va & 0xFFFFFFFFUL);
-			radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL);
-		}
-
-		radeon_emit_array(cs, descriptors[i], desc->element_dw_size);
-
-		last_index = i;
-	}
-
-	desc->dirty_mask = 0;
-	desc->current_context_id = new_context_id;
+	desc->list_dirty = false;
+	desc->pointer_dirty = true;
+	sctx->shader_userdata.atom.dirty = true;
+	return true;
 }
 
 /* SAMPLER VIEWS */
 
-static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_sampler_views *views = (struct si_sampler_views*)atom;
-
-	si_emit_descriptors(sctx, &views->desc, views->desc_data);
-}
-
-static void si_init_sampler_views(struct si_context *sctx,
-				  struct si_sampler_views *views)
-{
-	int i;
-
-	si_init_descriptors(sctx, &views->desc, SI_SGPR_RESOURCE,
-			    8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views);
-
-	for (i = 0; i < views->desc.num_elements; i++) {
-		views->desc_data[i] = null_descriptor;
-		views->desc.dirty_mask |= 1llu << i;
-	}
-	si_update_descriptors(sctx, &views->desc);
-}
-
 static void si_release_sampler_views(struct si_sampler_views *views)
 {
 	int i;
@@ -332,6 +243,8 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 				      si_get_resource_ro_priority(rview->resource));
 	}
 
+	if (!views->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 }
@@ -354,17 +267,16 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
 				rview->resource, RADEON_USAGE_READ,
 				si_get_resource_ro_priority(rview->resource));
 
-
 		pipe_sampler_view_reference(&views->views[slot], view);
-		views->desc_data[slot] = view_desc;
+		memcpy(views->desc.list + slot*8, view_desc, 8*4);
 		views->desc.enabled_mask |= 1llu << slot;
 	} else {
 		pipe_sampler_view_reference(&views->views[slot], NULL);
-		views->desc_data[slot] = null_descriptor;
+		memcpy(views->desc.list + slot*8, null_descriptor, 8*4);
 		views->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	views->desc.dirty_mask |= 1llu << slot;
+	views->desc.list_dirty = true;
 }
 
 static void si_set_sampler_views(struct pipe_context *ctx,
@@ -423,22 +335,15 @@ static void si_set_sampler_views(struct pipe_context *ctx,
 					    NULL, NULL);
 		}
 	}
-
-	si_update_descriptors(sctx, &samplers->views.desc);
 }
 
 /* SAMPLER STATES */
 
-static void si_emit_sampler_states(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_sampler_states *states = (struct si_sampler_states*)atom;
-
-	si_emit_descriptors(sctx, &states->desc, states->desc_data);
-}
-
 static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 					   struct si_sampler_states *states)
 {
+	if (!states->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 }
@@ -460,64 +365,39 @@ void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
 	for (i = 0; i < count; i++) {
 		unsigned slot = start + i;
 
-		if (!sstates[i]) {
-			samplers->desc.dirty_mask &= ~(1llu << slot);
+		if (!sstates[i])
 			continue;
-		}
 
-		samplers->desc_data[slot] = sstates[i]->val;
-		samplers->desc.dirty_mask |= 1llu << slot;
+		memcpy(samplers->desc.list + slot*4, sstates[i]->val, 4*4);
+		samplers->desc.list_dirty = true;
 	}
-
-	si_update_descriptors(sctx, &samplers->desc);
 }
 
 /* BUFFER RESOURCES */
 
-static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_buffer_resources *buffers = (struct si_buffer_resources*)atom;
-
-	si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data);
-}
-
-static void si_init_buffer_resources(struct si_context *sctx,
-				     struct si_buffer_resources *buffers,
+static void si_init_buffer_resources(struct si_buffer_resources *buffers,
 				     unsigned num_buffers,
 				     unsigned shader_userdata_index,
 				     enum radeon_bo_usage shader_usage,
 				     enum radeon_bo_priority priority)
 {
-	int i;
-
-	buffers->num_buffers = num_buffers;
 	buffers->shader_usage = shader_usage;
 	buffers->priority = priority;
 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
-	buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4);
 
-	/* si_emit_descriptors only accepts an array of arrays.
-	 * This adds such an array. */
-	buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*));
-	for (i = 0; i < num_buffers; i++) {
-		buffers->desc_data[i] = &buffers->desc_storage[i*4];
-	}
-
-	si_init_descriptors(sctx, &buffers->desc, shader_userdata_index, 4,
-			    num_buffers, si_emit_buffer_resources);
+	si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
+			    num_buffers);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
 {
 	int i;
 
-	for (i = 0; i < buffers->num_buffers; i++) {
+	for (i = 0; i < buffers->desc.num_elements; i++) {
 		pipe_resource_reference(&buffers->buffers[i], NULL);
 	}
 
 	FREE(buffers->buffers);
-	FREE(buffers->desc_storage);
-	FREE(buffers->desc_data);
 	si_release_descriptors(&buffers->desc);
 }
 
@@ -535,6 +415,8 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 				      buffers->shader_usage, buffers->priority);
 	}
 
+	if (!buffers->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
 			      RADEON_PRIO_SHADER_DATA);
@@ -560,12 +442,15 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
 				      RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
 	}
+
+	if (!desc->buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_SHADER_DATA);
 }
 
-void si_update_vertex_buffers(struct si_context *sctx)
+static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 {
 	struct si_descriptors *desc = &sctx->vertex_buffers;
 	bool bound[SI_NUM_VERTEX_BUFFERS] = {};
@@ -573,8 +458,10 @@ void si_update_vertex_buffers(struct si_context *sctx)
 	uint64_t va;
 	uint32_t *ptr;
 
+	if (!sctx->vertex_buffers_dirty)
+		return true;
 	if (!count || !sctx->vertex_elements)
-		return;
+		return true;
 
 	/* Vertex buffer descriptors are the only ones which are uploaded
 	 * directly through a staging buffer and don't go through
@@ -582,13 +469,14 @@ void si_update_vertex_buffers(struct si_context *sctx)
 	 */
 	u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
+	if (!desc->buffer)
+		return false;
 
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_SHADER_DATA);
 
 	assert(count <= SI_NUM_VERTEX_BUFFERS);
-	assert(desc->current_context_id == 0);
 
 	for (i = 0; i < count; i++) {
 		struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
@@ -640,6 +528,8 @@ void si_update_vertex_buffers(struct si_context *sctx)
 	 * cache is needed. */
 	desc->pointer_dirty = true;
 	sctx->shader_userdata.atom.dirty = true;
+	sctx->vertex_buffers_dirty = false;
+	return true;
 }
 
 
@@ -664,7 +554,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 	if (shader >= SI_NUM_SHADERS)
 		return;
 
-	assert(slot < buffers->num_buffers);
+	assert(slot < buffers->desc.num_elements);
 	pipe_resource_reference(&buffers->buffers[slot], NULL);
 
 	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
@@ -691,7 +581,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 		}
 
 		/* Set the descriptor. */
-		uint32_t *desc = buffers->desc_data[slot];
+		uint32_t *desc = buffers->desc.list + slot*4;
 		desc[0] = va;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(0);
@@ -710,12 +600,11 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 		buffers->desc.enabled_mask |= 1llu << slot;
 	} else {
 		/* Clear the descriptor. */
-		memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
 		buffers->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	buffers->desc.dirty_mask |= 1llu << slot;
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 /* RING BUFFERS */
@@ -735,7 +624,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 	/* The stride field in the resource descriptor has 14 bits */
 	assert(stride < (1 << 14));
 
-	assert(slot < buffers->num_buffers);
+	assert(slot < buffers->desc.num_elements);
 	pipe_resource_reference(&buffers->buffers[slot], NULL);
 
 	if (buffer) {
@@ -780,7 +669,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 		}
 
 		/* Set the descriptor. */
-		uint32_t *desc = buffers->desc_data[slot];
+		uint32_t *desc = buffers->desc.list + slot*4;
 		desc[0] = va;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(stride) |
@@ -803,12 +692,11 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 		buffers->desc.enabled_mask |= 1llu << slot;
 	} else {
 		/* Clear the descriptor. */
-		memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
 		buffers->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	buffers->desc.dirty_mask |= 1llu << slot;
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 /* STREAMOUT BUFFERS */
@@ -870,7 +758,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			uint64_t va = r600_resource(buffer)->gpu_address;
 
 			/* Set the descriptor. */
-			uint32_t *desc = buffers->desc_data[bufidx];
+			uint32_t *desc = buffers->desc.list + bufidx*4;
 			desc[0] = va;
 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
 			desc[2] = 0xffffffff;
@@ -888,24 +776,22 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			buffers->desc.enabled_mask |= 1llu << bufidx;
 		} else {
 			/* Clear the descriptor and unset the resource. */
-			memset(buffers->desc_data[bufidx], 0,
+			memset(buffers->desc.list + bufidx*4, 0,
 			       sizeof(uint32_t) * 4);
 			pipe_resource_reference(&buffers->buffers[bufidx],
 						NULL);
 			buffers->desc.enabled_mask &= ~(1llu << bufidx);
 		}
-		buffers->desc.dirty_mask |= 1llu << bufidx;
 	}
 	for (; i < old_num_targets; i++) {
 		bufidx = SI_SO_BUF_OFFSET + i;
 		/* Clear the descriptor and unset the resource. */
-		memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4);
 		pipe_resource_reference(&buffers->buffers[bufidx], NULL);
 		buffers->desc.enabled_mask &= ~(1llu << bufidx);
-		buffers->desc.dirty_mask |= 1llu << bufidx;
 	}
 
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
@@ -974,22 +860,19 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 	/* Read/Write buffers. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
-		bool found = false;
 		uint64_t mask = buffers->desc.enabled_mask;
 
 		while (mask) {
 			i = u_bit_scan64(&mask);
 			if (buffers->buffers[i] == buf) {
-				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+				si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
 							    old_va, buf);
+				buffers->desc.list_dirty = true;
 
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
 
-				buffers->desc.dirty_mask |= 1llu << i;
-				found = true;
-
 				if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) {
 					/* Update the streamout state. */
 					if (sctx->b.streamout.begin_emitted) {
@@ -1001,34 +884,25 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 				}
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &buffers->desc);
-		}
 	}
 
 	/* Constant buffers. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
-		bool found = false;
 		uint64_t mask = buffers->desc.enabled_mask;
 
 		while (mask) {
 			unsigned i = u_bit_scan64(&mask);
 			if (buffers->buffers[i] == buf) {
-				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+				si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
 							    old_va, buf);
+				buffers->desc.list_dirty = true;
 
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
-
-				buffers->desc.dirty_mask |= 1llu << i;
-				found = true;
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &buffers->desc);
-		}
 	}
 
 	/* Texture buffers - update virtual addresses in sampler view descriptors. */
@@ -1040,23 +914,20 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 	/* Texture buffers - update bindings. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_sampler_views *views = &sctx->samplers[shader].views;
-		bool found = false;
 		uint64_t mask = views->desc.enabled_mask;
 
 		while (mask) {
 			unsigned i = u_bit_scan64(&mask);
 			if (views->views[i]->texture == buf) {
+				si_desc_reset_buffer_offset(ctx, views->desc.list + i*8+4,
+							    old_va, buf);
+				views->desc.list_dirty = true;
+
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, RADEON_USAGE_READ,
 						      RADEON_PRIO_SHADER_BUFFER_RO);
-
-				views->desc.dirty_mask |= 1llu << i;
-				found = true;
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &views->desc);
-		}
 	}
 }
 
@@ -1297,11 +1168,10 @@ static void si_emit_shader_pointer(struct si_context *sctx,
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 	uint64_t va;
 
-	if (!desc->pointer_dirty)
+	if (!desc->pointer_dirty || !desc->buffer)
 		return;
 
 	va = desc->buffer->gpu_address +
-	     desc->current_context_id * desc->context_size +
 	     desc->buffer_offset;
 
 	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
@@ -1351,34 +1221,28 @@ static void si_emit_shader_userdata(struct si_context *sctx,
 	si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
 }
 
-/* INIT/DEINIT */
+/* INIT/DEINIT/UPLOAD */
 
 void si_init_all_descriptors(struct si_context *sctx)
 {
 	int i;
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
-		si_init_buffer_resources(sctx, &sctx->const_buffers[i],
+		si_init_buffer_resources(&sctx->const_buffers[i],
 					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
 					 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
-		si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
+		si_init_buffer_resources(&sctx->rw_buffers[i],
 					 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
 
-		si_init_sampler_views(sctx, &sctx->samplers[i].views);
-
-		si_init_descriptors(sctx, &sctx->samplers[i].states.desc,
-				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES,
-				    si_emit_sampler_states);
-
-		sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
-		sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
-		sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom;
-		sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
+		si_init_descriptors(&sctx->samplers[i].views.desc,
+				    SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
+		si_init_descriptors(&sctx->samplers[i].states.desc,
+				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
 	}
 
-	si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
-			    4, SI_NUM_VERTEX_BUFFERS, NULL);
+	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
+			    4, SI_NUM_VERTEX_BUFFERS);
 
 	/* Set pipe_context functions. */
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
@@ -1401,6 +1265,20 @@ void si_init_all_descriptors(struct si_context *sctx)
 	si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
 }
 
+bool si_upload_shader_descriptors(struct si_context *sctx)
+{
+	int i;
+
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) ||
+		    !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) ||
+		    !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) ||
+		    !si_upload_descriptors(sctx, &sctx->samplers[i].states.desc))
+			return false;
+	}
+	return si_upload_vertex_buffer_descriptors(sctx);
+}
+
 void si_release_all_descriptors(struct si_context *sctx)
 {
 	int i;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 7b2263b1162..28cb4e990ae 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -142,12 +142,6 @@ struct si_context {
 	union {
 		struct {
 			/* The order matters. */
-			struct r600_atom *const_buffers[SI_NUM_SHADERS];
-			struct r600_atom *rw_buffers[SI_NUM_SHADERS];
-			struct r600_atom *sampler_views[SI_NUM_SHADERS];
-			struct r600_atom *sampler_states[SI_NUM_SHADERS];
-			/* Caches must be flushed after resource descriptors are
-			 * updated in memory. */
 			struct r600_atom *cache_flush;
 			struct r600_atom *streamout_begin;
 			struct r600_atom *streamout_enable; /* must be after streamout_begin */
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index e4d859a4fb7..e6bacdf22fb 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -158,60 +158,48 @@ struct si_shader_data {
 #define SI_NUM_VERTEX_BUFFERS	16
 
 
-/* This represents resource descriptors in memory, such as buffer resources,
+/* This represents descriptors in memory, such as buffer resources,
  * image resources, and sampler states.
  */
 struct si_descriptors {
-	struct r600_atom atom;
-
-	/* The size of one resource descriptor. */
+	/* The list of descriptors in malloc'd memory. */
+	uint32_t *list;
+	/* The size of one descriptor. */
 	unsigned element_dw_size;
-	/* The maximum number of resource descriptors. */
+	/* The maximum number of descriptors. */
 	unsigned num_elements;
+	/* Whether the list has been changed and should be re-uploaded. */
+	bool list_dirty;
 
-	/* The buffer where resource descriptors are stored. */
+	/* The buffer where the descriptors have been uploaded. */
 	struct r600_resource *buffer;
 	unsigned buffer_offset;
 
-	/* The i-th bit is set if that element is dirty (changed but not emitted). */
-	uint64_t dirty_mask;
 	/* The i-th bit is set if that element is enabled (non-NULL resource). */
 	uint64_t enabled_mask;
 
-	/* We can't update descriptors directly because the GPU might be
-	 * reading them at the same time, so we have to update them
-	 * in a copy-on-write manner. Each such copy is called a context,
-	 * which is just another array descriptors in the same buffer. */
-	unsigned current_context_id;
-	/* The size of a context, should be equal to 4*element_dw_size*num_elements. */
-	unsigned context_size;
-
 	/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
 	 * array will be stored. */
 	unsigned shader_userdata_offset;
+	/* Whether the pointer should be re-emitted. */
 	bool pointer_dirty;
 };
 
 struct si_sampler_views {
 	struct si_descriptors		desc;
 	struct pipe_sampler_view	*views[SI_NUM_SAMPLER_VIEWS];
-	uint32_t			*desc_data[SI_NUM_SAMPLER_VIEWS];
 };
 
 struct si_sampler_states {
 	struct si_descriptors		desc;
-	uint32_t			*desc_data[SI_NUM_SAMPLER_STATES];
 	void				*saved_states[2]; /* saved for u_blitter */
 };
 
 struct si_buffer_resources {
 	struct si_descriptors		desc;
-	unsigned			num_buffers;
 	enum radeon_bo_usage		shader_usage; /* READ, WRITE, or READWRITE */
 	enum radeon_bo_priority		priority;
 	struct pipe_resource		**buffers; /* this has num_buffers elements */
-	uint32_t			*desc_storage; /* this has num_buffers*4 elements */
-	uint32_t			**desc_data; /* an array of pointers pointing to desc_storage */
 };
 
 #define si_pm4_block_idx(member) \
@@ -247,13 +235,13 @@ struct si_buffer_resources {
 /* si_descriptors.c */
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
 				unsigned start, unsigned count, void **states);
-void si_update_vertex_buffers(struct si_context *sctx);
 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			struct pipe_resource *buffer,
 			unsigned stride, unsigned num_records,
 			bool add_tid, bool swizzle,
 			unsigned element_size, unsigned index_stride, uint64_t offset);
 void si_init_all_descriptors(struct si_context *sctx);
+bool si_upload_shader_descriptors(struct si_context *sctx);
 void si_release_all_descriptors(struct si_context *sctx);
 void si_all_descriptors_begin_new_cs(struct si_context *sctx);
 void si_copy_buffer(struct si_context *sctx,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index ec8dd84c9dd..e8faf405afc 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -743,11 +743,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		sctx->current_rast_prim = info->mode;
 
 	si_update_shaders(sctx);
-
-	if (sctx->vertex_buffers_dirty) {
-		si_update_vertex_buffers(sctx);
-		sctx->vertex_buffers_dirty = false;
-	}
+	if (!si_upload_shader_descriptors(sctx))
+		return;
 
 	if (info->indexed) {
 		/* Initialize the index buffer struct. */

From 488a83637fe726d445775ee301e42003f749cb9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 15:55:45 +0200
Subject: [PATCH 0826/1208] gallium/util: clear up that debug_get_flags_option
 returns a 64-bit mask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Kai Wasserbäch <kai@dev.carbon-project.org>
---
 src/gallium/auxiliary/util/u_debug.c | 8 ++++----
 src/gallium/auxiliary/util/u_debug.h | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index cf6eca77a39..b4503deb8f6 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -256,12 +256,12 @@ static boolean str_has_option(const char *str, const char *name)
    return FALSE;
 }
 
-unsigned long
+uint64_t
 debug_get_flags_option(const char *name, 
                        const struct debug_named_value *flags,
-                       unsigned long dfault)
+                       uint64_t dfault)
 {
-   unsigned long result;
+   uint64_t result;
    const char *str;
    const struct debug_named_value *orig = flags;
    unsigned namealign = 0;
@@ -276,7 +276,7 @@ debug_get_flags_option(const char *name,
          namealign = MAX2(namealign, strlen(flags->name));
       for (flags = orig; flags->name; ++flags)
          _debug_printf("| %*s [0x%0*lx]%s%s\n", namealign, flags->name,
-                      (int)sizeof(unsigned long)*CHAR_BIT/4, flags->value,
+                      (int)sizeof(uint64_t)*CHAR_BIT/4, flags->value,
                       flags->desc ? " " : "", flags->desc ? flags->desc : "");
    }
    else {
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index b4286d32b32..926063a1918 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -269,7 +269,7 @@ void _debug_assert_fail(const char *expr,
 struct debug_named_value
 {
    const char *name;
-   unsigned long value;
+   uint64_t value;
    const char *desc;
 };
 
@@ -377,10 +377,10 @@ debug_get_bool_option(const char *name, boolean dfault);
 long
 debug_get_num_option(const char *name, long dfault);
 
-unsigned long
+uint64_t
 debug_get_flags_option(const char *name, 
                        const struct debug_named_value *flags,
-                       unsigned long dfault);
+                       uint64_t dfault);
 
 #define DEBUG_GET_ONCE_BOOL_OPTION(sufix, name, dfault) \
 static boolean \

From 0c805b6240769891d55db601f91b8dd84d69d43d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 11:17:48 +0200
Subject: [PATCH 0827/1208] gallivm: add LLVMAttribute parameter to
 lp_build_intrinsic

This will help remove some duplicated code from radeon.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c  |  6 +++---
 src/gallium/auxiliary/gallivm/lp_bld_intr.c  | 12 ++++++++----
 src/gallium/auxiliary/gallivm/lp_bld_intr.h  |  3 ++-
 src/gallium/auxiliary/gallivm/lp_bld_logic.c |  2 +-
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.c  |  2 +-
 src/gallium/drivers/r600/r600_llvm.c         |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c     | 16 ++++++++--------
 7 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 1d327ce48a9..50ae192325b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1717,7 +1717,7 @@ lp_build_round_sse41(struct lp_build_context *bld,
       args[2] = LLVMConstInt(i32t, mode, 0);
 
       res = lp_build_intrinsic(builder, intrinsic,
-                               vec_type, args, Elements(args));
+                               vec_type, args, Elements(args), 0);
 
       res = LLVMBuildExtractElement(builder, res, index0, "");
    }
@@ -3547,7 +3547,7 @@ lp_build_fpstate_get(struct gallivm_state *gallivm)
       lp_build_intrinsic(builder,
                          "llvm.x86.sse.stmxcsr",
                          LLVMVoidTypeInContext(gallivm->context),
-                         &mxcsr_ptr8, 1);
+                         &mxcsr_ptr8, 1, 0);
       return mxcsr_ptr;
    }
    return 0;
@@ -3594,6 +3594,6 @@ lp_build_fpstate_set(struct gallivm_state *gallivm,
       lp_build_intrinsic(builder,
                          "llvm.x86.sse.ldmxcsr",
                          LLVMVoidTypeInContext(gallivm->context),
-                         &mxcsr_ptr, 1);
+                         &mxcsr_ptr, 1, 0);
    }
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
index 2bf1211bcd7..30f4863ec44 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -81,7 +81,8 @@ lp_build_intrinsic(LLVMBuilderRef builder,
                    const char *name,
                    LLVMTypeRef ret_type,
                    LLVMValueRef *args,
-                   unsigned num_args)
+                   unsigned num_args,
+                   LLVMAttribute attr)
 {
    LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
    LLVMValueRef function;
@@ -99,6 +100,9 @@ lp_build_intrinsic(LLVMBuilderRef builder,
       }
 
       function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args);
+
+      if (attr)
+          LLVMAddFunctionAttr(function, attr);
    }
 
    return LLVMBuildCall(builder, function, args, num_args, "");
@@ -111,7 +115,7 @@ lp_build_intrinsic_unary(LLVMBuilderRef builder,
                          LLVMTypeRef ret_type,
                          LLVMValueRef a)
 {
-   return lp_build_intrinsic(builder, name, ret_type, &a, 1);
+   return lp_build_intrinsic(builder, name, ret_type, &a, 1, 0);
 }
 
 
@@ -127,7 +131,7 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
    args[0] = a;
    args[1] = b;
 
-   return lp_build_intrinsic(builder, name, ret_type, args, 2);
+   return lp_build_intrinsic(builder, name, ret_type, args, 2, 0);
 }
 
 
@@ -242,7 +246,7 @@ lp_build_intrinsic_map(struct gallivm_state *gallivm,
       LLVMValueRef res_elem;
       for(j = 0; j < num_args; ++j)
          arg_elems[j] = LLVMBuildExtractElement(builder, args[j], index, "");
-      res_elem = lp_build_intrinsic(builder, name, ret_elem_type, arg_elems, num_args);
+      res_elem = lp_build_intrinsic(builder, name, ret_elem_type, arg_elems, num_args, 0);
       res = LLVMBuildInsertElement(builder, res, res_elem, index, "");
    }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
index 38c5c29c980..a54b367961a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -59,7 +59,8 @@ lp_build_intrinsic(LLVMBuilderRef builder,
                    const char *name,
                    LLVMTypeRef ret_type,
                    LLVMValueRef *args,
-                   unsigned num_args);
+                   unsigned num_args,
+                   LLVMAttribute attr);
 
 
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 80b53e5c3f8..19d30d0d63c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -395,7 +395,7 @@ lp_build_select(struct lp_build_context *bld,
       args[2] = mask;
 
       res = lp_build_intrinsic(builder, intrinsic,
-                               arg_type, args, Elements(args));
+                               arg_type, args, Elements(args), 0);
 
       if (arg_type != bld->vec_type) {
          res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index 1887956d1c4..c4ae30461cb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -104,7 +104,7 @@ lp_build_tgsi_intrinsic(
    struct lp_build_context * base = &bld_base->base;
    emit_data->output[emit_data->chan] = lp_build_intrinsic(
                base->gallivm->builder, action->intr_name,
-               emit_data->dst_type, emit_data->args, emit_data->arg_count);
+               emit_data->dst_type, emit_data->args, emit_data->arg_count, 0);
 }
 
 LLVMValueRef
diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index 72e2dc42f7e..15fc2566fb0 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -332,7 +332,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 			args[2] = lp_build_const_int32(base->gallivm, so->output[i].output_buffer);
 			args[3] = lp_build_const_int32(base->gallivm, ((1 << num_components) - 1) << start_component);
 			lp_build_intrinsic(base->gallivm->builder, "llvm.R600.store.stream.output",
-				LLVMVoidTypeInContext(base->gallivm->context), args, 4);
+				LLVMVoidTypeInContext(base->gallivm->context), args, 4, 0);
 		}
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index d8bab875adb..10fc52d3c29 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1528,7 +1528,7 @@ static void build_tbuffer_store(struct si_shader_context *shader,
 
 	lp_build_intrinsic(gallivm->builder, name,
 			   LLVMVoidTypeInContext(gallivm->context),
-			   args, Elements(args));
+			   args, Elements(args), 0);
 }
 
 static void build_tbuffer_store_dwords(struct si_shader_context *shader,
@@ -1757,7 +1757,7 @@ handle_semantic:
 			lp_build_intrinsic(base->gallivm->builder,
 					   "llvm.SI.export",
 					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9);
+					   args, 9, 0);
 		}
 
 		if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
@@ -1845,7 +1845,7 @@ handle_semantic:
 		lp_build_intrinsic(base->gallivm->builder,
 				   "llvm.SI.export",
 				   LLVMVoidTypeInContext(base->gallivm->context),
-				   pos_args[i], 9);
+				   pos_args[i], 9, 0);
 	}
 }
 
@@ -2122,7 +2122,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 				lp_build_intrinsic(base->gallivm->builder,
 						   "llvm.SI.export",
 						   LLVMVoidTypeInContext(base->gallivm->context),
-						   last_args, 9);
+						   last_args, 9, 0);
 			}
 
 			/* This instruction will be emitted at the end of the shader. */
@@ -2139,14 +2139,14 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 					lp_build_intrinsic(base->gallivm->builder,
 							   "llvm.SI.export",
 							   LLVMVoidTypeInContext(base->gallivm->context),
-							   args, 9);
+							   args, 9, 0);
 				}
 			}
 		} else {
 			lp_build_intrinsic(base->gallivm->builder,
 					   "llvm.SI.export",
 					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9);
+					   args, 9, 0);
 		}
 	}
 
@@ -2208,7 +2208,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 			lp_build_intrinsic(base->gallivm->builder,
 					   "llvm.SI.export",
 					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9);
+					   args, 9, 0);
 		else
 			memcpy(last_args, args, sizeof(args));
 	}
@@ -2239,7 +2239,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 	lp_build_intrinsic(base->gallivm->builder,
 			   "llvm.SI.export",
 			   LLVMVoidTypeInContext(base->gallivm->context),
-			   last_args, 9);
+			   last_args, 9, 0);
 }
 
 static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,

From b9dad585e66b1031bdcbb148a19524ee2705baf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 11:26:18 +0200
Subject: [PATCH 0828/1208] gallium/radeon: remove build_intrinsic and
 build_tgsi_intrinsic

duplicated now

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/r600_llvm.c          | 34 +++++-----
 src/gallium/drivers/radeon/radeon_llvm.h      |  8 ---
 .../drivers/radeon/radeon_setup_tgsi_llvm.c   | 68 ++++---------------
 src/gallium/drivers/radeonsi/si_shader.c      | 56 +++++++--------
 4 files changed, 58 insertions(+), 108 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index 15fc2566fb0..686c92cda06 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -84,7 +84,7 @@ static void llvm_load_system_value(
 #else
 	LLVMValueRef reg = lp_build_const_int32(
 			ctx->soa.bld_base.base.gallivm, chan);
-	ctx->system_values[index] = build_intrinsic(
+	ctx->system_values[index] = lp_build_intrinsic(
 			ctx->soa.bld_base.base.gallivm->builder,
 			"llvm.R600.load.input",
 			ctx->soa.bld_base.base.elem_type, &reg, 1,
@@ -111,9 +111,9 @@ llvm_load_input_vector(
 			Args[ArgCount++] = LLVMBuildExtractElement(ctx->gallivm.builder, IJIndex,
 				lp_build_const_int32(&(ctx->gallivm), 2 * (ijregs % 2) + 1), "");
 			LLVMValueRef HalfVec[2] = {
-				build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.xy",
+				lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.xy",
 					VecType, Args, ArgCount, LLVMReadNoneAttribute),
-				build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.zw",
+				lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.zw",
 					VecType, Args, ArgCount, LLVMReadNoneAttribute)
 			};
 			LLVMValueRef MaskInputs[4] = {
@@ -127,7 +127,7 @@ llvm_load_input_vector(
 				Mask, "");
 		} else {
 			VecType = LLVMVectorType(ctx->soa.bld_base.base.elem_type, 4);
-			return build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.const",
+			return lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.const",
 				VecType, Args, ArgCount, LLVMReadNoneAttribute);
 		}
 }
@@ -153,7 +153,7 @@ llvm_load_input_helper(
 		arg_count = 1;
 	}
 
-	return build_intrinsic(bb->gallivm->builder, intrinsic,
+	return lp_build_intrinsic(bb->gallivm->builder, intrinsic,
 		bb->elem_type, &arg[0], arg_count, LLVMReadNoneAttribute);
 }
 #endif
@@ -356,7 +356,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 				args[0] = output;
 				args[1] = lp_build_const_int32(base->gallivm, next_pos++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
@@ -373,7 +373,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 						LLVMValueRef base_vector = llvm_load_const_buffer(bld_base, offset, CONSTANT_BUFFER_1_ADDR_SPACE);
 						args[0] = output;
 						args[1] = base_vector;
-						adjusted_elements[chan] = build_intrinsic(base->gallivm->builder,
+						adjusted_elements[chan] = lp_build_intrinsic(base->gallivm->builder,
 							"llvm.AMDGPU.dp4", bld_base->base.elem_type,
 							args, 2, LLVMReadNoneAttribute);
 					}
@@ -381,7 +381,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 						adjusted_elements, 4);
 					args[1] = lp_build_const_int32(base->gallivm, next_pos++);
 					args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-					build_intrinsic(
+					lp_build_intrinsic(
 						base->gallivm->builder,
 						"llvm.R600.store.swizzle",
 						LLVMVoidTypeInContext(base->gallivm->context),
@@ -394,14 +394,14 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 				args[0] = output;
 				args[1] = lp_build_const_int32(base->gallivm, next_pos++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
 					args, 3, 0);
 				args[1] = lp_build_const_int32(base->gallivm, next_param++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
@@ -418,7 +418,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 				args[0] = lp_build_gather_values(base->gallivm, elements, 4);
 				args[1] = lp_build_const_int32(base->gallivm, next_param++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
@@ -430,7 +430,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 				args[0] = output;
 				args[1] = lp_build_const_int32(base->gallivm, next_param++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
@@ -449,7 +449,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 						for (unsigned j = 0; j < ctx->color_buffer_count; j++) {
 							args[1] = lp_build_const_int32(base->gallivm, j);
 							args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL);
-							build_intrinsic(
+							lp_build_intrinsic(
 								base->gallivm->builder,
 								"llvm.R600.store.swizzle",
 								LLVMVoidTypeInContext(base->gallivm->context),
@@ -458,7 +458,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 					} else {
 						args[1] = lp_build_const_int32(base->gallivm, color_count++);
 						args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL);
-						build_intrinsic(
+						lp_build_intrinsic(
 							base->gallivm->builder,
 							"llvm.R600.store.swizzle",
 							LLVMVoidTypeInContext(base->gallivm->context),
@@ -543,7 +543,7 @@ static void llvm_emit_tex(
 		case TGSI_OPCODE_TXF: {
 			args[0] = LLVMBuildExtractElement(gallivm->builder, emit_data->args[0], lp_build_const_int32(gallivm, 0), "");
 			args[1] = lp_build_const_int32(gallivm, R600_MAX_CONST_BUFFERS);
-			emit_data->output[0] = build_intrinsic(gallivm->builder,
+			emit_data->output[0] = lp_build_intrinsic(gallivm->builder,
 							"llvm.R600.load.texbuf",
 							emit_data->dst_type, args, 2, LLVMReadNoneAttribute);
 			if (ctx->chip_class >= EVERGREEN)
@@ -658,7 +658,7 @@ static void llvm_emit_tex(
 				lp_build_const_int32(gallivm, 1),
 				lp_build_const_int32(gallivm, 1)
 			};
-			LLVMValueRef ptr = build_intrinsic(gallivm->builder,
+			LLVMValueRef ptr = lp_build_intrinsic(gallivm->builder,
 				"llvm.R600.ldptr",
 				emit_data->dst_type, ldptr_args, 10, LLVMReadNoneAttribute);
 			LLVMValueRef Tmp = LLVMBuildExtractElement(gallivm->builder, args[0],
@@ -679,7 +679,7 @@ static void llvm_emit_tex(
 		}
 	}
 
-	emit_data->output[0] = build_intrinsic(gallivm->builder,
+	emit_data->output[0] = lp_build_intrinsic(gallivm->builder,
 					action->intr_name,
 					emit_data->dst_type, args, c, LLVMReadNoneAttribute);
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 950a51beff3..9e05c2447ee 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -194,14 +194,6 @@ unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan);
 
 void radeon_llvm_finalize_module(struct radeon_llvm_context * ctx);
 
-LLVMValueRef
-build_intrinsic(LLVMBuilderRef builder,
-		const char *name,
-		LLVMTypeRef ret_type,
-		LLVMValueRef *args,
-		unsigned num_args,
-		LLVMAttribute attr);
-
 void
 build_tgsi_intrinsic_nomem(
 		const struct lp_build_tgsi_action * action,
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 2362979cf87..66602c9e311 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -760,14 +760,14 @@ static void radeon_llvm_cube_to_2d_coords(struct lp_build_tgsi_context *bld_base
 	unsigned i;
 
 	cube_vec = lp_build_gather_values(bld_base->base.gallivm, in, 4);
-	v = build_intrinsic(builder, "llvm.AMDGPU.cube", LLVMVectorType(type, 4),
+	v = lp_build_intrinsic(builder, "llvm.AMDGPU.cube", LLVMVectorType(type, 4),
                             &cube_vec, 1, LLVMReadNoneAttribute);
 
 	for (i = 0; i < 4; ++i)
 		coords[i] = LLVMBuildExtractElement(builder, v,
 						    lp_build_const_int32(gallivm, i), "");
 
-	coords[2] = build_intrinsic(builder, "fabs",
+	coords[2] = lp_build_intrinsic(builder, "fabs",
 			type, &coords[2], 1, LLVMReadNoneAttribute);
 	coords[2] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_RCP, coords[2]);
 
@@ -1358,58 +1358,16 @@ static void emit_immediate(struct lp_build_tgsi_context * bld_base,
 	ctx->soa.num_immediates++;
 }
 
-LLVMValueRef
-build_intrinsic(LLVMBuilderRef builder,
-                   const char *name,
-                   LLVMTypeRef ret_type,
-                   LLVMValueRef *args,
-                   unsigned num_args,
-                   LLVMAttribute attr)
-{
-   LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
-   LLVMValueRef function;
-
-   function = LLVMGetNamedFunction(module, name);
-   if(!function) {
-      LLVMTypeRef arg_types[LP_MAX_FUNC_ARGS];
-      unsigned i;
-
-      assert(num_args <= LP_MAX_FUNC_ARGS);
-
-      for(i = 0; i < num_args; ++i) {
-         assert(args[i]);
-         arg_types[i] = LLVMTypeOf(args[i]);
-      }
-
-      function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args);
-
-      if (attr)
-          LLVMAddFunctionAttr(function, attr);
-   }
-
-   return LLVMBuildCall(builder, function, args, num_args, "");
-}
-
-static void build_tgsi_intrinsic(
- const struct lp_build_tgsi_action * action,
- struct lp_build_tgsi_context * bld_base,
- struct lp_build_emit_data * emit_data,
- LLVMAttribute attr)
-{
-   struct lp_build_context * base = &bld_base->base;
-   emit_data->output[emit_data->chan] = build_intrinsic(
-               base->gallivm->builder, action->intr_name,
-               emit_data->dst_type, emit_data->args,
-               emit_data->arg_count, attr);
-}
-
 void
-build_tgsi_intrinsic_nomem(
- const struct lp_build_tgsi_action * action,
- struct lp_build_tgsi_context * bld_base,
- struct lp_build_emit_data * emit_data)
+build_tgsi_intrinsic_nomem(const struct lp_build_tgsi_action *action,
+			   struct lp_build_tgsi_context *bld_base,
+			   struct lp_build_emit_data *emit_data)
 {
-	build_tgsi_intrinsic(action, bld_base, emit_data, LLVMReadNoneAttribute);
+	struct lp_build_context * base = &bld_base->base;
+	emit_data->output[emit_data->chan] =
+		lp_build_intrinsic(base->gallivm->builder, action->intr_name,
+				   emit_data->dst_type, emit_data->args,
+				   emit_data->arg_count, LLVMReadNoneAttribute);
 }
 
 static void emit_bfi(const struct lp_build_tgsi_action * action,
@@ -1465,7 +1423,7 @@ static void emit_lsb(const struct lp_build_tgsi_action * action,
 	};
 
 	emit_data->output[emit_data->chan] =
-		build_intrinsic(gallivm->builder, "llvm.cttz.i32",
+		lp_build_intrinsic(gallivm->builder, "llvm.cttz.i32",
 				emit_data->dst_type, args, Elements(args),
 				LLVMReadNoneAttribute);
 }
@@ -1484,7 +1442,7 @@ static void emit_umsb(const struct lp_build_tgsi_action * action,
 	};
 
 	LLVMValueRef msb =
-		build_intrinsic(builder, "llvm.ctlz.i32",
+		lp_build_intrinsic(builder, "llvm.ctlz.i32",
 				emit_data->dst_type, args, Elements(args),
 				LLVMReadNoneAttribute);
 
@@ -1511,7 +1469,7 @@ static void emit_imsb(const struct lp_build_tgsi_action * action,
 	LLVMValueRef arg = emit_data->args[0];
 
 	LLVMValueRef msb =
-		build_intrinsic(builder, "llvm.AMDGPU.flbit.i32",
+		lp_build_intrinsic(builder, "llvm.AMDGPU.flbit.i32",
 				emit_data->dst_type, &arg, 1,
 				LLVMReadNoneAttribute);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 10fc52d3c29..06faba3156e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -457,7 +457,7 @@ static void declare_input_vs(
 	args[0] = t_list;
 	args[1] = attribute_offset;
 	args[2] = buffer_index;
-	input = build_intrinsic(gallivm->builder,
+	input = lp_build_intrinsic(gallivm->builder,
 		"llvm.SI.vs.load.input", vec4_type, args, 3,
 		LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
@@ -830,7 +830,7 @@ static LLVMValueRef fetch_input_gs(
 	args[8] = uint->zero; /* TFE */
 
 	return LLVMBuildBitCast(gallivm->builder,
-				build_intrinsic(gallivm->builder,
+				lp_build_intrinsic(gallivm->builder,
 						"llvm.SI.buffer.load.dword.i32.i32",
 						i32, args, 9,
 						LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
@@ -974,12 +974,12 @@ static void declare_input_fs(
 
 			args[0] = llvm_chan;
 			args[1] = attr_number;
-			front = build_intrinsic(gallivm->builder, intr_name,
+			front = lp_build_intrinsic(gallivm->builder, intr_name,
 						input_type, args, args[3] ? 4 : 3,
 						LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
 			args[1] = back_attr_number;
-			back = build_intrinsic(gallivm->builder, intr_name,
+			back = lp_build_intrinsic(gallivm->builder, intr_name,
 					       input_type, args, args[3] ? 4 : 3,
 					       LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
@@ -1000,7 +1000,7 @@ static void declare_input_fs(
 		args[2] = params;
 		args[3] = interp_param;
 		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
-			build_intrinsic(gallivm->builder, intr_name,
+			lp_build_intrinsic(gallivm->builder, intr_name,
 					input_type, args, args[3] ? 4 : 3,
 					LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
@@ -1018,7 +1018,7 @@ static void declare_input_fs(
 			args[2] = params;
 			args[3] = interp_param;
 			radeon_bld->inputs[soa_index] =
-				build_intrinsic(gallivm->builder, intr_name,
+				lp_build_intrinsic(gallivm->builder, intr_name,
 						input_type, args, args[3] ? 4 : 3,
 						LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 		}
@@ -1039,7 +1039,7 @@ static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resou
 {
 	LLVMValueRef args[2] = {resource, offset};
 
-	return build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
+	return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
 			       LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 }
 
@@ -1290,7 +1290,7 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 			args[0] = values[2 * chan];
 			args[1] = values[2 * chan + 1];
 			args[chan + 5] =
-				build_intrinsic(base->gallivm->builder,
+				lp_build_intrinsic(base->gallivm->builder,
 						"llvm.SI.packf16",
 						LLVMInt32TypeInContext(base->gallivm->context),
 						args, 2,
@@ -1372,12 +1372,12 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 					lp_build_const_float(gallivm, 1.0f),
 					lp_build_const_float(gallivm, -1.0f));
 
-		build_intrinsic(gallivm->builder,
+		lp_build_intrinsic(gallivm->builder,
 				"llvm.AMDGPU.kill",
 				LLVMVoidTypeInContext(gallivm->context),
 				&arg, 1, 0);
 	} else {
-		build_intrinsic(gallivm->builder,
+		lp_build_intrinsic(gallivm->builder,
 				"llvm.AMDGPU.kilp",
 				LLVMVoidTypeInContext(gallivm->context),
 				NULL, 0, 0);
@@ -1398,7 +1398,7 @@ static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base
 				SI_PARAM_SAMPLE_COVERAGE);
 	coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
 
-	coverage = build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
+	coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
 				   bld_base->int_bld.elem_type,
 				   &coverage, 1, LLVMReadNoneAttribute);
 
@@ -1570,7 +1570,7 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 	LLVMValueRef so_vtx_count =
 		unpack_param(shader, shader->param_streamout_config, 16, 7);
 
-	LLVMValueRef tid = build_intrinsic(builder, "llvm.SI.tid", i32,
+	LLVMValueRef tid = lp_build_intrinsic(builder, "llvm.SI.tid", i32,
 					   NULL, 0, LLVMReadNoneAttribute);
 
 	/* can_emit = tid < so_vtx_count; */
@@ -2031,7 +2031,7 @@ static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
 
 	args[0] = lp_build_const_int32(gallivm,	SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
 	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
-	build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
+	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
 			LLVMVoidTypeInContext(gallivm->context), args, 2,
 			LLVMNoUnwindAttribute);
 }
@@ -2678,7 +2678,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
 				emit_data->inst->Texture.NumOffsets > 0 : false;
 
 	if (target == TGSI_TEXTURE_BUFFER) {
-		emit_data->output[emit_data->chan] = build_intrinsic(
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
 			base->gallivm->builder,
 			"llvm.SI.vs.load.input", emit_data->dst_type,
 			emit_data->args, emit_data->arg_count,
@@ -2727,7 +2727,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
 			is_shadow ? ".c" : "", infix, has_offset ? ".o" : "",
 			LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0])));
 
-		emit_data->output[emit_data->chan] = build_intrinsic(
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
 			base->gallivm->builder, intr_name, emit_data->dst_type,
 			emit_data->args, emit_data->arg_count,
 			LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
@@ -2774,7 +2774,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
 		sprintf(intr_name, "%s.v%ui32", name,
 			LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0])));
 
-		emit_data->output[emit_data->chan] = build_intrinsic(
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
 			base->gallivm->builder, intr_name, emit_data->dst_type,
 			emit_data->args, emit_data->arg_count,
 			LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
@@ -2918,7 +2918,7 @@ static void si_llvm_emit_ddxy(
 	i32 = LLVMInt32TypeInContext(gallivm->context);
 
 	indices[0] = bld_base->uint_bld.zero;
-	indices[1] = build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
+	indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
 				     NULL, 0, LLVMReadNoneAttribute);
 	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
@@ -2994,8 +2994,8 @@ static LLVMValueRef si_llvm_emit_ddxy_interp(
 	i32 = LLVMInt32TypeInContext(gallivm->context);
 
 	indices[0] = bld_base->uint_bld.zero;
-	indices[1] = build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
-				     NULL, 0, LLVMReadNoneAttribute);
+	indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
+					NULL, 0, LLVMReadNoneAttribute);
 	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
@@ -3195,9 +3195,9 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
 		args[3] = interp_param;
 
 		emit_data->output[chan] =
-			build_intrinsic(gallivm->builder, intr_name,
-					input_type, args, args[3] ? 4 : 3,
-					LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
+			lp_build_intrinsic(gallivm->builder, intr_name,
+					   input_type, args, args[3] ? 4 : 3,
+					   LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 	}
 }
 
@@ -3254,8 +3254,8 @@ static void si_llvm_emit_vertex(
 			       lp_build_const_float(gallivm, 1.0f),
 			       lp_build_const_float(gallivm, -1.0f));
 
-	build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
-			LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
+	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
+			   LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
 
 	for (i = 0; i < info->num_outputs; i++) {
 		LLVMValueRef *out_ptr =
@@ -3289,7 +3289,7 @@ static void si_llvm_emit_vertex(
 	/* Signal vertex emission */
 	args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
 	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
-	build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
+	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
 			LLVMVoidTypeInContext(gallivm->context), args, 2,
 			LLVMNoUnwindAttribute);
 }
@@ -3309,7 +3309,7 @@ static void si_llvm_emit_primitive(
 	stream = si_llvm_get_stream(bld_base, emit_data);
 	args[0] = lp_build_const_int32(gallivm,	SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
 	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
-	build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
+	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
 			LLVMVoidTypeInContext(gallivm->context), args, 2,
 			LLVMNoUnwindAttribute);
 }
@@ -3320,7 +3320,7 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
 {
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 
-	build_intrinsic(gallivm->builder, "llvm.AMDGPU.barrier.local",
+	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.barrier.local",
 			LLVMVoidTypeInContext(gallivm->context), NULL, 0,
 			LLVMNoUnwindAttribute);
 }
@@ -3907,7 +3907,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
 			outputs[i].values[chan] =
 				LLVMBuildBitCast(gallivm->builder,
-						 build_intrinsic(gallivm->builder,
+						 lp_build_intrinsic(gallivm->builder,
 								 "llvm.SI.buffer.load.dword.i32.i32",
 								 LLVMInt32TypeInContext(gallivm->context),
 								 args, 9,

From 9a4c57afe48c391bb335f74c88b447f83704b413 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 16:53:29 +0200
Subject: [PATCH 0829/1208] gallium/radeon: remove unused variables and old
 comments

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/r600_llvm.c          |  1 -
 src/gallium/drivers/r600/r600_shader.c        |  1 -
 src/gallium/drivers/radeon/radeon_llvm.h      | 19 -------------------
 .../drivers/radeon/radeon_setup_tgsi_llvm.c   | 14 --------------
 4 files changed, 35 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index 686c92cda06..f86554902e4 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -783,7 +783,6 @@ LLVMModuleRef r600_tgsi_llvm(
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = llvm_fetch_const;
 	bld_base->emit_prologue = llvm_emit_prologue;
 	bld_base->emit_epilogue = llvm_emit_epilogue;
-	ctx->userdata = ctx;
 	ctx->load_input = llvm_load_input;
 	ctx->load_system_value = llvm_load_system_value;
 
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index dda38f6f905..09f50f5d810 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -2091,7 +2091,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
 		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
 		radeon_llvm_ctx.stream_outputs = &so;
-		radeon_llvm_ctx.clip_vertex = ctx.cv_output;
 		radeon_llvm_ctx.alpha_to_one = key.alpha_to_one;
 		radeon_llvm_ctx.has_compressed_msaa_texturing =
 			ctx.bc->has_compressed_msaa_texturing;
diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 9e05c2447ee..e967ad2214e 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -58,7 +58,6 @@ struct radeon_llvm_context {
 	unsigned type;
 	unsigned face_gpr;
 	unsigned two_side;
-	unsigned clip_vertex;
 	unsigned inputs_count;
 	struct r600_shader_io * r600_inputs;
 	struct r600_shader_io * r600_outputs;
@@ -72,21 +71,6 @@ struct radeon_llvm_context {
 
 	/*=== Front end configuration ===*/
 
-	/* Special Intrinsics */
-
-	/** Write to an output register: float store_output(float, i32) */
-	const char * store_output_intr;
-
-	/** Swizzle a vector value: <4 x float> swizzle(<4 x float>, i32)
-	 * The swizzle is an unsigned integer that encodes a TGSI_SWIZZLE_* value
-	 * in 2-bits.
-	 * Swizzle{0-1} = X Channel
-	 * Swizzle{2-3} = Y Channel
-	 * Swizzle{4-5} = Z Channel
-	 * Swizzle{6-7} = W Channel
-	 */
-	const char * swizzle_intr;
-
 	/* Instructions that are not described by any of the TGSI opcodes. */
 
 	/** This function is responsible for initilizing the inputs array and will be
@@ -100,9 +84,6 @@ struct radeon_llvm_context {
 			unsigned index,
 			const struct tgsi_full_declaration *decl);
 
-	/** User data to use with the callbacks */
-	void * userdata;
-
 	/** This array contains the input values for the shader.  Typically these
 	  * values will be in the form of a target intrinsic that will inform the
 	  * backend how to load the actual inputs to the shader. 
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 66602c9e311..5a131f0a019 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -918,14 +918,6 @@ static void tex_fetch_args(
 	struct lp_build_tgsi_context * bld_base,
 	struct lp_build_emit_data * emit_data)
 {
-	/* XXX: lp_build_swizzle_aos() was failing with wrong arg types,
-	 * when we used CHAN_ALL.  We should be able to get this to work,
-	 * but for now we will swizzle it ourselves
-	emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
-						 0, CHAN_ALL);
-
-	*/
-
 	const struct tgsi_full_instruction * inst = emit_data->inst;
 
 	LLVMValueRef coords[5];
@@ -1508,12 +1500,8 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 						ctx->gallivm.context);
 	ctx->gallivm.builder = LLVMCreateBuilderInContext(ctx->gallivm.context);
 
-	ctx->store_output_intr = "llvm.AMDGPU.store.output.";
-	ctx->swizzle_intr = "llvm.AMDGPU.swizzle";
 	struct lp_build_tgsi_context * bld_base = &ctx->soa.bld_base;
 
-	/* XXX: We need to revisit this.I think the correct way to do this is
-	 * to use length = 4 here and use the elem_bld for everything. */
 	type.floating = TRUE;
 	type.fixed = FALSE;
 	type.sign = TRUE;
@@ -1546,8 +1534,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	/* Allocate outputs */
 	ctx->soa.outputs = ctx->outputs;
 
-	/* XXX: Is there a better way to initialize all this ? */
-
 	lp_set_default_actions(bld_base);
 
 	bld_base->op_actions[TGSI_OPCODE_ABS].emit = build_tgsi_intrinsic_nomem;

From 681dbcf69040883e91423df56fcb34f4fee57110 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 17:12:39 +0200
Subject: [PATCH 0830/1208] gallium/radeon: move r600-specific code to r600g

Reviewed-by: Tom Stellard <thomas.stellard@amd.com>
---
 src/gallium/drivers/r600/r600_llvm.c          | 154 +++++++++++++++++-
 .../drivers/radeon/radeon_setup_tgsi_llvm.c   | 150 -----------------
 2 files changed, 151 insertions(+), 153 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index f86554902e4..9cd4357f530 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -754,7 +754,131 @@ static struct lp_build_tgsi_action dot_action = {
 	.intr_name = "llvm.AMDGPU.dp4"
 };
 
+static void txd_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
 
+	LLVMValueRef coords[4];
+	unsigned chan, src;
+	for (src = 0; src < 3; src++) {
+		for (chan = 0; chan < 4; chan++)
+			coords[chan] = lp_build_emit_fetch(bld_base, inst, src, chan);
+
+		emit_data->args[src] = lp_build_gather_values(bld_base->base.gallivm,
+				coords, 4);
+	}
+	emit_data->arg_count = 3;
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+}
+
+
+static void txp_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	LLVMValueRef src_w;
+	unsigned chan;
+	LLVMValueRef coords[5];
+
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+	src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
+
+	for (chan = 0; chan < 3; chan++ ) {
+		LLVMValueRef arg = lp_build_emit_fetch(bld_base,
+						emit_data->inst, 0, chan);
+		coords[chan] = lp_build_emit_llvm_binary(bld_base,
+					TGSI_OPCODE_DIV, arg, src_w);
+	}
+	coords[3] = bld_base->base.one;
+
+	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
+	}
+
+	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+						coords, 4);
+	emit_data->arg_count = 1;
+}
+
+static void tex_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+
+	LLVMValueRef coords[5];
+	unsigned chan;
+	for (chan = 0; chan < 4; chan++) {
+		coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
+	}
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
+		inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
+		inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
+		/* These instructions have additional operand that should be packed
+		 * into the cube coord vector by radeon_llvm_emit_prepare_cube_coords.
+		 * That operand should be passed as a float value in the args array
+		 * right after the coord vector. After packing it's not used anymore,
+		 * that's why arg_count is not increased */
+		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+	}
+
+	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
+	}
+
+	emit_data->arg_count = 1;
+	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+						coords, 4);
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+}
+
+static void txf_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+	const struct tgsi_texture_offset * off = inst->TexOffsets;
+	LLVMTypeRef offset_type = bld_base->int_bld.elem_type;
+
+	/* fetch tex coords */
+	tex_fetch_args(bld_base, emit_data);
+
+	/* fetch tex offsets */
+	if (inst->Texture.NumOffsets) {
+		assert(inst->Texture.NumOffsets == 1);
+
+		emit_data->args[1] = LLVMConstBitCast(
+			bld->immediates[off->Index][off->SwizzleX],
+			offset_type);
+		emit_data->args[2] = LLVMConstBitCast(
+			bld->immediates[off->Index][off->SwizzleY],
+			offset_type);
+		emit_data->args[3] = LLVMConstBitCast(
+			bld->immediates[off->Index][off->SwizzleZ],
+			offset_type);
+	} else {
+		emit_data->args[1] = bld_base->int_bld.zero;
+		emit_data->args[2] = bld_base->int_bld.zero;
+		emit_data->args[3] = bld_base->int_bld.zero;
+	}
+
+	emit_data->arg_count = 4;
+}
 
 LLVMModuleRef r600_tgsi_llvm(
 	struct radeon_llvm_context * ctx,
@@ -790,18 +914,42 @@ LLVMModuleRef r600_tgsi_llvm(
 	bld_base->op_actions[TGSI_OPCODE_DP3] = dot_action;
 	bld_base->op_actions[TGSI_OPCODE_DP4] = dot_action;
 	bld_base->op_actions[TGSI_OPCODE_DPH] = dot_action;
+	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
+	bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_DDX].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
+	bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_DDY].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex";
 	bld_base->op_actions[TGSI_OPCODE_TEX].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TEX2].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TEX2].intr_name = "llvm.AMDGPU.tex";
 	bld_base->op_actions[TGSI_OPCODE_TEX2].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb";
 	bld_base->op_actions[TGSI_OPCODE_TXB].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXB2].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXB2].intr_name = "llvm.AMDGPU.txb";
 	bld_base->op_actions[TGSI_OPCODE_TXB2].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = txd_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd";
 	bld_base->op_actions[TGSI_OPCODE_TXD].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXL].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXL2].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = txf_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf";
 	bld_base->op_actions[TGSI_OPCODE_TXF].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl";
+	bld_base->op_actions[TGSI_OPCODE_TXL].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXL2].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXL2].intr_name = "llvm.AMDGPU.txl";
+	bld_base->op_actions[TGSI_OPCODE_TXL2].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex";
 	bld_base->op_actions[TGSI_OPCODE_TXP].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq";
+	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_CMP].emit = emit_cndlt;
 
 	lp_build_tgsi_llvm(bld_base, tokens);
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 5a131f0a019..a7fa7cc5be3 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -860,132 +860,6 @@ void radeon_llvm_emit_prepare_cube_coords(
 	memcpy(coords_arg, coords, sizeof(coords));
 }
 
-static void txd_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-
-	LLVMValueRef coords[4];
-	unsigned chan, src;
-	for (src = 0; src < 3; src++) {
-		for (chan = 0; chan < 4; chan++)
-			coords[chan] = lp_build_emit_fetch(bld_base, inst, src, chan);
-
-		emit_data->args[src] = lp_build_gather_values(bld_base->base.gallivm,
-				coords, 4);
-	}
-	emit_data->arg_count = 3;
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-}
-
-
-static void txp_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	LLVMValueRef src_w;
-	unsigned chan;
-	LLVMValueRef coords[5];
-
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-	src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
-
-	for (chan = 0; chan < 3; chan++ ) {
-		LLVMValueRef arg = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 0, chan);
-		coords[chan] = lp_build_emit_llvm_binary(bld_base,
-					TGSI_OPCODE_DIV, arg, src_w);
-	}
-	coords[3] = bld_base->base.one;
-
-	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
-	}
-
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->arg_count = 1;
-}
-
-static void tex_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-
-	LLVMValueRef coords[5];
-	unsigned chan;
-	for (chan = 0; chan < 4; chan++) {
-		coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
-	}
-
-	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
-		inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
-		inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
-		/* These instructions have additional operand that should be packed
-		 * into the cube coord vector by radeon_llvm_emit_prepare_cube_coords.
-		 * That operand should be passed as a float value in the args array
-		 * right after the coord vector. After packing it's not used anymore,
-		 * that's why arg_count is not increased */
-		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0);
-	}
-
-	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
-	}
-
-	emit_data->arg_count = 1;
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-}
-
-static void txf_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-	const struct tgsi_texture_offset * off = inst->TexOffsets;
-	LLVMTypeRef offset_type = bld_base->int_bld.elem_type;
-
-	/* fetch tex coords */
-	tex_fetch_args(bld_base, emit_data);
-
-	/* fetch tex offsets */
-	if (inst->Texture.NumOffsets) {
-		assert(inst->Texture.NumOffsets == 1);
-
-		emit_data->args[1] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleX],
-			offset_type);
-		emit_data->args[2] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleY],
-			offset_type);
-		emit_data->args[3] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleZ],
-			offset_type);
-	} else {
-		emit_data->args[1] = bld_base->int_bld.zero;
-		emit_data->args[2] = bld_base->int_bld.zero;
-		emit_data->args[3] = bld_base->int_bld.zero;
-	}
-
-	emit_data->arg_count = 4;
-}
-
 static void emit_icmp(
 		const struct lp_build_tgsi_action * action,
 		struct lp_build_tgsi_context * bld_base,
@@ -1565,10 +1439,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_DSGE].emit = emit_dcmp;
 	bld_base->op_actions[TGSI_OPCODE_DSLT].emit = emit_dcmp;
 	bld_base->op_actions[TGSI_OPCODE_DSNE].emit = emit_dcmp;
-	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
-	bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
-	bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_DRSQ].intr_name = "llvm.AMDGPU.rsq.f64";
 	bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = build_tgsi_intrinsic_nomem;
@@ -1640,26 +1510,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_SQRT].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_SQRT].intr_name = "llvm.sqrt.f32";
 	bld_base->op_actions[TGSI_OPCODE_SSG].emit = emit_ssg;
-	bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TEX2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TEX2].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb";
-	bld_base->op_actions[TGSI_OPCODE_TXB2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXB2].intr_name = "llvm.AMDGPU.txb";
-	bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = txd_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd";
-	bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = txf_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf";
-	bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl";
-	bld_base->op_actions[TGSI_OPCODE_TXL2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXL2].intr_name = "llvm.AMDGPU.txl";
-	bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq";
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.AMDGPU.trunc";
 	bld_base->op_actions[TGSI_OPCODE_UADD].emit = emit_uadd;

From 12a197b2d58125e4dbe2942204df1bbe3258e54b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 17:24:08 +0200
Subject: [PATCH 0831/1208] gallium/radeon: don't use rsq_action

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index a7fa7cc5be3..319380f9048 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -1498,6 +1498,9 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_POW].intr_name = "llvm.pow.f32";
 	bld_base->op_actions[TGSI_OPCODE_ROUND].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.AMDIL.round.nearest.";
+	bld_base->op_actions[TGSI_OPCODE_RSQ].intr_name =
+		HAVE_LLVM >= 0x0305 ? "llvm.AMDGPU.rsq.clamped.f32" : "llvm.AMDGPU.rsq";
+	bld_base->op_actions[TGSI_OPCODE_RSQ].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_cmp;
 	bld_base->op_actions[TGSI_OPCODE_SEQ].emit = emit_cmp;
 	bld_base->op_actions[TGSI_OPCODE_SHL].emit = emit_shl;
@@ -1529,13 +1532,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_U2F].emit = emit_u2f;
 	bld_base->op_actions[TGSI_OPCODE_XOR].emit = emit_xor;
 	bld_base->op_actions[TGSI_OPCODE_UCMP].emit = emit_ucmp;
-
-	bld_base->rsq_action.emit = build_tgsi_intrinsic_nomem;
-#if HAVE_LLVM >= 0x0305
-	bld_base->rsq_action.intr_name = "llvm.AMDGPU.rsq.clamped.f32";
-#else
-	bld_base->rsq_action.intr_name = "llvm.AMDGPU.rsq";
-#endif
 }
 
 void radeon_llvm_create_func(struct radeon_llvm_context * ctx,

From 8559f6ce62a9d5b52fa8189ba2352cd48bdabccf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 12:01:07 +0200
Subject: [PATCH 0832/1208] gallium/radeon: re-enable unsafe math for graphics
 shaders

This reverts commit 4db985a5fa9ea985616a726b1770727309502d81.

The grass no longer disappears, which was the reason the commit was reverted.
This might affect tessellation. We'll see.

Totals from affected shaders:
SGPRS: 151672 -> 150232 (-0.95 %)
VGPRS: 90620 -> 89776 (-0.93 %)
Code Size: 3980472 -> 3920836 (-1.50 %) bytes
LDS: 67 -> 67 (0.00 %) blocks
Scratch: 1357824 -> 1202176 (-11.46 %) bytes per wave

Reviewed-by: Tom Stellard <thomas.stellard@amd.com>
---
 src/gallium/drivers/radeon/radeon_llvm_emit.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 973d6edff8a..c442c6566dc 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -82,6 +82,10 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type)
 	sprintf(Str, "%1d", llvm_type);
 
 	LLVMAddTargetDependentFunctionAttr(F, "ShaderType", Str);
+
+	if (type != TGSI_PROCESSOR_COMPUTE) {
+		LLVMAddTargetDependentFunctionAttr(F, "unsafe-fp-math", "true");
+	}
 }
 
 static void init_r600_target()

From 1bbe40836306549414408bb7f30b9288c020db75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 12:17:19 +0200
Subject: [PATCH 0833/1208] radeonsi: don't use llvm.AMDIL.fraction for FRC and
 DFRAC

There are 2 reasons for this:
- LLVM optimization passes can work with floor
- there are patterns to select v_fract from floor anyway

There is no change in the generated code.
---
 .../drivers/radeon/radeon_setup_tgsi_llvm.c   | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 319380f9048..5c08cf5d339 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -1170,6 +1170,20 @@ static void emit_dneg(
 			emit_data->args[0], "");
 }
 
+static void emit_frac(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+
+	LLVMValueRef floor = lp_build_intrinsic(builder, "floor", emit_data->dst_type,
+						&emit_data->args[0], 1,
+						LLVMReadNoneAttribute);
+	emit_data->output[emit_data->chan] = LLVMBuildFSub(builder,
+			emit_data->args[0], floor, "");
+}
+
 static void emit_f2i(
 		const struct lp_build_tgsi_action * action,
 		struct lp_build_tgsi_context * bld_base,
@@ -1432,8 +1446,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_DABS].intr_name = "fabs";
 	bld_base->op_actions[TGSI_OPCODE_DFMA].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_DFMA].intr_name = "llvm.fma.f64";
-	bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_DFRAC].intr_name = "llvm.AMDIL.fraction.";
+	bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = emit_frac;
 	bld_base->op_actions[TGSI_OPCODE_DNEG].emit = emit_dneg;
 	bld_base->op_actions[TGSI_OPCODE_DSEQ].emit = emit_dcmp;
 	bld_base->op_actions[TGSI_OPCODE_DSGE].emit = emit_dcmp;
@@ -1452,8 +1465,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "floor";
 	bld_base->op_actions[TGSI_OPCODE_FMA].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_FMA].intr_name = "llvm.fma.f32";
-	bld_base->op_actions[TGSI_OPCODE_FRC].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_FRC].intr_name = "llvm.AMDIL.fraction.";
+	bld_base->op_actions[TGSI_OPCODE_FRC].emit = emit_frac;
 	bld_base->op_actions[TGSI_OPCODE_F2I].emit = emit_f2i;
 	bld_base->op_actions[TGSI_OPCODE_F2U].emit = emit_f2u;
 	bld_base->op_actions[TGSI_OPCODE_FSEQ].emit = emit_fcmp;

From 7dd1f45bc41c4a936b0ff84400840524bb9f8871 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 11 Jul 2015 00:17:48 +0200
Subject: [PATCH 0834/1208] radeonsi: store shader disassemblies in memory for
 future users

This will be used by the new ddebug pipe. I'm including it now to avoid
conflicts with other patches.
---
 src/gallium/drivers/r600/evergreen_compute.c  |  2 +-
 src/gallium/drivers/radeon/r600_pipe_common.h |  5 ++---
 src/gallium/drivers/radeon/radeon_elf_util.c  | 13 +++++--------
 src/gallium/drivers/radeon/radeon_elf_util.h  |  2 +-
 src/gallium/drivers/radeon/radeon_llvm_emit.c |  2 +-
 src/gallium/drivers/radeonsi/si_compute.c     |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c      |  9 +++++++--
 7 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 4c3c34cd664..d89e3de9cc6 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -226,7 +226,7 @@ void *evergreen_create_compute_state(
 	}
 #else
 	memset(&shader->binary, 0, sizeof(shader->binary));
-	radeon_elf_read(code, header->num_bytes, &shader->binary, true);
+	radeon_elf_read(code, header->num_bytes, &shader->binary);
 	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 
 	shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index fcb758b2086..ea165fd0f60 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -129,9 +129,8 @@ struct radeon_shader_binary {
 	struct radeon_shader_reloc *relocs;
 	unsigned reloc_count;
 
-	/** Set to 1 if the disassembly for this binary has been dumped to
-	 *  stderr. */
-	int disassembled;
+	/** Disassembled shader in a string. */
+	char *disasm_string;
 };
 
 struct r600_resource {
diff --git a/src/gallium/drivers/radeon/radeon_elf_util.c b/src/gallium/drivers/radeon/radeon_elf_util.c
index 9b508227fd4..2e45d439e7a 100644
--- a/src/gallium/drivers/radeon/radeon_elf_util.c
+++ b/src/gallium/drivers/radeon/radeon_elf_util.c
@@ -103,8 +103,7 @@ static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 }
 
 void radeon_elf_read(const char *elf_data, unsigned elf_size,
-					struct radeon_shader_binary *binary,
-					unsigned debug)
+		     struct radeon_shader_binary *binary)
 {
 	char *elf_buffer;
 	Elf *elf;
@@ -124,7 +123,6 @@ void radeon_elf_read(const char *elf_data, unsigned elf_size,
 	elf = elf_memory(elf_buffer, elf_size);
 
 	elf_getshdrstrndx(elf, &section_str_index);
-	binary->disassembled = 0;
 
 	while ((section = elf_nextscn(elf, section))) {
 		const char *name;
@@ -145,12 +143,11 @@ void radeon_elf_read(const char *elf_data, unsigned elf_size,
 			binary->config_size = section_data->d_size;
 			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 			memcpy(binary->config, section_data->d_buf, binary->config_size);
-		} else if (debug && !strcmp(name, ".AMDGPU.disasm")) {
-			binary->disassembled = 1;
+		} else if (!strcmp(name, ".AMDGPU.disasm")) {
+			/* Always read disassembly if it's available. */
 			section_data = elf_getdata(section, section_data);
-			fprintf(stderr, "\nShader Disassembly:\n\n");
-			fprintf(stderr, "%.*s\n", (int)section_data->d_size,
-						  (char *)section_data->d_buf);
+			binary->disasm_string = strndup(section_data->d_buf,
+							section_data->d_size);
 		} else if (!strncmp(name, ".rodata", 7)) {
 			section_data = elf_getdata(section, section_data);
 			binary->rodata_size = section_data->d_size;
diff --git a/src/gallium/drivers/radeon/radeon_elf_util.h b/src/gallium/drivers/radeon/radeon_elf_util.h
index ab83f98ea69..ea4ab2f14b2 100644
--- a/src/gallium/drivers/radeon/radeon_elf_util.h
+++ b/src/gallium/drivers/radeon/radeon_elf_util.h
@@ -37,7 +37,7 @@ struct radeon_shader_reloc;
  * radeon_shader_binary object.
  */
 void radeon_elf_read(const char *elf_data, unsigned elf_size,
-		struct radeon_shader_binary *binary, unsigned debug);
+		     struct radeon_shader_binary *binary);
 
 /**
  * @returns A pointer to the start of the configuration information for
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index c442c6566dc..04f62d13ca7 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -210,7 +210,7 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	buffer_size = LLVMGetBufferSize(out_buffer);
 	buffer_data = LLVMGetBufferStart(out_buffer);
 
-	radeon_elf_read(buffer_data, buffer_size, binary, dump);
+	radeon_elf_read(buffer_data, buffer_size, binary);
 
 	/* Clean up */
 	LLVMDisposeMemoryBuffer(out_buffer);
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 5e4225bd04c..d4fe5653687 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -137,7 +137,7 @@ static void *si_create_compute_state(
 	}
 #else
 
-	radeon_elf_read(code, header->num_bytes, &program->shader.binary, true);
+	radeon_elf_read(code, header->num_bytes, &program->shader.binary);
 
 	/* init_scratch_buffer patches the shader code with the scratch address,
 	 * so we need to call it before si_shader_binary_read() which uploads
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 06faba3156e..465eecd0346 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3811,7 +3811,10 @@ int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
 	si_shader_binary_upload(sscreen, shader);
 
 	if (dump) {
-		if (!binary->disassembled) {
+		if (binary->disasm_string) {
+			fprintf(stderr, "\nShader Disassembly:\n\n");
+			fprintf(stderr, "%s\n", binary->disasm_string);
+		} else {
 			fprintf(stderr, "SI CODE:\n");
 			for (i = 0; i < binary->code_size; i+=4 ) {
 				fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
@@ -3849,7 +3852,8 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 	if (shader->scratch_bytes_per_wave == 0) {
 		FREE(shader->binary.code);
 		FREE(shader->binary.relocs);
-		memset(&shader->binary, 0, sizeof(shader->binary));
+		memset(&shader->binary, 0,
+		       offsetof(struct radeon_shader_binary, disasm_string));
 	}
 	return r;
 }
@@ -4176,4 +4180,5 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader)
 
 	FREE(shader->binary.code);
 	FREE(shader->binary.relocs);
+	FREE(shader->binary.disasm_string);
 }

From ac19a896d3de13b7d064d01c575f46f4191ef37c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 16:15:48 +0200
Subject: [PATCH 0835/1208] radeonsi: add a debug flag that disables printing
 the LLVM IR in shader dumps

This is for shader-db and should reduce size of shader dumps.
---
 src/gallium/drivers/r600/r600_llvm.c          |  2 +-
 src/gallium/drivers/radeon/r600_pipe_common.c |  1 +
 src/gallium/drivers/radeon/r600_pipe_common.h | 23 ++++++++++---------
 src/gallium/drivers/radeon/radeon_llvm_emit.c |  9 ++++----
 src/gallium/drivers/radeon/radeon_llvm_emit.h | 10 ++++----
 src/gallium/drivers/radeonsi/si_shader.c      | 13 ++++++-----
 6 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index 9cd4357f530..faf538ccbb5 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -1028,7 +1028,7 @@ unsigned r600_llvm_compile(
 	const char * gpu_family = r600_get_llvm_processor_name(family);
 
 	memset(&binary, 0, sizeof(struct radeon_shader_binary));
-	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, NULL);
+	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, dump, NULL);
 
 	r = r600_create_shader(bc, &binary, use_kill);
 
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index fb834b01dcd..c1dbdc740a0 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -335,6 +335,7 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "cs", DBG_CS, "Print compute shaders" },
 	{ "tcs", DBG_TCS, "Print tessellation control shaders" },
 	{ "tes", DBG_TES, "Print tessellation evaluation shaders" },
+	{ "noir", DBG_NO_IR, "Don't print the LLVM IR"},
 
 	/* features */
 	{ "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" },
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index ea165fd0f60..c2b83859b52 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -81,17 +81,18 @@
 #define DBG_CS			(1 << 9)
 #define DBG_TCS			(1 << 10)
 #define DBG_TES			(1 << 11)
+#define DBG_NO_IR		(1 << 12)
+/* Bits 21-31 are reserved for the r600g driver. */
 /* features */
-#define DBG_NO_ASYNC_DMA	(1 << 12)
-#define DBG_NO_HYPERZ		(1 << 13)
-#define DBG_NO_DISCARD_RANGE	(1 << 14)
-#define DBG_NO_2D_TILING	(1 << 15)
-#define DBG_NO_TILING		(1 << 16)
-#define DBG_SWITCH_ON_EOP	(1 << 17)
-#define DBG_FORCE_DMA		(1 << 18)
-#define DBG_PRECOMPILE		(1 << 19)
-#define DBG_INFO		(1 << 20)
-/* The maximum allowed bit is 20. */
+#define DBG_NO_ASYNC_DMA	(1llu << 32)
+#define DBG_NO_HYPERZ		(1llu << 33)
+#define DBG_NO_DISCARD_RANGE	(1llu << 34)
+#define DBG_NO_2D_TILING	(1llu << 35)
+#define DBG_NO_TILING		(1llu << 36)
+#define DBG_SWITCH_ON_EOP	(1llu << 37)
+#define DBG_FORCE_DMA		(1llu << 38)
+#define DBG_PRECOMPILE		(1llu << 39)
+#define DBG_INFO		(1llu << 40)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
 
@@ -272,7 +273,7 @@ struct r600_common_screen {
 	enum chip_class			chip_class;
 	struct radeon_info		info;
 	struct r600_tiling_info		tiling_info;
-	unsigned			debug_flags;
+	uint64_t			debug_flags;
 	bool				has_cp_dma;
 	bool				has_streamout;
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 04f62d13ca7..f0112c784d4 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -148,7 +148,8 @@ static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
  * @returns 0 for success, 1 for failure
  */
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-			  const char *gpu_family, unsigned dump, LLVMTargetMachineRef tm)
+			     const char *gpu_family, bool dump_ir, bool dump_asm,
+			     LLVMTargetMachineRef tm)
 {
 
 	char cpu[CPU_STRING_LEN];
@@ -171,17 +172,15 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 		}
 		strncpy(cpu, gpu_family, CPU_STRING_LEN);
 		memset(fs, 0, sizeof(fs));
-		if (dump) {
+		if (dump_asm)
 			strncpy(fs, "+DumpCode", FS_STRING_LEN);
-		}
 		tm = LLVMCreateTargetMachine(target, triple, cpu, fs,
 				  LLVMCodeGenLevelDefault, LLVMRelocDefault,
 						  LLVMCodeModelDefault);
 		dispose_tm = true;
 	}
-	if (dump) {
+	if (dump_ir)
 		LLVMDumpModule(M);
-	}
 	/* Setup Diagnostic Handler*/
 	llvm_ctx = LLVMGetModuleContext(M);
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.h b/src/gallium/drivers/radeon/radeon_llvm_emit.h
index 3ccef78e36d..e20aed94c6b 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.h
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.h
@@ -29,6 +29,7 @@
 
 #include <llvm-c/Core.h>
 #include <llvm-c/TargetMachine.h>
+#include <stdbool.h>
 
 struct radeon_shader_binary;
 
@@ -36,11 +37,8 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
 
 LLVMTargetRef radeon_llvm_get_r600_target(const char *triple);
 
-unsigned  radeon_llvm_compile(
-	LLVMModuleRef M,
-	struct radeon_shader_binary *binary,
-	const char * gpu_family,
-	unsigned dump,
-	LLVMTargetMachineRef tm);
+unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
+			     const char *gpu_family, bool dump_ir, bool dump_asm,
+			     LLVMTargetMachineRef tm);
 
 #endif /* RADEON_LLVM_EMIT_H */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 465eecd0346..61bf1b801ee 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3836,14 +3836,15 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    LLVMTargetMachineRef tm, LLVMModuleRef mod)
 {
 	int r = 0;
-	bool dump = r600_can_dump_shader(&sscreen->b,
-			shader->selector ? shader->selector->tokens : NULL);
-	r = radeon_llvm_compile(mod, &shader->binary,
-		r600_get_llvm_processor_name(sscreen->b.family), dump, tm);
+	bool dump_asm = r600_can_dump_shader(&sscreen->b,
+				shader->selector ? shader->selector->tokens : NULL);
+	bool dump_ir = dump_asm && !(sscreen->b.debug_flags & DBG_NO_IR);
 
-	if (r) {
+	r = radeon_llvm_compile(mod, &shader->binary,
+		r600_get_llvm_processor_name(sscreen->b.family), dump_ir, dump_asm, tm);
+	if (r)
 		return r;
-	}
+
 	r = si_shader_binary_read(sscreen, shader);
 
 	FREE(shader->binary.config);

From 2dcbd427da74c8f2b6f46e789924a7ced67be260 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 16:15:48 +0200
Subject: [PATCH 0836/1208] radeonsi: add a debug flag that disables printing
 TGSI in shader dumps

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.c | 1 +
 src/gallium/drivers/radeon/r600_pipe_common.h | 1 +
 src/gallium/drivers/radeonsi/si_shader.c      | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index c1dbdc740a0..d0162b514dc 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -336,6 +336,7 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "tcs", DBG_TCS, "Print tessellation control shaders" },
 	{ "tes", DBG_TES, "Print tessellation evaluation shaders" },
 	{ "noir", DBG_NO_IR, "Don't print the LLVM IR"},
+	{ "notgsi", DBG_NO_TGSI, "Don't print the TGSI"},
 
 	/* features */
 	{ "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" },
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index c2b83859b52..72f4715975e 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -82,6 +82,7 @@
 #define DBG_TCS			(1 << 10)
 #define DBG_TES			(1 << 11)
 #define DBG_NO_IR		(1 << 12)
+#define DBG_NO_TGSI		(1 << 13)
 /* Bits 21-31 are reserved for the r600g driver. */
 /* features */
 #define DBG_NO_ASYNC_DMA	(1llu << 32)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 61bf1b801ee..dec68b3a8f0 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -4008,7 +4008,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 
 	/* Dump TGSI code before doing TGSI->LLVM conversion in case the
 	 * conversion fails. */
-	if (dump) {
+	if (dump && !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
 		si_dump_key(sel->type, &shader->key);
 		tgsi_dump(tokens, 0);
 		si_dump_streamout(&sel->so);

From 3063c5e3d3fefdc5eed7600882bd08f56bf86db8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 16:15:48 +0200
Subject: [PATCH 0837/1208] radeonsi: add a debug flag that disables printing
 ISA in shader dumps

---
 src/gallium/drivers/radeon/r600_pipe_common.c |  1 +
 src/gallium/drivers/radeon/r600_pipe_common.h |  1 +
 src/gallium/drivers/radeonsi/si_shader.c      | 20 ++++++++++---------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index d0162b514dc..75e820144e3 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -337,6 +337,7 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "tes", DBG_TES, "Print tessellation evaluation shaders" },
 	{ "noir", DBG_NO_IR, "Don't print the LLVM IR"},
 	{ "notgsi", DBG_NO_TGSI, "Don't print the TGSI"},
+	{ "noasm", DBG_NO_ASM, "Don't print disassembled shaders"},
 
 	/* features */
 	{ "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" },
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 72f4715975e..e2a60c59c82 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -83,6 +83,7 @@
 #define DBG_TES			(1 << 11)
 #define DBG_NO_IR		(1 << 12)
 #define DBG_NO_TGSI		(1 << 13)
+#define DBG_NO_ASM		(1 << 14)
 /* Bits 21-31 are reserved for the r600g driver. */
 /* features */
 #define DBG_NO_ASYNC_DMA	(1llu << 32)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index dec68b3a8f0..4151e011d58 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3811,15 +3811,17 @@ int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
 	si_shader_binary_upload(sscreen, shader);
 
 	if (dump) {
-		if (binary->disasm_string) {
-			fprintf(stderr, "\nShader Disassembly:\n\n");
-			fprintf(stderr, "%s\n", binary->disasm_string);
-		} else {
-			fprintf(stderr, "SI CODE:\n");
-			for (i = 0; i < binary->code_size; i+=4 ) {
-				fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
-				binary->code[i + 2], binary->code[i + 1],
-				binary->code[i]);
+		if (!(sscreen->b.debug_flags & DBG_NO_ASM)) {
+			if (binary->disasm_string) {
+				fprintf(stderr, "\nShader Disassembly:\n\n");
+				fprintf(stderr, "%s\n", binary->disasm_string);
+			} else {
+				fprintf(stderr, "SI CODE:\n");
+				for (i = 0; i < binary->code_size; i+=4 ) {
+					fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
+					binary->code[i + 2], binary->code[i + 1],
+					binary->code[i]);
+				}
 			}
 		}
 

From 2d3ae154ba36546485468b9552e6da905b42aaa4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 01:25:07 +0200
Subject: [PATCH 0838/1208] radeonsi: move CP DMA functions to their own file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/Makefile.sources |   1 +
 src/gallium/drivers/radeonsi/si_cp_dma.c      | 265 ++++++++++++++++++
 src/gallium/drivers/radeonsi/si_descriptors.c | 233 ---------------
 src/gallium/drivers/radeonsi/si_pipe.c        |   1 +
 src/gallium/drivers/radeonsi/si_pipe.h        |   7 +
 src/gallium/drivers/radeonsi/si_state.h       |   3 -
 6 files changed, 274 insertions(+), 236 deletions(-)
 create mode 100644 src/gallium/drivers/radeonsi/si_cp_dma.c

diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index 2876c0ae735..a0b1414f4bb 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -3,6 +3,7 @@ C_SOURCES := \
 	si_blit.c \
 	si_commands.c \
 	si_compute.c \
+	si_cp_dma.c \
 	si_descriptors.c \
 	sid.h \
 	si_dma.c \
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
new file mode 100644
index 00000000000..f8a9da45a10
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#include "si_pipe.h"
+#include "sid.h"
+#include "radeon/r600_cs.h"
+
+
+/* Set this if you want the 3D engine to wait until CP DMA is done.
+ * It should be set on the last CP DMA packet. */
+#define R600_CP_DMA_SYNC	(1 << 0) /* R600+ */
+
+/* Set this if the source data was used as a destination in a previous CP DMA
+ * packet. It's for preventing a read-after-write (RAW) hazard between two
+ * CP DMA packets. */
+#define SI_CP_DMA_RAW_WAIT	(1 << 1) /* SI+ */
+#define CIK_CP_DMA_USE_L2	(1 << 2)
+
+/* Emit a CP DMA packet to do a copy from one buffer to another.
+ * The size must fit in bits [20:0].
+ */
+static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
+				       uint64_t dst_va, uint64_t src_va,
+				       unsigned size, unsigned flags)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
+	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
+	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
+			   PKT3_CP_DMA_SRC_SEL(3) | PKT3_CP_DMA_DST_SEL(3) : 0;
+
+	assert(size);
+	assert((size & ((1<<21)-1)) == size);
+
+	if (sctx->b.chip_class >= CIK) {
+		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+		radeon_emit(cs, sync_flag | sel);	/* CP_SYNC [31] */
+		radeon_emit(cs, src_va);		/* SRC_ADDR_LO [31:0] */
+		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
+		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
+		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
+		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+	} else {
+		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
+		radeon_emit(cs, src_va);			/* SRC_ADDR_LO [31:0] */
+		radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
+		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
+		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
+		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+	}
+}
+
+/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
+static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
+					uint64_t dst_va, unsigned size,
+					uint32_t clear_value, unsigned flags)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
+	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
+	uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? PKT3_CP_DMA_DST_SEL(3) : 0;
+
+	assert(size);
+	assert((size & ((1<<21)-1)) == size);
+
+	if (sctx->b.chip_class >= CIK) {
+		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+		radeon_emit(cs, sync_flag | dst_sel | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
+		radeon_emit(cs, clear_value);		/* DATA [31:0] */
+		radeon_emit(cs, 0);
+		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
+		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [15:0] */
+		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+	} else {
+		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
+		radeon_emit(cs, clear_value);		/* DATA [31:0] */
+		radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
+		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
+		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
+		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+	}
+}
+
+/* The max number of bytes to copy per packet. */
+#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
+
+static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
+			    unsigned offset, unsigned size, unsigned value,
+			    bool is_framebuffer)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	unsigned flush_flags, tc_l2_flag;
+
+	if (!size)
+		return;
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&r600_resource(dst)->valid_buffer_range, offset,
+		       offset + size);
+
+	/* Fallback for unaligned clears. */
+	if (offset % 4 != 0 || size % 4 != 0) {
+		uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
+						       sctx->b.rings.gfx.cs,
+						       PIPE_TRANSFER_WRITE);
+		size /= 4;
+		for (unsigned i = 0; i < size; i++)
+			*map++ = value;
+		return;
+	}
+
+	uint64_t va = r600_resource(dst)->gpu_address + offset;
+
+	/* Flush the caches where the resource is bound. */
+	if (is_framebuffer) {
+		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+		tc_l2_flag = 0;
+	} else {
+		flush_flags = SI_CONTEXT_INV_TC_L1 |
+			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
+			      SI_CONTEXT_INV_KCACHE;
+		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	}
+
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+			 flush_flags;
+
+	while (size) {
+		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
+		unsigned dma_flags = tc_l2_flag;
+
+		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0),
+				 FALSE);
+
+		/* This must be done after need_cs_space. */
+		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+				      (struct r600_resource*)dst, RADEON_USAGE_WRITE,
+				      RADEON_PRIO_MIN);
+
+		/* Flush the caches for the first copy only.
+		 * Also wait for the previous CP DMA operations. */
+		if (sctx->b.flags) {
+			si_emit_cache_flush(&sctx->b, NULL);
+			dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
+		}
+
+		/* Do the synchronization after the last copy, so that all data is written to memory. */
+		if (size == byte_count)
+			dma_flags |= R600_CP_DMA_SYNC;
+
+		/* Emit the clear packet. */
+		si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
+
+		size -= byte_count;
+		va += byte_count;
+	}
+
+	/* Flush the caches again in case the 3D engine has been prefetching
+	 * the resource. */
+	sctx->b.flags |= flush_flags;
+
+	if (tc_l2_flag)
+		r600_resource(dst)->TC_L2_dirty = true;
+}
+
+void si_copy_buffer(struct si_context *sctx,
+		    struct pipe_resource *dst, struct pipe_resource *src,
+		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
+		    bool is_framebuffer)
+{
+	unsigned flush_flags, tc_l2_flag;
+
+	if (!size)
+		return;
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
+		       dst_offset + size);
+
+	dst_offset += r600_resource(dst)->gpu_address;
+	src_offset += r600_resource(src)->gpu_address;
+
+	/* Flush the caches where the resource is bound. */
+	if (is_framebuffer) {
+		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+		tc_l2_flag = 0;
+	} else {
+		flush_flags = SI_CONTEXT_INV_TC_L1 |
+			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
+			      SI_CONTEXT_INV_KCACHE;
+		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	}
+
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+			 flush_flags;
+
+	while (size) {
+		unsigned sync_flags = tc_l2_flag;
+		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
+
+		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE);
+
+		/* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
+		if (sctx->b.flags) {
+			si_emit_cache_flush(&sctx->b, NULL);
+			sync_flags |= SI_CP_DMA_RAW_WAIT;
+		}
+
+		/* Do the synchronization after the last copy, so that all data is written to memory. */
+		if (size == byte_count) {
+			sync_flags |= R600_CP_DMA_SYNC;
+		}
+
+		/* This must be done after r600_need_cs_space. */
+		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
+				      RADEON_USAGE_READ, RADEON_PRIO_MIN);
+		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
+				      RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
+
+		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
+
+		size -= byte_count;
+		src_offset += byte_count;
+		dst_offset += byte_count;
+	}
+
+	/* Flush the caches again in case the 3D engine has been prefetching
+	 * the resource. */
+	sctx->b.flags |= flush_flags;
+
+	if (tc_l2_flag)
+		r600_resource(dst)->TC_L2_dirty = true;
+}
+
+void si_init_cp_dma_functions(struct si_context *sctx)
+{
+	sctx->b.clear_buffer = si_clear_buffer;
+}
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 48ec9b72043..fcf4dbfd989 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -72,81 +72,6 @@ static uint32_t null_descriptor[8] = {
 	 * descriptor */
 };
 
-/* Set this if you want the 3D engine to wait until CP DMA is done.
- * It should be set on the last CP DMA packet. */
-#define R600_CP_DMA_SYNC	(1 << 0) /* R600+ */
-
-/* Set this if the source data was used as a destination in a previous CP DMA
- * packet. It's for preventing a read-after-write (RAW) hazard between two
- * CP DMA packets. */
-#define SI_CP_DMA_RAW_WAIT	(1 << 1) /* SI+ */
-#define CIK_CP_DMA_USE_L2	(1 << 2)
-
-/* Emit a CP DMA packet to do a copy from one buffer to another.
- * The size must fit in bits [20:0].
- */
-static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
-				       uint64_t dst_va, uint64_t src_va,
-				       unsigned size, unsigned flags)
-{
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
-	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
-	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
-			   PKT3_CP_DMA_SRC_SEL(3) | PKT3_CP_DMA_DST_SEL(3) : 0;
-
-	assert(size);
-	assert((size & ((1<<21)-1)) == size);
-
-	if (sctx->b.chip_class >= CIK) {
-		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-		radeon_emit(cs, sync_flag | sel);	/* CP_SYNC [31] */
-		radeon_emit(cs, src_va);		/* SRC_ADDR_LO [31:0] */
-		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
-		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
-		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	} else {
-		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
-		radeon_emit(cs, src_va);			/* SRC_ADDR_LO [31:0] */
-		radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
-		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	}
-}
-
-/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
-static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
-					uint64_t dst_va, unsigned size,
-					uint32_t clear_value, unsigned flags)
-{
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
-	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
-	uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? PKT3_CP_DMA_DST_SEL(3) : 0;
-
-	assert(size);
-	assert((size & ((1<<21)-1)) == size);
-
-	if (sctx->b.chip_class >= CIK) {
-		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-		radeon_emit(cs, sync_flag | dst_sel | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
-		radeon_emit(cs, clear_value);		/* DATA [31:0] */
-		radeon_emit(cs, 0);
-		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	} else {
-		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
-		radeon_emit(cs, clear_value);		/* DATA [31:0] */
-		radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
-		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	}
-}
-
 static void si_init_descriptors(struct si_descriptors *desc,
 				unsigned shader_userdata_index,
 				unsigned element_dw_size,
@@ -931,163 +856,6 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 	}
 }
 
-/* CP DMA */
-
-/* The max number of bytes to copy per packet. */
-#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
-
-static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
-			    unsigned offset, unsigned size, unsigned value,
-			    bool is_framebuffer)
-{
-	struct si_context *sctx = (struct si_context*)ctx;
-	unsigned flush_flags, tc_l2_flag;
-
-	if (!size)
-		return;
-
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(&r600_resource(dst)->valid_buffer_range, offset,
-		       offset + size);
-
-	/* Fallback for unaligned clears. */
-	if (offset % 4 != 0 || size % 4 != 0) {
-		uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
-						       sctx->b.rings.gfx.cs,
-						       PIPE_TRANSFER_WRITE);
-		size /= 4;
-		for (unsigned i = 0; i < size; i++)
-			*map++ = value;
-		return;
-	}
-
-	uint64_t va = r600_resource(dst)->gpu_address + offset;
-
-	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer) {
-		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-		tc_l2_flag = 0;
-	} else {
-		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-			      SI_CONTEXT_INV_KCACHE;
-		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
-	}
-
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 flush_flags;
-
-	while (size) {
-		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
-		unsigned dma_flags = tc_l2_flag;
-
-		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0),
-				 FALSE);
-
-		/* This must be done after need_cs_space. */
-		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
-				      (struct r600_resource*)dst, RADEON_USAGE_WRITE,
-				      RADEON_PRIO_MIN);
-
-		/* Flush the caches for the first copy only.
-		 * Also wait for the previous CP DMA operations. */
-		if (sctx->b.flags) {
-			si_emit_cache_flush(&sctx->b, NULL);
-			dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
-		}
-
-		/* Do the synchronization after the last copy, so that all data is written to memory. */
-		if (size == byte_count)
-			dma_flags |= R600_CP_DMA_SYNC;
-
-		/* Emit the clear packet. */
-		si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
-
-		size -= byte_count;
-		va += byte_count;
-	}
-
-	/* Flush the caches again in case the 3D engine has been prefetching
-	 * the resource. */
-	sctx->b.flags |= flush_flags;
-
-	if (tc_l2_flag)
-		r600_resource(dst)->TC_L2_dirty = true;
-}
-
-void si_copy_buffer(struct si_context *sctx,
-		    struct pipe_resource *dst, struct pipe_resource *src,
-		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
-		    bool is_framebuffer)
-{
-	unsigned flush_flags, tc_l2_flag;
-
-	if (!size)
-		return;
-
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
-		       dst_offset + size);
-
-	dst_offset += r600_resource(dst)->gpu_address;
-	src_offset += r600_resource(src)->gpu_address;
-
-	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer) {
-		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-		tc_l2_flag = 0;
-	} else {
-		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-			      SI_CONTEXT_INV_KCACHE;
-		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
-	}
-
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 flush_flags;
-
-	while (size) {
-		unsigned sync_flags = tc_l2_flag;
-		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
-
-		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE);
-
-		/* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
-		if (sctx->b.flags) {
-			si_emit_cache_flush(&sctx->b, NULL);
-			sync_flags |= SI_CP_DMA_RAW_WAIT;
-		}
-
-		/* Do the synchronization after the last copy, so that all data is written to memory. */
-		if (size == byte_count) {
-			sync_flags |= R600_CP_DMA_SYNC;
-		}
-
-		/* This must be done after r600_need_cs_space. */
-		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
-				      RADEON_USAGE_READ, RADEON_PRIO_MIN);
-		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
-				      RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
-
-		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
-
-		size -= byte_count;
-		src_offset += byte_count;
-		dst_offset += byte_count;
-	}
-
-	/* Flush the caches again in case the 3D engine has been prefetching
-	 * the resource. */
-	sctx->b.flags |= flush_flags;
-
-	if (tc_l2_flag)
-		r600_resource(dst)->TC_L2_dirty = true;
-}
-
 /* SHADER USER DATA */
 
 static void si_mark_shader_pointers_dirty(struct si_context *sctx,
@@ -1248,7 +1016,6 @@ void si_init_all_descriptors(struct si_context *sctx)
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
 	sctx->b.b.set_sampler_views = si_set_sampler_views;
 	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
-	sctx->b.clear_buffer = si_clear_buffer;
 	sctx->b.invalidate_buffer = si_invalidate_buffer;
 
 	/* Shader user data. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 82a6033f694..44735575c1d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -104,6 +104,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 
 	si_init_blit_functions(sctx);
 	si_init_compute_functions(sctx);
+	si_init_cp_dma_functions(sctx);
 
 	if (sscreen->b.info.has_uvd) {
 		sctx->b.b.create_video_codec = si_uvd_create_decoder;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 28cb4e990ae..a249d317b07 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -268,6 +268,13 @@ void si_resource_copy_region(struct pipe_context *ctx,
 			     unsigned src_level,
 			     const struct pipe_box *src_box);
 
+/* si_cp_dma.c */
+void si_copy_buffer(struct si_context *sctx,
+		    struct pipe_resource *dst, struct pipe_resource *src,
+		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
+		    bool is_framebuffer);
+void si_init_cp_dma_functions(struct si_context *sctx);
+
 /* si_dma.c */
 void si_dma_copy(struct pipe_context *ctx,
 		 struct pipe_resource *dst,
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index e6bacdf22fb..b8f63c5dd36 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -244,9 +244,6 @@ void si_init_all_descriptors(struct si_context *sctx);
 bool si_upload_shader_descriptors(struct si_context *sctx);
 void si_release_all_descriptors(struct si_context *sctx);
 void si_all_descriptors_begin_new_cs(struct si_context *sctx);
-void si_copy_buffer(struct si_context *sctx,
-		    struct pipe_resource *dst, struct pipe_resource *src,
-		    uint64_t dst_offset, uint64_t src_offset, unsigned size, bool is_framebuffer);
 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
 			    const uint8_t *ptr, unsigned size, uint32_t *const_offset);
 void si_shader_change_notify(struct si_context *sctx);

From 3ca21320583a4c0ba9bee755935df5e1f1637fdf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 26 Jul 2015 21:08:18 +0200
Subject: [PATCH 0839/1208] radeonsi: fix broken st/nine from merging
 tessellation

st/nine uses GENERIC slots greater than 60.
---
 src/gallium/drivers/radeonsi/si_shader.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4151e011d58..92382e85985 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -136,8 +136,13 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 		assert(index <= 1);
 		return 2 + index;
 	case TGSI_SEMANTIC_GENERIC:
-		assert(index <= 63-4);
-		return 4 + index;
+		if (index <= 63-4)
+			return 4 + index;
+		else
+			/* same explanation as in the default statement,
+			 * the only user hitting this is st/nine.
+			 */
+			return 0;
 
 	/* patch indices are completely separate and thus start from 0 */
 	case TGSI_SEMANTIC_TESSOUTER:

From 190a40580fdfccf00db93f5c8f15bbf16914be2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 27 Jul 2015 19:01:21 +0200
Subject: [PATCH 0840/1208] radeonsi: fix a regression since the
 resource_copy_region cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Broken since:
    46b2b3b - radeonsi: don't change pipe_resource in resource_copy_region

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91444

Reviewed-and-Tested-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 86e1624f3d3..d0269f0fe86 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2511,7 +2511,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 			  S_008F1C_LAST_LEVEL(texture->nr_samples > 1 ?
 						      util_logbase2(texture->nr_samples) :
 						      last_level) |
-			  S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, 0, false)) |
+			  S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, base_level, false)) |
 			  S_008F1C_POW2_PAD(texture->last_level > 0) |
 			  S_008F1C_TYPE(si_tex_dim(texture->target, texture->nr_samples)));
 	view->state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1));

From f9c4953f99e75e45bc4f0f07315ee643b62b0c23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 28 Jul 2015 11:39:35 +0200
Subject: [PATCH 0841/1208] radeonsi: early exit in si_clear if there's nothing
 to do
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_blit.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index c3591a7e85a..c8926238940 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -342,6 +342,8 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 	if (buffers & PIPE_CLEAR_COLOR) {
 		evergreen_do_fast_color_clear(&sctx->b, fb, &sctx->framebuffer.atom,
 					      &buffers, color);
+		if (!buffers)
+			return; /* all buffers have been fast cleared */
 	}
 
 	if (buffers & PIPE_CLEAR_COLOR) {

From 64d3130994bde98b0be44a5c54511e376b6d994e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 28 Jul 2015 11:39:35 +0200
Subject: [PATCH 0842/1208] r600g: early exit in r600_clear if there's nothing
 to do
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/r600/r600_blit.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 8e553a8e3b9..1c59230c78e 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -401,6 +401,8 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
 	    rctx->framebuffer.nr_samples > 1) {
 		evergreen_do_fast_color_clear(&rctx->b, fb, &rctx->framebuffer.atom,
 					      &buffers, color);
+		if (!buffers)
+			return; /* all buffers have been fast cleared */
 	}
 
 	if (buffers & PIPE_CLEAR_COLOR) {

From 30509788641a413742098f21a4ee0087b1f86e18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 30 Jul 2015 17:38:44 +0200
Subject: [PATCH 0843/1208] radeonsi: copy *8_SNORM bits exactly in
 resource_copy_region
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Disabling the FP16 mode didn't help.

If needed, we can use this trick for blits too, but not for scaled blits.

+ 4 piglits

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_blit.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index c8926238940..61ca2a82195 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -522,7 +522,9 @@ void si_resource_copy_region(struct pipe_context *ctx,
 		src_box = &sbox;
 
 		src_force_level = src_level;
-	} else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) {
+	} else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src) ||
+		   /* also *8_SNORM has precision issues, use UNORM instead */
+		   util_format_is_snorm(src->format)) {
 		if (util_format_is_subsampled_422(src->format)) {
 			src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
 			dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;

From b15aba940a3b6fc7c9bebc692968e7e9b72b9f29 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Tue, 21 Jul 2015 11:43:42 -0400
Subject: [PATCH 0844/1208] glx: Fix image size computation for
 EXT_texture_integer (v2)

Without this this extension basically can't work in indirect contexts,
TexImage2D will compute the image size as 0 and we'll send no image data
to the server.

v2: Add EXT_texture_integer to the client extension list too (Ian)

Reviewed-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Adam Jackson <ajax@redhat.com>
---
 src/glx/compsize.c      | 10 ++++++++++
 src/glx/glxextensions.c |  1 +
 2 files changed, 11 insertions(+)

diff --git a/src/glx/compsize.c b/src/glx/compsize.c
index 99c7763c7f9..805591914c5 100644
--- a/src/glx/compsize.c
+++ b/src/glx/compsize.c
@@ -65,6 +65,8 @@ __glElementsPerGroup(GLenum format, GLenum type)
    switch (format) {
    case GL_RGB:
    case GL_BGR:
+   case GL_RGB_INTEGER_EXT:
+   case GL_BGR_INTEGER_EXT:
       return 3;
    case GL_RG:
    case GL_422_EXT:
@@ -74,10 +76,13 @@ __glElementsPerGroup(GLenum format, GLenum type)
    case GL_DEPTH_STENCIL_NV:
    case GL_YCBCR_422_APPLE:
    case GL_LUMINANCE_ALPHA:
+   case GL_LUMINANCE_ALPHA_INTEGER_EXT:
       return 2;
    case GL_RGBA:
    case GL_BGRA:
    case GL_ABGR_EXT:
+   case GL_RGBA_INTEGER_EXT:
+   case GL_BGRA_INTEGER_EXT:
       return 4;
    case GL_COLOR_INDEX:
    case GL_STENCIL_INDEX:
@@ -88,6 +93,11 @@ __glElementsPerGroup(GLenum format, GLenum type)
    case GL_ALPHA:
    case GL_LUMINANCE:
    case GL_INTENSITY:
+   case GL_RED_INTEGER_EXT:
+   case GL_GREEN_INTEGER_EXT:
+   case GL_BLUE_INTEGER_EXT:
+   case GL_ALPHA_INTEGER_EXT:
+   case GL_LUMINANCE_INTEGER_EXT:
       return 1;
    default:
       return 0;
diff --git a/src/glx/glxextensions.c b/src/glx/glxextensions.c
index cb8cd665a88..3b29aef1234 100644
--- a/src/glx/glxextensions.c
+++ b/src/glx/glxextensions.c
@@ -241,6 +241,7 @@ static const struct extension_info known_gl_extensions[] = {
    { GL(EXT_texture_env_combine),        VER(1,3), Y, N, N, N },
    { GL(EXT_texture_env_dot3),           VER(0,0), Y, N, N, N },
    { GL(EXT_texture_filter_anisotropic), VER(0,0), Y, N, N, N },
+   { GL(EXT_texture_integer),            VER(0,0), Y, N, N, N },
    { GL(EXT_texture_lod),                VER(1,2), Y, N, N, N },
    { GL(EXT_texture_lod_bias),           VER(1,4), Y, N, N, N },
    { GL(EXT_texture_mirror_clamp),       VER(0,0), Y, N, N, N },

From 616355160d3ee6edff3429b1abef82f0706dad3d Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 29 Jul 2015 10:47:51 -0700
Subject: [PATCH 0845/1208] glsl: Initialize parse-state in constructor of
 lower_subroutine.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Static analysis tools don't like partial object initializations.

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/glsl/lower_subroutine.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/glsl/lower_subroutine.cpp b/src/glsl/lower_subroutine.cpp
index e45ccfea6d9..b29912ad150 100644
--- a/src/glsl/lower_subroutine.cpp
+++ b/src/glsl/lower_subroutine.cpp
@@ -37,7 +37,8 @@ namespace {
 
 class lower_subroutine_visitor : public ir_hierarchical_visitor {
 public:
-   lower_subroutine_visitor()
+   lower_subroutine_visitor(struct _mesa_glsl_parse_state *state)
+      : state(state)
    {
       this->progress = false;
    }
@@ -52,8 +53,7 @@ public:
 bool
 lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state)
 {
-   lower_subroutine_visitor v;
-   v.state = state;
+   lower_subroutine_visitor v(state);
    visit_list_elements(&v, instructions);
    return v.progress;
 }

From bafdafa7b2e6649791188b5acf235ba166ceae50 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Fri, 31 Jul 2015 13:36:21 -0400
Subject: [PATCH 0846/1208] glx: Fix missing bit decl for EXT_texture_integer

Missing from:

    commit b15aba940a3b6fc7c9bebc692968e7e9b72b9f29
    Author: Adam Jackson <ajax@redhat.com>
    Date:   Tue Jul 21 11:43:42 2015 -0400

	glx: Fix image size computation for EXT_texture_integer (v2)

Signed-off-by: Adam Jackson <ajax@redhat.com>
---
 src/glx/glxextensions.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/glx/glxextensions.h b/src/glx/glxextensions.h
index 37e4ccc8303..90b173fc915 100644
--- a/src/glx/glxextensions.h
+++ b/src/glx/glxextensions.h
@@ -146,6 +146,7 @@ enum
    GL_EXT_texture_env_combine_bit,
    GL_EXT_texture_env_dot3_bit,
    GL_EXT_texture_filter_anisotropic_bit,
+   GL_EXT_texture_integer_bit,
    GL_EXT_texture_lod_bit,
    GL_EXT_texture_lod_bias_bit,
    GL_EXT_texture_mirror_clamp_bit,

From 8477dd7c2e4416838c54da75a769109b4c5cc48e Mon Sep 17 00:00:00 2001
From: Vinson Lee <vlee@freedesktop.org>
Date: Wed, 29 Jul 2015 20:17:36 -0700
Subject: [PATCH 0847/1208] gallivm: Fix GCC unused-variable warning.

lp_bld_tgsi_soa.c: In function 'lp_emit_immediate_soa':
lp_bld_tgsi_soa.c:3065:18: warning: unused variable 'size' [-Wunused-variable]
       const uint size = imm->Immediate.NrTokens - 1;
                  ^

Signed-off-by: Vinson Lee <vlee@freedesktop.org>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 55f606f9c59..fae604e2f9c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -3062,8 +3062,7 @@ void lp_emit_immediate_soa(
    } else {
       /* simply copy the immediate values into the next immediates[] slot */
       unsigned i;
-      const uint size = imm->Immediate.NrTokens - 1;
-      assert(size <= 4);
+      assert(imm->Immediate.NrTokens - 1 <= 4);
       assert(bld->num_immediates < LP_MAX_INLINED_IMMEDIATES);
 
       for(i = 0; i < 4; ++i )

From a40179f47ba11e78097ae1a839df6f3911a6749f Mon Sep 17 00:00:00 2001
From: EdB <edb+mesa@sigluy.net>
Date: Fri, 31 Jul 2015 19:14:45 +0200
Subject: [PATCH 0848/1208] clover: make dispatch matches functions def

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 .../state_trackers/clover/api/dispatch.hpp    | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/gallium/state_trackers/clover/api/dispatch.hpp b/src/gallium/state_trackers/clover/api/dispatch.hpp
index ffae1ae6e12..7f622822ef9 100644
--- a/src/gallium/state_trackers/clover/api/dispatch.hpp
+++ b/src/gallium/state_trackers/clover/api/dispatch.hpp
@@ -693,7 +693,13 @@ struct _cl_icd_dispatch {
    CL_API_ENTRY cl_int (CL_API_CALL *clUnloadPlatformCompiler)(
       cl_platform_id platform);
 
-   void *clGetKernelArgInfo;
+   CL_API_ENTRY cl_int (CL_API_CALL *clGetKernelArgInfo)(
+      cl_kernel kernel,
+      cl_uint arg_indx,
+      cl_kernel_arg_info  param_name,
+      size_t param_value_size,
+      void *param_value,
+      size_t *param_value_size_ret);
 
    CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueFillBuffer)(
       cl_command_queue command_queue,
@@ -701,7 +707,7 @@ struct _cl_icd_dispatch {
       const void *pattern,
       size_t pattern_size,
       size_t offset,
-      size_t cb,
+      size_t size,
       cl_uint num_events_in_wait_list,
       const cl_event *event_wait_list,
       cl_event *event);
@@ -710,13 +716,20 @@ struct _cl_icd_dispatch {
       cl_command_queue command_queue,
       cl_mem image,
       const void *fill_color,
-      const size_t origin[3],
-      const size_t region[3],
+      const size_t *origin,
+      const size_t *region,
       cl_uint num_events_in_wait_list,
       const cl_event *event_wait_list,
       cl_event *event);
 
-   void *clEnqueueMigrateMemObjects;
+   CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueMigrateMemObjects)(
+      cl_command_queue command_queue,
+      cl_uint num_mem_objects,
+      const cl_mem *mem_objects,
+      cl_mem_migration_flags flags,
+      cl_uint num_events_in_wait_list,
+      const cl_event *event_wait_list,
+      cl_event *event);
 
    CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueMarkerWithWaitList)(
       cl_command_queue command_queue,

From 5d29eaef85c15663cde317c2df58ea81637c53f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 1 Aug 2015 00:51:00 +0200
Subject: [PATCH 0849/1208] Revert "gallium/radeon: re-enable unsafe math for
 graphics shaders"

This reverts commit 8559f6ce62a9d5b52fa8189ba2352cd48bdabccf.

It causes hangs in DOTA 2 Reborn.
---
 src/gallium/drivers/radeon/radeon_llvm_emit.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index f0112c784d4..00025590137 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -82,10 +82,6 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type)
 	sprintf(Str, "%1d", llvm_type);
 
 	LLVMAddTargetDependentFunctionAttr(F, "ShaderType", Str);
-
-	if (type != TGSI_PROCESSOR_COMPUTE) {
-		LLVMAddTargetDependentFunctionAttr(F, "unsafe-fp-math", "true");
-	}
 }
 
 static void init_r600_target()

From b0a929960384ffebf3b4f693fa0db4231ed897d4 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Fri, 10 Jul 2015 00:16:21 +0100
Subject: [PATCH 0850/1208] configure.ac: null,android,gdi are not valid
 egl-platforms

... and update the documentation to reflect reality.
null and gdi are gone, and surfaceless is a recent addition.

v2: s/platforms/platform/ (spotted by Thomas)

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Thomas Helland <thomashelland90@gmail.com>
---
 configure.ac  | 3 ---
 docs/egl.html | 6 +++---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/configure.ac b/configure.ac
index 480018ac536..6636ee4769f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1763,9 +1763,6 @@ for plat in $egl_platforms; do
 			AC_MSG_ERROR([EGL platform surfaceless requires libdrm >= $LIBDRM_REQUIRED])
 		;;
 
-	android|gdi|null)
-		;;
-
 	*)
 		AC_MSG_ERROR([EGL platform '$plat' does not exist])
 		;;
diff --git a/docs/egl.html b/docs/egl.html
index 3ab1a6018fd..bc21c6c4894 100644
--- a/docs/egl.html
+++ b/docs/egl.html
@@ -88,10 +88,10 @@ types such as <code>EGLNativeDisplayType</code> or
 <code>EGLNativeWindowType</code> defined for.</p>
 
 <p>The available platforms are <code>x11</code>, <code>drm</code>,
-<code>wayland</code>, <code>null</code>, <code>android</code>,
-<code>haiku</code>, and <code>gdi</code>.  The <code>android</code> platform
+<code>wayland</code>, <code>surfaceless</code>, <code>android</code>,
+and <code>haiku</code>.  The <code>android</code> platform
 can only be built as a system component, part of AOSP, while the
-<code>haiku</code> and <code>gdi</code> platforms can only be built with SCons.
+<code>haiku</code> platform can only be built with SCons.
 Unless for special needs, the build system should
 select the right platforms automatically.</p>
 

From 57c670a823e55f5dd1fb2eb3d15e7db0a4f5c07a Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Fri, 10 Jul 2015 11:01:35 +0100
Subject: [PATCH 0851/1208] egl: consolidate ifdef HAVE_LIBDRM blocks

Move the code around rather than having it scattered. No functional
change.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Thomas Helland <thomashelland90@gmail.com>
---
 src/egl/drivers/dri2/egl_dri2.c | 210 ++++++++++++++++----------------
 1 file changed, 102 insertions(+), 108 deletions(-)

diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index e3afab4fee8..4f6db58a322 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -1448,53 +1448,6 @@ dri2_create_image_khr_renderbuffer(_EGLDisplay *disp, _EGLContext *ctx,
    return dri2_create_image_from_dri(disp, dri_image);
 }
 
-#ifdef HAVE_LIBDRM
-static _EGLImage *
-dri2_create_image_mesa_drm_buffer(_EGLDisplay *disp, _EGLContext *ctx,
-				  EGLClientBuffer buffer, const EGLint *attr_list)
-{
-   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   EGLint format, name, pitch, err;
-   _EGLImageAttribs attrs;
-   __DRIimage *dri_image;
-
-   name = (EGLint) (uintptr_t) buffer;
-
-   err = _eglParseImageAttribList(&attrs, disp, attr_list);
-   if (err != EGL_SUCCESS)
-      return NULL;
-
-   if (attrs.Width <= 0 || attrs.Height <= 0 ||
-       attrs.DRMBufferStrideMESA <= 0) {
-      _eglError(EGL_BAD_PARAMETER,
-		"bad width, height or stride");
-      return NULL;
-   }
-
-   switch (attrs.DRMBufferFormatMESA) {
-   case EGL_DRM_BUFFER_FORMAT_ARGB32_MESA:
-      format = __DRI_IMAGE_FORMAT_ARGB8888;
-      pitch = attrs.DRMBufferStrideMESA;
-      break;
-   default:
-      _eglError(EGL_BAD_PARAMETER,
-		"dri2_create_image_khr: unsupported pixmap depth");
-      return NULL;
-   }
-
-   dri_image =
-      dri2_dpy->image->createImageFromName(dri2_dpy->dri_screen,
-					   attrs.Width,
-					   attrs.Height,
-					   format,
-					   name,
-					   pitch,
-					   NULL);
-
-   return dri2_create_image_from_dri(disp, dri_image);
-}
-#endif
-
 #ifdef HAVE_WAYLAND_PLATFORM
 
 /* This structure describes how a wl_buffer maps to one or more
@@ -1691,6 +1644,51 @@ dri2_create_wayland_buffer_from_image(_EGLDriver *drv, _EGLDisplay *dpy,
 }
 
 #ifdef HAVE_LIBDRM
+static _EGLImage *
+dri2_create_image_mesa_drm_buffer(_EGLDisplay *disp, _EGLContext *ctx,
+				  EGLClientBuffer buffer, const EGLint *attr_list)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   EGLint format, name, pitch, err;
+   _EGLImageAttribs attrs;
+   __DRIimage *dri_image;
+
+   name = (EGLint) (uintptr_t) buffer;
+
+   err = _eglParseImageAttribList(&attrs, disp, attr_list);
+   if (err != EGL_SUCCESS)
+      return NULL;
+
+   if (attrs.Width <= 0 || attrs.Height <= 0 ||
+       attrs.DRMBufferStrideMESA <= 0) {
+      _eglError(EGL_BAD_PARAMETER,
+		"bad width, height or stride");
+      return NULL;
+   }
+
+   switch (attrs.DRMBufferFormatMESA) {
+   case EGL_DRM_BUFFER_FORMAT_ARGB32_MESA:
+      format = __DRI_IMAGE_FORMAT_ARGB8888;
+      pitch = attrs.DRMBufferStrideMESA;
+      break;
+   default:
+      _eglError(EGL_BAD_PARAMETER,
+		"dri2_create_image_khr: unsupported pixmap depth");
+      return NULL;
+   }
+
+   dri_image =
+      dri2_dpy->image->createImageFromName(dri2_dpy->dri_screen,
+					   attrs.Width,
+					   attrs.Height,
+					   format,
+					   name,
+					   pitch,
+					   NULL);
+
+   return dri2_create_image_from_dri(disp, dri_image);
+}
+
 static EGLBoolean
 dri2_check_dma_buf_attribs(const _EGLImageAttribs *attrs)
 {
@@ -1923,67 +1921,6 @@ dri2_create_image_dma_buf(_EGLDisplay *disp, _EGLContext *ctx,
 
    return res;
 }
-#endif
-
-_EGLImage *
-dri2_create_image_khr(_EGLDriver *drv, _EGLDisplay *disp,
-		      _EGLContext *ctx, EGLenum target,
-		      EGLClientBuffer buffer, const EGLint *attr_list)
-{
-   (void) drv;
-
-   switch (target) {
-   case EGL_GL_TEXTURE_2D_KHR:
-   case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_X_KHR:
-   case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_X_KHR:
-   case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Y_KHR:
-   case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_KHR:
-   case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Z_KHR:
-   case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_KHR:
-      return dri2_create_image_khr_texture(disp, ctx, target, buffer, attr_list);
-   case EGL_GL_TEXTURE_3D_KHR:
-      if (disp->Extensions.KHR_gl_texture_3D_image) {
-         return dri2_create_image_khr_texture(disp, ctx, target, buffer, attr_list);
-      }
-      else {
-         _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
-         return EGL_NO_IMAGE_KHR;
-      }
-   case EGL_GL_RENDERBUFFER_KHR:
-      return dri2_create_image_khr_renderbuffer(disp, ctx, buffer, attr_list);
-#ifdef HAVE_LIBDRM
-   case EGL_DRM_BUFFER_MESA:
-      return dri2_create_image_mesa_drm_buffer(disp, ctx, buffer, attr_list);
-#endif
-#ifdef HAVE_WAYLAND_PLATFORM
-   case EGL_WAYLAND_BUFFER_WL:
-      return dri2_create_image_wayland_wl_buffer(disp, ctx, buffer, attr_list);
-#endif
-#ifdef HAVE_LIBDRM
-   case EGL_LINUX_DMA_BUF_EXT:
-      return dri2_create_image_dma_buf(disp, ctx, buffer, attr_list);
-#endif
-   default:
-      _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
-      return EGL_NO_IMAGE_KHR;
-   }
-}
-
-static EGLBoolean
-dri2_destroy_image_khr(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *image)
-{
-   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   struct dri2_egl_image *dri2_img = dri2_egl_image(image);
-
-   (void) drv;
-
-   dri2_dpy->image->destroyImage(dri2_img->dri_image);
-   free(dri2_img);
-
-   return EGL_TRUE;
-}
-
-#ifdef HAVE_LIBDRM
 static _EGLImage *
 dri2_create_drm_image_mesa(_EGLDriver *drv, _EGLDisplay *disp,
 			   const EGLint *attr_list)
@@ -2143,8 +2080,65 @@ dri2_export_dma_buf_image_mesa(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *im
 
    return EGL_TRUE;
 }
+
 #endif
 
+_EGLImage *
+dri2_create_image_khr(_EGLDriver *drv, _EGLDisplay *disp,
+		      _EGLContext *ctx, EGLenum target,
+		      EGLClientBuffer buffer, const EGLint *attr_list)
+{
+   (void) drv;
+
+   switch (target) {
+   case EGL_GL_TEXTURE_2D_KHR:
+   case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_X_KHR:
+   case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_X_KHR:
+   case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Y_KHR:
+   case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_KHR:
+   case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Z_KHR:
+   case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_KHR:
+      return dri2_create_image_khr_texture(disp, ctx, target, buffer, attr_list);
+   case EGL_GL_TEXTURE_3D_KHR:
+      if (disp->Extensions.KHR_gl_texture_3D_image) {
+         return dri2_create_image_khr_texture(disp, ctx, target, buffer, attr_list);
+      }
+      else {
+         _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
+         return EGL_NO_IMAGE_KHR;
+      }
+   case EGL_GL_RENDERBUFFER_KHR:
+      return dri2_create_image_khr_renderbuffer(disp, ctx, buffer, attr_list);
+#ifdef HAVE_LIBDRM
+   case EGL_DRM_BUFFER_MESA:
+      return dri2_create_image_mesa_drm_buffer(disp, ctx, buffer, attr_list);
+   case EGL_LINUX_DMA_BUF_EXT:
+      return dri2_create_image_dma_buf(disp, ctx, buffer, attr_list);
+#endif
+#ifdef HAVE_WAYLAND_PLATFORM
+   case EGL_WAYLAND_BUFFER_WL:
+      return dri2_create_image_wayland_wl_buffer(disp, ctx, buffer, attr_list);
+#endif
+   default:
+      _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
+      return EGL_NO_IMAGE_KHR;
+   }
+}
+
+static EGLBoolean
+dri2_destroy_image_khr(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *image)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   struct dri2_egl_image *dri2_img = dri2_egl_image(image);
+
+   (void) drv;
+
+   dri2_dpy->image->destroyImage(dri2_img->dri_image);
+   free(dri2_img);
+
+   return EGL_TRUE;
+}
+
 #ifdef HAVE_WAYLAND_PLATFORM
 
 static void

From fa109d02dda118f756903b663879375c06353ae7 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Fri, 10 Jul 2015 11:01:55 +0100
Subject: [PATCH 0852/1208] egl/wayland: libdrm is a hard requirement, treat it
 as such

Prompt at configure time if it's missing otherwise we'll fail later on
in the build. Remove ambiguous HAVE_LIBDRM guard.

Cc: 10.6 <mesa-stable@lists.freedesktop.org>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 configure.ac                    | 3 +++
 src/egl/drivers/dri2/egl_dri2.c | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index 6636ee4769f..bfe7f80fe6f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1740,6 +1740,9 @@ egl_platforms=`IFS=', '; echo $with_egl_platforms`
 for plat in $egl_platforms; do
 	case "$plat" in
 	wayland)
+		test "x$have_libdrm" != xyes &&
+			AC_MSG_ERROR([EGL platform wayland requires libdrm >= $LIBDRM_REQUIRED])
+
 		PKG_CHECK_MODULES([WAYLAND], [wayland-client >= $WAYLAND_REQUIRED wayland-server >= $WAYLAND_REQUIRED])
 
 		if test "x$WAYLAND_SCANNER" = x; then
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 4f6db58a322..47f52a5e6d8 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -2216,13 +2216,11 @@ dri2_bind_wayland_display_wl(_EGLDriver *drv, _EGLDisplay *disp,
    wl_drm_callbacks.authenticate =
       (int(*)(void *, uint32_t)) dri2_dpy->vtbl->authenticate;
 
-#ifdef HAVE_LIBDRM
    if (drmGetCap(dri2_dpy->fd, DRM_CAP_PRIME, &cap) == 0 &&
        cap == (DRM_PRIME_CAP_IMPORT | DRM_PRIME_CAP_EXPORT) &&
        dri2_dpy->image->base.version >= 7 &&
        dri2_dpy->image->createImageFromFds != NULL)
       flags |= WAYLAND_DRM_PRIME;
-#endif
 
    dri2_dpy->wl_server_drm =
 	   wayland_drm_init(wl_dpy, dri2_dpy->device_name,

From 720125ff99a8563d1f5991bd7428b8d884f1f618 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Fri, 10 Jul 2015 11:22:13 +0100
Subject: [PATCH 0853/1208] egl: remove ifdef $(egl_extension) compile guards

All of these are already defined in the headers provided.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/egl/main/eglapi.c       | 36 +-----------------------------------
 src/egl/main/eglapi.h       | 24 ------------------------
 src/egl/main/eglfallbacks.c |  6 ------
 3 files changed, 1 insertion(+), 65 deletions(-)

diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index 96bd8856c25..323634e4511 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -1013,8 +1013,6 @@ eglSwapBuffers(EGLDisplay dpy, EGLSurface surface)
 }
 
 
-#ifdef EGL_EXT_swap_buffers_with_damage
-
 static EGLBoolean EGLAPIENTRY
 eglSwapBuffersWithDamageEXT(EGLDisplay dpy, EGLSurface surface,
                             EGLint *rects, EGLint n_rects)
@@ -1040,8 +1038,6 @@ eglSwapBuffersWithDamageEXT(EGLDisplay dpy, EGLSurface surface,
    RETURN_EGL_EVAL(disp, ret);
 }
 
-#endif /* EGL_EXT_swap_buffers_with_damage */
-
 EGLBoolean EGLAPIENTRY
 eglCopyBuffers(EGLDisplay dpy, EGLSurface surface, EGLNativePixmapType target)
 {
@@ -1202,8 +1198,6 @@ eglGetError(void)
 }
 
 
-#ifdef EGL_MESA_drm_display
-
 static EGLDisplay EGLAPIENTRY
 eglGetDRMDisplayMESA(int fd)
 {
@@ -1211,8 +1205,6 @@ eglGetDRMDisplayMESA(int fd)
    return _eglGetDisplayHandle(dpy);
 }
 
-#endif /* EGL_MESA_drm_display */
-
 /**
  ** EGL 1.2
  **/
@@ -1578,8 +1570,6 @@ eglGetSyncAttribKHR(EGLDisplay dpy, EGLSync sync, EGLint attribute, EGLint *valu
 }
 
 
-#ifdef EGL_NOK_swap_region
-
 static EGLBoolean EGLAPIENTRY
 eglSwapBuffersRegionNOK(EGLDisplay dpy, EGLSurface surface,
 			EGLint numRects, const EGLint *rects)
@@ -1605,10 +1595,6 @@ eglSwapBuffersRegionNOK(EGLDisplay dpy, EGLSurface surface,
    RETURN_EGL_EVAL(disp, ret);
 }
 
-#endif /* EGL_NOK_swap_region */
-
-
-#ifdef EGL_MESA_drm_image
 
 static EGLImage EGLAPIENTRY
 eglCreateDRMImageMESA(EGLDisplay dpy, const EGLint *attr_list)
@@ -1648,9 +1634,7 @@ eglExportDRMImageMESA(EGLDisplay dpy, EGLImage image,
    RETURN_EGL_EVAL(disp, ret);
 }
 
-#endif
 
-#ifdef EGL_WL_bind_wayland_display
 struct wl_display;
 
 static EGLBoolean EGLAPIENTRY
@@ -1707,9 +1691,8 @@ eglQueryWaylandBufferWL(EGLDisplay dpy, struct wl_resource *buffer,
 
    RETURN_EGL_EVAL(disp, ret);
 }
-#endif
 
-#ifdef EGL_WL_create_wayland_buffer_from_image
+
 static struct wl_buffer * EGLAPIENTRY
 eglCreateWaylandBufferFromImageWL(EGLDisplay dpy, EGLImage image)
 {
@@ -1730,7 +1713,6 @@ eglCreateWaylandBufferFromImageWL(EGLDisplay dpy, EGLImage image)
 
    RETURN_EGL_EVAL(disp, ret);
 }
-#endif
 
 static EGLBoolean EGLAPIENTRY
 eglPostSubBufferNV(EGLDisplay dpy, EGLSurface surface,
@@ -1773,7 +1755,6 @@ eglGetSyncValuesCHROMIUM(EGLDisplay display, EGLSurface surface,
    RETURN_EGL_EVAL(disp, ret);
 }
 
-#ifdef EGL_MESA_image_dma_buf_export
 static EGLBoolean EGLAPIENTRY
 eglExportDMABUFImageQueryMESA(EGLDisplay dpy, EGLImage image,
                               EGLint *fourcc, EGLint *nplanes,
@@ -1815,7 +1796,6 @@ eglExportDMABUFImageMESA(EGLDisplay dpy, EGLImage image,
 
    RETURN_EGL_EVAL(disp, ret);
 }
-#endif
 
 __eglMustCastToProperFunctionPointerType EGLAPIENTRY
 eglGetProcAddress(const char *procname)
@@ -1872,9 +1852,7 @@ eglGetProcAddress(const char *procname)
       { "eglGetPlatformDisplay", (_EGLProc) eglGetPlatformDisplay },
       { "eglCreatePlatformWindowSurface", (_EGLProc) eglCreatePlatformWindowSurface },
       { "eglCreatePlatformPixmapSurface", (_EGLProc) eglCreatePlatformPixmapSurface },
-#ifdef EGL_MESA_drm_display
       { "eglGetDRMDisplayMESA", (_EGLProc) eglGetDRMDisplayMESA },
-#endif
       { "eglCreateImageKHR", (_EGLProc) eglCreateImageKHR },
       { "eglDestroyImageKHR", (_EGLProc) eglDestroyImage },
       { "eglCreateSyncKHR", (_EGLProc) eglCreateSyncKHR },
@@ -1884,33 +1862,21 @@ eglGetProcAddress(const char *procname)
       { "eglWaitSyncKHR", (_EGLProc) eglWaitSyncKHR },
       { "eglSignalSyncKHR", (_EGLProc) eglSignalSyncKHR },
       { "eglGetSyncAttribKHR", (_EGLProc) eglGetSyncAttribKHR },
-#ifdef EGL_NOK_swap_region
       { "eglSwapBuffersRegionNOK", (_EGLProc) eglSwapBuffersRegionNOK },
-#endif
-#ifdef EGL_MESA_drm_image
       { "eglCreateDRMImageMESA", (_EGLProc) eglCreateDRMImageMESA },
       { "eglExportDRMImageMESA", (_EGLProc) eglExportDRMImageMESA },
-#endif
-#ifdef EGL_WL_bind_wayland_display
       { "eglBindWaylandDisplayWL", (_EGLProc) eglBindWaylandDisplayWL },
       { "eglUnbindWaylandDisplayWL", (_EGLProc) eglUnbindWaylandDisplayWL },
       { "eglQueryWaylandBufferWL", (_EGLProc) eglQueryWaylandBufferWL },
-#endif
-#ifdef EGL_WL_create_wayland_buffer_from_image
       { "eglCreateWaylandBufferFromImageWL", (_EGLProc) eglCreateWaylandBufferFromImageWL },
-#endif
       { "eglPostSubBufferNV", (_EGLProc) eglPostSubBufferNV },
-#ifdef EGL_EXT_swap_buffers_with_damage
       { "eglSwapBuffersWithDamageEXT", (_EGLProc) eglSwapBuffersWithDamageEXT },
-#endif
       { "eglGetPlatformDisplayEXT", (_EGLProc) eglGetPlatformDisplayEXT },
       { "eglCreatePlatformWindowSurfaceEXT", (_EGLProc) eglCreatePlatformWindowSurfaceEXT },
       { "eglCreatePlatformPixmapSurfaceEXT", (_EGLProc) eglCreatePlatformPixmapSurfaceEXT },
       { "eglGetSyncValuesCHROMIUM", (_EGLProc) eglGetSyncValuesCHROMIUM },
-#ifdef EGL_MESA_image_dma_buf_export
       { "eglExportDMABUFImageQueryMESA", (_EGLProc) eglExportDMABUFImageQueryMESA },
       { "eglExportDMABUFImageMESA", (_EGLProc) eglExportDMABUFImageMESA },
-#endif
       { NULL, NULL }
    };
    EGLint i;
diff --git a/src/egl/main/eglapi.h b/src/egl/main/eglapi.h
index 4e0378d0d5f..6c54c7c410d 100644
--- a/src/egl/main/eglapi.h
+++ b/src/egl/main/eglapi.h
@@ -99,41 +99,29 @@ typedef EGLBoolean (*SignalSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSyn
 typedef EGLBoolean (*GetSyncAttrib_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLint attribute, EGLAttrib *value);
 
 
-#ifdef EGL_NOK_swap_region
 typedef EGLBoolean (*SwapBuffersRegionNOK_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf, EGLint numRects, const EGLint *rects);
-#endif
 
-#ifdef EGL_MESA_drm_image
 typedef _EGLImage *(*CreateDRMImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, const EGLint *attr_list);
 typedef EGLBoolean (*ExportDRMImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img, EGLint *name, EGLint *handle, EGLint *stride);
-#endif
 
-#ifdef EGL_WL_bind_wayland_display
 struct wl_display;
 typedef EGLBoolean (*BindWaylandDisplayWL_t)(_EGLDriver *drv, _EGLDisplay *disp, struct wl_display *display);
 typedef EGLBoolean (*UnbindWaylandDisplayWL_t)(_EGLDriver *drv, _EGLDisplay *disp, struct wl_display *display);
 typedef EGLBoolean (*QueryWaylandBufferWL_t)(_EGLDriver *drv, _EGLDisplay *displ, struct wl_resource *buffer, EGLint attribute, EGLint *value);
-#endif
 
-#ifdef EGL_WL_create_wayland_buffer_from_image
 typedef struct wl_buffer * (*CreateWaylandBufferFromImageWL_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img);
-#endif
 
 typedef EGLBoolean (*PostSubBufferNV_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surface, EGLint x, EGLint y, EGLint width, EGLint height);
 
 typedef EGLint (*QueryBufferAge_t)(_EGLDriver *drv,
                                    _EGLDisplay *dpy, _EGLSurface *surface);
 
-#ifdef EGL_EXT_swap_buffers_with_damage
 typedef EGLBoolean (*SwapBuffersWithDamageEXT_t) (_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, const EGLint *rects, EGLint n_rects);
-#endif
 
 typedef EGLBoolean (*GetSyncValuesCHROMIUM_t) (_EGLDisplay *dpy, _EGLSurface *surface, EGLuint64KHR *ust, EGLuint64KHR *msc, EGLuint64KHR *sbc);
 
-#ifdef EGL_MESA_image_dma_buf_export
 typedef EGLBoolean (*ExportDMABUFImageQueryMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img, EGLint *fourcc, EGLint *nplanes, EGLuint64KHR *modifiers);
 typedef EGLBoolean (*ExportDMABUFImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img, EGLint *fds, EGLint *strides, EGLint *offsets);
-#endif
 
 /**
  * The API dispatcher jumps through these functions
@@ -180,38 +168,26 @@ struct _egl_api
    SignalSyncKHR_t SignalSyncKHR;
    GetSyncAttrib_t GetSyncAttrib;
 
-#ifdef EGL_NOK_swap_region
    SwapBuffersRegionNOK_t SwapBuffersRegionNOK;
-#endif
 
-#ifdef EGL_MESA_drm_image
    CreateDRMImageMESA_t CreateDRMImageMESA;
    ExportDRMImageMESA_t ExportDRMImageMESA;
-#endif
 
-#ifdef EGL_WL_bind_wayland_display
    BindWaylandDisplayWL_t BindWaylandDisplayWL;
    UnbindWaylandDisplayWL_t UnbindWaylandDisplayWL;
    QueryWaylandBufferWL_t QueryWaylandBufferWL;
-#endif
 
-#ifdef EGL_WL_create_wayland_buffer_from_image
    CreateWaylandBufferFromImageWL_t CreateWaylandBufferFromImageWL;
-#endif
 
-#ifdef EGL_EXT_swap_buffers_with_damage
    SwapBuffersWithDamageEXT_t SwapBuffersWithDamageEXT;
-#endif /* EGL_EXT_swap_buffers_with_damage */
 
    PostSubBufferNV_t PostSubBufferNV;
 
    QueryBufferAge_t QueryBufferAge;
    GetSyncValuesCHROMIUM_t GetSyncValuesCHROMIUM;
 
-#ifdef EGL_MESA_image_dma_buf_export
    ExportDMABUFImageQueryMESA_t ExportDMABUFImageQueryMESA;
    ExportDMABUFImageMESA_t ExportDMABUFImageMESA;
-#endif
 };
 
 
diff --git a/src/egl/main/eglfallbacks.c b/src/egl/main/eglfallbacks.c
index 3c3701f4ae9..65daf8fd0f5 100644
--- a/src/egl/main/eglfallbacks.c
+++ b/src/egl/main/eglfallbacks.c
@@ -93,17 +93,11 @@ _eglInitDriverFallbacks(_EGLDriver *drv)
    drv->API.SignalSyncKHR = NULL;
    drv->API.GetSyncAttrib = _eglGetSyncAttrib;
 
-#ifdef EGL_MESA_drm_image
    drv->API.CreateDRMImageMESA = NULL;
    drv->API.ExportDRMImageMESA = NULL;
-#endif
 
-#ifdef EGL_NOK_swap_region
    drv->API.SwapBuffersRegionNOK = NULL;
-#endif
 
-#ifdef EGL_MESA_image_dma_buf_export
    drv->API.ExportDMABUFImageQueryMESA = NULL;
    drv->API.ExportDMABUFImageMESA = NULL;
-#endif
 }

From 5567494403938940f61d44888c436a20a6635ef3 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Fri, 10 Jul 2015 12:24:11 +0100
Subject: [PATCH 0854/1208] egl/wayland: use designated initializers

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Thomas Helland <thomashelland90@gmail.com>
---
 src/egl/drivers/dri2/platform_wayland.c | 26 ++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index f4cb1857a37..0feb0567254 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -65,7 +65,7 @@ sync_callback(void *data, struct wl_callback *callback, uint32_t serial)
 }
 
 static const struct wl_callback_listener sync_listener = {
-   sync_callback
+   .done = sync_callback
 };
 
 static int
@@ -104,8 +104,8 @@ wl_buffer_release(void *data, struct wl_buffer *buffer)
    dri2_surf->color_buffers[i].locked = 0;
 }
 
-static struct wl_buffer_listener wl_buffer_listener = {
-   wl_buffer_release
+static const struct wl_buffer_listener wl_buffer_listener = {
+   .release = wl_buffer_release
 };
 
 static void
@@ -601,7 +601,7 @@ wayland_throttle_callback(void *data,
 }
 
 static const struct wl_callback_listener throttle_listener = {
-   wayland_throttle_callback
+   .done = wayland_throttle_callback
 };
 
 static void
@@ -947,10 +947,10 @@ drm_handle_authenticated(void *data, struct wl_drm *drm)
 }
 
 static const struct wl_drm_listener drm_listener = {
-	drm_handle_device,
-	drm_handle_format,
-	drm_handle_authenticated,
-	drm_handle_capabilities
+   .device = drm_handle_device,
+   .format = drm_handle_format,
+   .authenticated = drm_handle_authenticated,
+   .capabilities = drm_handle_capabilities
 };
 
 static void
@@ -975,8 +975,8 @@ registry_handle_global_remove(void *data, struct wl_registry *registry,
 }
 
 static const struct wl_registry_listener registry_listener_drm = {
-   registry_handle_global_drm,
-   registry_handle_global_remove
+   .global = registry_handle_global_drm,
+   .global_remove = registry_handle_global_remove
 };
 
 static EGLBoolean
@@ -1732,7 +1732,7 @@ shm_handle_format(void *data, struct wl_shm *shm, uint32_t format)
 }
 
 static const struct wl_shm_listener shm_listener = {
-   shm_handle_format
+   .format = shm_handle_format
 };
 
 static void
@@ -1749,8 +1749,8 @@ registry_handle_global_swrast(void *data, struct wl_registry *registry, uint32_t
 }
 
 static const struct wl_registry_listener registry_listener_swrast = {
-   registry_handle_global_swrast,
-   registry_handle_global_remove
+   .global = registry_handle_global_swrast,
+   .global_remove = registry_handle_global_remove
 };
 
 static struct dri2_egl_display_vtbl dri2_wl_swrast_display_vtbl = {

From 175d9752796bbcc52f1df90b1466c879bccfc406 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Fri, 10 Jul 2015 12:27:06 +0100
Subject: [PATCH 0855/1208] egl/wayland: use drmGetNodeTypeFromFd helper
 instead of opencoding it

Cc: Axel Davy <axel.davy@ens.fr>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Acked-by: Boyan Ding <boyan.j.ding@gmail.com>
---
 configure.ac                            |  2 +-
 src/egl/drivers/dri2/platform_wayland.c | 20 ++------------------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/configure.ac b/configure.ac
index bfe7f80fe6f..1976d56769e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -66,7 +66,7 @@ OSMESA_VERSION=8
 AC_SUBST([OSMESA_VERSION])
 
 dnl Versions for external dependencies
-LIBDRM_REQUIRED=2.4.38
+LIBDRM_REQUIRED=2.4.60
 LIBDRM_RADEON_REQUIRED=2.4.56
 LIBDRM_INTEL_REQUIRED=2.4.61
 LIBDRM_NVVIEUX_REQUIRED=2.4.33
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index 0feb0567254..dabaf1ebbd1 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -845,22 +845,6 @@ bad_format:
    return NULL;
 }
 
-static char
-is_fd_render_node(int fd)
-{
-   struct stat render;
-
-   if (fstat(fd, &render))
-      return 0;
-
-   if (!S_ISCHR(render.st_mode))
-      return 0;
-
-   if (render.st_rdev & 0x80)
-      return 1;
-   return 0;
-}
-
 static int
 dri2_wl_authenticate(_EGLDisplay *disp, uint32_t id)
 {
@@ -904,7 +888,7 @@ drm_handle_device(void *data, struct wl_drm *drm, const char *device)
       return;
    }
 
-   if (is_fd_render_node(dri2_dpy->fd)) {
+   if (drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER) {
       dri2_dpy->authenticated = 1;
    } else {
       drmGetMagic(dri2_dpy->fd, &magic);
@@ -1114,7 +1098,7 @@ dri2_initialize_wayland_drm(_EGLDriver *drv, _EGLDisplay *disp)
     * will return a render-node when the requested gpu is different
     * to the server, but also if the client asks for the same gpu than
     * the server by requesting its pci-id */
-   dri2_dpy->is_render_node = is_fd_render_node(dri2_dpy->fd);
+   dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER;
 
    dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd, 0);
    if (dri2_dpy->driver_name == NULL) {

From eb3e2562a4bf728082818b46dcae1ab88340786e Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Fri, 10 Jul 2015 12:28:23 +0100
Subject: [PATCH 0856/1208] configure.ac: check for mkostemp()

We can make use of it over mkstemp + fcntl in the egl/wayland code.

Cc: Axel Davy <axel.davy@ens.fr>
Suggested-by: Matt Turner <mattst88@gmail.com>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 configure.ac | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configure.ac b/configure.ac
index 1976d56769e..2fc5a2592d0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -654,6 +654,7 @@ fi
 AC_CHECK_HEADER([xlocale.h], [DEFINES="$DEFINES -DHAVE_XLOCALE_H"])
 AC_CHECK_HEADER([sys/sysctl.h], [DEFINES="$DEFINES -DHAVE_SYS_SYSCTL_H"])
 AC_CHECK_FUNC([strtof], [DEFINES="$DEFINES -DHAVE_STRTOF"])
+AC_CHECK_FUNC([mkostemp], [DEFINES="$DEFINES -DHAVE_MKOSTEMP"])
 
 dnl Check to see if dlopen is in default libraries (like Solaris, which
 dnl has it in libc), or if libdl is needed to get it.

From 1307be519b8785249ee863a22115930299ff642a Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 29 Jul 2015 15:44:32 +0100
Subject: [PATCH 0857/1208] winsys/radeon: don't leak the fd when it is 0

Earlier commit added an extra dup(fd) to fix a ZaphodHeads issue.
Although it did not consider the (very unlikely) case where we might end
up with the valid fd == 0.

Fixes: 28dda47ae4d(winsys/radeon: Use dup fd as key in drm-winsys hash
table to fix ZaphodHeads.)

Cc: 10.6 <mesa-stable@lists.freedesktop.org>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Mario Kleiner <mario.kleiner.de@gmail.com>
---
 src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index 41f8826c39d..f7784fb795e 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -493,7 +493,7 @@ static void radeon_winsys_destroy(struct radeon_winsys *rws)
         radeon_surface_manager_free(ws->surf_man);
     }
 
-    if (ws->fd)
+    if (ws->fd >= 0)
         close(ws->fd);
 
     FREE(rws);
@@ -786,7 +786,7 @@ fail:
         ws->kman->destroy(ws->kman);
     if (ws->surf_man)
         radeon_surface_manager_free(ws->surf_man);
-    if (ws->fd)
+    if (ws->fd >= 0)
         close(ws->fd);
 
     FREE(ws);

From 6f2d88927a77f902157704d16b70b1265e0ca357 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 29 Jul 2015 18:13:50 +0100
Subject: [PATCH 0858/1208] docs: rename/bump 10.7.0 release notes to 11.0.0

Recently a few drivers have grown OpenGL 4+ support so we might as
well go all the way to... 11 ;-)

v2: Don't forget to update the version file (Ilia)

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 VERSION                                    |  2 +-
 docs/relnotes/{10.7.0.html => 11.0.0.html} | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)
 rename docs/relnotes/{10.7.0.html => 11.0.0.html} (85%)

diff --git a/VERSION b/VERSION
index 1edd8fc00e5..2b1181ddc3f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-10.7.0-devel
+11.0.0-devel
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/11.0.0.html
similarity index 85%
rename from docs/relnotes/10.7.0.html
rename to docs/relnotes/11.0.0.html
index 92ae20fe964..1b633315c15 100644
--- a/docs/relnotes/10.7.0.html
+++ b/docs/relnotes/11.0.0.html
@@ -14,19 +14,19 @@
 <iframe src="../contents.html"></iframe>
 <div class="content">
 
-<h1>Mesa 10.7.0 Release Notes / TBD</h1>
+<h1>Mesa 11.0.0 Release Notes / TBD</h1>
 
 <p>
-Mesa 10.7.0 is a new development release.
+Mesa 11.0.0 is a new development release.
 People who are concerned with stability and reliability should stick
-with a previous release or wait for Mesa 10.7.1.
+with a previous release or wait for Mesa 11.0.1.
 </p>
 <p>
-Mesa 10.7.0 implements the OpenGL 3.3 API, but the version reported by
+Mesa 11.0.0 implements the OpenGL 4.1 API, but the version reported by
 glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
 glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
-Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
-3.3 is <strong>only</strong> available if requested at context creation
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
 because compatibility contexts are not supported.
 </p>
 

From 2b831334e95e80e1a53dcce2fab21b012d3384c7 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Thu, 30 Jul 2015 15:18:54 +0100
Subject: [PATCH 0859/1208] includes/GL: remove duplicated extension
 declarations from glx.h

All three of GLX_NV_float_buffer, GLX_EXT_texture_from_pixmap and
GLX_MESA_query_renderer have been in glxext.h for a while now.

As such we can drop this workaround/hack from the header.

v2: Remove the comment about GLX_NV_float_buffer.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com> (v1)
---
 include/GL/glx.h | 92 ------------------------------------------------
 1 file changed, 92 deletions(-)

diff --git a/include/GL/glx.h b/include/GL/glx.h
index 78f5052b23a..1e4bb7d7176 100644
--- a/include/GL/glx.h
+++ b/include/GL/glx.h
@@ -368,18 +368,6 @@ extern Bool glXDrawableAttribARB(Display *dpy, GLXDrawable draw, const int *attr
 #endif /* GLX_ARB_render_texture */
 
 
-/*
- * Remove this when glxext.h is updated.
- */
-#ifndef GLX_NV_float_buffer
-#define GLX_NV_float_buffer 1
-
-#define GLX_FLOAT_COMPONENTS_NV         0x20B0
-
-#endif /* GLX_NV_float_buffer */
-
-
-
 /*
  * #?. GLX_MESA_swap_frame_usage
  */
@@ -415,86 +403,6 @@ typedef int (*PFNGLXGETSWAPINTERVALMESAPROC)(void);
 #endif /* GLX_MESA_swap_control */
 
 
-
-/*
- * #?. GLX_EXT_texture_from_pixmap
- * XXX not finished?
- */
-#ifndef GLX_EXT_texture_from_pixmap
-#define GLX_EXT_texture_from_pixmap 1
-
-#define GLX_BIND_TO_TEXTURE_RGB_EXT        0x20D0
-#define GLX_BIND_TO_TEXTURE_RGBA_EXT       0x20D1
-#define GLX_BIND_TO_MIPMAP_TEXTURE_EXT     0x20D2
-#define GLX_BIND_TO_TEXTURE_TARGETS_EXT    0x20D3
-#define GLX_Y_INVERTED_EXT                 0x20D4
-
-#define GLX_TEXTURE_FORMAT_EXT             0x20D5
-#define GLX_TEXTURE_TARGET_EXT             0x20D6
-#define GLX_MIPMAP_TEXTURE_EXT             0x20D7
-
-#define GLX_TEXTURE_FORMAT_NONE_EXT        0x20D8
-#define GLX_TEXTURE_FORMAT_RGB_EXT         0x20D9
-#define GLX_TEXTURE_FORMAT_RGBA_EXT        0x20DA
-
-#define GLX_TEXTURE_1D_BIT_EXT             0x00000001
-#define GLX_TEXTURE_2D_BIT_EXT             0x00000002
-#define GLX_TEXTURE_RECTANGLE_BIT_EXT      0x00000004
-
-#define GLX_TEXTURE_1D_EXT                 0x20DB
-#define GLX_TEXTURE_2D_EXT                 0x20DC
-#define GLX_TEXTURE_RECTANGLE_EXT          0x20DD
-
-#define GLX_FRONT_LEFT_EXT                 0x20DE
-#define GLX_FRONT_RIGHT_EXT                0x20DF
-#define GLX_BACK_LEFT_EXT                  0x20E0
-#define GLX_BACK_RIGHT_EXT                 0x20E1
-#define GLX_FRONT_EXT                      GLX_FRONT_LEFT_EXT
-#define GLX_BACK_EXT                       GLX_BACK_LEFT_EXT
-#define GLX_AUX0_EXT                       0x20E2
-#define GLX_AUX1_EXT                       0x20E3 
-#define GLX_AUX2_EXT                       0x20E4 
-#define GLX_AUX3_EXT                       0x20E5 
-#define GLX_AUX4_EXT                       0x20E6 
-#define GLX_AUX5_EXT                       0x20E7 
-#define GLX_AUX6_EXT                       0x20E8
-#define GLX_AUX7_EXT                       0x20E9 
-#define GLX_AUX8_EXT                       0x20EA 
-#define GLX_AUX9_EXT                       0x20EB
-
-extern void glXBindTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer, const int *attrib_list);
-extern void glXReleaseTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer);
-
-#endif /* GLX_EXT_texture_from_pixmap */
-
-
-#ifndef GLX_MESA_query_renderer
-#define GLX_MESA_query_renderer 1
-
-#define GLX_RENDERER_VENDOR_ID_MESA                      0x8183
-#define GLX_RENDERER_DEVICE_ID_MESA                      0x8184
-#define GLX_RENDERER_VERSION_MESA                        0x8185
-#define GLX_RENDERER_ACCELERATED_MESA                    0x8186
-#define GLX_RENDERER_VIDEO_MEMORY_MESA                   0x8187
-#define GLX_RENDERER_UNIFIED_MEMORY_ARCHITECTURE_MESA    0x8188
-#define GLX_RENDERER_PREFERRED_PROFILE_MESA              0x8189
-#define GLX_RENDERER_OPENGL_CORE_PROFILE_VERSION_MESA    0x818A
-#define GLX_RENDERER_OPENGL_COMPATIBILITY_PROFILE_VERSION_MESA    0x818B
-#define GLX_RENDERER_OPENGL_ES_PROFILE_VERSION_MESA      0x818C
-#define GLX_RENDERER_OPENGL_ES2_PROFILE_VERSION_MESA     0x818D
-#define GLX_RENDERER_ID_MESA                             0x818E
-
-Bool glXQueryRendererIntegerMESA(Display *dpy, int screen, int renderer, int attribute, unsigned int *value);
-Bool glXQueryCurrentRendererIntegerMESA(int attribute, unsigned int *value);
-const char *glXQueryRendererStringMESA(Display *dpy, int screen, int renderer, int attribute);
-const char *glXQueryCurrentRendererStringMESA(int attribute);
-
-typedef Bool (*PFNGLXQUERYRENDERERINTEGERMESAPROC) (Display *dpy, int screen, int renderer, int attribute, unsigned int *value);
-typedef Bool (*PFNGLXQUERYCURRENTRENDERERINTEGERMESAPROC) (int attribute, unsigned int *value);
-typedef const char *(*PFNGLXQUERYRENDERERSTRINGMESAPROC) (Display *dpy, int screen, int renderer, int attribute);
-typedef const char *(*PFNGLXQUERYCURRENTRENDERERSTRINGMESAPROC) (int attribute);
-#endif /* GLX_MESA_query_renderer */
-
 /*** Should these go here, or in another header? */
 /*
 ** GLX Events

From 4d7e0fa8c731776ad5d630f37b36c535f1907371 Mon Sep 17 00:00:00 2001
From: Igor Gnatenko <i.gnatenko.brain@gmail.com>
Date: Tue, 7 Jul 2015 13:05:04 +0300
Subject: [PATCH 0860/1208] opencl: use versioned .so in mesa.icd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We must have versioned library in mesa.icd, because ICD loader would
fail if the mesa-devel package wasn't installed.

Cc: "10.6" <mesa-stable@lists.freedesktop.org>
Reported-by: Fabian Deutsch <fabian.deutsch@gmx.de>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73512
Signed-off-by: Igor Gnatenko <i.gnatenko.brain@gmail.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Acked-by: Michel Dänzer <michel.daenzer@amd.com>
---
 configure.ac                           | 3 +++
 src/gallium/targets/opencl/Makefile.am | 2 +-
 src/gallium/targets/opencl/mesa.icd    | 1 -
 src/gallium/targets/opencl/mesa.icd.in | 1 +
 4 files changed, 5 insertions(+), 2 deletions(-)
 delete mode 100644 src/gallium/targets/opencl/mesa.icd
 create mode 100644 src/gallium/targets/opencl/mesa.icd.in

diff --git a/configure.ac b/configure.ac
index 2fc5a2592d0..248f6183172 100644
--- a/configure.ac
+++ b/configure.ac
@@ -64,6 +64,8 @@ m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
 dnl Set internal versions
 OSMESA_VERSION=8
 AC_SUBST([OSMESA_VERSION])
+OPENCL_VERSION=1
+AC_SUBST([OPENCL_VERSION])
 
 dnl Versions for external dependencies
 LIBDRM_REQUIRED=2.4.60
@@ -2336,6 +2338,7 @@ AC_CONFIG_FILES([Makefile
 		src/gallium/targets/libgl-xlib/Makefile
 		src/gallium/targets/omx/Makefile
 		src/gallium/targets/opencl/Makefile
+		src/gallium/targets/opencl/mesa.icd
 		src/gallium/targets/osmesa/Makefile
 		src/gallium/targets/osmesa/osmesa.pc
 		src/gallium/targets/pipe-loader/Makefile
diff --git a/src/gallium/targets/opencl/Makefile.am b/src/gallium/targets/opencl/Makefile.am
index 441b438d339..4ab706ef2ac 100644
--- a/src/gallium/targets/opencl/Makefile.am
+++ b/src/gallium/targets/opencl/Makefile.am
@@ -5,7 +5,7 @@ lib_LTLIBRARIES = lib@OPENCL_LIBNAME@.la
 lib@OPENCL_LIBNAME@_la_LDFLAGS = \
 	$(LLVM_LDFLAGS) \
 	-no-undefined \
-	-version-number 1:0 \
+	-version-number @OPENCL_VERSION@:0 \
 	$(GC_SECTIONS) \
 	$(LD_NO_UNDEFINED)
 
diff --git a/src/gallium/targets/opencl/mesa.icd b/src/gallium/targets/opencl/mesa.icd
deleted file mode 100644
index 6a6a8706d7c..00000000000
--- a/src/gallium/targets/opencl/mesa.icd
+++ /dev/null
@@ -1 +0,0 @@
-libMesaOpenCL.so
diff --git a/src/gallium/targets/opencl/mesa.icd.in b/src/gallium/targets/opencl/mesa.icd.in
new file mode 100644
index 00000000000..1b77b4e4929
--- /dev/null
+++ b/src/gallium/targets/opencl/mesa.icd.in
@@ -0,0 +1 @@
+lib@OPENCL_LIBNAME@.so.@OPENCL_VERSION@

From 08fd736a45c98bd0acd96dfc1a61e6a695d2703c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Jul 2015 19:55:42 +0200
Subject: [PATCH 0861/1208] radeonsi: flush if the memory usage for an IB is
 too high
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Picked from the amdgpu branch.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/radeonsi/si_hw_context.c | 13 +++++++++++++
 src/gallium/drivers/radeonsi/si_state.c      |  4 ++++
 2 files changed, 17 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 1aef37b18b5..581b130caec 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -32,6 +32,19 @@ void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
 {
 	int i;
 
+	/* There are two memory usage counters in the winsys for all buffers
+	 * that have been added (cs_add_reloc) and two counters in the pipe
+	 * driver for those that haven't been added yet.
+	 * */
+	if (!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs, ctx->b.vram, ctx->b.gtt)) {
+		ctx->b.gtt = 0;
+		ctx->b.vram = 0;
+		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		return;
+	}
+	ctx->b.gtt = 0;
+	ctx->b.vram = 0;
+
 	/* The number of dwords we already used in the CS so far. */
 	num_dw += ctx->b.rings.gfx.cs->cdw;
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index d0269f0fe86..867dbc68633 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2059,6 +2059,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 		if (rtex->fmask.size && rtex->cmask.size) {
 			sctx->framebuffer.compressed_cb_mask |= 1 << i;
 		}
+		r600_context_add_resource_size(ctx, surf->base.texture);
 	}
 	/* Set the 16BPC export for possible dual-src blending. */
 	if (i == 1 && surf && surf->export_16bpc) {
@@ -2073,6 +2074,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 		if (!surf->depth_initialized) {
 			si_init_depth_surface(sctx, surf);
 		}
+		r600_context_add_resource_size(ctx, surf->base.texture);
 	}
 
 	si_update_fb_rs_state(sctx);
@@ -2815,6 +2817,7 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 			pipe_resource_reference(&dsti->buffer, src->buffer);
 			dsti->buffer_offset = src->buffer_offset;
 			dsti->stride = src->stride;
+			r600_context_add_resource_size(ctx, src->buffer);
 		}
 	} else {
 		for (i = 0; i < count; i++) {
@@ -2832,6 +2835,7 @@ static void si_set_index_buffer(struct pipe_context *ctx,
 	if (ib) {
 		pipe_resource_reference(&sctx->index_buffer.buffer, ib->buffer);
 	        memcpy(&sctx->index_buffer, ib, sizeof(*ib));
+		r600_context_add_resource_size(ctx, ib->buffer);
 	} else {
 		pipe_resource_reference(&sctx->index_buffer.buffer, NULL);
 	}

From 828d20bdb79c4b6e6cb761017ad030bd875f9ac2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 15:17:30 +0200
Subject: [PATCH 0862/1208] r600g: fix the single-sample fast clear setup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No effect, but this is what we should be doing.

Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/evergreen_state.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 7065af94e7c..5c8fd0d2d35 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1028,7 +1028,10 @@ void evergreen_init_color_surface(struct r600_context *rctx,
 	macro_aspect = rtex->surface.mtilea;
 	bankw = rtex->surface.bankw;
 	bankh = rtex->surface.bankh;
-	fmask_bankh = rtex->fmask.bank_height;
+	if (rtex->fmask.size)
+		fmask_bankh = rtex->fmask.bank_height;
+	else
+		fmask_bankh = rtex->surface.bankh;
 	tile_split = eg_tile_split(tile_split);
 	macro_aspect = eg_macro_tile_aspect(macro_aspect);
 	bankw = eg_bank_wh(bankw);
@@ -1149,10 +1152,11 @@ void evergreen_init_color_surface(struct r600_context *rctx,
 	surf->cb_color_attrib = color_attrib;
 	if (rtex->fmask.size) {
 		surf->cb_color_fmask = (base_offset + rtex->fmask.offset) >> 8;
+		surf->cb_color_fmask_slice = S_028C88_TILE_MAX(rtex->fmask.slice_tile_max);
 	} else {
 		surf->cb_color_fmask = surf->cb_color_base;
+		surf->cb_color_fmask_slice = S_028C88_TILE_MAX(slice);
 	}
-	surf->cb_color_fmask_slice = S_028C88_TILE_MAX(rtex->fmask.slice_tile_max);
 
 	surf->color_initialized = true;
 }

From d4ad4c20617f45f71152e292ee39f020ef352bfd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 15:18:36 +0200
Subject: [PATCH 0863/1208] r600g: fix the CB_SHADER_MASK setup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes the single-sample fast clear hang.

Cc: 10.6 <mesa-stable@lists.freedesktop.org>
Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/evergreen_state.c | 8 ++++----
 src/gallium/drivers/r600/r600_shader.c     | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 5c8fd0d2d35..13ecc46959f 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1736,10 +1736,10 @@ static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_
 
 	r600_write_context_reg_seq(cs, R_028238_CB_TARGET_MASK, 2);
 	radeon_emit(cs, a->blend_colormask & fb_colormask); /* R_028238_CB_TARGET_MASK */
-	/* Always enable the first colorbuffer in CB_SHADER_MASK. This
-	 * will assure that the alpha-test will work even if there is
-	 * no colorbuffer bound. */
-	radeon_emit(cs, 0xf | (a->dual_src_blend ? ps_colormask : 0) | fb_colormask); /* R_02823C_CB_SHADER_MASK */
+	/* This must match the used export instructions exactly.
+	 * Other values may lead to undefined behavior and hangs.
+	 */
+	radeon_emit(cs, ps_colormask); /* R_02823C_CB_SHADER_MASK */
 }
 
 static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom *atom)
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 09f50f5d810..8d1f95abddc 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -2490,6 +2490,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 			output[j].array_base = 0;
 			output[j].op = CF_OP_EXPORT;
 			j++;
+			shader->nr_ps_color_exports++;
 		}
 
 		noutput = j;

From de59a40f6898e20a61ac4ea0e5995334f6ed2932 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 15:19:19 +0200
Subject: [PATCH 0864/1208] r600g: re-enable single-sample fast clear
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixed by the CB_SHADER_MASK fix.

Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/r600_blit.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 1c59230c78e..8664e036338 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -393,12 +393,7 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct pipe_framebuffer_state *fb = &rctx->framebuffer.state;
 
-	/* Single-sample fast color clear is broken on r600g:
-	 *   https://bugs.freedesktop.org/show_bug.cgi?id=73528
-	 *   https://bugs.freedesktop.org/show_bug.cgi?id=82186
-	 */
-	if (buffers & PIPE_CLEAR_COLOR && rctx->b.chip_class >= EVERGREEN &&
-	    rctx->framebuffer.nr_samples > 1) {
+	if (buffers & PIPE_CLEAR_COLOR && rctx->b.chip_class >= EVERGREEN) {
 		evergreen_do_fast_color_clear(&rctx->b, fb, &rctx->framebuffer.atom,
 					      &buffers, color);
 		if (!buffers)

From 2e0179e2b3b9ea369816597f789a5bda7e0c46b5 Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Mon, 11 May 2015 15:03:49 +0200
Subject: [PATCH 0865/1208] mesa/es3.1: Allow binding GL_DRAW_INDIRECT_BUFFER
 with gles 3.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/bufferobj.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 9425b095e81..6ddcc5cad4e 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -91,8 +91,9 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
    case GL_COPY_WRITE_BUFFER:
       return &ctx->CopyWriteBuffer;
    case GL_DRAW_INDIRECT_BUFFER:
-      if (ctx->API == API_OPENGL_CORE &&
-          ctx->Extensions.ARB_draw_indirect) {
+      if ((ctx->API == API_OPENGL_CORE &&
+           ctx->Extensions.ARB_draw_indirect) ||
+           _mesa_is_gles31(ctx)) {
          return &ctx->DrawIndirectBuffer;
       }
       break;

From 4f8e4a95dbd806bc735bf93dda245be2bb2ea454 Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Mon, 11 May 2015 15:03:50 +0200
Subject: [PATCH 0866/1208] mesa/es3.1: Allow GL_SAMPLE_MASK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GLES 3.1 should be allowed to enable GL_SAMPLE_MASK.

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/enable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
index d0583edc019..adba03576c1 100644
--- a/src/mesa/main/enable.c
+++ b/src/mesa/main/enable.c
@@ -1001,7 +1001,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
 
       /* ARB_texture_multisample */
       case GL_SAMPLE_MASK:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles31(ctx))
             goto invalid_enum_error;
          CHECK_EXTENSION(ARB_texture_multisample, cap);
          if (ctx->Multisample.SampleMask == state)

From a4bde371c7172fd775dea4377f9bccc3a38992c0 Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Mon, 11 May 2015 15:03:51 +0200
Subject: [PATCH 0867/1208] mesa/es3.1: Allow GL_DEPTH_STENCIL_TEXTURE_MODE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GLES 3.1 must support the parameter GL_DEPTH_STENCIL_TEXTURE_MODE.

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/texparam.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index 88c2c142804..c0611c3e489 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -500,7 +500,9 @@ set_tex_parameteri(struct gl_context *ctx,
       goto invalid_pname;
 
    case GL_DEPTH_STENCIL_TEXTURE_MODE:
-      if (_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_stencil_texturing) {
+      if ((_mesa_is_desktop_gl(ctx) &&
+           ctx->Extensions.ARB_stencil_texturing) ||
+          _mesa_is_gles31(ctx)) {
          bool stencil = params[0] == GL_STENCIL_INDEX;
          if (!stencil && params[0] != GL_DEPTH_COMPONENT)
             goto invalid_param;

From d74645d3acc815f6129b4cb20e6570c127d5ab2b Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Mon, 11 May 2015 15:03:52 +0200
Subject: [PATCH 0868/1208] mesa/es3.1: Allow textures with target
 GL_TEXTURE_2D_MULTISAMPLE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GLES 3.1 should be able to bind a texture with the target
GL_TEXTURE_2D_MULTISAMPLE.

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/teximage.c | 2 +-
 src/mesa/main/texobj.c   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 949eef0e285..a719c1c5d4c 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -1008,7 +1008,7 @@ _mesa_max_texture_levels(struct gl_context *ctx, GLenum target)
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY:
-      return _mesa_is_desktop_gl(ctx)
+      return (_mesa_is_desktop_gl(ctx) || _mesa_is_gles31(ctx))
          && ctx->Extensions.ARB_texture_multisample
          ? 1 : 0;
    case GL_TEXTURE_EXTERNAL_OES:
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index 094c3a61a71..cd7cfd6a4fb 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -1606,8 +1606,8 @@ _mesa_tex_target_to_index(const struct gl_context *ctx, GLenum target)
       return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_cube_map_array
          ? TEXTURE_CUBE_ARRAY_INDEX : -1;
    case GL_TEXTURE_2D_MULTISAMPLE:
-      return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample
-         ? TEXTURE_2D_MULTISAMPLE_INDEX: -1;
+      return ((_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample) ||
+              _mesa_is_gles31(ctx)) ? TEXTURE_2D_MULTISAMPLE_INDEX: -1;
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample
          ? TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX: -1;

From 0fe81a25f7102d78dbe8f7e89d2b024b1741da1c Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Mon, 11 May 2015 15:03:53 +0200
Subject: [PATCH 0869/1208] mesa/es3.1: Allow enable of GL_SAMPLE_MASK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GLES 3.1 must be able to enable GL_SAMPLE_MASK.

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/enable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
index adba03576c1..42f67990784 100644
--- a/src/mesa/main/enable.c
+++ b/src/mesa/main/enable.c
@@ -1603,7 +1603,7 @@ _mesa_IsEnabled( GLenum cap )
 
       /* ARB_texture_multisample */
       case GL_SAMPLE_MASK:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles31(ctx))
             goto invalid_enum_error;
          CHECK_EXTENSION(ARB_texture_multisample);
          return ctx->Multisample.SampleMask;

From 2253a296c9ad7b11f9844640024c5f0784e4e528 Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Mon, 11 May 2015 15:03:55 +0200
Subject: [PATCH 0870/1208] mesa/es3.1: Allow query of GL_TEXTURE_MULTISAMPLE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GLES 3.1 must allow a query for GL_TEXTURE_MULTISAMPLE.

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/formatquery.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c
index fe96d5b29fb..85f7b6b5664 100644
--- a/src/mesa/main/formatquery.c
+++ b/src/mesa/main/formatquery.c
@@ -74,7 +74,9 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       /* These enums are only valid if ARB_texture_multisample is supported */
-      if (_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample)
+      if ((_mesa_is_desktop_gl(ctx) &&
+           ctx->Extensions.ARB_texture_multisample) ||
+          _mesa_is_gles31(ctx))
          break;
 
    default:

From 704e764f06e8e6ec75484e28271e502bbc4cf06a Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Mon, 11 May 2015 15:03:56 +0200
Subject: [PATCH 0871/1208] mesa/es3.1: Allow multisampled textures for GLES
 3.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GLES 3.1 must be allowed to create multisampled textures.

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/teximage.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index a719c1c5d4c..841ec364020 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -5589,8 +5589,8 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
    GLenum sample_count_error;
    bool dsa = strstr(func, "ture") ? true : false;
 
-   if (!(ctx->Extensions.ARB_texture_multisample
-      && _mesa_is_desktop_gl(ctx))) {
+   if (!((ctx->Extensions.ARB_texture_multisample
+         && _mesa_is_desktop_gl(ctx))) && !_mesa_is_gles31(ctx)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(unsupported)", func);
       return;
    }

From cf5667108b2cdd6f37e1a561c18fb5c757258f06 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sun, 2 Aug 2015 11:40:26 +1000
Subject: [PATCH 0872/1208] mesa: fix type for array indexing validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

parse_program_resource_name returns -1 when the index is invalid this needs to
be tested before assigning the value to the unsigned array_index.

In link_varyings.cpp (the other place parse_program_resource_name is used) after
the -1 check is done the value is just assigned to an unsigned variable so it
seems long is just used so we can return the -1 rather than actually expecting
index values to be ridiculously large.

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/shader_query.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index b49fd38d64d..a85e4c42ae3 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -506,7 +506,7 @@ _mesa_program_resource_array_size(struct gl_program_resource *res)
 static bool
 valid_array_index(const GLchar *name, unsigned *array_index)
 {
-   unsigned idx = 0;
+   long idx = 0;
    const GLchar *out_base_name_end;
 
    idx = parse_program_resource_name(name, &out_base_name_end);

From d2cd2c69b20fcb3f1fc3b7671745c5c84ef200cb Mon Sep 17 00:00:00 2001
From: Zoltan Gilian <zoltan.gilian@gmail.com>
Date: Mon, 27 Jul 2015 11:21:07 +0200
Subject: [PATCH 0873/1208] clover: move find_kernels to functions

---
 .../state_trackers/clover/llvm/invocation.cpp | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index 967284d8094..924cb36f993 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -269,17 +269,19 @@ namespace {
 #endif
    }
 
-   void
-   find_kernels(llvm::Module *mod, std::vector<llvm::Function *> &kernels) {
+   std::vector<llvm::Function *>
+   find_kernels(const llvm::Module *mod) {
       const llvm::NamedMDNode *kernel_node =
                                  mod->getNamedMetadata("opencl.kernels");
       // This means there are no kernels in the program.  The spec does not
       // require that we return an error here, but there will be an error if
       // the user tries to pass this program to a clCreateKernel() call.
       if (!kernel_node) {
-         return;
+         return std::vector<llvm::Function *>();
       }
 
+      std::vector<llvm::Function *> kernels;
+      kernels.reserve(kernel_node->getNumOperands());
       for (unsigned i = 0; i < kernel_node->getNumOperands(); ++i) {
 #if HAVE_LLVM >= 0x0306
          kernels.push_back(llvm::mdconst::dyn_extract<llvm::Function>(
@@ -288,11 +290,11 @@ namespace {
 #endif
                                     kernel_node->getOperand(i)->getOperand(0)));
       }
+      return kernels;
    }
 
    void
-   optimize(llvm::Module *mod, unsigned optimization_level,
-            const std::vector<llvm::Function *> &kernels) {
+   optimize(llvm::Module *mod, unsigned optimization_level) {
 
 #if HAVE_LLVM >= 0x0307
       llvm::legacy::PassManager PM;
@@ -300,6 +302,8 @@ namespace {
       llvm::PassManager PM;
 #endif
 
+      const std::vector<llvm::Function *> kernels = find_kernels(mod);
+
       // Add a function internalizer pass.
       //
       // By default, the function internalizer pass will look for a function
@@ -435,7 +439,6 @@ namespace {
 
    module
    build_module_llvm(llvm::Module *mod,
-                     const std::vector<llvm::Function *> &kernels,
                      clang::LangAS::Map& address_spaces) {
 
       module m;
@@ -447,6 +450,7 @@ namespace {
       llvm::WriteBitcodeToFile(mod, bitcode_ostream);
       bitcode_ostream.flush();
 
+      const std::vector<llvm::Function *> kernels = find_kernels(mod);
       for (unsigned i = 0; i < kernels.size(); ++i) {
          std::string kernel_name = kernels[i]->getName();
          std::vector<module::argument> args =
@@ -610,10 +614,11 @@ namespace {
    module
    build_module_native(std::vector<char> &code,
                        const llvm::Module *mod,
-                       const std::vector<llvm::Function *> &kernels,
                        const clang::LangAS::Map &address_spaces,
                        std::string &r_log) {
 
+      const std::vector<llvm::Function *> kernels = find_kernels(mod);
+
       std::map<std::string, unsigned> kernel_offsets =
             get_kernel_offsets(code, kernels, r_log);
 
@@ -697,7 +702,6 @@ clover::compile_program_llvm(const std::string &source,
 
    init_targets();
 
-   std::vector<llvm::Function *> kernels;
    size_t processor_str_len = std::string(target).find_first_of("-");
    std::string processor(target, 0, processor_str_len);
    std::string triple(target, processor_str_len + 1,
@@ -717,9 +721,7 @@ clover::compile_program_llvm(const std::string &source,
                                     triple, processor, opts, address_spaces,
                                     optimization_level, r_log);
 
-   find_kernels(mod, kernels);
-
-   optimize(mod, optimization_level, kernels);
+   optimize(mod, optimization_level);
 
    if (get_debug_flags() & DBG_LLVM) {
       std::string log;
@@ -738,13 +740,13 @@ clover::compile_program_llvm(const std::string &source,
          m = module();
          break;
       case PIPE_SHADER_IR_LLVM:
-         m = build_module_llvm(mod, kernels, address_spaces);
+         m = build_module_llvm(mod, address_spaces);
          break;
       case PIPE_SHADER_IR_NATIVE: {
          std::vector<char> code = compile_native(mod, triple, processor,
                                                  get_debug_flags() & DBG_ASM,
                                                  r_log);
-         m = build_module_native(code, mod, kernels, address_spaces, r_log);
+         m = build_module_native(code, mod, address_spaces, r_log);
          break;
       }
    }

From 9ef5b7a23348291893a6bf61fcce7a306e787add Mon Sep 17 00:00:00 2001
From: Zoltan Gilian <zoltan.gilian@gmail.com>
Date: Mon, 27 Jul 2015 11:34:07 +0200
Subject: [PATCH 0874/1208] clover: pass image attributes to the kernel

Read-only and write-only image arguments are recognized and
distinguished.
Attributes of the image arguments are passed to the kernel as implicit
arguments.
---
 .../state_trackers/clover/core/kernel.cpp     |  28 ++++
 .../state_trackers/clover/core/kernel.hpp     |  15 +-
 .../state_trackers/clover/core/module.hpp     |   4 +-
 .../state_trackers/clover/llvm/invocation.cpp | 135 +++++++++++++++++-
 4 files changed, 171 insertions(+), 11 deletions(-)

diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp b/src/gallium/state_trackers/clover/core/kernel.cpp
index 0756f068553..955ff7b3955 100644
--- a/src/gallium/state_trackers/clover/core/kernel.cpp
+++ b/src/gallium/state_trackers/clover/core/kernel.cpp
@@ -182,6 +182,34 @@ kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
          }
          break;
       }
+      case module::argument::image_size: {
+         auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
+         std::vector<cl_uint> image_size{
+               static_cast<cl_uint>(img->width()),
+               static_cast<cl_uint>(img->height()),
+               static_cast<cl_uint>(img->depth())};
+         for (auto x : image_size) {
+            auto arg = argument::create(marg);
+
+            arg->set(sizeof(x), &x);
+            arg->bind(*this, marg);
+         }
+         break;
+      }
+      case module::argument::image_format: {
+         auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
+         cl_image_format fmt = img->format();
+         std::vector<cl_uint> image_format{
+               static_cast<cl_uint>(fmt.image_channel_data_type),
+               static_cast<cl_uint>(fmt.image_channel_order)};
+         for (auto x : image_format) {
+            auto arg = argument::create(marg);
+
+            arg->set(sizeof(x), &x);
+            arg->bind(*this, marg);
+         }
+         break;
+      }
       }
    }
 
diff --git a/src/gallium/state_trackers/clover/core/kernel.hpp b/src/gallium/state_trackers/clover/core/kernel.hpp
index d6432a4df8d..4ba6ff467b7 100644
--- a/src/gallium/state_trackers/clover/core/kernel.hpp
+++ b/src/gallium/state_trackers/clover/core/kernel.hpp
@@ -190,7 +190,16 @@ namespace clover {
          pipe_surface *st;
       };
 
-      class image_rd_argument : public argument {
+      class image_argument : public argument {
+      public:
+         const image *get() const {
+            return img;
+         }
+      protected:
+         image *img;
+      };
+
+      class image_rd_argument : public image_argument {
       public:
          virtual void set(size_t size, const void *value);
          virtual void bind(exec_context &ctx,
@@ -198,11 +207,10 @@ namespace clover {
          virtual void unbind(exec_context &ctx);
 
       private:
-         image *img;
          pipe_sampler_view *st;
       };
 
-      class image_wr_argument : public argument {
+      class image_wr_argument : public image_argument {
       public:
          virtual void set(size_t size, const void *value);
          virtual void bind(exec_context &ctx,
@@ -210,7 +218,6 @@ namespace clover {
          virtual void unbind(exec_context &ctx);
 
       private:
-         image *img;
          pipe_surface *st;
       };
 
diff --git a/src/gallium/state_trackers/clover/core/module.hpp b/src/gallium/state_trackers/clover/core/module.hpp
index 9d656885945..5db0548872c 100644
--- a/src/gallium/state_trackers/clover/core/module.hpp
+++ b/src/gallium/state_trackers/clover/core/module.hpp
@@ -72,7 +72,9 @@ namespace clover {
          enum semantic {
             general,
             grid_dimension,
-            grid_offset
+            grid_offset,
+            image_size,
+            image_format
          };
 
          argument(enum type type, size_t size,
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index 924cb36f993..86859af3530 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -344,18 +344,91 @@ namespace {
       PM.run(*mod);
    }
 
+   // Kernel metadata
+
+   const llvm::MDNode *
+   get_kernel_metadata(const llvm::Function *kernel_func) {
+      auto mod = kernel_func->getParent();
+      auto kernels_node = mod->getNamedMetadata("opencl.kernels");
+      if (!kernels_node) {
+         return nullptr;
+      }
+
+      const llvm::MDNode *kernel_node = nullptr;
+      for (unsigned i = 0; i < kernels_node->getNumOperands(); ++i) {
+#if HAVE_LLVM >= 0x0306
+         auto func = llvm::mdconst::dyn_extract<llvm::Function>(
+#else
+         auto func = llvm::dyn_cast<llvm::Function>(
+#endif
+                                    kernels_node->getOperand(i)->getOperand(0));
+         if (func == kernel_func) {
+            kernel_node = kernels_node->getOperand(i);
+            break;
+         }
+      }
+
+      return kernel_node;
+   }
+
+   llvm::MDNode*
+   node_from_op_checked(const llvm::MDOperand &md_operand,
+                        llvm::StringRef expect_name,
+                        unsigned expect_num_args)
+   {
+      auto node = llvm::cast<llvm::MDNode>(md_operand);
+      assert(node->getNumOperands() == expect_num_args &&
+             "Wrong number of operands.");
+
+      auto str_node = llvm::cast<llvm::MDString>(node->getOperand(0));
+      assert(str_node->getString() == expect_name &&
+             "Wrong metadata node name.");
+
+      return node;
+   }
+
+   struct kernel_arg_md {
+      llvm::StringRef type_name;
+      llvm::StringRef access_qual;
+      kernel_arg_md(llvm::StringRef type_name_, llvm::StringRef access_qual_):
+         type_name(type_name_), access_qual(access_qual_) {}
+   };
+
+   std::vector<kernel_arg_md>
+   get_kernel_arg_md(const llvm::Function *kernel_func) {
+      auto num_args = kernel_func->getArgumentList().size();
+
+      auto kernel_node = get_kernel_metadata(kernel_func);
+      auto aq = node_from_op_checked(kernel_node->getOperand(2),
+                                     "kernel_arg_access_qual", num_args + 1);
+      auto ty = node_from_op_checked(kernel_node->getOperand(3),
+                                     "kernel_arg_type", num_args + 1);
+
+      std::vector<kernel_arg_md> res;
+      res.reserve(num_args);
+      for (unsigned i = 0; i < num_args; ++i) {
+         res.push_back(kernel_arg_md(
+            llvm::cast<llvm::MDString>(ty->getOperand(i+1))->getString(),
+            llvm::cast<llvm::MDString>(aq->getOperand(i+1))->getString()));
+      }
+
+      return res;
+   }
+
    std::vector<module::argument>
    get_kernel_args(const llvm::Module *mod, const std::string &kernel_name,
                    const clang::LangAS::Map &address_spaces) {
 
       std::vector<module::argument> args;
       llvm::Function *kernel_func = mod->getFunction(kernel_name);
+      assert(kernel_func && "Kernel name not found in module.");
+      auto arg_md = get_kernel_arg_md(kernel_func);
 
       llvm::DataLayout TD(mod);
+      llvm::Type *size_type =
+         TD.getSmallestLegalIntType(mod->getContext(), sizeof(cl_uint) * 8);
 
-      for (llvm::Function::const_arg_iterator I = kernel_func->arg_begin(),
-                                      E = kernel_func->arg_end(); I != E; ++I) {
-         const llvm::Argument &arg = *I;
+      for (const auto &arg: kernel_func->args()) {
 
          llvm::Type *arg_type = arg.getType();
          const unsigned arg_store_size = TD.getTypeStoreSize(arg_type);
@@ -373,6 +446,59 @@ namespace {
          unsigned target_size = TD.getTypeStoreSize(target_type);
          unsigned target_align = TD.getABITypeAlignment(target_type);
 
+         llvm::StringRef type_name = arg_md[arg.getArgNo()].type_name;
+         llvm::StringRef access_qual = arg_md[arg.getArgNo()].access_qual;
+
+         // Image
+         const bool is_image2d = type_name == "image2d_t";
+         const bool is_image3d = type_name == "image3d_t";
+         if (is_image2d || is_image3d) {
+            const bool is_write_only = access_qual == "write_only";
+            const bool is_read_only = access_qual == "read_only";
+
+            typename module::argument::type marg_type;
+            if (is_image2d && is_read_only) {
+               marg_type = module::argument::image2d_rd;
+            } else if (is_image2d && is_write_only) {
+               marg_type = module::argument::image2d_wr;
+            } else if (is_image3d && is_read_only) {
+               marg_type = module::argument::image3d_rd;
+            } else if (is_image3d && is_write_only) {
+               marg_type = module::argument::image3d_wr;
+            } else {
+               assert(0 && "Wrong image access qualifier");
+            }
+
+            args.push_back(module::argument(marg_type,
+                                            arg_store_size, target_size,
+                                            target_align,
+                                            module::argument::zero_ext));
+            continue;
+         }
+
+         // Image size implicit argument
+         if (type_name == "__llvm_image_size") {
+            args.push_back(module::argument(module::argument::scalar,
+                                            sizeof(cl_uint),
+                                            TD.getTypeStoreSize(size_type),
+                                            TD.getABITypeAlignment(size_type),
+                                            module::argument::zero_ext,
+                                            module::argument::image_size));
+            continue;
+         }
+
+         // Image format implicit argument
+         if (type_name == "__llvm_image_format") {
+            args.push_back(module::argument(module::argument::scalar,
+                                            sizeof(cl_uint),
+                                            TD.getTypeStoreSize(size_type),
+                                            TD.getABITypeAlignment(size_type),
+                                            module::argument::zero_ext,
+                                            module::argument::image_format));
+            continue;
+         }
+
+         // Other types
          if (llvm::isa<llvm::PointerType>(arg_type) && arg.hasByValAttr()) {
             arg_type =
                   llvm::dyn_cast<llvm::PointerType>(arg_type)->getElementType();
@@ -417,9 +543,6 @@ namespace {
       // Append implicit arguments.  XXX - The types, ordering and
       // vector size of the implicit arguments should depend on the
       // target according to the selected calling convention.
-      llvm::Type *size_type =
-         TD.getSmallestLegalIntType(mod->getContext(), sizeof(cl_uint) * 8);
-
       args.push_back(
          module::argument(module::argument::scalar, sizeof(cl_uint),
                           TD.getTypeStoreSize(size_type),

From ab5b7a0fe659ff6f9c1885d5cb047b6531959506 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sun, 2 Aug 2015 14:57:38 +1000
Subject: [PATCH 0875/1208] nir: Use a single bit for the dual-source blend
 index
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only values allowed are 0 and 1, and the value is checked before
assigning.

This is a copy of 8eeca7a56c that seems to have been made to the glsl
ir type after it was copied for use in nir but before nir landed.

Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/glsl/nir/nir.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 62cdbd4a5fb..6c954c0ef7c 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -292,9 +292,13 @@ typedef struct {
       unsigned int driver_location;
 
       /**
-       * output index for dual source blending.
+       * Output index for dual source blending.
+       *
+       * \note
+       * The GLSL spec only allows the values 0 or 1 for the index in \b dual
+       * source blending.
        */
-      int index;
+      unsigned index:1;
 
       /**
        * Initial binding point for a sampler or UBO.

From aa46fba7e61a77bb3b029c7a483b5a2a2a73ff4d Mon Sep 17 00:00:00 2001
From: Zoltan Gilian <zoltan.gilian@gmail.com>
Date: Mon, 27 Jul 2015 11:27:12 +0200
Subject: [PATCH 0876/1208] clover: fix image resource depth and array_size

---
 src/gallium/state_trackers/clover/core/memory.cpp   | 2 +-
 src/gallium/state_trackers/clover/core/resource.cpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/state_trackers/clover/core/memory.cpp b/src/gallium/state_trackers/clover/core/memory.cpp
index 055336a3325..b852e6896fe 100644
--- a/src/gallium/state_trackers/clover/core/memory.cpp
+++ b/src/gallium/state_trackers/clover/core/memory.cpp
@@ -189,7 +189,7 @@ image2d::image2d(clover::context &ctx, cl_mem_flags flags,
                  const cl_image_format *format, size_t width,
                  size_t height, size_t row_pitch,
                  void *host_ptr) :
-   image(ctx, flags, format, width, height, 0,
+   image(ctx, flags, format, width, height, 1,
          row_pitch, 0, height * row_pitch, host_ptr) {
 }
 
diff --git a/src/gallium/state_trackers/clover/core/resource.cpp b/src/gallium/state_trackers/clover/core/resource.cpp
index 78ebafb644f..10a29a94eac 100644
--- a/src/gallium/state_trackers/clover/core/resource.cpp
+++ b/src/gallium/state_trackers/clover/core/resource.cpp
@@ -132,6 +132,7 @@ root_resource::root_resource(clover::device &dev, memory_obj &obj,
       info.depth0 = 1;
    }
 
+   info.array_size = 1;
    info.target = translate_target(obj.type());
    info.bind = (PIPE_BIND_SAMPLER_VIEW |
                 PIPE_BIND_COMPUTE_RESOURCE |

From be3622dce383cb930a233b88bb056adb026dce1f Mon Sep 17 00:00:00 2001
From: Zoltan Gilian <zoltan.gilian@gmail.com>
Date: Thu, 30 Jul 2015 23:35:09 +0200
Subject: [PATCH 0877/1208] clover: handle setKernelArg errors

---
 src/gallium/state_trackers/clover/core/kernel.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp b/src/gallium/state_trackers/clover/core/kernel.cpp
index 955ff7b3955..a226ec1a752 100644
--- a/src/gallium/state_trackers/clover/core/kernel.cpp
+++ b/src/gallium/state_trackers/clover/core/kernel.cpp
@@ -367,6 +367,9 @@ kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
 
 void
 kernel::scalar_argument::set(size_t size, const void *value) {
+   if (!value)
+      throw error(CL_INVALID_ARG_VALUE);
+
    if (size != this->size)
       throw error(CL_INVALID_ARG_SIZE);
 
@@ -435,6 +438,9 @@ kernel::local_argument::set(size_t size, const void *value) {
    if (value)
       throw error(CL_INVALID_ARG_VALUE);
 
+   if (!size)
+      throw error(CL_INVALID_ARG_SIZE);
+
    _storage = size;
    _set = true;
 }
@@ -494,6 +500,9 @@ kernel::constant_argument::unbind(exec_context &ctx) {
 
 void
 kernel::image_rd_argument::set(size_t size, const void *value) {
+   if (!value)
+      throw error(CL_INVALID_ARG_VALUE);
+
    if (size != sizeof(cl_mem))
       throw error(CL_INVALID_ARG_SIZE);
 
@@ -522,6 +531,9 @@ kernel::image_rd_argument::unbind(exec_context &ctx) {
 
 void
 kernel::image_wr_argument::set(size_t size, const void *value) {
+   if (!value)
+      throw error(CL_INVALID_ARG_VALUE);
+
    if (size != sizeof(cl_mem))
       throw error(CL_INVALID_ARG_SIZE);
 
@@ -550,6 +562,9 @@ kernel::image_wr_argument::unbind(exec_context &ctx) {
 
 void
 kernel::sampler_argument::set(size_t size, const void *value) {
+   if (!value)
+      throw error(CL_INVALID_SAMPLER);
+
    if (size != sizeof(cl_sampler))
       throw error(CL_INVALID_ARG_SIZE);
 

From 44e90f2a556a9b8ede12ae18a7cfa3a71e32d40c Mon Sep 17 00:00:00 2001
From: Zoltan Gilian <zoltan.gilian@gmail.com>
Date: Thu, 30 Jul 2015 20:11:51 +0200
Subject: [PATCH 0878/1208] r600,compute: force tiling on 2D and 3D texture
 compute resources

To circumvent a problem occuring when LINEAR_ALIGNED array mode is
selected on a TEXTURE_2D RAT.
This configuration causes MEM_RAT STORE_TYPED to write to incorrect
locations.
---
 src/gallium/drivers/radeon/r600_texture.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index e63c37e890c..d3f3e63f219 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -706,6 +706,7 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
 				   const struct pipe_resource *templ)
 {
 	const struct util_format_description *desc = util_format_description(templ->format);
+	bool force_tiling = templ->flags & R600_RESOURCE_FLAG_FORCE_TILING;
 
 	/* MSAA resources must be 2D tiled. */
 	if (templ->nr_samples > 1)
@@ -715,10 +716,16 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
 	if (templ->flags & R600_RESOURCE_FLAG_TRANSFER)
 		return RADEON_SURF_MODE_LINEAR_ALIGNED;
 
+	/* r600g: force tiling on TEXTURE_2D and TEXTURE_3D compute resources. */
+	if (rscreen->chip_class >= R600 && rscreen->chip_class <= CAYMAN &&
+	    (templ->bind & PIPE_BIND_COMPUTE_RESOURCE) &&
+	    (templ->target == PIPE_TEXTURE_2D ||
+	     templ->target == PIPE_TEXTURE_3D))
+		force_tiling = true;
+
 	/* Handle common candidates for the linear mode.
 	 * Compressed textures must always be tiled. */
-	if (!(templ->flags & R600_RESOURCE_FLAG_FORCE_TILING) &&
-	    !util_format_is_compressed(templ->format)) {
+	if (!force_tiling && !util_format_is_compressed(templ->format)) {
 		/* Not everything can be linear, so we cannot enforce it
 		 * for all textures. */
 		if ((rscreen->debug_flags & DBG_NO_TILING) &&

From 875458b778e8d389e00f42269e716a3cb2761fab Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Fri, 31 Jul 2015 12:18:37 -0700
Subject: [PATCH 0879/1208] mesa: Add -fno-math-errno to CFLAGS.

Cuts about 9k of .text size.

   text    data     bss     dec     hex filename
4992804  197808   26328 5216940  4f9aac i965_dri.so before
4983676  197808   26328 5207812  4f7704 i965_dri.so after

Also, Darwin's libm does not ever set errno, so if we care about those
systems we shouldn't rely on errno anyway.

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 configure.ac | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/configure.ac b/configure.ac
index 248f6183172..dc9272d3016 100644
--- a/configure.ac
+++ b/configure.ac
@@ -286,6 +286,9 @@ if test "x$GCC" = xyes; then
     # Work around aliasing bugs - developers should comment this out
     CFLAGS="$CFLAGS -fno-strict-aliasing"
 
+    # We don't want floating-point math functions to set errno
+    CFLAGS="$CFLAGS -fno-math-errno"
+
     # gcc's builtin memcmp is slower than glibc's
     # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
     CFLAGS="$CFLAGS -fno-builtin-memcmp"

From f55c408067a3ea3529fcf7cbbaa1a041a4a8849d Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Fri, 31 Jul 2015 12:19:46 -0700
Subject: [PATCH 0880/1208] mesa: Add -fno-trapping-math to CFLAGS.

Cuts about 1k of .text size.

   text    data     bss     dec     hex filename
4983676  197808   26328 5207812  4f7704 i965_dri.so before
4982522  197800   26328 5206650  4f727a i965_dri.so after

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 configure.ac | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index dc9272d3016..36197d3396a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -286,8 +286,8 @@ if test "x$GCC" = xyes; then
     # Work around aliasing bugs - developers should comment this out
     CFLAGS="$CFLAGS -fno-strict-aliasing"
 
-    # We don't want floating-point math functions to set errno
-    CFLAGS="$CFLAGS -fno-math-errno"
+    # We don't want floating-point math functions to set errno or trap
+    CFLAGS="$CFLAGS -fno-math-errno -fno-trapping-math"
 
     # gcc's builtin memcmp is slower than glibc's
     # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052

From 594fc0f85953d11c455e7ab549308a773b312d70 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 25 Jun 2015 16:47:52 -0700
Subject: [PATCH 0881/1208] mesa: Replace F_TO_I() with _mesa_lroundevenf().

I'm not sure what the true meaning of "The rounding mode may vary." is,
but it is the case that the IROUND() path rounds differently than the
other paths (and does it wrong, at that).

Like _mesa_roundeven{f,}(), just add an use _mesa_lroundeven{f,}() that
has known semantics.

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/mesa/main/format_utils.h  |  5 +++--
 src/mesa/main/imports.h       | 28 ----------------------------
 src/mesa/main/macros.h        | 11 ++++++-----
 src/mesa/main/pack.c          |  4 ++--
 src/mesa/main/pixeltransfer.c | 11 ++++++-----
 src/util/rounding.h           | 25 +++++++++++++++++++++++++
 6 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/src/mesa/main/format_utils.h b/src/mesa/main/format_utils.h
index 7f500ec78da..00ec7774dd2 100644
--- a/src/mesa/main/format_utils.h
+++ b/src/mesa/main/format_utils.h
@@ -33,6 +33,7 @@
 
 #include "imports.h"
 #include "macros.h"
+#include "util/rounding.h"
 
 extern const mesa_array_format RGBA32_FLOAT;
 extern const mesa_array_format RGBA8_UBYTE;
@@ -84,7 +85,7 @@ _mesa_float_to_unorm(float x, unsigned dst_bits)
    else if (x > 1.0f)
       return MAX_UINT(dst_bits);
    else
-      return F_TO_I(x * MAX_UINT(dst_bits));
+      return _mesa_lroundevenf(x * MAX_UINT(dst_bits));
 }
 
 static inline unsigned
@@ -128,7 +129,7 @@ _mesa_float_to_snorm(float x, unsigned dst_bits)
    else if (x > 1.0f)
       return MAX_INT(dst_bits);
    else
-      return F_TO_I(x * MAX_INT(dst_bits));
+      return _mesa_lroundevenf(x * MAX_INT(dst_bits));
 }
 
 static inline int
diff --git a/src/mesa/main/imports.h b/src/mesa/main/imports.h
index 9ffe3decd0f..d61279ac4e5 100644
--- a/src/mesa/main/imports.h
+++ b/src/mesa/main/imports.h
@@ -170,34 +170,6 @@ static inline int IROUND_POS(float f)
    return (int) (f + 0.5F);
 }
 
-#ifdef __x86_64__
-#  include <xmmintrin.h>
-#endif
-
-/**
- * Convert float to int using a fast method.  The rounding mode may vary.
- */
-static inline int F_TO_I(float f)
-{
-#if defined(USE_X86_ASM) && defined(__GNUC__) && defined(__i386__)
-   int r;
-   __asm__ ("fistpl %0" : "=m" (r) : "t" (f) : "st");
-   return r;
-#elif defined(USE_X86_ASM) && defined(_MSC_VER)
-   int r;
-   _asm {
-	 fld f
-	 fistp r
-	}
-   return r;
-#elif defined(__x86_64__)
-   return _mm_cvt_ss2si(_mm_load_ss(&f));
-#else
-   return IROUND(f);
-#endif
-}
-
-
 /** Return (as an integer) floor of float */
 static inline int IFLOOR(float f)
 {
diff --git a/src/mesa/main/macros.h b/src/mesa/main/macros.h
index 07919a6e1e4..54df50c9cfe 100644
--- a/src/mesa/main/macros.h
+++ b/src/mesa/main/macros.h
@@ -33,6 +33,7 @@
 
 #include "util/macros.h"
 #include "util/u_math.h"
+#include "util/rounding.h"
 #include "imports.h"
 
 
@@ -131,12 +132,12 @@ extern GLfloat _mesa_ubyte_to_float_color_tab[256];
 #define INT_TO_USHORT(i)   ((i) < 0 ? 0 : ((GLushort) ((i) >> 15)))
 #define UINT_TO_USHORT(i)  ((i) < 0 ? 0 : ((GLushort) ((i) >> 16)))
 #define UNCLAMPED_FLOAT_TO_USHORT(us, f)  \
-        us = ( (GLushort) F_TO_I( CLAMP((f), 0.0F, 1.0F) * 65535.0F) )
+        us = ( (GLushort) _mesa_lroundevenf( CLAMP((f), 0.0F, 1.0F) * 65535.0F) )
 #define CLAMPED_FLOAT_TO_USHORT(us, f)  \
-        us = ( (GLushort) F_TO_I( (f) * 65535.0F) )
+        us = ( (GLushort) _mesa_lroundevenf( (f) * 65535.0F) )
 
 #define UNCLAMPED_FLOAT_TO_SHORT(s, f)  \
-        s = ( (GLshort) F_TO_I( CLAMP((f), -1.0F, 1.0F) * 32767.0F) )
+        s = ( (GLshort) _mesa_lroundevenf( CLAMP((f), -1.0F, 1.0F) * 32767.0F) )
 
 /***
  *** UNCLAMPED_FLOAT_TO_UBYTE: clamp float to [0,1] and map to ubyte in [0,255]
@@ -167,9 +168,9 @@ extern GLfloat _mesa_ubyte_to_float_color_tab[256];
         } while (0)
 #else
 #define UNCLAMPED_FLOAT_TO_UBYTE(ub, f) \
-	ub = ((GLubyte) F_TO_I(CLAMP((f), 0.0F, 1.0F) * 255.0F))
+	ub = ((GLubyte) _mesa_lroundevenf(CLAMP((f), 0.0F, 1.0F) * 255.0F))
 #define CLAMPED_FLOAT_TO_UBYTE(ub, f) \
-	ub = ((GLubyte) F_TO_I((f) * 255.0F))
+	ub = ((GLubyte) _mesa_lroundevenf((f) * 255.0F))
 #endif
 
 static fi_type UINT_AS_UNION(GLuint u)
diff --git a/src/mesa/main/pack.c b/src/mesa/main/pack.c
index fb642b85be8..7147fd6e4fe 100644
--- a/src/mesa/main/pack.c
+++ b/src/mesa/main/pack.c
@@ -470,7 +470,7 @@ extract_uint_indexes(GLuint n, GLuint indexes[],
 static inline GLuint
 clamp_float_to_uint(GLfloat f)
 {
-   return f < 0.0F ? 0 : F_TO_I(f);
+   return f < 0.0F ? 0 : _mesa_lroundevenf(f);
 }
 
 
@@ -478,7 +478,7 @@ static inline GLuint
 clamp_half_to_uint(GLhalfARB h)
 {
    GLfloat f = _mesa_half_to_float(h);
-   return f < 0.0F ? 0 : F_TO_I(f);
+   return f < 0.0F ? 0 : _mesa_lroundevenf(f);
 }
 
 
diff --git a/src/mesa/main/pixeltransfer.c b/src/mesa/main/pixeltransfer.c
index 51f2ebf768f..22eac00a7df 100644
--- a/src/mesa/main/pixeltransfer.c
+++ b/src/mesa/main/pixeltransfer.c
@@ -35,6 +35,7 @@
 #include "pixeltransfer.h"
 #include "imports.h"
 #include "mtypes.h"
+#include "util/rounding.h"
 
 
 /*
@@ -94,10 +95,10 @@ _mesa_map_rgba( const struct gl_context *ctx, GLuint n, GLfloat rgba[][4] )
       GLfloat g = CLAMP(rgba[i][GCOMP], 0.0F, 1.0F);
       GLfloat b = CLAMP(rgba[i][BCOMP], 0.0F, 1.0F);
       GLfloat a = CLAMP(rgba[i][ACOMP], 0.0F, 1.0F);
-      rgba[i][RCOMP] = rMap[F_TO_I(r * rscale)];
-      rgba[i][GCOMP] = gMap[F_TO_I(g * gscale)];
-      rgba[i][BCOMP] = bMap[F_TO_I(b * bscale)];
-      rgba[i][ACOMP] = aMap[F_TO_I(a * ascale)];
+      rgba[i][RCOMP] = rMap[(int)_mesa_lroundevenf(r * rscale)];
+      rgba[i][GCOMP] = gMap[(int)_mesa_lroundevenf(g * gscale)];
+      rgba[i][BCOMP] = bMap[(int)_mesa_lroundevenf(b * bscale)];
+      rgba[i][ACOMP] = aMap[(int)_mesa_lroundevenf(a * ascale)];
    }
 }
 
@@ -236,7 +237,7 @@ _mesa_apply_ci_transfer_ops(const struct gl_context *ctx,
       GLuint i;
       for (i = 0; i < n; i++) {
          const GLuint j = indexes[i] & mask;
-         indexes[i] = F_TO_I(ctx->PixelMaps.ItoI.Map[j]);
+         indexes[i] = _mesa_lroundevenf(ctx->PixelMaps.ItoI.Map[j]);
       }
    }
 }
diff --git a/src/util/rounding.h b/src/util/rounding.h
index 0cbe9269f7b..088cf86dd08 100644
--- a/src/util/rounding.h
+++ b/src/util/rounding.h
@@ -21,6 +21,9 @@
  * IN THE SOFTWARE.
  */
 
+#ifndef _ROUNDING_H
+#define _ROUNDING_H
+
 #include <math.h>
 
 #ifdef __SSE4_1__
@@ -76,3 +79,25 @@ _mesa_roundeven(double x)
    return rint(x);
 #endif
 }
+
+/**
+ * \brief Rounds \c x to the nearest integer, with ties to the even integer,
+ * and returns the value as a long int.
+ */
+static inline long
+_mesa_lroundevenf(float x)
+{
+   return lrintf(x);
+}
+
+/**
+ * \brief Rounds \c x to the nearest integer, with ties to the even integer,
+ * and returns the value as a long int.
+ */
+static inline long
+_mesa_lroundeven(double x)
+{
+   return lrint(x);
+}
+
+#endif

From abf4fa3c03ebe5716c90c8a310945c3621cf598f Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Tue, 16 Jun 2015 12:08:09 +0200
Subject: [PATCH 0882/1208] i965/nir/vec4: Add implementation placeholders for
 a new NIR->vec4 pass

This patch will add a brw_vec4_nir.cpp file filled with entry point methods to
the main functionality, following a structure similar to brw_fs_nir.cpp.

Subsequent patches in this series will be adding the implementations for these
methods, incrementally.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/Makefile.sources |   1 +
 src/mesa/drivers/dri/i965/brw_vec4.h       |  19 ++
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 253 +++++++++++++++++++++
 3 files changed, 273 insertions(+)
 create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index e861c2ccad5..3585126526a 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -125,6 +125,7 @@ i965_FILES = \
 	brw_vec4.h \
 	brw_vec4_live_variables.cpp \
 	brw_vec4_live_variables.h \
+	brw_vec4_nir.cpp \
 	brw_vec4_reg_allocate.cpp \
 	brw_vec4_visitor.cpp \
 	brw_vec4_vp.cpp \
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index bf6183a74a6..f6f76f56fba 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -45,6 +45,7 @@ extern "C" {
 #endif
 
 #include "glsl/ir.h"
+#include "glsl/nir/nir.h"
 
 
 #ifdef __cplusplus
@@ -389,6 +390,24 @@ public:
 
    void visit_atomic_counter_intrinsic(ir_call *ir);
 
+   virtual void emit_nir_code();
+   virtual void nir_setup_inputs(nir_shader *shader);
+   virtual void nir_setup_uniforms(nir_shader *shader);
+   virtual void nir_setup_uniform(nir_variable *var);
+   virtual void nir_setup_builtin_uniform(nir_variable *var);
+   virtual void nir_setup_system_values(nir_shader *shader);
+   virtual void nir_emit_impl(nir_function_impl *impl);
+   virtual void nir_emit_cf_list(exec_list *list);
+   virtual void nir_emit_if(nir_if *if_stmt);
+   virtual void nir_emit_loop(nir_loop *loop);
+   virtual void nir_emit_block(nir_block *block);
+   virtual void nir_emit_instr(nir_instr *instr);
+   virtual void nir_emit_load_const(nir_load_const_instr *instr);
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+   virtual void nir_emit_alu(nir_alu_instr *instr);
+   virtual void nir_emit_jump(nir_jump_instr *instr);
+   virtual void nir_emit_texture(nir_tex_instr *instr);
+
 protected:
    void emit_vertex();
    void lower_attributes_to_hw_regs(const int *attribute_map,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
new file mode 100644
index 00000000000..7ec29784737
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4.h"
+#include "glsl/ir_uniform.h"
+
+namespace brw {
+
+void
+vec4_visitor::emit_nir_code()
+{
+   nir_shader *nir = prog->nir;
+
+   if (nir->num_inputs > 0)
+      nir_setup_inputs(nir);
+
+   if (nir->num_uniforms > 0)
+      nir_setup_uniforms(nir);
+
+   nir_setup_system_values(nir);
+
+   /* get the main function and emit it */
+   nir_foreach_overload(nir, overload) {
+      assert(strcmp(overload->function->name, "main") == 0);
+      assert(overload->impl);
+      nir_emit_impl(overload->impl);
+   }
+}
+
+static bool
+setup_system_values_block(nir_block *block, void *void_visitor)
+{
+   /* @TODO: Not yet implemented */
+   return true;
+}
+
+void
+vec4_visitor::nir_setup_system_values(nir_shader *shader)
+{
+   nir_foreach_overload(shader, overload) {
+      assert(strcmp(overload->function->name, "main") == 0);
+      assert(overload->impl);
+      nir_foreach_block(overload->impl, setup_system_values_block, this);
+   }
+}
+
+void
+vec4_visitor::nir_setup_inputs(nir_shader *shader)
+{
+   /* @TODO: Not yet implemented */
+}
+
+void
+vec4_visitor::nir_setup_uniforms(nir_shader *shader)
+{
+   /* @TODO: Not yet implemented */
+}
+
+void
+vec4_visitor::nir_setup_uniform(nir_variable *var)
+{
+   /* @TODO: Not yet implemented */
+}
+
+void
+vec4_visitor::nir_setup_builtin_uniform(nir_variable *var)
+{
+   /* @TODO: Not yet implemented */
+}
+
+void
+vec4_visitor::nir_emit_impl(nir_function_impl *impl)
+{
+   /* @TODO: Not yet implemented */
+}
+
+void
+vec4_visitor::nir_emit_cf_list(exec_list *list)
+{
+   exec_list_validate(list);
+   foreach_list_typed(nir_cf_node, node, node, list) {
+      switch (node->type) {
+      case nir_cf_node_if:
+         nir_emit_if(nir_cf_node_as_if(node));
+         break;
+
+      case nir_cf_node_loop:
+         nir_emit_loop(nir_cf_node_as_loop(node));
+         break;
+
+      case nir_cf_node_block:
+         nir_emit_block(nir_cf_node_as_block(node));
+         break;
+
+      default:
+         unreachable("Invalid CFG node block");
+      }
+   }
+}
+
+void
+vec4_visitor::nir_emit_if(nir_if *if_stmt)
+{
+   /* @TODO: Not yet implemented */
+}
+
+void
+vec4_visitor::nir_emit_loop(nir_loop *loop)
+{
+   /* @TODO: Not yet implemented */
+}
+
+void
+vec4_visitor::nir_emit_block(nir_block *block)
+{
+   nir_foreach_instr(block, instr) {
+      nir_emit_instr(instr);
+   }
+}
+
+void
+vec4_visitor::nir_emit_instr(nir_instr *instr)
+{
+   this->base_ir = instr;
+
+   switch (instr->type) {
+   case nir_instr_type_load_const:
+      nir_emit_load_const(nir_instr_as_load_const(instr));
+      break;
+
+   case nir_instr_type_intrinsic:
+      nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
+      break;
+
+   case nir_instr_type_alu:
+      nir_emit_alu(nir_instr_as_alu(instr));
+      break;
+
+   case nir_instr_type_jump:
+      nir_emit_jump(nir_instr_as_jump(instr));
+      break;
+
+   case nir_instr_type_tex:
+      nir_emit_texture(nir_instr_as_tex(instr));
+      break;
+
+   default:
+      fprintf(stderr, "VS instruction not yet implemented by NIR->vec4\n");
+      break;
+   }
+}
+
+void
+vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
+{
+   /* @TODO: Not yet implemented */
+}
+
+void
+vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   switch (instr->intrinsic) {
+
+   case nir_intrinsic_load_input_indirect:
+      /* fallthrough */
+   case nir_intrinsic_load_input:
+      /* @TODO: Not yet implemented */
+      break;
+
+   case nir_intrinsic_store_output_indirect:
+      /* fallthrough */
+   case nir_intrinsic_store_output:
+      /* @TODO: Not yet implemented */
+      break;
+
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by lower_vertex_id()");
+
+   case nir_intrinsic_load_vertex_id_zero_base:
+      /* @TODO: Not yet implemented */
+      break;
+
+   case nir_intrinsic_load_base_vertex:
+      /* @TODO: Not yet implemented */
+      break;
+
+   case nir_intrinsic_load_instance_id:
+      /* @TODO: Not yet implemented */
+      break;
+
+   case nir_intrinsic_load_uniform_indirect:
+      /* fallthrough */
+   case nir_intrinsic_load_uniform:
+      /* @TODO: Not yet implemented */
+      break;
+
+   case nir_intrinsic_atomic_counter_read:
+   case nir_intrinsic_atomic_counter_inc:
+   case nir_intrinsic_atomic_counter_dec:
+      /* @TODO: Not yet implemented */
+      break;
+
+   case nir_intrinsic_load_ubo_indirect:
+      /* fallthrough */
+   case nir_intrinsic_load_ubo:
+      /* @TODO: Not yet implemented */
+      break;
+
+   default:
+      unreachable("Unknown intrinsic");
+   }
+}
+
+void
+vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
+{
+   /* @TODO: Not yet implemented */
+}
+
+void
+vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
+{
+   /* @TODO: Not yet implemented */
+}
+
+void
+vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
+{
+   /* @TODO: Not yet implemented */
+}
+
+}

From 47d68908f2c3ad3e9011a2cf910b04cd3300673a Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Tue, 16 Jun 2015 12:26:39 +0200
Subject: [PATCH 0883/1208] i965/nir/vec4: Select between new nir_vec4 or
 current vec4_visitor code-paths

The NIR->vec4 pass will be activated if both the following conditions are met:

* INTEL_USE_NIR environment variable is defined and is positive (1 or true)
* The stage is vertex shader (support for geometry shaders and
  ARB_vertex_program will be added later).

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_shader.cpp | 14 ++++++++------
 src/mesa/drivers/dri/i965/brw_vec4.cpp   | 18 ++++++++++++++----
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 58587b24403..524798c6ac6 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -122,12 +122,14 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
    compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = true;
    compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].OptimizeForAOS = true;
 
-   if (compiler->scalar_vs) {
-      /* If we're using the scalar backend for vertex shaders, we need to
-       * configure these accordingly.
-       */
-      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
-      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
+   if (compiler->scalar_vs || brw_env_var_as_boolean("INTEL_USE_NIR", false)) {
+      if (compiler->scalar_vs) {
+         /* If we're using the scalar backend for vertex shaders, we need to
+          * configure these accordingly.
+          */
+         compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
+         compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
+      }
       compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = false;
 
       compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions = nir_options;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 53270fb6eba..ce04f1b2173 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1709,6 +1709,9 @@ vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
 bool
 vec4_visitor::run(gl_clip_plane *clip_planes)
 {
+   bool use_vec4_nir =
+      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions != NULL;
+
    sanity_param_count = prog->Parameters->NumParameters;
 
    if (shader_time_index >= 0)
@@ -1718,11 +1721,18 @@ vec4_visitor::run(gl_clip_plane *clip_planes)
 
    emit_prolog();
 
-   /* Generate VS IR for main().  (the visitor only descends into
-    * functions called "main").
-    */
    if (shader) {
-      visit_instructions(shader->base.ir);
+      if (use_vec4_nir) {
+         assert(prog->nir != NULL);
+         emit_nir_code();
+         if (failed)
+            return false;
+      } else {
+         /* Generate VS IR for main().  (the visitor only descends into
+          * functions called "main").
+          */
+         visit_instructions(shader->base.ir);
+      }
    } else {
       emit_program_code();
    }

From 78e7ce2b7329f8cc3f771afbf39d3fa662e02d9e Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Tue, 16 Jun 2015 13:39:48 +0200
Subject: [PATCH 0884/1208] i965/vec4: Move type_size() method to
 brw_vec4_visitor class

The type_size() method is currently accessible only in the implementation
of vec4_visitor. Since we need to reuse it in the upcoming NIR->vec4 pass,
lets make it a method of the class instead.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h          |  2 ++
 .../drivers/dri/i965/brw_vec4_visitor.cpp     | 21 +++++++++++++------
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index f6f76f56fba..8d688f2764a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -390,6 +390,8 @@ public:
 
    void visit_atomic_counter_intrinsic(ir_call *ir);
 
+   int type_size(const struct glsl_type *type);
+
    virtual void emit_nir_code();
    virtual void nir_setup_inputs(nir_shader *shader);
    virtual void nir_setup_uniforms(nir_shader *shader);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index bf17fe6baa7..aeb31fdc0e2 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -572,9 +572,18 @@ vec4_visitor::visit_instructions(const exec_list *list)
    }
 }
 
-
-static int
-type_size(const struct glsl_type *type)
+/**
+ * Returns the minimum number of vec4 elements needed to pack a type.
+ *
+ * For simple types, it will return 1 (a single vec4); for matrices, the
+ * number of columns; for array and struct, the sum of the vec4_size of
+ * each of its elements; and for sampler and atomic, zero.
+ *
+ * This method is useful to calculate how much register space is needed to
+ * store a particular type.
+ */
+int
+vec4_visitor::type_size(const struct glsl_type *type)
 {
    unsigned int i;
    int size;
@@ -629,7 +638,7 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(type_size(type));
+   this->reg = v->alloc.allocate(v->type_size(type));
 
    if (type->is_array() || type->is_record()) {
       this->swizzle = BRW_SWIZZLE_NOOP;
@@ -647,7 +656,7 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(type_size(type) * size);
+   this->reg = v->alloc.allocate(v->type_size(type) * size);
 
    this->swizzle = BRW_SWIZZLE_NOOP;
 
@@ -659,7 +668,7 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(type_size(type));
+   this->reg = v->alloc.allocate(v->type_size(type));
 
    if (type->is_array() || type->is_record()) {
       this->writemask = WRITEMASK_XYZW;

From b929acb6a8659fdc06623b766bdf59904d8a3558 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Tue, 16 Jun 2015 13:50:43 +0200
Subject: [PATCH 0885/1208] i965/nir/vec4: Add setup of input variables in
 NIR->vec4 pass

This implementation sets up a map of input variable offsets to source registers
that are already initialized with the corresponding register offset.

This map will then be queried when processing load_input intrinsic operations,
to obtain the correct register source from which the input data will be loaded.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h       |  2 ++
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 11 ++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 8d688f2764a..c286689ed58 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -410,6 +410,8 @@ public:
    virtual void nir_emit_jump(nir_jump_instr *instr);
    virtual void nir_emit_texture(nir_tex_instr *instr);
 
+   src_reg *nir_inputs;
+
 protected:
    void emit_vertex();
    void lower_attributes_to_hw_regs(const int *attribute_map,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 7ec29784737..15a180519dd 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -68,7 +68,16 @@ vec4_visitor::nir_setup_system_values(nir_shader *shader)
 void
 vec4_visitor::nir_setup_inputs(nir_shader *shader)
 {
-   /* @TODO: Not yet implemented */
+   nir_inputs = ralloc_array(mem_ctx, src_reg, shader->num_inputs);
+
+   foreach_list_typed(nir_variable, var, node, &shader->inputs) {
+      int offset = var->data.driver_location;
+      unsigned size = type_size(var->type);
+      for (unsigned i = 0; i < size; i++) {
+         src_reg src = src_reg(ATTR, var->data.location + i, var->type);
+         nir_inputs[offset + i] = src;
+      }
+   }
 }
 
 void

From 195156e571e851273c135847f91ed73b3bfc1914 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 16 Jun 2015 14:30:31 +0200
Subject: [PATCH 0886/1208] i965/nir/vec4: Add setup of uniform variables

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h       |  1 +
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 99 +++++++++++++++++++++-
 2 files changed, 97 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index c286689ed58..98bfb7ce847 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -411,6 +411,7 @@ public:
    virtual void nir_emit_texture(nir_tex_instr *instr);
 
    src_reg *nir_inputs;
+   unsigned *nir_uniform_driver_location;
 
 protected:
    void emit_vertex();
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 15a180519dd..feafcbe3cce 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -83,19 +83,112 @@ vec4_visitor::nir_setup_inputs(nir_shader *shader)
 void
 vec4_visitor::nir_setup_uniforms(nir_shader *shader)
 {
-   /* @TODO: Not yet implemented */
+   uniforms = 0;
+
+   nir_uniform_driver_location =
+      rzalloc_array(mem_ctx, unsigned, this->uniform_array_size);
+
+   if (shader_prog) {
+      foreach_list_typed(nir_variable, var, node, &shader->uniforms) {
+         /* UBO's, atomics and samplers don't take up space in the
+            uniform file */
+         if (var->interface_type != NULL || var->type->contains_atomic() ||
+             type_size(var->type) == 0) {
+            continue;
+         }
+
+         assert(uniforms < uniform_array_size);
+         this->uniform_size[uniforms] = type_size(var->type);
+
+         if (strncmp(var->name, "gl_", 3) == 0)
+            nir_setup_builtin_uniform(var);
+         else
+            nir_setup_uniform(var);
+      }
+   } else {
+      /* ARB_vertex_program is not supported yet */
+      assert("Not implemented");
+   }
 }
 
 void
 vec4_visitor::nir_setup_uniform(nir_variable *var)
 {
-   /* @TODO: Not yet implemented */
+   int namelen = strlen(var->name);
+
+   /* The data for our (non-builtin) uniforms is stored in a series of
+    * gl_uniform_driver_storage structs for each subcomponent that
+    * glGetUniformLocation() could name.  We know it's been set up in the same
+    * order we'd walk the type, so walk the list of storage and find anything
+    * with our name, or the prefix of a component that starts with our name.
+    */
+    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
+       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
+
+       if (storage->builtin)
+          continue;
+
+       if (strncmp(var->name, storage->name, namelen) != 0 ||
+           (storage->name[namelen] != 0 &&
+            storage->name[namelen] != '.' &&
+            storage->name[namelen] != '[')) {
+          continue;
+       }
+
+       gl_constant_value *components = storage->storage;
+       unsigned vector_count = (MAX2(storage->array_elements, 1) *
+                                storage->type->matrix_columns);
+
+       for (unsigned s = 0; s < vector_count; s++) {
+          assert(uniforms < uniform_array_size);
+          uniform_vector_size[uniforms] = storage->type->vector_elements;
+
+          int i;
+          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
+             stage_prog_data->param[uniforms * 4 + i] = components;
+             components++;
+          }
+          for (; i < 4; i++) {
+             static const gl_constant_value zero = { 0.0 };
+             stage_prog_data->param[uniforms * 4 + i] = &zero;
+          }
+
+          nir_uniform_driver_location[uniforms] = var->data.driver_location;
+          uniforms++;
+       }
+    }
 }
 
 void
 vec4_visitor::nir_setup_builtin_uniform(nir_variable *var)
 {
-   /* @TODO: Not yet implemented */
+   const nir_state_slot *const slots = var->state_slots;
+   assert(var->state_slots != NULL);
+
+   for (unsigned int i = 0; i < var->num_state_slots; i++) {
+      /* This state reference has already been setup by ir_to_mesa,
+       * but we'll get the same index back here.  We can reference
+       * ParameterValues directly, since unlike brw_fs.cpp, we never
+       * add new state references during compile.
+       */
+      int index = _mesa_add_state_reference(this->prog->Parameters,
+					    (gl_state_index *)slots[i].tokens);
+      gl_constant_value *values =
+         &this->prog->Parameters->ParameterValues[index][0];
+
+      assert(uniforms < uniform_array_size);
+
+      for (unsigned j = 0; j < 4; j++)
+         stage_prog_data->param[uniforms * 4 + j] =
+            &values[GET_SWZ(slots[i].swizzle, j)];
+
+      this->uniform_vector_size[uniforms] =
+         (var->type->is_scalar() || var->type->is_vector() ||
+          var->type->is_matrix() ? var->type->vector_elements : 4);
+
+      nir_uniform_driver_location[uniforms] = var->data.driver_location;
+      uniforms++;
+   }
 }
 
 void

From 01c5617c8edc2f392363e9f8861d62a9fc9aa973 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 16 Jun 2015 17:01:29 +0200
Subject: [PATCH 0887/1208] i965/vec4: Redefine make_reg_for_system_value() to
 allow reuse in NIR->vec4 pass

The new virtual method is more flexible, it has a signature:

dst_reg *make_reg_for_system_value(int location, const glsl_type *type);

v2 (Jason Ekstrand):
   Use the new version in unit tests so make check passes again

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h                      | 4 +++-
 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp         | 7 ++++---
 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h           | 3 ++-
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp            | 2 +-
 src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp         | 5 +++--
 src/mesa/drivers/dri/i965/brw_vs.h                        | 3 ++-
 src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp  | 3 ++-
 src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp | 3 ++-
 8 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 98bfb7ce847..d429310563f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -410,6 +410,9 @@ public:
    virtual void nir_emit_jump(nir_jump_instr *instr);
    virtual void nir_emit_texture(nir_tex_instr *instr);
 
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type) = 0;
+
    src_reg *nir_inputs;
    unsigned *nir_uniform_driver_location;
 
@@ -419,7 +422,6 @@ protected:
                                     bool interleaved);
    void setup_payload_interference(struct ra_graph *g, int first_payload_node,
                                    int reg_node_count);
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir) = 0;
    virtual void assign_binding_table_offsets();
    virtual void setup_payload() = 0;
    virtual void emit_prolog() = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index d6b350bea3f..704644e7429 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -51,11 +51,12 @@ vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
 
 
 dst_reg *
-vec4_gs_visitor::make_reg_for_system_value(ir_variable *ir)
+vec4_gs_visitor::make_reg_for_system_value(int location,
+                                           const glsl_type *type)
 {
-   dst_reg *reg = new(mem_ctx) dst_reg(this, ir->type);
+   dst_reg *reg = new(mem_ctx) dst_reg(this, type);
 
-   switch (ir->data.location) {
+   switch (location) {
    case SYSTEM_VALUE_INVOCATION_ID:
       this->current_annotation = "initialize gl_InvocationID";
       emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index e51399d620d..54cd2a1ff34 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -76,7 +76,8 @@ public:
                    int shader_time_index);
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type);
    virtual void setup_payload();
    virtual void emit_prolog();
    virtual void emit_program_code();
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index aeb31fdc0e2..17604cdaecc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1094,7 +1094,7 @@ vec4_visitor::visit(ir_variable *ir)
       break;
 
    case ir_var_system_value:
-      reg = make_reg_for_system_value(ir);
+      reg = make_reg_for_system_value(ir->data.location, ir->type);
       break;
 
    default:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index b7ec8b9c169..620f652d6dc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -143,7 +143,8 @@ vec4_vs_visitor::emit_prolog()
 
 
 dst_reg *
-vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
+vec4_vs_visitor::make_reg_for_system_value(int location,
+                                           const glsl_type *type)
 {
    /* VertexID is stored by the VF as the last vertex element, but
     * we don't represent it with a flag in inputs_read, so we call
@@ -151,7 +152,7 @@ vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
     */
    dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 
-   switch (ir->data.location) {
+   switch (location) {
    case SYSTEM_VALUE_BASE_VERTEX:
       reg->writemask = WRITEMASK_X;
       vs_prog_data->uses_vertexid = true;
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 88927eeb3eb..1d9bee11c56 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -95,7 +95,8 @@ public:
                    bool use_legacy_snorm_formula);
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type);
    virtual void setup_payload();
    virtual void emit_prolog();
    virtual void emit_program_code();
diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
index 84e43fa75cd..fbd9fa8f19b 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
@@ -53,7 +53,8 @@ public:
    }
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir)
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type)
    {
       unreachable("Not reached");
    }
diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
index de2afd39cfe..a3055fcc851 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
@@ -56,7 +56,8 @@ public:
    }
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir)
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type)
    {
       unreachable("Not reached");
    }

From 4023b55fdd7005a8a100637c229a1c40648cdd2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 16 Jun 2015 17:08:04 +0200
Subject: [PATCH 0888/1208] i965/nir/vec4: Add setup for system values

Similar to other variable setups, system values will initialize the
corresponding register inside a 'nir_system_values' map, which will then
be queried later when processing the different system value intrinsics
for the appropriate register.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h       |  2 +
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 49 +++++++++++++++++++++-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index d429310563f..722f9a1f4c5 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -397,6 +397,7 @@ public:
    virtual void nir_setup_uniforms(nir_shader *shader);
    virtual void nir_setup_uniform(nir_variable *var);
    virtual void nir_setup_builtin_uniform(nir_variable *var);
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
    virtual void nir_setup_system_values(nir_shader *shader);
    virtual void nir_emit_impl(nir_function_impl *impl);
    virtual void nir_emit_cf_list(exec_list *list);
@@ -415,6 +416,7 @@ public:
 
    src_reg *nir_inputs;
    unsigned *nir_uniform_driver_location;
+   dst_reg *nir_system_values;
 
 protected:
    void emit_vertex();
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index feafcbe3cce..989b8e3b6b5 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -48,16 +48,63 @@ vec4_visitor::emit_nir_code()
    }
 }
 
+void
+vec4_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg *reg;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by lower_vertex_id().");
+
+   case nir_intrinsic_load_vertex_id_zero_base:
+      reg = &this->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
+      if (reg->file == BAD_FILE)
+         *reg =
+            *this->make_reg_for_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
+                                             glsl_type::int_type);
+      break;
+
+   case nir_intrinsic_load_base_vertex:
+      reg = &this->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
+      if (reg->file == BAD_FILE)
+         *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_BASE_VERTEX,
+                                                 glsl_type::int_type);
+      break;
+
+   case nir_intrinsic_load_instance_id:
+      reg = &this->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
+      if (reg->file == BAD_FILE)
+         *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_INSTANCE_ID,
+                                                 glsl_type::int_type);
+      break;
+
+   default:
+      break;
+   }
+}
+
 static bool
 setup_system_values_block(nir_block *block, void *void_visitor)
 {
-   /* @TODO: Not yet implemented */
+   vec4_visitor *v = (vec4_visitor *)void_visitor;
+
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      v->nir_setup_system_value_intrinsic(intrin);
+   }
+
    return true;
 }
 
 void
 vec4_visitor::nir_setup_system_values(nir_shader *shader)
 {
+   nir_system_values = ralloc_array(mem_ctx, dst_reg, SYSTEM_VALUE_MAX);
+
    nir_foreach_overload(shader, overload) {
       assert(strcmp(overload->function->name, "main") == 0);
       assert(overload->impl);

From 59006d3ad3ed5d29e84afa5931f425344e2ef658 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Wed, 22 Jul 2015 09:34:35 +0200
Subject: [PATCH 0889/1208] i965/nir/vec4: Add shader function implementation

It basically allocates registers local to a function in a nir_locals map,
then emits all its control-flow blocks.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h       |  1 +
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 11 ++++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 722f9a1f4c5..01f88e641e7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -414,6 +414,7 @@ public:
    virtual dst_reg *make_reg_for_system_value(int location,
                                               const glsl_type *type) = 0;
 
+   dst_reg *nir_locals;
    src_reg *nir_inputs;
    unsigned *nir_uniform_driver_location;
    dst_reg *nir_system_values;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 989b8e3b6b5..f8bcbb7cd42 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -241,7 +241,16 @@ vec4_visitor::nir_setup_builtin_uniform(nir_variable *var)
 void
 vec4_visitor::nir_emit_impl(nir_function_impl *impl)
 {
-   /* @TODO: Not yet implemented */
+   nir_locals = ralloc_array(mem_ctx, dst_reg, impl->reg_alloc);
+
+   foreach_list_typed(nir_register, reg, node, &impl->registers) {
+      unsigned array_elems =
+         reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
+
+      nir_locals[reg->index] = dst_reg(GRF, alloc.allocate(array_elems));
+   }
+
+   nir_emit_cf_list(&impl->body);
 }
 
 void

From 5e839727ed2378a01d3b657bad83abd4728e8da6 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Wed, 22 Jul 2015 09:35:28 +0200
Subject: [PATCH 0890/1208] i965/nir: Pass a is_scalar boolean to
 brw_create_nir()

The upcoming introduction of NIR->vec4 pass will require that some NIR
lowering passes are enabled/disabled depending on the type of shader
(scalar vs. vector).

With this patch we pass a 'is_scalar' variable to the process of
constructing the NIR, to let an external context decide how the shader
should be handled.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_nir.c      | 3 ++-
 src/mesa/drivers/dri/i965/brw_nir.h      | 3 ++-
 src/mesa/drivers/dri/i965/brw_program.c  | 5 +++--
 src/mesa/drivers/dri/i965/brw_shader.cpp | 6 ++++--
 src/mesa/drivers/dri/i965/brw_vec4.cpp   | 2 +-
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 3e154c10526..4aa893aff50 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -61,7 +61,8 @@ nir_shader *
 brw_create_nir(struct brw_context *brw,
                const struct gl_shader_program *shader_prog,
                const struct gl_program *prog,
-               gl_shader_stage stage)
+               gl_shader_stage stage,
+               bool is_scalar)
 {
    struct gl_context *ctx = &brw->ctx;
    const nir_shader_compiler_options *options =
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index 313110997bf..c76defd86ca 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -77,7 +77,8 @@ void brw_nir_analyze_boolean_resolves(nir_shader *nir);
 nir_shader *brw_create_nir(struct brw_context *brw,
                            const struct gl_shader_program *shader_prog,
                            const struct gl_program *prog,
-                           gl_shader_stage stage);
+                           gl_shader_stage stage,
+                           bool is_scalar);
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 85e271d2351..467a8934180 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -143,7 +143,7 @@ brwProgramStringNotify(struct gl_context *ctx,
       brw_add_texrect_params(prog);
 
       if (ctx->Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
-         prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT);
+         prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
       }
 
       brw_fs_precompile(ctx, NULL, prog);
@@ -169,7 +169,8 @@ brwProgramStringNotify(struct gl_context *ctx,
       brw_add_texrect_params(prog);
 
       if (ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
-         prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX);
+         prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
+                                    brw->intelScreen->compiler->scalar_vs);
       }
 
       brw_vs_precompile(ctx, NULL, prog);
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 524798c6ac6..7c5095ddce3 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -398,8 +398,10 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
 
       brw_add_texrect_params(prog);
 
-      if (options->NirOptions)
-         prog->nir = brw_create_nir(brw, shProg, prog, (gl_shader_stage) stage);
+      if (options->NirOptions) {
+         prog->nir = brw_create_nir(brw, shProg, prog, (gl_shader_stage) stage,
+                                    is_scalar_shader_stage(brw, stage));
+      }
 
       _mesa_reference_program(ctx, &prog, NULL);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index ce04f1b2173..4e5518588c6 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1919,7 +1919,7 @@ brw_vs_emit(struct brw_context *brw,
           */
          assert(vp->Base.Id == 0 && prog == NULL);
          vp->Base.nir =
-            brw_create_nir(brw, NULL, &vp->Base, MESA_SHADER_VERTEX);
+            brw_create_nir(brw, NULL, &vp->Base, MESA_SHADER_VERTEX, true);
       }
 
       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;

From 01f6235020f9f0c2bc1a6e6ea9bd15c22fb2bcf5 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 18 Jun 2015 13:52:21 +0200
Subject: [PATCH 0891/1208] nir/nir_lower_io: Add vec4 support

The current implementation operates in scalar mode only, so add a vec4
mode where types are padded to vec4 sizes.

This will be useful in the i965 driver for its vec4 nir backend
(and possbly other drivers that have vec4-based shaders).

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/glsl/nir/nir.h                  | 16 +++---
 src/glsl/nir/nir_lower_io.c         | 87 ++++++++++++++++++++++-------
 src/mesa/drivers/dri/i965/brw_nir.c | 14 +++--
 3 files changed, 85 insertions(+), 32 deletions(-)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 6c954c0ef7c..d9daaa2e7e2 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1641,14 +1641,16 @@ void nir_lower_global_vars_to_local(nir_shader *shader);
 
 void nir_lower_locals_to_regs(nir_shader *shader);
 
-void nir_assign_var_locations_scalar(struct exec_list *var_list,
-                                     unsigned *size);
-void nir_assign_var_locations_scalar_direct_first(nir_shader *shader,
-                                                  struct exec_list *var_list,
-                                                  unsigned *direct_size,
-                                                  unsigned *size);
+void nir_assign_var_locations(struct exec_list *var_list,
+                              unsigned *size,
+                              bool is_scalar);
+void nir_assign_var_locations_direct_first(nir_shader *shader,
+                                           struct exec_list *var_list,
+                                           unsigned *direct_size,
+                                           unsigned *size,
+                                           bool is_scalar);
 
-void nir_lower_io(nir_shader *shader);
+void nir_lower_io(nir_shader *shader, bool is_scalar);
 
 void nir_lower_vars_to_ssa(nir_shader *shader);
 
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c
index f6e5e2beda4..3c179299bd9 100644
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -29,19 +29,56 @@
 /*
  * This lowering pass converts references to input/output variables with
  * loads/stores to actual input/output intrinsics.
- *
- * NOTE: This pass really only works for scalar backends at the moment due
- * to the way it packes the input/output data.
  */
 
 #include "nir.h"
 
 struct lower_io_state {
    void *mem_ctx;
+   bool is_scalar;
 };
 
+static int
+type_size_vec4(const struct glsl_type *type)
+{
+   unsigned int i;
+   int size;
+
+   switch (glsl_get_base_type(type)) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_BOOL:
+      if (glsl_type_is_matrix(type)) {
+         return glsl_get_matrix_columns(type);
+      } else {
+         return 1;
+      }
+   case GLSL_TYPE_ARRAY:
+      return type_size_vec4(glsl_get_array_element(type)) * glsl_get_length(type);
+   case GLSL_TYPE_STRUCT:
+      size = 0;
+      for (i = 0; i <  glsl_get_length(type); i++) {
+         size += type_size_vec4(glsl_get_struct_field(type, i));
+      }
+      return size;
+   case GLSL_TYPE_SAMPLER:
+      return 0;
+   case GLSL_TYPE_ATOMIC_UINT:
+      return 0;
+   case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_VOID:
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_ERROR:
+   case GLSL_TYPE_INTERFACE:
+      unreachable("not reached");
+   }
+
+   return 0;
+}
+
 static unsigned
-type_size(const struct glsl_type *type)
+type_size_scalar(const struct glsl_type *type)
 {
    unsigned int size, i;
 
@@ -52,11 +89,11 @@ type_size(const struct glsl_type *type)
    case GLSL_TYPE_BOOL:
       return glsl_get_components(type);
    case GLSL_TYPE_ARRAY:
-      return type_size(glsl_get_array_element(type)) * glsl_get_length(type);
+      return type_size_scalar(glsl_get_array_element(type)) * glsl_get_length(type);
    case GLSL_TYPE_STRUCT:
       size = 0;
       for (i = 0; i < glsl_get_length(type); i++) {
-         size += type_size(glsl_get_struct_field(type, i));
+         size += type_size_scalar(glsl_get_struct_field(type, i));
       }
       return size;
    case GLSL_TYPE_SUBROUTINE:
@@ -78,8 +115,17 @@ type_size(const struct glsl_type *type)
    return 0;
 }
 
+static unsigned
+type_size(const struct glsl_type *type, bool is_scalar)
+{
+   if (is_scalar)
+      return type_size_scalar(type);
+   else
+      return type_size_vec4(type);
+}
+
 void
-nir_assign_var_locations_scalar(struct exec_list *var_list, unsigned *size)
+nir_assign_var_locations(struct exec_list *var_list, unsigned *size, bool is_scalar)
 {
    unsigned location = 0;
 
@@ -93,7 +139,7 @@ nir_assign_var_locations_scalar(struct exec_list *var_list, unsigned *size)
          continue;
 
       var->data.driver_location = location;
-      location += type_size(var->type);
+      location += type_size(var->type, is_scalar);
    }
 
    *size = location;
@@ -139,10 +185,11 @@ mark_indirect_uses_block(nir_block *block, void *void_state)
  * assigns locations to variables that are used indirectly.
  */
 void
-nir_assign_var_locations_scalar_direct_first(nir_shader *shader,
-                                             struct exec_list *var_list,
-                                             unsigned *direct_size,
-                                             unsigned *size)
+nir_assign_var_locations_direct_first(nir_shader *shader,
+                                      struct exec_list *var_list,
+                                      unsigned *direct_size,
+                                      unsigned *size,
+                                      bool is_scalar)
 {
    struct set *indirect_set = _mesa_set_create(NULL, _mesa_hash_pointer,
                                                _mesa_key_pointer_equal);
@@ -164,7 +211,7 @@ nir_assign_var_locations_scalar_direct_first(nir_shader *shader,
          continue;
 
       var->data.driver_location = location;
-      location += type_size(var->type);
+      location += type_size(var->type, is_scalar);
    }
 
    *direct_size = location;
@@ -178,7 +225,7 @@ nir_assign_var_locations_scalar_direct_first(nir_shader *shader,
          continue;
 
       var->data.driver_location = location;
-      location += type_size(var->type);
+      location += type_size(var->type, is_scalar);
    }
 
    *size = location;
@@ -200,7 +247,7 @@ get_io_offset(nir_deref_var *deref, nir_instr *instr, nir_src *indirect,
 
       if (tail->deref_type == nir_deref_type_array) {
          nir_deref_array *deref_array = nir_deref_as_array(tail);
-         unsigned size = type_size(tail->type);
+         unsigned size = type_size(tail->type, state->is_scalar);
 
          base_offset += size * deref_array->base_offset;
 
@@ -242,7 +289,8 @@ get_io_offset(nir_deref_var *deref, nir_instr *instr, nir_src *indirect,
          nir_deref_struct *deref_struct = nir_deref_as_struct(tail);
 
          for (unsigned i = 0; i < deref_struct->index; i++)
-            base_offset += type_size(glsl_get_struct_field(parent_type, i));
+            base_offset += type_size(glsl_get_struct_field(parent_type, i),
+                                     state->is_scalar);
       }
    }
 
@@ -355,11 +403,12 @@ nir_lower_io_block(nir_block *block, void *void_state)
 }
 
 static void
-nir_lower_io_impl(nir_function_impl *impl)
+nir_lower_io_impl(nir_function_impl *impl, bool is_scalar)
 {
    struct lower_io_state state;
 
    state.mem_ctx = ralloc_parent(impl);
+   state.is_scalar = is_scalar;
 
    nir_foreach_block(impl, nir_lower_io_block, &state);
 
@@ -368,10 +417,10 @@ nir_lower_io_impl(nir_function_impl *impl)
 }
 
 void
-nir_lower_io(nir_shader *shader)
+nir_lower_io(nir_shader *shader, bool is_scalar)
 {
    nir_foreach_overload(shader, overload) {
       if (overload->impl)
-         nir_lower_io_impl(overload->impl);
+         nir_lower_io_impl(overload->impl, is_scalar);
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 4aa893aff50..8700cb71ad4 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -101,13 +101,15 @@ brw_create_nir(struct brw_context *brw,
    /* Get rid of split copies */
    nir_optimize(nir);
 
-   nir_assign_var_locations_scalar_direct_first(nir, &nir->uniforms,
-                                                &nir->num_direct_uniforms,
-                                                &nir->num_uniforms);
-   nir_assign_var_locations_scalar(&nir->inputs, &nir->num_inputs);
-   nir_assign_var_locations_scalar(&nir->outputs, &nir->num_outputs);
+   nir_assign_var_locations_direct_first(nir, &nir->uniforms,
+                                         &nir->num_direct_uniforms,
+                                         &nir->num_uniforms,
+                                         is_scalar);
+   nir_assign_var_locations(&nir->inputs, &nir->num_inputs, is_scalar);
+   nir_assign_var_locations(&nir->outputs, &nir->num_outputs, is_scalar);
+
+   nir_lower_io(nir, is_scalar);
 
-   nir_lower_io(nir);
    nir_validate_shader(nir);
 
    nir_remove_dead_variables(nir);

From 6e58fc56a5a396020cd299db11895120ec3da520 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 3 Jul 2015 08:23:33 +0200
Subject: [PATCH 0892/1208] i965/nir: Dot not assign direct uniform locations
 first for vec4-based shaders

In the vec4 backend we want uniform locations to be assigned consecutively
since that way the offsets produced by nir_lower_io are exactly what we
need to implement nir_intrinsic_load_uniform. Otherwise we would need a
mapping to match the output of nir_lower_io to the actual uniform registers
we need to use.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_nir.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 8700cb71ad4..d81d82323bb 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -101,10 +101,16 @@ brw_create_nir(struct brw_context *brw,
    /* Get rid of split copies */
    nir_optimize(nir);
 
-   nir_assign_var_locations_direct_first(nir, &nir->uniforms,
-                                         &nir->num_direct_uniforms,
-                                         &nir->num_uniforms,
-                                         is_scalar);
+   if (is_scalar) {
+      nir_assign_var_locations_direct_first(nir, &nir->uniforms,
+                                            &nir->num_direct_uniforms,
+                                            &nir->num_uniforms,
+                                            is_scalar);
+   } else {
+      nir_assign_var_locations(&nir->uniforms,
+                               &nir->num_uniforms,
+                               is_scalar);
+   }
    nir_assign_var_locations(&nir->inputs, &nir->num_inputs, is_scalar);
    nir_assign_var_locations(&nir->outputs, &nir->num_outputs, is_scalar);
 

From a5a3287f7392356386aa305c791d94b6d5dde6cc Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Tue, 16 Jun 2015 20:53:28 +0200
Subject: [PATCH 0893/1208] i965/vec4: Add auxiliary func to build a writemask
 from a component size

New method brw_writemask_for_size() will return a writemask with the first
'size' components activated.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_reg.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index 4867148c12f..31806f769bd 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -950,6 +950,12 @@ brw_set_writemask(struct brw_reg reg, unsigned mask)
    return reg;
 }
 
+static inline unsigned
+brw_writemask_for_size(unsigned n)
+{
+   return (1 << n) - 1;
+}
+
 static inline struct brw_reg
 negate(struct brw_reg reg)
 {

From f7152525374015594e037fa11bb64e1c7174829b Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Wed, 1 Jul 2015 16:10:49 +0200
Subject: [PATCH 0894/1208] i965/nir/vec4: Implement load_const intrinsic

Similar to fs_nir backend, a nir_local_values map will be filled with
newly allocated registers as the load_const instrinsic instructions are
processed. Later, get_nir_src() will fetch the registers from this map
for sources that are ssa.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_nir.c        |  2 +-
 src/mesa/drivers/dri/i965/brw_vec4.h       |  1 +
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 19 ++++++++++++++++++-
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index d81d82323bb..a4b65d24e27 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -165,7 +165,7 @@ brw_create_nir(struct brw_context *brw,
       nir_print_shader(nir, stderr);
    }
 
-   nir_convert_from_ssa(nir, true);
+   nir_convert_from_ssa(nir, is_scalar);
    nir_validate_shader(nir);
 
    /* This is the last pass we run before we start emitting stuff.  It
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 01f88e641e7..f24d74438ad 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -415,6 +415,7 @@ public:
                                               const glsl_type *type) = 0;
 
    dst_reg *nir_locals;
+   dst_reg *nir_ssa_values;
    src_reg *nir_inputs;
    unsigned *nir_uniform_driver_location;
    dst_reg *nir_system_values;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index f8bcbb7cd42..96e5e7c66e4 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -250,6 +250,8 @@ vec4_visitor::nir_emit_impl(nir_function_impl *impl)
       nir_locals[reg->index] = dst_reg(GRF, alloc.allocate(array_elems));
    }
 
+   nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc);
+
    nir_emit_cf_list(&impl->body);
 }
 
@@ -332,7 +334,22 @@ vec4_visitor::nir_emit_instr(nir_instr *instr)
 void
 vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
 {
-   /* @TODO: Not yet implemented */
+   dst_reg reg = dst_reg(GRF, alloc.allocate(1));
+   reg.type =  BRW_REGISTER_TYPE_F;
+
+   /* @FIXME: consider emitting vector operations to save some MOVs in
+    * cases where the components are representable in 8 bits.
+    * By now, we emit a MOV for each component.
+    */
+   for (unsigned i = 0; i < instr->def.num_components; ++i) {
+      reg.writemask = 1 << i;
+      emit(MOV(reg, src_reg(instr->value.f[i])));
+   }
+
+   /* Set final writemask */
+   reg.writemask = brw_writemask_for_size(instr->def.num_components);
+
+   nir_ssa_values[instr->def.index] = reg;
 }
 
 void

From 97e205fd35bf77fd761caf24c611ff72cc0d85e2 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Fri, 17 Apr 2015 18:10:50 +0200
Subject: [PATCH 0895/1208] i965/nir: Move brw_type_for_nir_type() to brw_nir
 to allow reuse

Upcoming NIR->vec4 pass can benefit from this method, so lets move it up.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 18 ------------------
 src/mesa/drivers/dri/i965/brw_nir.c      | 18 ++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_nir.h      |  3 +++
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 9cb7b0db3f6..b51fe0e5eba 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -476,24 +476,6 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
    }
 }
 
-static brw_reg_type
-brw_type_for_nir_type(nir_alu_type type)
-{
-   switch (type) {
-   case nir_type_unsigned:
-      return BRW_REGISTER_TYPE_UD;
-   case nir_type_bool:
-   case nir_type_int:
-      return BRW_REGISTER_TYPE_D;
-   case nir_type_float:
-      return BRW_REGISTER_TYPE_F;
-   default:
-      unreachable("unknown type");
-   }
-
-   return BRW_REGISTER_TYPE_F;
-}
-
 bool
 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
                                          const fs_reg &result)
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index a4b65d24e27..b241121dac7 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -186,3 +186,21 @@ brw_create_nir(struct brw_context *brw,
 
    return nir;
 }
+
+enum brw_reg_type
+brw_type_for_nir_type(nir_alu_type type)
+{
+   switch (type) {
+   case nir_type_unsigned:
+      return BRW_REGISTER_TYPE_UD;
+   case nir_type_bool:
+   case nir_type_int:
+      return BRW_REGISTER_TYPE_D;
+   case nir_type_float:
+      return BRW_REGISTER_TYPE_F;
+   default:
+      unreachable("unknown type");
+   }
+
+   return BRW_REGISTER_TYPE_F;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index c76defd86ca..6152321521b 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -24,6 +24,7 @@
 #pragma once
 
 #include "brw_context.h"
+#include "brw_reg.h"
 #include "glsl/nir/nir.h"
 
 #ifdef __cplusplus
@@ -80,6 +81,8 @@ nir_shader *brw_create_nir(struct brw_context *brw,
                            gl_shader_stage stage,
                            bool is_scalar);
 
+enum brw_reg_type brw_type_for_nir_type(nir_alu_type type);
+
 #ifdef __cplusplus
 }
 #endif

From f3187ea31ede6bc181ee561573d127aa2e485657 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Tue, 16 Jun 2015 17:43:02 +0200
Subject: [PATCH 0896/1208] i965/nir/vec4: Add get_nir_dst() and get_nir_src()
 methods

These methods are essential for the implementation of the NIR->vec4 pass. They
work similar to their fs_nir counter-parts.

When processing instructions, these methods are invoked to resolve the
brw registers (source or destination) corresponding to the NIR sources
or destination. It uses the map of NIR register index to brw register for
all registers locally allocated in a block.

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h       | 10 +++
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 73 ++++++++++++++++++++++
 2 files changed, 83 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index f24d74438ad..b6ae926c8e2 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -411,6 +411,16 @@ public:
    virtual void nir_emit_jump(nir_jump_instr *instr);
    virtual void nir_emit_texture(nir_tex_instr *instr);
 
+   dst_reg get_nir_dest(nir_dest dest, enum brw_reg_type type);
+   dst_reg get_nir_dest(nir_dest dest, nir_alu_type type);
+   dst_reg get_nir_dest(nir_dest dest);
+   src_reg get_nir_src(nir_src src, enum brw_reg_type type,
+                       unsigned num_components = 4);
+   src_reg get_nir_src(nir_src src, nir_alu_type type,
+                       unsigned num_components = 4);
+   src_reg get_nir_src(nir_src src,
+                       unsigned num_components = 4);
+
    virtual dst_reg *make_reg_for_system_value(int location,
                                               const glsl_type *type) = 0;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 96e5e7c66e4..a3dfddf7d3c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -331,6 +331,79 @@ vec4_visitor::nir_emit_instr(nir_instr *instr)
    }
 }
 
+static dst_reg
+dst_reg_for_nir_reg(vec4_visitor *v, nir_register *nir_reg,
+                    unsigned base_offset, nir_src *indirect)
+{
+   dst_reg reg;
+
+   reg = v->nir_locals[nir_reg->index];
+   reg = offset(reg, base_offset);
+   if (indirect) {
+      reg.reladdr =
+         new(v->mem_ctx) src_reg(v->get_nir_src(*indirect,
+                                                BRW_REGISTER_TYPE_D,
+                                                1));
+   }
+   return reg;
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(nir_dest dest)
+{
+   assert(!dest.is_ssa);
+   return dst_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset,
+                              dest.reg.indirect);
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(nir_dest dest, enum brw_reg_type type)
+{
+   return retype(get_nir_dest(dest), type);
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(nir_dest dest, nir_alu_type type)
+{
+   return get_nir_dest(dest, brw_type_for_nir_type(type));
+}
+
+src_reg
+vec4_visitor::get_nir_src(nir_src src, enum brw_reg_type type,
+                          unsigned num_components)
+{
+   dst_reg reg;
+
+   if (src.is_ssa) {
+      assert(src.ssa != NULL);
+      reg = nir_ssa_values[src.ssa->index];
+   }
+   else {
+     reg = dst_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
+                               src.reg.indirect);
+   }
+
+   reg = retype(reg, type);
+
+   src_reg reg_as_src = src_reg(reg);
+   reg_as_src.swizzle = brw_swizzle_for_size(num_components);
+   return reg_as_src;
+}
+
+src_reg
+vec4_visitor::get_nir_src(nir_src src, nir_alu_type type,
+                          unsigned num_components)
+{
+   return get_nir_src(src, brw_type_for_nir_type(type), num_components);
+}
+
+src_reg
+vec4_visitor::get_nir_src(nir_src src, unsigned num_components)
+{
+   /* if type is not specified, default to signed int */
+   return get_nir_src(src, nir_type_int, num_components);
+}
+
 void
 vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
 {

From 5c0436dbf87fef76ba67456f215d9285c38f1816 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 16 Jun 2015 20:16:15 +0200
Subject: [PATCH 0897/1208] i965/nir/vec4: Implement conditional statements
 (nir_cf_node_if)

The same we do in the FS NIR backend, only that here we need to consider
the number of components in the condition and adjust the swizzle
accordingly.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index a3dfddf7d3c..7ce571a2261 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -282,7 +282,21 @@ vec4_visitor::nir_emit_cf_list(exec_list *list)
 void
 vec4_visitor::nir_emit_if(nir_if *if_stmt)
 {
-   /* @TODO: Not yet implemented */
+   /* First, put the condition in f0 */
+   src_reg condition = get_nir_src(if_stmt->condition, BRW_REGISTER_TYPE_D, 1);
+   vec4_instruction *inst = emit(MOV(dst_null_d(), condition));
+   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+   emit(IF(BRW_PREDICATE_NORMAL));
+
+   nir_emit_cf_list(&if_stmt->then_list);
+
+   /* note: if the else is empty, dead CF elimination will remove it */
+   emit(BRW_OPCODE_ELSE);
+
+   nir_emit_cf_list(&if_stmt->else_list);
+
+   emit(BRW_OPCODE_ENDIF);
 }
 
 void

From afe085a0ca01f659c69456018e5f5076c9dde47d Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Tue, 16 Jun 2015 20:25:55 +0200
Subject: [PATCH 0898/1208] i965/nir/vec4: Implement loop statements
 (nir_cf_node_loop)

This is taken as-is from fs_nir.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 7ce571a2261..763c69a9521 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -302,7 +302,11 @@ vec4_visitor::nir_emit_if(nir_if *if_stmt)
 void
 vec4_visitor::nir_emit_loop(nir_loop *loop)
 {
-   /* @TODO: Not yet implemented */
+   emit(BRW_OPCODE_DO);
+
+   nir_emit_cf_list(&loop->body);
+
+   emit(BRW_OPCODE_WHILE);
 }
 
 void

From 167cb9663adc8c7c61807e503f66e85f955e7d5f Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Tue, 16 Jun 2015 21:24:21 +0200
Subject: [PATCH 0899/1208] i965/nir/vec4: Implement load_input intrinsic

The source register is fetched from the nir_inputs map built during
nir_setup_inputs stage.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 763c69a9521..5bf1dbb7881 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -446,13 +446,31 @@ vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
 void
 vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 {
+   dst_reg dest;
+   src_reg src;
+
+   bool has_indirect = false;
+
    switch (instr->intrinsic) {
 
    case nir_intrinsic_load_input_indirect:
+      has_indirect = true;
       /* fallthrough */
-   case nir_intrinsic_load_input:
-      /* @TODO: Not yet implemented */
+   case nir_intrinsic_load_input: {
+      int offset = instr->const_index[0];
+      src = nir_inputs[offset];
+
+      if (has_indirect) {
+         dest.reladdr = new(mem_ctx) src_reg(get_nir_src(instr->src[0],
+                                                         BRW_REGISTER_TYPE_D,
+                                                         1));
+      }
+      dest = get_nir_dest(instr->dest, src.type);
+      dest.writemask = brw_writemask_for_size(instr->num_components);
+
+      emit(MOV(dest, src));
       break;
+   }
 
    case nir_intrinsic_store_output_indirect:
       /* fallthrough */

From 11ed02e1c81a2aa71b22b1d6847f58e41fd89271 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Tue, 21 Jul 2015 20:21:21 +0200
Subject: [PATCH 0900/1208] i965/vec4: Make sure that register types always
 match during emit_urb_slot()

Instead of relying on backends (currently vec4_visitor and soon NIR-vec4) to
store registers in output_reg with the correct type, this patch makes sure
that the common code in emit_urb_slot() always emit MOVs from output registers
using the same type on source and destination.

Since the actual type is not important, only that they match, we default to
float.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 17604cdaecc..8fa1e8d2d48 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1056,8 +1056,6 @@ vec4_visitor::visit(ir_variable *ir)
       for (int i = 0; i < type_size(ir->type); i++) {
 	 output_reg[ir->data.location + i] = *reg;
 	 output_reg[ir->data.location + i].reg_offset = i;
-	 output_reg[ir->data.location + i].type =
-            brw_type_for_base_type(ir->type->get_scalar_type());
 	 output_reg_annotation[ir->data.location + i] = ir->name;
       }
       break;
@@ -3103,6 +3101,7 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
          vec4_instruction *inst;
          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
          inst->predicate = BRW_PREDICATE_NORMAL;
+         output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
          inst->predicate = BRW_PREDICATE_NORMAL;
       }
@@ -3115,18 +3114,23 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
          dst_reg reg_w = reg;
          reg_w.writemask = WRITEMASK_W;
-         emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
+         src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
+         reg_as_src.type = reg_w.type;
+         reg_as_src.swizzle = brw_swizzle_for_size(1);
+         emit(MOV(reg_w, reg_as_src));
       }
       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
          dst_reg reg_y = reg;
          reg_y.writemask = WRITEMASK_Y;
          reg_y.type = BRW_REGISTER_TYPE_D;
+         output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
       }
       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
          dst_reg reg_z = reg;
          reg_z.writemask = WRITEMASK_Z;
          reg_z.type = BRW_REGISTER_TYPE_D;
+         output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
       }
    }
@@ -3164,8 +3168,8 @@ vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
 vec4_instruction *
 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
 {
-   assert (varying < VARYING_SLOT_MAX);
-   reg.type = output_reg[varying].type;
+   assert(varying < VARYING_SLOT_MAX);
+   assert(output_reg[varying].type == reg.type);
    current_annotation = output_reg_annotation[varying];
    /* Copy the register, saturating if necessary */
    return emit(MOV(reg, src_reg(output_reg[varying])));
@@ -3175,6 +3179,7 @@ void
 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
 {
    reg.type = BRW_REGISTER_TYPE_F;
+   output_reg[varying].type = reg.type;
 
    switch (varying) {
    case VARYING_SLOT_PSIZ:

From 662c4c99065381b8e265310d176cfdef6698ca57 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Tue, 16 Jun 2015 21:31:49 +0200
Subject: [PATCH 0901/1208] i965/nir/vec4: Implement store_output intrinsic

This implementation is based on the current URB setup in vec4_visitor, which
requires the output register to be stored in the output_reg array at variable's
original shader location index. But since nir_lower_io() pass uses the value
in var->data.driver_location, we need to put there var->data.location instead,
prior to calling nir_lower_io(), so that we end up with the correct index
in const_index[0].

The driver_location is not used at all, so this patch also disables the
nir_assign_var_locations pass on non-scalar shaders.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_nir.c        |  5 ++++-
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 17 +++++++++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index b241121dac7..4f7335024a4 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -106,13 +106,16 @@ brw_create_nir(struct brw_context *brw,
                                             &nir->num_direct_uniforms,
                                             &nir->num_uniforms,
                                             is_scalar);
+      nir_assign_var_locations(&nir->outputs, &nir->num_outputs, is_scalar);
    } else {
       nir_assign_var_locations(&nir->uniforms,
                                &nir->num_uniforms,
                                is_scalar);
+
+      foreach_list_typed(nir_variable, var, node, &nir->outputs)
+         var->data.driver_location = var->data.location;
    }
    nir_assign_var_locations(&nir->inputs, &nir->num_inputs, is_scalar);
-   nir_assign_var_locations(&nir->outputs, &nir->num_outputs, is_scalar);
 
    nir_lower_io(nir, is_scalar);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 5bf1dbb7881..696122d725e 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -473,10 +473,23 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    }
 
    case nir_intrinsic_store_output_indirect:
+      has_indirect = true;
       /* fallthrough */
-   case nir_intrinsic_store_output:
-      /* @TODO: Not yet implemented */
+   case nir_intrinsic_store_output: {
+      int varying = instr->const_index[0];
+
+      src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F,
+                        instr->num_components);
+      dest = dst_reg(src);
+
+      if (has_indirect) {
+         dest.reladdr = new(mem_ctx) src_reg(get_nir_src(instr->src[1],
+                                                         BRW_REGISTER_TYPE_D,
+                                                         1));
+      }
+      output_reg[varying] = dest;
       break;
+   }
 
    case nir_intrinsic_load_vertex_id:
       unreachable("should be lowered by lower_vertex_id()");

From e76e8caecd30799500357a45468329f033a93932 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 16 Jun 2015 21:36:49 +0200
Subject: [PATCH 0902/1208] i965/nir/vec4: Implement intrinsics that load
 system values

These include:

nir_intrinsic_load_vertex_id_zero_base
nir_intrinsic_load_base_vertex
nir_intrinsic_load_instance_id

The source register is fetched from the nir_system_values map initialized
during nir_setup_system_values stage.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 27 +++++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 696122d725e..d9af945e2e4 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -494,17 +494,32 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    case nir_intrinsic_load_vertex_id:
       unreachable("should be lowered by lower_vertex_id()");
 
-   case nir_intrinsic_load_vertex_id_zero_base:
-      /* @TODO: Not yet implemented */
+   case nir_intrinsic_load_vertex_id_zero_base: {
+      src_reg vertex_id =
+         src_reg(nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE]);
+      assert(vertex_id.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, vertex_id.type);
+      emit(MOV(dest, vertex_id));
       break;
+   }
 
-   case nir_intrinsic_load_base_vertex:
-      /* @TODO: Not yet implemented */
+   case nir_intrinsic_load_base_vertex: {
+      src_reg base_vertex =
+         src_reg(nir_system_values[SYSTEM_VALUE_BASE_VERTEX]);
+      assert(base_vertex.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, base_vertex.type);
+      emit(MOV(dest, base_vertex));
       break;
+   }
 
-   case nir_intrinsic_load_instance_id:
-      /* @TODO: Not yet implemented */
+   case nir_intrinsic_load_instance_id: {
+      src_reg instance_id =
+         src_reg(nir_system_values[SYSTEM_VALUE_INSTANCE_ID]);
+      assert(instance_id.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, instance_id.type);
+      emit(MOV(dest, instance_id));
       break;
+   }
 
    case nir_intrinsic_load_uniform_indirect:
       /* fallthrough */

From e6cafb5dfdef8d8d25ee1e3375304cf35897d1f7 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 16 Jun 2015 21:55:14 +0200
Subject: [PATCH 0903/1208] i965/nir/vec4: Implement load_uniform intrinsic

For the indirect case we need to take the index delivered by
NIR and compute the parent uniform that we are accessing (the one
that we uploaded to a surface) and the constant offset into that
surface.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 26 ++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index d9af945e2e4..ff218a50807 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -522,10 +522,32 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    }
 
    case nir_intrinsic_load_uniform_indirect:
+      has_indirect = true;
       /* fallthrough */
-   case nir_intrinsic_load_uniform:
-      /* @TODO: Not yet implemented */
+   case nir_intrinsic_load_uniform: {
+      int uniform = instr->const_index[0];
+
+      dest = get_nir_dest(instr->dest);
+
+      if (has_indirect) {
+         /* Split addressing into uniform and offset */
+         int offset = uniform - nir_uniform_driver_location[uniform];
+         assert(offset >= 0);
+
+         uniform -= offset;
+         assert(uniform >= 0);
+
+         src = src_reg(dst_reg(UNIFORM, uniform));
+         src.reg_offset = offset;
+         src_reg tmp = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_D, 1);
+         src.reladdr = new(mem_ctx) src_reg(tmp);
+      } else {
+         src = src_reg(dst_reg(UNIFORM, uniform));
+      }
+
+      emit(MOV(dest, src));
       break;
+   }
 
    case nir_intrinsic_atomic_counter_read:
    case nir_intrinsic_atomic_counter_inc:

From 98d07022f5312967bdfd54069869c8d6c65117a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 16 Jun 2015 22:03:17 +0200
Subject: [PATCH 0904/1208] i965/nir/vec4: Implement atomic counter intrinsics
 (read, inc and dec)

The implementation is based on its fs_nir counterpart.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 27 ++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index ff218a50807..497732e92df 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -551,9 +551,32 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
    case nir_intrinsic_atomic_counter_read:
    case nir_intrinsic_atomic_counter_inc:
-   case nir_intrinsic_atomic_counter_dec:
-      /* @TODO: Not yet implemented */
+   case nir_intrinsic_atomic_counter_dec: {
+      unsigned surf_index = prog_data->base.binding_table.abo_start +
+         (unsigned) instr->const_index[0];
+      src_reg offset = get_nir_src(instr->src[0], nir_type_int,
+                                   instr->num_components);
+      dest = get_nir_dest(instr->dest);
+
+      switch (instr->intrinsic) {
+         case nir_intrinsic_atomic_counter_inc:
+            emit_untyped_atomic(BRW_AOP_INC, surf_index, dest, offset,
+                                src_reg(), src_reg());
+            break;
+         case nir_intrinsic_atomic_counter_dec:
+            emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dest, offset,
+                                src_reg(), src_reg());
+            break;
+         case nir_intrinsic_atomic_counter_read:
+            emit_untyped_surface_read(surf_index, dest, offset);
+            break;
+         default:
+            unreachable("Unreachable");
+      }
+
+      brw_mark_surface_used(stage_prog_data, surf_index);
       break;
+   }
 
    case nir_intrinsic_load_ubo_indirect:
       /* fallthrough */

From 168bbfa6ff22a586ad6307c187cfa3b8fff5f227 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Tue, 16 Jun 2015 22:10:32 +0200
Subject: [PATCH 0905/1208] i965/nir/vec4: Implement loading values from an UBO

Based on the vec4_visitor IR implementation for the ir_binop_load_ubo
operation. Notice that unlike the vec4_visitor IR, adding the !=0
comparison for UBO bools is not needed here because that comparison is
already added by the nir_visitor when processing the ir_binop_load_ubo
(in UBOs "true" is any value different from zero, but for us is ~0).

Adds NIR instrinsics:

   * nir_intrinsic_load_ubo_indirect
   * nir_intrinsic_load_ubo

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 61 +++++++++++++++++++++-
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 497732e92df..bc5ebbea98e 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -579,10 +579,67 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    }
 
    case nir_intrinsic_load_ubo_indirect:
+      has_indirect = true;
       /* fallthrough */
-   case nir_intrinsic_load_ubo:
-      /* @TODO: Not yet implemented */
+   case nir_intrinsic_load_ubo: {
+      nir_const_value *const_block_index = nir_src_as_const_value(instr->src[0]);
+      src_reg surf_index;
+
+      dest = get_nir_dest(instr->dest);
+
+      if (const_block_index) {
+         /* The block index is a constant, so just emit the binding table entry
+          * as an immediate.
+          */
+         surf_index = src_reg(prog_data->base.binding_table.ubo_start +
+                              const_block_index->u[0]);
+      } else {
+         /* The block index is not a constant. Evaluate the index expression
+          * per-channel and add the base UBO index; we have to select a value
+          * from any live channel.
+          */
+         surf_index = src_reg(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int,
+                                                   instr->num_components),
+                  src_reg(prog_data->base.binding_table.ubo_start)));
+         surf_index = emit_uniformize(surf_index);
+
+         /* Assume this may touch any UBO. It would be nice to provide
+          * a tighter bound, but the array information is already lowered away.
+          */
+         brw_mark_surface_used(&prog_data->base,
+                               prog_data->base.binding_table.ubo_start +
+                               shader_prog->NumUniformBlocks - 1);
+      }
+
+      unsigned const_offset = instr->const_index[0];
+      src_reg offset;
+
+      if (!has_indirect)  {
+         offset = src_reg(const_offset / 16);
+      } else {
+         offset = src_reg(this, glsl_type::uint_type);
+         emit(SHR(dst_reg(offset), get_nir_src(instr->src[1], nir_type_int, 1),
+                  src_reg(4u)));
+      }
+
+      src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
+      packed_consts.type = dest.type;
+
+      emit_pull_constant_load_reg(dst_reg(packed_consts),
+                                  surf_index,
+                                  offset,
+                                  NULL, NULL /* before_block/inst */);
+
+      packed_consts.swizzle = brw_swizzle_for_size(instr->num_components);
+      packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
+                                            const_offset % 16 / 4,
+                                            const_offset % 16 / 4,
+                                            const_offset % 16 / 4);
+
+      emit(MOV(dest, packed_consts));
       break;
+   }
 
    default:
       unreachable("Unknown intrinsic");

From ef1b30ae637e613b384541324c199d2dbe6b44bd Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Tue, 16 Jun 2015 22:30:16 +0200
Subject: [PATCH 0906/1208] i965/nir/vec4: Prepare source and destination
 registers for ALU operations

This patch resolves and initializes the destination and the source
registers that are common to most ALU operations.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index bc5ebbea98e..46bfd3b81d9 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -646,10 +646,27 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    }
 }
 
+static unsigned
+brw_swizzle_for_nir_swizzle(uint8_t swizzle[4])
+{
+   return BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+}
+
 void
 vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
 {
-   /* @TODO: Not yet implemented */
+   dst_reg dst = get_nir_dest(instr->dest.dest,
+                              nir_op_infos[instr->op].output_type);
+   dst.writemask = instr->dest.write_mask;
+
+   src_reg op[4];
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+      op[i] = get_nir_src(instr->src[i].src,
+                          nir_op_infos[instr->op].input_types[i], 4);
+      op[i].swizzle = brw_swizzle_for_nir_swizzle(instr->src[i].swizzle);
+      op[i].abs = instr->src[i].abs;
+      op[i].negate = instr->src[i].negate;
+   }
 }
 
 void

From 9e5d827f455f3c72af6cb8d60b97890bab8d5ad0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Thu, 25 Jun 2015 09:52:35 +0200
Subject: [PATCH 0907/1208] i965/nir: Disable alu_to_scalar pass on non-scalar
 shaders

Disables nir_lower_alu_to_scalar when the shader stage being processed work
on vec4 vectors, like the upcoming NIR->vec4 backend.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_nir.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 4f7335024a4..a838ccaed83 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -27,15 +27,19 @@
 #include "program/prog_to_nir.h"
 
 static void
-nir_optimize(nir_shader *nir)
+nir_optimize(nir_shader *nir, bool is_scalar)
 {
    bool progress;
    do {
       progress = false;
       nir_lower_vars_to_ssa(nir);
       nir_validate_shader(nir);
-      nir_lower_alu_to_scalar(nir);
-      nir_validate_shader(nir);
+
+      if (is_scalar) {
+         nir_lower_alu_to_scalar(nir);
+         nir_validate_shader(nir);
+      }
+
       progress |= nir_copy_prop(nir);
       nir_validate_shader(nir);
       nir_lower_phis_to_scalar(nir);
@@ -92,14 +96,14 @@ brw_create_nir(struct brw_context *brw,
    nir_split_var_copies(nir);
    nir_validate_shader(nir);
 
-   nir_optimize(nir);
+   nir_optimize(nir, is_scalar);
 
    /* Lower a bunch of stuff */
    nir_lower_var_copies(nir);
    nir_validate_shader(nir);
 
    /* Get rid of split copies */
-   nir_optimize(nir);
+   nir_optimize(nir, is_scalar);
 
    if (is_scalar) {
       nir_assign_var_locations_direct_first(nir, &nir->uniforms,
@@ -135,7 +139,7 @@ brw_create_nir(struct brw_context *brw,
    nir_lower_atomics(nir);
    nir_validate_shader(nir);
 
-   nir_optimize(nir);
+   nir_optimize(nir, is_scalar);
 
    if (brw->gen >= 6) {
       /* Try and fuse multiply-adds */

From 79154d99d6e760b1daf327b4594dded18f1d4191 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Tue, 16 Jun 2015 22:52:29 +0200
Subject: [PATCH 0908/1208] i965/nir/vec4: Implement single-element "mov"
 operations

Adds NIR ALU operations:
   * nir_op_imov
   * nir_op_fmov

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 46bfd3b81d9..74efc576c56 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -655,6 +655,8 @@ brw_swizzle_for_nir_swizzle(uint8_t swizzle[4])
 void
 vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
 {
+   vec4_instruction *inst;
+
    dst_reg dst = get_nir_dest(instr->dest.dest,
                               nir_op_infos[instr->op].output_type);
    dst.writemask = instr->dest.write_mask;
@@ -667,6 +669,17 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       op[i].abs = instr->src[i].abs;
       op[i].negate = instr->src[i].negate;
    }
+
+   switch (instr->op) {
+   case nir_op_imov:
+   case nir_op_fmov:
+      inst = emit(MOV(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   default:
+      unreachable("Unimplemented ALU operation");
+   }
 }
 
 void

From e4f02f47e70d384531ac68e6d33a62fdcdbd1f28 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Tue, 16 Jun 2015 22:58:15 +0200
Subject: [PATCH 0909/1208] i965/nir/vec4: Lower "vecN" instructions and mark
 them unreachable

This enables NIR pass "lower_vec_to_movs" on shaders that work on vec4.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_nir.c        | 5 +++++
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index a838ccaed83..beb7d6912c8 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -175,6 +175,11 @@ brw_create_nir(struct brw_context *brw,
    nir_convert_from_ssa(nir, is_scalar);
    nir_validate_shader(nir);
 
+   if (!is_scalar) {
+      nir_lower_vec_to_movs(nir);
+      nir_validate_shader(nir);
+   }
+
    /* This is the last pass we run before we start emitting stuff.  It
     * determines when we need to insert boolean resolves on Gen <= 5.  We
     * run it last because it stashes data in instr->pass_flags and we don't
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 74efc576c56..2829bbc111c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -677,6 +677,11 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4:
+      unreachable("not reached: should be handled by lower_vec_to_movs()");
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 4f39b547da4f9949d1b1f9f0df07d08951f0358d Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Tue, 16 Jun 2015 23:04:32 +0200
Subject: [PATCH 0910/1208] i965/nir/vec4: Implement int<->float format
 conversion ops

Adds NIR ALU operations:
   * nir_op_f2i
   * nir_op_f2u
   * nir_op_i2f
   * nir_op_u2f

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 2829bbc111c..e1ada3bc855 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -682,6 +682,17 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_vec4:
       unreachable("not reached: should be handled by lower_vec_to_movs()");
 
+   case nir_op_i2f:
+   case nir_op_u2f:
+      inst = emit(MOV(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_f2i:
+   case nir_op_f2u:
+      inst = emit(MOV(dst, op[0]));
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 0675842b56a956befbac4a3b912823e73a95a500 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Tue, 16 Jun 2015 23:48:46 +0200
Subject: [PATCH 0911/1208] i965/nir/vec4: Implement the addition operation

Adds NIR ALU operations:
   * nir_op_fadd
   * nir_op_iadd

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index e1ada3bc855..db136c3e7ab 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -693,6 +693,13 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst = emit(MOV(dst, op[0]));
       break;
 
+   case nir_op_fadd:
+      /* fall through */
+   case nir_op_iadd:
+      inst = emit(ADD(dst, op[0], op[1]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 9acebf146184c35e6897b91fff414c5295d47996 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Tue, 16 Jun 2015 23:50:46 +0200
Subject: [PATCH 0912/1208] i965/nir/vec4: Implement multiplication

Implementation based on the vec4_visitor IR implementation
for the operations ir_binop_mul and ir_binop_imul_high.

Adds NIR ALU operations:
   * nir_op_fmul
   * nir_op_imul
   * nir_op_imul_high
   * nir_op_umul_high

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 44 ++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index db136c3e7ab..1455ebac965 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -700,6 +700,50 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
+   case nir_op_fmul:
+      inst = emit(MUL(dst, op[0], op[1]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_imul: {
+      nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
+      nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
+
+      /* For integer multiplication, the MUL uses the low 16 bits of one of
+       * the operands (src0 through SNB, src1 on IVB and later). The MACH
+       * accumulates in the contribution of the upper 16 bits of that
+       * operand. If we can determine that one of the args is in the low
+       * 16 bits, though, we can just emit a single MUL.
+       */
+      if (value0 && value0->u[0] < (1 << 16)) {
+         if (devinfo->gen < 7)
+            emit(MUL(dst, op[0], op[1]));
+         else
+            emit(MUL(dst, op[1], op[0]));
+      } else if (value1 && value1->u[0] < (1 << 16)) {
+         if (devinfo->gen < 7)
+            emit(MUL(dst, op[1], op[0]));
+         else
+            emit(MUL(dst, op[0], op[1]));
+      } else {
+         struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+         emit(MUL(acc, op[0], op[1]));
+         emit(MACH(dst_null_d(), op[0], op[1]));
+         emit(MOV(dst, src_reg(acc)));
+      }
+      break;
+   }
+
+   case nir_op_imul_high:
+   case nir_op_umul_high: {
+      struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+      emit(MUL(acc, op[0], op[1]));
+      emit(MACH(dst, op[0], op[1]));
+      break;
+   }
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 068a41b349e8bc30293c44d96553184f7562949f Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 00:04:09 +0200
Subject: [PATCH 0913/1208] i965/vec4: Return the last emitted instruction in
 emit_math()

Needed in the NIR backend to set the "saturate" value of the
instruction.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h           | 5 +++--
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index b6ae926c8e2..285e2ee8401 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -321,8 +321,9 @@ public:
 
    src_reg fix_3src_operand(src_reg src);
 
-   void emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
-                  const src_reg &src1 = src_reg());
+   vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+                               const src_reg &src1 = src_reg());
+
    src_reg fix_math_operand(src_reg src);
 
    void emit_pack_half_2x16(dst_reg dst, src_reg src0);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 8fa1e8d2d48..fa979319374 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -338,7 +338,7 @@ vec4_visitor::fix_math_operand(src_reg src)
    return src_reg(expanded);
 }
 
-void
+vec4_instruction *
 vec4_visitor::emit_math(enum opcode opcode,
                         const dst_reg &dst,
                         const src_reg &src0, const src_reg &src1)
@@ -350,11 +350,13 @@ vec4_visitor::emit_math(enum opcode opcode,
       /* MATH on Gen6 must be align1, so we can't do writemasks. */
       math->dst = dst_reg(this, glsl_type::vec4_type);
       math->dst.type = dst.type;
-      emit(MOV(dst, src_reg(math->dst)));
+      math = emit(MOV(dst, src_reg(math->dst)));
    } else if (devinfo->gen < 6) {
       math->base_mrf = 1;
       math->mlen = src1.file == BAD_FILE ? 1 : 2;
    }
+
+   return math;
 }
 
 void

From 62cef7b0723ad6ca49ed06a6899a5852e41359e8 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 00:10:18 +0200
Subject: [PATCH 0914/1208] i965/nir/vec4: Implement more math operations

Adds NIR ALU operations:
   * nir_op_frcp
   * nir_op_fexp2
   * nir_op_flog2
   * nir_op_fexp
   * nir_op_flog
   * nir_op_fsin
   * nir_op_fcos
   * nir_op_idiv
   * nir_op_udiv
   * nir_op_umod
   * nir_op_ldexp
   * nir_op_fsqrt
   * nir_op_frsq
   * nir_op_fpow

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 52 ++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 1455ebac965..dff39709ab0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -744,6 +744,58 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
    }
 
+   case nir_op_frcp:
+      inst = emit_math(SHADER_OPCODE_RCP, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fexp2:
+      inst = emit_math(SHADER_OPCODE_EXP2, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_flog2:
+      inst = emit_math(SHADER_OPCODE_LOG2, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fsin:
+      inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fcos:
+      inst = emit_math(SHADER_OPCODE_COS, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_idiv:
+   case nir_op_udiv:
+      emit_math(SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]);
+      break;
+
+   case nir_op_umod:
+      emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
+      break;
+
+   case nir_op_ldexp:
+      unreachable("not reached: should be handled by ldexp_to_arith()");
+
+   case nir_op_fsqrt:
+      inst = emit_math(SHADER_OPCODE_SQRT, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_frsq:
+      inst = emit_math(SHADER_OPCODE_RSQ, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fpow:
+      inst = emit_math(SHADER_OPCODE_POW, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 0ce159ec7fbcdf00c488b77f63e565e89ef6cab5 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 00:22:14 +0200
Subject: [PATCH 0915/1208] i965/nir/vec4: Implement carry/borrow for
 addition/subtraction

Adds NIR ALU operations:
   * nir_op_uadd_carry
   * nir_op_usub_borrow

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index dff39709ab0..7576bbd455b 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -796,6 +796,22 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
+   case nir_op_uadd_carry: {
+      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+
+      emit(ADDC(dst_null_ud(), op[0], op[1]));
+      emit(MOV(dst, src_reg(acc)));
+      break;
+   }
+
+   case nir_op_usub_borrow: {
+      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+
+      emit(SUBB(dst_null_ud(), op[0], op[1]));
+      emit(MOV(dst, src_reg(acc)));
+      break;
+   }
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 7553a51a68c0b2030265fe741f9c511b65047914 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 00:25:02 +0200
Subject: [PATCH 0916/1208] i965/nir/vec4: Implement various rounding functions

Adds NIR ALU operations:
   * nir_op_ftrunc
   * nir_op_fceil
   * nir_op_ffloor
   * nir_op_ffrac
   * nir_op_fround_even

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 35 ++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 7576bbd455b..ab6335a8546 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -812,6 +812,41 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
    }
 
+   case nir_op_ftrunc:
+      inst = emit(RNDZ(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fceil: {
+      src_reg tmp = src_reg(this, glsl_type::float_type);
+      tmp.swizzle =
+         brw_swizzle_for_size(instr->src[0].src.is_ssa ?
+                              instr->src[0].src.ssa->num_components :
+                              instr->src[0].src.reg.reg->num_components);
+
+      op[0].negate = !op[0].negate;
+      emit(RNDD(dst_reg(tmp), op[0]));
+      tmp.negate = true;
+      inst = emit(MOV(dst, tmp));
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+
+   case nir_op_ffloor:
+      inst = emit(RNDD(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_ffract:
+      inst = emit(FRC(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fround_even:
+      inst = emit(RNDE(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From d53098393e3929b0c8d82f56144c7497b184f5b7 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 00:32:58 +0200
Subject: [PATCH 0917/1208] i965/vec4: Return the emitted instruction in
 emit_minmax()

Needed in the NIR backend to set the "saturate" value of the
instruction.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h           | 4 ++--
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 285e2ee8401..90e5753e34c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -291,8 +291,8 @@ public:
    void emit_bool_to_cond_code(ir_rvalue *ir, enum brw_predicate *predicate);
    void emit_if_gen6(ir_if *ir);
 
-   void emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
-                    src_reg src0, src_reg src1);
+   vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
+                                 src_reg src0, src_reg src1);
 
    void emit_lrp(const dst_reg &dst,
                  const src_reg &x, const src_reg &y, const src_reg &a);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index fa979319374..6865df7e0df 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1266,7 +1266,7 @@ vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
    return true;
 }
 
-void
+vec4_instruction *
 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
                           src_reg src0, src_reg src1)
 {
@@ -1281,6 +1281,8 @@ vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
       inst->predicate = BRW_PREDICATE_NORMAL;
    }
+
+   return inst;
 }
 
 void

From 5e6f1c38a591fa39cff1c32a2cfdda927145756a Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 00:34:57 +0200
Subject: [PATCH 0918/1208] i965/nir/vec4: Implement min/max operations

Adds NIR ALU operations:
   * nir_op_fmin
   * nir_op_imin
   * nir_op_umin
   * nir_op_fmax
   * nir_op_imax
   * nir_op_umax

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index ab6335a8546..584ea5b2299 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -847,6 +847,20 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
+   case nir_op_fmin:
+   case nir_op_imin:
+   case nir_op_umin:
+      inst = emit_minmax(BRW_CONDITIONAL_L, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fmax:
+   case nir_op_imax:
+   case nir_op_umax:
+      inst = emit_minmax(BRW_CONDITIONAL_GE, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From dae6025e8efdfb759458a3243c8cd1588f485135 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Tue, 14 Apr 2015 12:04:24 +0200
Subject: [PATCH 0919/1208] i965/nir/vec4: Derivatives are not allowed in VS

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 584ea5b2299..6ae387a8852 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -861,6 +861,14 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
+   case nir_op_fddx:
+   case nir_op_fddx_coarse:
+   case nir_op_fddx_fine:
+   case nir_op_fddy:
+   case nir_op_fddy_coarse:
+   case nir_op_fddy_fine:
+      unreachable("derivatives are not valid in vertex shaders");
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From b9c41affcf67f30d7f6c74c17ea34bc42756d56d Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Fri, 17 Apr 2015 17:58:35 +0200
Subject: [PATCH 0920/1208] i965/nir: Add utility method for comparisons

This method returns the brw_conditional_mod value used when emitting
comparative ALU operations.

It could be moved to brw_nir in the future to reuse it in fs_nir backend.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 39 ++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 6ae387a8852..77df2842e6e 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -652,6 +652,45 @@ brw_swizzle_for_nir_swizzle(uint8_t swizzle[4])
    return BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
 }
 
+static enum brw_conditional_mod
+brw_conditional_for_nir_comparison(nir_op op)
+{
+   switch (op) {
+   case nir_op_flt:
+   case nir_op_ilt:
+   case nir_op_ult:
+      return BRW_CONDITIONAL_L;
+
+   case nir_op_fge:
+   case nir_op_ige:
+   case nir_op_uge:
+      return BRW_CONDITIONAL_GE;
+
+   case nir_op_feq:
+   case nir_op_ieq:
+   case nir_op_ball_fequal2:
+   case nir_op_ball_iequal2:
+   case nir_op_ball_fequal3:
+   case nir_op_ball_iequal3:
+   case nir_op_ball_fequal4:
+   case nir_op_ball_iequal4:
+      return BRW_CONDITIONAL_Z;
+
+   case nir_op_fne:
+   case nir_op_ine:
+   case nir_op_bany_fnequal2:
+   case nir_op_bany_inequal2:
+   case nir_op_bany_fnequal3:
+   case nir_op_bany_inequal3:
+   case nir_op_bany_fnequal4:
+   case nir_op_bany_inequal4:
+      return BRW_CONDITIONAL_NZ;
+
+   default:
+      unreachable("not reached: bad operation for comparison");
+   }
+}
+
 void
 vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
 {

From 84d4a9dc2ca3d98f19cc9125a5ff1ac1225f360d Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 00:49:42 +0200
Subject: [PATCH 0921/1208] i965/nir/vec4: Implement non-vector comparison ops

Adds NIR ALU operations:
   * nir_op_flt
   * nir_op_ilt
   * nir_op_ult
   * nir_op_fge
   * nir_op_ige
   * nir_op_uge
   * nir_op_feq
   * nir_op_ieq
   * nir_op_fne
   * nir_op_ine

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 77df2842e6e..34a9f4fc46d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -908,6 +908,20 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_fddy_fine:
       unreachable("derivatives are not valid in vertex shaders");
 
+   case nir_op_flt:
+   case nir_op_ilt:
+   case nir_op_ult:
+   case nir_op_fge:
+   case nir_op_ige:
+   case nir_op_uge:
+   case nir_op_feq:
+   case nir_op_ieq:
+   case nir_op_fne:
+   case nir_op_ine:
+      emit(CMP(dst, op[0], op[1],
+               brw_conditional_for_nir_comparison(instr->op)));
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 8be4b876c90192c3a5e6fcc9b526f43a3f7bfc11 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 00:55:24 +0200
Subject: [PATCH 0922/1208] i965/nir/vec4: Implement equality ops on vectors

Adds NIR ALU operations:
   * nir_op_ball_fequal2
   * nir_op_ball_iequal2
   * nir_op_ball_fequal3
   * nir_op_ball_iequal3
   * nir_op_ball_fequal4
   * nir_op_ball_iequal4

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 33 ++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 34a9f4fc46d..4e85b94723c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -922,6 +922,39 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
                brw_conditional_for_nir_comparison(instr->op)));
       break;
 
+   case nir_op_ball_fequal2:
+   case nir_op_ball_iequal2:
+   case nir_op_ball_fequal3:
+   case nir_op_ball_iequal3:
+   case nir_op_ball_fequal4:
+   case nir_op_ball_iequal4: {
+      dst_reg tmp = dst_reg(this, glsl_type::bool_type);
+
+      switch (instr->op) {
+      case nir_op_ball_fequal2:
+      case nir_op_ball_iequal2:
+         tmp.writemask = WRITEMASK_XY;
+         break;
+      case nir_op_ball_fequal3:
+      case nir_op_ball_iequal3:
+         tmp.writemask = WRITEMASK_XYZ;
+         break;
+      case nir_op_ball_fequal4:
+      case nir_op_ball_iequal4:
+         tmp.writemask = WRITEMASK_XYZW;
+         break;
+      default:
+         unreachable("not reached");
+      }
+
+      emit(CMP(tmp, op[0], op[1],
+               brw_conditional_for_nir_comparison(instr->op)));
+      emit(MOV(dst, src_reg(0)));
+      inst = emit(MOV(dst, src_reg(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+      break;
+   }
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 51aeafaf96b3b349e007ad05738bc1e05663fedf Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 09:01:28 +0200
Subject: [PATCH 0923/1208] i965/nir/vec4: Implement non-equality ops on
 vectors

Adds NIR ALU operations:
   * nir_op_bany_fnequal2
   * nir_op_bany_inequal2
   * nir_op_bany_fnequal3
   * nir_op_bany_inequal3
   * nir_op_bany_fnequal4
   * nir_op_bany_inequal4

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 34 ++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 4e85b94723c..3ee7ac29333 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -955,6 +955,40 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
    }
 
+   case nir_op_bany_fnequal2:
+   case nir_op_bany_inequal2:
+   case nir_op_bany_fnequal3:
+   case nir_op_bany_inequal3:
+   case nir_op_bany_fnequal4:
+   case nir_op_bany_inequal4: {
+      dst_reg tmp = dst_reg(this, glsl_type::bool_type);
+
+      switch (instr->op) {
+      case nir_op_bany_fnequal2:
+      case nir_op_bany_inequal2:
+         tmp.writemask = WRITEMASK_XY;
+         break;
+      case nir_op_bany_fnequal3:
+      case nir_op_bany_inequal3:
+         tmp.writemask = WRITEMASK_XYZ;
+         break;
+      case nir_op_bany_fnequal4:
+      case nir_op_bany_inequal4:
+         tmp.writemask = WRITEMASK_XYZW;
+         break;
+      default:
+         unreachable("not reached");
+      }
+
+      emit(CMP(tmp, op[0], op[1],
+               brw_conditional_for_nir_comparison(instr->op)));
+
+      emit(MOV(dst, src_reg(0)));
+      inst = emit(MOV(dst, src_reg(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+   }
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From f14199a8fb802f6672d559fa958a5ee84e3e13f1 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 09:07:20 +0200
Subject: [PATCH 0924/1208] i965/nir/vec4: Implement logical operators

Adds NIR ALU operations:
   * nir_op_inot
   * nir_op_ixor
   * nir_op_ior
   * nir_op_iand

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 3ee7ac29333..549ed1ba5dc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -989,6 +989,22 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
    }
 
+   case nir_op_inot:
+      emit(NOT(dst, op[0]));
+      break;
+
+   case nir_op_ixor:
+      emit(XOR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ior:
+      emit(OR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_iand:
+      emit(AND(dst, op[0], op[1]));
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From fa4731f4a53aa21e53a62f42f3afdc19b0ce4c8e Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 09:21:30 +0200
Subject: [PATCH 0925/1208] i965/nir/vec4: Implement "bool<->int,float" format
 conversion

Used the same implementation than the vec4_visitor NIR.

Adds NIR ALU operations:
   * nir_op_b2i
   * nir_op_b2f
   * nir_op_f2b
   * nir_op_i2b

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 549ed1ba5dc..039309c5c0a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1005,6 +1005,25 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       emit(AND(dst, op[0], op[1]));
       break;
 
+   case nir_op_b2i:
+      emit(AND(dst, op[0], src_reg(1)));
+      break;
+
+   case nir_op_b2f:
+      op[0].type = BRW_REGISTER_TYPE_D;
+      dst.type = BRW_REGISTER_TYPE_D;
+      emit(AND(dst, op[0], src_reg(0x3f800000u)));
+      dst.type = BRW_REGISTER_TYPE_F;
+      break;
+
+   case nir_op_f2b:
+      emit(CMP(dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
+      break;
+
+   case nir_op_i2b:
+      emit(CMP(dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 3f10c2f3d73ae41ff83afcdbe225121b8336f499 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 09:23:10 +0200
Subject: [PATCH 0926/1208] i965/nir/vec4: "noise" ops should already be
 lowered

Marked them as unreachable.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 039309c5c0a..f83653c96e5 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1024,6 +1024,24 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       emit(CMP(dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
       break;
 
+   case nir_op_fnoise1_1:
+   case nir_op_fnoise1_2:
+   case nir_op_fnoise1_3:
+   case nir_op_fnoise1_4:
+   case nir_op_fnoise2_1:
+   case nir_op_fnoise2_2:
+   case nir_op_fnoise2_3:
+   case nir_op_fnoise2_4:
+   case nir_op_fnoise3_1:
+   case nir_op_fnoise3_2:
+   case nir_op_fnoise3_3:
+   case nir_op_fnoise3_4:
+   case nir_op_fnoise4_1:
+   case nir_op_fnoise4_2:
+   case nir_op_fnoise4_3:
+   case nir_op_fnoise4_4:
+      unreachable("not reached: should be handled by lower_noise");
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 0e874985ce50d902535e1eb766bd252c921b5d8f Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 09:29:04 +0200
Subject: [PATCH 0927/1208] i965/nir/vec4: Implement pack/unpack operations

* Lowered floating-point pack and unpack operations are not valid in VS.

* Pack and unpack 2x16 operations should be handled by lower_packing_builtins.

* Adds NIR ALU operations:
   * nir_op_pack_half_2x16
   * nir_op_unpack_half_2x16
   * nir_op_unpack_unorm_4x8
   * nir_op_unpack_snorm_4x8
   * nir_op_pack_unorm_4x8
   * nir_op_pack_snorm_4x8

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 44 ++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index f83653c96e5..87784db853a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1042,6 +1042,50 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_fnoise4_4:
       unreachable("not reached: should be handled by lower_noise");
 
+   case nir_op_unpack_half_2x16_split_x:
+   case nir_op_unpack_half_2x16_split_y:
+   case nir_op_pack_half_2x16_split:
+      unreachable("not reached: should not occur in vertex shader");
+
+   case nir_op_unpack_snorm_2x16:
+   case nir_op_unpack_unorm_2x16:
+   case nir_op_pack_snorm_2x16:
+   case nir_op_pack_unorm_2x16:
+      unreachable("not reached: should be handled by lower_packing_builtins");
+
+   case nir_op_unpack_half_2x16:
+      /* As NIR does not guarantee that we have a correct swizzle outside the
+       * boundaries of a vector, and the implementation of emit_unpack_half_2x16
+       * uses the source operand in an operation with WRITEMASK_Y while our
+       * source operand has only size 1, it accessed incorrect data producing
+       * regressions in Piglit. We repeat the swizzle of the first component on the
+       * rest of components to avoid regressions. In the vec4_visitor IR code path
+       * this is not needed because the operand has already the correct swizzle.
+       */
+      op[0].swizzle = brw_compose_swizzle(BRW_SWIZZLE_XXXX, op[0].swizzle);
+      emit_unpack_half_2x16(dst, op[0]);
+      break;
+
+   case nir_op_pack_half_2x16:
+      emit_pack_half_2x16(dst, op[0]);
+      break;
+
+   case nir_op_unpack_unorm_4x8:
+      emit_unpack_unorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_pack_unorm_4x8:
+      emit_pack_unorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_unpack_snorm_4x8:
+      emit_unpack_snorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_pack_snorm_4x8:
+      emit_pack_snorm_4x8(dst, op[0]);
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 8e1e6facbf828258a9a8ca09da846d1baa21d984 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 09:44:25 +0200
Subject: [PATCH 0928/1208] i965/nir/vec4: Implement bit operations

Same implementation than the IR case.

Adds NIR ALU operations:
   * nir_op_bitfield_reverse
   * nir_op_bit_count
   * nir_op_ufind_msb
   * nir_op_ifind_msb
   * nir_op_find_lsb
   * nir_op_ubitfield_extract
   * nir_op_ibitfield_extract
   * nir_op_bfm
   * nir_op_bfi
   * nir_op_bitfield_insert

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 62 ++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 87784db853a..e2357307ed3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1086,6 +1086,68 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       emit_pack_snorm_4x8(dst, op[0]);
       break;
 
+   case nir_op_bitfield_reverse:
+      emit(BFREV(dst, op[0]));
+      break;
+
+   case nir_op_bit_count:
+      emit(CBIT(dst, op[0]));
+      break;
+
+   case nir_op_ufind_msb:
+   case nir_op_ifind_msb: {
+      src_reg temp = src_reg(this, glsl_type::uint_type);
+
+      inst = emit(FBH(dst_reg(temp), op[0]));
+      inst->dst.writemask = WRITEMASK_XYZW;
+
+      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
+       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
+       * subtract the result from 31 to convert the MSB count into an LSB count.
+       */
+
+      /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
+      temp.swizzle = BRW_SWIZZLE_NOOP;
+      emit(MOV(dst, temp));
+
+      src_reg src_tmp = src_reg(dst);
+      emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
+
+      src_tmp.negate = true;
+      inst = emit(ADD(dst, src_tmp, src_reg(31)));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
+   case nir_op_find_lsb:
+      emit(FBL(dst, op[0]));
+      break;
+
+   case nir_op_ubitfield_extract:
+   case nir_op_ibitfield_extract:
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      emit(BFE(dst, op[2], op[1], op[0]));
+      break;
+
+   case nir_op_bfm:
+      emit(BFI1(dst, op[0], op[1]));
+      break;
+
+   case nir_op_bfi:
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      emit(BFI2(dst, op[0], op[1], op[2]));
+      break;
+
+   case nir_op_bitfield_insert:
+      unreachable("not reached: should be handled by "
+                  "lower_instructions::bitfield_insert_to_bfm_bfi");
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 798cb33a256f703ecaf56d4443e12055484d4bcc Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 09:47:41 +0200
Subject: [PATCH 0929/1208] i965/nir/vec4: Implement the "sign" operation

Follows the vec4_visitor IR implementation but
sets the saturate value in addition.

Adds NIR ALU operations:
   * nir_op_fsign
   * nir_op_isign

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 33 ++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index e2357307ed3..560797068fb 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1148,6 +1148,39 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       unreachable("not reached: should be handled by "
                   "lower_instructions::bitfield_insert_to_bfm_bfi");
 
+   case nir_op_fsign:
+      /* AND(val, 0x80000000) gives the sign bit.
+       *
+       * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
+       * zero.
+       */
+      emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
+
+      op[0].type = BRW_REGISTER_TYPE_UD;
+      dst.type = BRW_REGISTER_TYPE_UD;
+      emit(AND(dst, op[0], src_reg(0x80000000u)));
+
+      inst = emit(OR(dst, src_reg(dst), src_reg(0x3f800000u)));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      dst.type = BRW_REGISTER_TYPE_F;
+
+      if (instr->dest.saturate) {
+         inst = emit(MOV(dst, src_reg(dst)));
+         inst->saturate = true;
+      }
+      break;
+
+   case nir_op_isign:
+      /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
+       *               -> non-negative val generates 0x00000000.
+       *  Predicated OR sets 1 if val is positive.
+       */
+      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
+      emit(ASR(dst, op[0], src_reg(31)));
+      inst = emit(OR(dst, src_reg(dst), src_reg(1)));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From d12e165dbb403c3cf86ab7f1b8f28ab6188b479f Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 09:49:31 +0200
Subject: [PATCH 0930/1208] i965/nir/vec4: Implement "shift" operations

Adds NIR ALU operations:
   * nir_op_ishl
   * nir_op_ishr
   * nir_op_ushr

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 560797068fb..b332fd315fb 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1181,6 +1181,18 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
+   case nir_op_ishl:
+      emit(SHL(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ishr:
+      emit(ASR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ushr:
+      emit(SHR(dst, op[0], op[1]));
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From b64bd1fdc37eed1bb62d2b32ad22f0f77501f7f2 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 09:51:10 +0200
Subject: [PATCH 0931/1208] i965/nir/vec4: Implement floating-point fused
 multiply-add

Adds NIR ALU operation:
   * nir_op_ffma

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index b332fd315fb..2cf157678c9 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1193,6 +1193,15 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       emit(SHR(dst, op[0], op[1]));
       break;
 
+   case nir_op_ffma:
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      inst = emit(MAD(dst, op[2], op[1], op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 314474872b77f291132a01f7c1df2788586fc943 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 10:01:07 +0200
Subject: [PATCH 0932/1208] i965/vec4: Return the emitted instruction in
 emit_lrp()

Needed in the NIR backend to set the "saturate" value of the
instruction.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h           | 4 ++--
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 90e5753e34c..54fe84ae6c6 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -294,8 +294,8 @@ public:
    vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
                                  src_reg src0, src_reg src1);
 
-   void emit_lrp(const dst_reg &dst,
-                 const src_reg &x, const src_reg &y, const src_reg &a);
+   vec4_instruction *emit_lrp(const dst_reg &dst, const src_reg &x,
+                              const src_reg &y, const src_reg &a);
 
    /**
     * Copy any live channel from \p src to the first channel of the
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 6865df7e0df..5346fde950a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1285,7 +1285,7 @@ vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
    return inst;
 }
 
-void
+vec4_instruction *
 vec4_visitor::emit_lrp(const dst_reg &dst,
                        const src_reg &x, const src_reg &y, const src_reg &a)
 {
@@ -1293,8 +1293,8 @@ vec4_visitor::emit_lrp(const dst_reg &dst,
       /* Note that the instruction's argument order is reversed from GLSL
        * and the IR.
        */
-      emit(LRP(dst,
-               fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
+     return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
+                     fix_3src_operand(x)));
    } else {
       /* Earlier generations don't support three source operations, so we
        * need to emit x*(1-a) + y*a.
@@ -1309,7 +1309,7 @@ vec4_visitor::emit_lrp(const dst_reg &dst,
       emit(MUL(y_times_a, y, a));
       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
-      emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
+      return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
    }
 }
 

From b38fcd0aea8d17919ecd9cc7afc518cfb2c01c27 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 09:52:43 +0200
Subject: [PATCH 0933/1208] i965/nir/vec4: Implement linear interpolation

Adds NIR ALU operation:
   * nir_op_flrp

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 2cf157678c9..1ee85976793 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1202,6 +1202,11 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
+   case nir_op_flrp:
+      inst = emit_lrp(dst, op[0], op[1], op[2]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 96106e2a9f214d98fc2e99c65398f95d41a3b879 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 10:05:29 +0200
Subject: [PATCH 0934/1208] i965/nir/vec4: Implement conditional select

Adds NIR ALU operations:
   * nir_op_bcsel

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 1ee85976793..529f7f73eae 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1207,6 +1207,12 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
+   case nir_op_bcsel:
+      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+      inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From fa4e3c3c9f6f3a72a032499fccaa6e222d6a7fa4 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 10:06:44 +0200
Subject: [PATCH 0935/1208] i965/nir/vec4: Implement the dot product operation

Adds NIR ALU operations:
   * nir_op_fdot2
   * nir_op_fdot3
   * nir_op_fdot4

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 529f7f73eae..8a87759fcb9 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1213,6 +1213,21 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
+   case nir_op_fdot2:
+      inst = emit(BRW_OPCODE_DP2, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fdot3:
+      inst = emit(BRW_OPCODE_DP3, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fdot4:
+      inst = emit(BRW_OPCODE_DP4, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 16072834babc487f78472f7e7b59d35249a3aac8 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 10:08:27 +0200
Subject: [PATCH 0936/1208] i965/nir/vec4: Implement vector "any" operation

Adds NIR ALU operations:
   * nir_op_bany2
   * nir_op_bany3
   * nir_op_bany4

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 8a87759fcb9..a586f902f67 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1228,6 +1228,20 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
+   case nir_op_bany2:
+   case nir_op_bany3:
+   case nir_op_bany4: {
+      dst_reg tmp = dst_reg(this, glsl_type::bool_type);
+      tmp.writemask = brw_writemask_for_size(nir_op_infos[instr->op].input_sizes[0]);
+
+      emit(CMP(tmp, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+
+      emit(MOV(dst, src_reg(0)));
+      inst = emit(MOV(dst, src_reg(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+   }
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 9b4a6fa4c09d36e0e5c00309e6ea37300ea38f78 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 17 Jun 2015 10:10:44 +0200
Subject: [PATCH 0937/1208] i965/nir/vec4: Mark as unreachable ops that should
 be already lowered

NIR ALU operations:
   * nir_op_fabs
   * nir_op_iabs
   * nir_op_fneg
   * nir_op_ineg
   * nir_op_fsat
        should be lowered by lower_source mods

   * nir_op_fdiv
        should be lowered in the compiler by DIV_TO_MUL_RCP.

   * nir_op_fmod
        should be lowered in the compiler by MOD_TO_FLOOR.

   * nir_op_fsub
   * nir_op_isub
        should be handled by ir_sub_to_add_neg.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index a586f902f67..049edbdf099 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1242,6 +1242,23 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
    }
 
+   case nir_op_fabs:
+   case nir_op_iabs:
+   case nir_op_fneg:
+   case nir_op_ineg:
+   case nir_op_fsat:
+      unreachable("not reached: should be lowered by lower_source mods");
+
+   case nir_op_fdiv:
+      unreachable("not reached: should be lowered by DIV_TO_MUL_RCP in the compiler");
+
+   case nir_op_fmod:
+      unreachable("not reached: should be lowered by MOD_TO_FLOOR in the compiler");
+
+   case nir_op_fsub:
+   case nir_op_isub:
+      unreachable("not reached: should be handled by ir_sub_to_add_neg");
+
    default:
       unreachable("Unimplemented ALU operation");
    }

From 583c1c61703826002ba0f202e8ef7bc2c822ef1d Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Wed, 17 Jun 2015 10:20:19 +0200
Subject: [PATCH 0938/1208] i965/nir/vec4: Implement nir_emit_jump

This implementation is taken as-is from fs_nir.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 049edbdf099..544f4263f2a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1267,7 +1267,20 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
 void
 vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
 {
-   /* @TODO: Not yet implemented */
+   switch (instr->type) {
+   case nir_jump_break:
+      emit(BRW_OPCODE_BREAK);
+      break;
+
+   case nir_jump_continue:
+      emit(BRW_OPCODE_CONTINUE);
+      break;
+
+   case nir_jump_return:
+      /* fall through */
+   default:
+      unreachable("unknown jump");
+   }
 }
 
 void

From db8a6de571bb72ef43209a415e5492001a87b1d8 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Wed, 17 Jun 2015 10:59:10 +0200
Subject: [PATCH 0939/1208] i965/nir: Add new utility method
 brw_glsl_base_type_for_nir_type()

This method returns the glsl_base_type corresponding to a nir_alu_type.
It will factorize code currently present in fs_nir, that can be reused
in vec4_nir on its upcoming emit_texture support.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 16 ++--------------
 src/mesa/drivers/dri/i965/brw_nir.c      | 21 +++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_nir.h      |  2 ++
 3 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index b51fe0e5eba..e922a85573c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1699,20 +1699,8 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
       }
    }
 
-   enum glsl_base_type dest_base_type;
-   switch (instr->dest_type) {
-   case nir_type_float:
-      dest_base_type = GLSL_TYPE_FLOAT;
-      break;
-   case nir_type_int:
-      dest_base_type = GLSL_TYPE_INT;
-      break;
-   case nir_type_unsigned:
-      dest_base_type = GLSL_TYPE_UINT;
-      break;
-   default:
-      unreachable("bad type");
-   }
+   enum glsl_base_type dest_base_type =
+     brw_glsl_base_type_for_nir_type (instr->dest_type);
 
    const glsl_type *dest_type =
       glsl_type::get_instance(dest_base_type, nir_tex_instr_dest_size(instr),
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index beb7d6912c8..49d17422546 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -216,3 +216,24 @@ brw_type_for_nir_type(nir_alu_type type)
 
    return BRW_REGISTER_TYPE_F;
 }
+
+/* Returns the glsl_base_type corresponding to a nir_alu_type.
+ * This is used by both brw_vec4_nir and brw_fs_nir.
+ */
+enum glsl_base_type
+brw_glsl_base_type_for_nir_type(nir_alu_type type)
+{
+   switch (type) {
+   case nir_type_float:
+      return GLSL_TYPE_FLOAT;
+
+   case nir_type_int:
+      return GLSL_TYPE_INT;
+
+   case nir_type_unsigned:
+      return GLSL_TYPE_UINT;
+
+   default:
+      unreachable("bad type");
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index 6152321521b..ad712930536 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -83,6 +83,8 @@ nir_shader *brw_create_nir(struct brw_context *brw,
 
 enum brw_reg_type brw_type_for_nir_type(nir_alu_type type);
 
+enum glsl_base_type brw_glsl_base_type_for_nir_type(nir_alu_type type);
+
 #ifdef __cplusplus
 }
 #endif

From 434481f3155040217c3e5a8da98dab4248435f0e Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Thu, 18 Jun 2015 09:37:33 +0200
Subject: [PATCH 0940/1208] i965/vec4: Move is_high_sample() method to
 vec4_visitor class

The is_high_sample() method is currently accessible only in the implementation of
vec4_visitor. Since we need to reuse it in the upcoming NIR->vec4 pass, lets make
it a method of the class instead.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h           | 1 +
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 54fe84ae6c6..aea21c5897d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -392,6 +392,7 @@ public:
    void visit_atomic_counter_intrinsic(ir_call *ir);
 
    int type_size(const struct glsl_type *type);
+   bool is_high_sampler(src_reg sampler);
 
    virtual void emit_nir_code();
    virtual void nir_setup_inputs(nir_shader *shader);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 5346fde950a..737c9fa6d27 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -2509,8 +2509,8 @@ vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler
    return src_reg(inst->dst);
 }
 
-static bool
-is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
+bool
+vec4_visitor::is_high_sampler(src_reg sampler)
 {
    if (devinfo->gen < 8 && !devinfo->is_haswell)
       return false;
@@ -2686,7 +2686,7 @@ vec4_visitor::visit(ir_texture *ir)
    inst->header_size =
       (devinfo->gen < 5 || devinfo->gen >= 9 ||
        inst->offset != 0 || ir->op == ir_tg4 ||
-       is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
+       is_high_sampler(sampler_reg)) ? 1 : 0;
    inst->base_mrf = 2;
    inst->mlen = inst->header_size + 1; /* always at least one */
    inst->dst.writemask = WRITEMASK_XYZW;

From 72c8d7721feb966cf8530a3ee2642f0b842dc0f8 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Thu, 18 Jun 2015 11:31:54 +0200
Subject: [PATCH 0941/1208] i965/vec4: Change vec4_visitor::emit_mcs_fetch()
 method to allow reuse

This patch changes the signature of emit_mcs_fetch() to accept lower level
arguments. The purpose is to reuse it in the upcoming NIR->vec4 pass.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h           |  3 ++-
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 11 ++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index aea21c5897d..ace0bf4437b 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -334,7 +334,8 @@ public:
    void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0);
 
    uint32_t gather_channel(ir_texture *ir, uint32_t sampler);
-   src_reg emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler);
+   src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
+                          src_reg sampler);
    void emit_gen6_gather_wa(uint8_t wa, dst_reg dst);
    void swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 737c9fa6d27..3f157beff09 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -2469,7 +2469,8 @@ vec4_visitor::visit(ir_call *ir)
 }
 
 src_reg
-vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
+vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
+                             src_reg coordinate, src_reg sampler)
 {
    vec4_instruction *inst =
       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
@@ -2496,13 +2497,13 @@ vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler
    }
 
    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
-   int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
+   int coord_mask = (1 << coordinate_type->vector_elements) - 1;
    int zero_mask = 0xf & ~coord_mask;
 
-   emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
+   emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
             coordinate));
 
-   emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
+   emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
             src_reg(0)));
 
    emit(inst);
@@ -2625,7 +2626,7 @@ vec4_visitor::visit(ir_texture *ir)
       sample_index_type = ir->lod_info.sample_index->type;
 
       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
-         mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
+         mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
       else
          mcs = src_reg(0u);
       break;

From 57182332b84b58fed6641314def67450893b7419 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Thu, 18 Jun 2015 12:12:21 +0200
Subject: [PATCH 0942/1208] i965/vec4: Change vec4_visitor::gather_channel()
 method to allow reuse

This patch changes the signature of gather_channel() to accept the gather
component directly instead of fetching it internally from ir_texture.
This will allow reuse in the upcoming NIR->vec4 pass.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h           | 2 +-
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index ace0bf4437b..170353a0c28 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -333,7 +333,7 @@ public:
    void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0);
    void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0);
 
-   uint32_t gather_channel(ir_texture *ir, uint32_t sampler);
+   uint32_t gather_channel(unsigned gather_component, uint32_t sampler);
    src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
                           src_reg sampler);
    void emit_gen6_gather_wa(uint8_t wa, dst_reg dst);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 3f157beff09..c72fae6ce0c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -2675,7 +2675,9 @@ vec4_visitor::visit(ir_texture *ir)
 
    /* Stuff the channel select bits in the top of the texture offset */
    if (ir->op == ir_tg4)
-      inst->offset |= gather_channel(ir, sampler) << 16;
+      inst->offset |=
+        gather_channel( ir->lod_info.component->as_constant()->value.i[0],
+                        sampler) << 16;
 
    /* The message header is necessary for:
     * - Gen4 (always)
@@ -2847,10 +2849,9 @@ vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
  * Set up the gather channel based on the swizzle, for gather4.
  */
 uint32_t
-vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
+vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
 {
-   ir_constant *chan = ir->lod_info.component->as_constant();
-   int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
+   int swiz = GET_SWZ(key->tex.swizzles[sampler], gather_component);
    switch (swiz) {
       case SWIZZLE_X: return 0;
       case SWIZZLE_Y:

From c15eea2afa7a295992cde949b8e2a5d4552f6290 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Mon, 6 Jul 2015 13:31:05 +0200
Subject: [PATCH 0943/1208] i965/vec4: Change vec4_visitor::swizzle_result()
 method to allow reuse

This patch changes the signature of swizzle_result() to accept lower
level arguments. The purpose is to reuse it in the upcoming NIR->vec4
pass.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h          |  4 +++-
 .../drivers/dri/i965/brw_vec4_visitor.cpp     | 19 ++++++++++++-------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 170353a0c28..f1482bb60b1 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -337,7 +337,9 @@ public:
    src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
                           src_reg sampler);
    void emit_gen6_gather_wa(uint8_t wa, dst_reg dst);
-   void swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler);
+   void swizzle_result(ir_texture_opcode op, dst_reg dest,
+                       src_reg orig_val, uint32_t sampler,
+                       const glsl_type *dest_type);
 
    void emit_ndc_computation();
    void emit_psiz_and_flags(dst_reg reg);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index c72fae6ce0c..81a5cd12f47 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -2815,7 +2815,11 @@ vec4_visitor::visit(ir_texture *ir)
       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
    }
 
-   swizzle_result(ir, src_reg(inst->dst), sampler);
+   this->result = src_reg(this, ir->type);
+   dst_reg dest = dst_reg(this->result);
+
+   swizzle_result(ir->op, dest, src_reg(inst->dst),
+                  sampler, ir->type);
 }
 
 /**
@@ -2869,22 +2873,23 @@ vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
 }
 
 void
-vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
+vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
+                             src_reg orig_val, uint32_t sampler,
+                             const glsl_type *dest_type)
 {
    int s = key->tex.swizzles[sampler];
 
-   this->result = src_reg(this, ir->type);
-   dst_reg swizzled_result(this->result);
+   dst_reg swizzled_result = dest;
 
-   if (ir->op == ir_query_levels) {
+   if (op == ir_query_levels) {
       /* # levels is in .w */
       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
       emit(MOV(swizzled_result, orig_val));
       return;
    }
 
-   if (ir->op == ir_txs || ir->type == glsl_type::float_type
-			|| s == SWIZZLE_NOOP || ir->op == ir_tg4) {
+   if (op == ir_txs || dest_type == glsl_type::float_type
+			|| s == SWIZZLE_NOOP || op == ir_tg4) {
       emit(MOV(swizzled_result, orig_val));
       return;
    }

From 0d43d27df742ad95a086580bae2ee08a0bc00e69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Sat, 23 May 2015 23:42:58 +0200
Subject: [PATCH 0944/1208] i965/vec4: Add a new dst_reg constructor accepting
 a brw_reg_type

This is useful for the upcoming texture support in NIR->vec4 pass,
as we found several cases where the brw_type is available, but not
the glsl_type.

Without this new constructor, the alternative would be:
dst_reg reg(MRF, <reg>)
reg.type = <brw_type>
reg.writemask = <mask>

Adding a new constructor makes code easier to read.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_ir_vec4.h |  2 ++
 src/mesa/drivers/dri/i965/brw_vec4.cpp  | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
index fceacae0e51..966a410a15d 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
@@ -113,6 +113,8 @@ public:
    dst_reg(register_file file, int reg);
    dst_reg(register_file file, int reg, const glsl_type *type,
            unsigned writemask);
+   dst_reg(register_file file, int reg, brw_reg_type type,
+           unsigned writemask);
    dst_reg(struct brw_reg reg);
    dst_reg(class vec4_visitor *v, const struct glsl_type *type);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 4e5518588c6..6f6e6271a91 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -171,6 +171,17 @@ dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
    this->writemask = writemask;
 }
 
+dst_reg::dst_reg(register_file file, int reg, brw_reg_type type,
+                 unsigned writemask)
+{
+   init();
+
+   this->file = file;
+   this->reg = reg;
+   this->type = type;
+   this->writemask = writemask;
+}
+
 dst_reg::dst_reg(struct brw_reg reg)
 {
    init();

From 1343f403b2d08a0877f17133abb6dccf0f51127b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Mon, 6 Jul 2015 14:33:21 +0200
Subject: [PATCH 0945/1208] i965/ir/vec4: Refactor visit(ir_texture *ir)

Splitted in two. The emission is moved to a new vec4_visitor
method, vec4_visitor::emit_texture, ir order to be reused
on the nir path.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h          |  14 +
 .../drivers/dri/i965/brw_vec4_visitor.cpp     | 368 ++++++++++--------
 2 files changed, 209 insertions(+), 173 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index f1482bb60b1..3ddfe88ad38 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -333,6 +333,20 @@ public:
    void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0);
    void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0);
 
+   void emit_texture(ir_texture_opcode op,
+                     dst_reg dest,
+                     const glsl_type *dest_type,
+                     src_reg coordinate,
+                     int coord_components,
+                     src_reg shadow_comparitor,
+                     src_reg lod, src_reg lod2,
+                     src_reg sample_index,
+                     uint32_t constant_offset,
+                     src_reg offset_value,
+                     src_reg mcs,
+                     bool is_cube_array,
+                     uint32_t sampler, src_reg sampler_reg);
+
    uint32_t gather_channel(unsigned gather_component, uint32_t sampler);
    src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
                           src_reg sampler);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 81a5cd12f47..f6155a3a3b5 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -2519,6 +2519,183 @@ vec4_visitor::is_high_sampler(src_reg sampler)
    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
 }
 
+void
+vec4_visitor::emit_texture(ir_texture_opcode op,
+                           dst_reg dest,
+                           const glsl_type *dest_type,
+                           src_reg coordinate,
+                           int coord_components,
+                           src_reg shadow_comparitor,
+                           src_reg lod, src_reg lod2,
+                           src_reg sample_index,
+                           uint32_t constant_offset,
+                           src_reg offset_value,
+                           src_reg mcs,
+                           bool is_cube_array,
+                           uint32_t sampler,
+                           src_reg sampler_reg)
+{
+   enum opcode opcode;
+   switch (op) {
+   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
+   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
+   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
+   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
+   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
+   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
+   case ir_tg4: opcode = offset_value.file != BAD_FILE
+                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
+   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
+   case ir_txb:
+      unreachable("TXB is not valid for vertex shaders.");
+   case ir_lod:
+      unreachable("LOD is not valid for vertex shaders.");
+   default:
+      unreachable("Unrecognized tex op");
+   }
+
+   vec4_instruction *inst = new(mem_ctx) vec4_instruction(
+      opcode, dst_reg(this, dest_type));
+
+   inst->offset = constant_offset;
+
+   /* The message header is necessary for:
+    * - Gen4 (always)
+    * - Gen9+ for selecting SIMD4x2
+    * - Texel offsets
+    * - Gather channel selection
+    * - Sampler indices too large to fit in a 4-bit value.
+    */
+   inst->header_size =
+      (devinfo->gen < 5 || devinfo->gen >= 9 ||
+       inst->offset != 0 || op == ir_tg4 ||
+       is_high_sampler(sampler_reg)) ? 1 : 0;
+   inst->base_mrf = 2;
+   inst->mlen = inst->header_size + 1; /* always at least one */
+   inst->dst.writemask = WRITEMASK_XYZW;
+   inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
+
+   inst->src[1] = sampler_reg;
+
+   /* MRF for the first parameter */
+   int param_base = inst->base_mrf + inst->header_size;
+
+   if (op == ir_txs || op == ir_query_levels) {
+      int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
+      emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
+   } else {
+      /* Load the coordinate */
+      /* FINISHME: gl_clamp_mask and saturate */
+      int coord_mask = (1 << coord_components) - 1;
+      int zero_mask = 0xf & ~coord_mask;
+
+      emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
+               coordinate));
+
+      if (zero_mask != 0) {
+         emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
+                  src_reg(0)));
+      }
+      /* Load the shadow comparitor */
+      if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
+	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
+			  WRITEMASK_X),
+		  shadow_comparitor));
+	 inst->mlen++;
+      }
+
+      /* Load the LOD info */
+      if (op == ir_tex || op == ir_txl) {
+	 int mrf, writemask;
+	 if (devinfo->gen >= 5) {
+	    mrf = param_base + 1;
+	    if (shadow_comparitor.file != BAD_FILE) {
+	       writemask = WRITEMASK_Y;
+	       /* mlen already incremented */
+	    } else {
+	       writemask = WRITEMASK_X;
+	       inst->mlen++;
+	    }
+	 } else /* devinfo->gen == 4 */ {
+	    mrf = param_base;
+	    writemask = WRITEMASK_W;
+	 }
+         lod.swizzle = BRW_SWIZZLE_XXXX;
+	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
+      } else if (op == ir_txf) {
+         emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
+      } else if (op == ir_txf_ms) {
+         emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
+                  sample_index));
+         if (devinfo->gen >= 7) {
+            /* MCS data is in the first channel of `mcs`, but we need to get it into
+             * the .y channel of the second vec4 of params, so replicate .x across
+             * the whole vec4 and then mask off everything except .y
+             */
+            mcs.swizzle = BRW_SWIZZLE_XXXX;
+            emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
+                     mcs));
+         }
+         inst->mlen++;
+      } else if (op == ir_txd) {
+         const brw_reg_type type = lod.type;
+
+	 if (devinfo->gen >= 5) {
+	    lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+	    lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
+	    inst->mlen++;
+
+	    if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
+	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
+	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
+	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
+	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
+	       inst->mlen++;
+
+               if (shadow_comparitor.file != BAD_FILE) {
+                  emit(MOV(dst_reg(MRF, param_base + 2,
+                                   shadow_comparitor.type, WRITEMASK_Z),
+                           shadow_comparitor));
+               }
+	    }
+	 } else /* devinfo->gen == 4 */ {
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
+	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
+	    inst->mlen += 2;
+	 }
+      } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
+         if (shadow_comparitor.file != BAD_FILE) {
+            emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
+                     shadow_comparitor));
+         }
+
+         emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
+                  offset_value));
+         inst->mlen++;
+      }
+   }
+
+   emit(inst);
+
+   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
+    * spec requires layers.
+    */
+   if (op == ir_txs && is_cube_array) {
+      emit_math(SHADER_OPCODE_INT_QUOTIENT,
+                writemask(inst->dst, WRITEMASK_Z),
+                src_reg(inst->dst), src_reg(6));
+   }
+
+   if (devinfo->gen == 6 && op == ir_tg4) {
+      emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
+   }
+
+   swizzle_result(op, dest,
+                  src_reg(inst->dst), sampler, dest_type);
+}
+
 void
 vec4_visitor::visit(ir_texture *ir)
 {
@@ -2584,7 +2761,9 @@ vec4_visitor::visit(ir_texture *ir)
     * generating these values may involve SEND messages that need the MRFs.
     */
    src_reg coordinate;
+   int coord_components = 0;
    if (ir->coordinate) {
+      coord_components = ir->coordinate->type->vector_elements;
       ir->coordinate->accept(this);
       coordinate = this->result;
    }
@@ -2602,28 +2781,23 @@ vec4_visitor::visit(ir_texture *ir)
       offset_value = src_reg(this->result);
    }
 
-   const glsl_type *lod_type = NULL, *sample_index_type = NULL;
-   src_reg lod, dPdx, dPdy, sample_index, mcs;
+   src_reg lod, lod2, sample_index, mcs;
    switch (ir->op) {
    case ir_tex:
       lod = src_reg(0.0f);
-      lod_type = glsl_type::float_type;
       break;
    case ir_txf:
    case ir_txl:
    case ir_txs:
       ir->lod_info.lod->accept(this);
       lod = this->result;
-      lod_type = ir->lod_info.lod->type;
       break;
    case ir_query_levels:
       lod = src_reg(0);
-      lod_type = glsl_type::int_type;
       break;
    case ir_txf_ms:
       ir->lod_info.sample_index->accept(this);
       sample_index = this->result;
-      sample_index_type = ir->lod_info.sample_index->type;
 
       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
          mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
@@ -2632,12 +2806,10 @@ vec4_visitor::visit(ir_texture *ir)
       break;
    case ir_txd:
       ir->lod_info.grad.dPdx->accept(this);
-      dPdx = this->result;
+      lod = this->result;
 
       ir->lod_info.grad.dPdy->accept(this);
-      dPdy = this->result;
-
-      lod_type = ir->lod_info.grad.dPdx->type;
+      lod2 = this->result;
       break;
    case ir_txb:
    case ir_lod:
@@ -2645,181 +2817,31 @@ vec4_visitor::visit(ir_texture *ir)
       break;
    }
 
-   enum opcode opcode;
-   switch (ir->op) {
-   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_tg4: opcode = has_nonconstant_offset
-                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
-   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
-   case ir_txb:
-      unreachable("TXB is not valid for vertex shaders.");
-   case ir_lod:
-      unreachable("LOD is not valid for vertex shaders.");
-   default:
-      unreachable("Unrecognized tex op");
-   }
-
-   vec4_instruction *inst = new(mem_ctx) vec4_instruction(
-      opcode, dst_reg(this, ir->type));
-
+   uint32_t constant_offset = 0;
    if (ir->offset != NULL && !has_nonconstant_offset) {
-      inst->offset =
+      constant_offset  =
          brw_texture_offset(ir->offset->as_constant()->value.i,
                             ir->offset->type->vector_elements);
    }
 
    /* Stuff the channel select bits in the top of the texture offset */
    if (ir->op == ir_tg4)
-      inst->offset |=
-        gather_channel( ir->lod_info.component->as_constant()->value.i[0],
-                        sampler) << 16;
+      constant_offset |=
+         gather_channel( ir->lod_info.component->as_constant()->value.i[0],
+                         sampler) << 16;
 
-   /* The message header is necessary for:
-    * - Gen4 (always)
-    * - Gen9+ for selecting SIMD4x2
-    * - Texel offsets
-    * - Gather channel selection
-    * - Sampler indices too large to fit in a 4-bit value.
-    */
-   inst->header_size =
-      (devinfo->gen < 5 || devinfo->gen >= 9 ||
-       inst->offset != 0 || ir->op == ir_tg4 ||
-       is_high_sampler(sampler_reg)) ? 1 : 0;
-   inst->base_mrf = 2;
-   inst->mlen = inst->header_size + 1; /* always at least one */
-   inst->dst.writemask = WRITEMASK_XYZW;
-   inst->shadow_compare = ir->shadow_comparitor != NULL;
-
-   inst->src[1] = sampler_reg;
-
-   /* MRF for the first parameter */
-   int param_base = inst->base_mrf + inst->header_size;
-
-   if (ir->op == ir_txs || ir->op == ir_query_levels) {
-      int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
-      emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
-   } else {
-      /* Load the coordinate */
-      /* FINISHME: gl_clamp_mask and saturate */
-      int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
-      int zero_mask = 0xf & ~coord_mask;
-
-      emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
-               coordinate));
-
-      if (zero_mask != 0) {
-         emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
-                  src_reg(0)));
-      }
-      /* Load the shadow comparitor */
-      if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
-	 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
-			  WRITEMASK_X),
-		  shadow_comparitor));
-	 inst->mlen++;
-      }
-
-      /* Load the LOD info */
-      if (ir->op == ir_tex || ir->op == ir_txl) {
-	 int mrf, writemask;
-	 if (devinfo->gen >= 5) {
-	    mrf = param_base + 1;
-	    if (ir->shadow_comparitor) {
-	       writemask = WRITEMASK_Y;
-	       /* mlen already incremented */
-	    } else {
-	       writemask = WRITEMASK_X;
-	       inst->mlen++;
-	    }
-	 } else /* devinfo->gen == 4 */ {
-	    mrf = param_base;
-	    writemask = WRITEMASK_W;
-	 }
-	 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
-      } else if (ir->op == ir_txf) {
-         emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
-      } else if (ir->op == ir_txf_ms) {
-         emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
-                  sample_index));
-         if (devinfo->gen >= 7) {
-            /* MCS data is in the first channel of `mcs`, but we need to get it into
-             * the .y channel of the second vec4 of params, so replicate .x across
-             * the whole vec4 and then mask off everything except .y
-             */
-            mcs.swizzle = BRW_SWIZZLE_XXXX;
-            emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
-                     mcs));
-         }
-         inst->mlen++;
-      } else if (ir->op == ir_txd) {
-	 const glsl_type *type = lod_type;
-
-	 if (devinfo->gen >= 5) {
-	    dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
-	    dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
-	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
-	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
-	    inst->mlen++;
-
-	    if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
-	       dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
-	       dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
-	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
-	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
-	       inst->mlen++;
-
-               if (ir->shadow_comparitor) {
-                  emit(MOV(dst_reg(MRF, param_base + 2,
-                                   ir->shadow_comparitor->type, WRITEMASK_Z),
-                           shadow_comparitor));
-               }
-	    }
-	 } else /* devinfo->gen == 4 */ {
-	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
-	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
-	    inst->mlen += 2;
-	 }
-      } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
-         if (ir->shadow_comparitor) {
-            emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
-                     shadow_comparitor));
-         }
-
-         emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
-                  offset_value));
-         inst->mlen++;
-      }
-   }
-
-   emit(inst);
-
-   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
-    * spec requires layers.
-    */
-   if (ir->op == ir_txs) {
-      glsl_type const *type = ir->sampler->type;
-      if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
-          type->sampler_array) {
-         emit_math(SHADER_OPCODE_INT_QUOTIENT,
-                   writemask(inst->dst, WRITEMASK_Z),
-                   src_reg(inst->dst), src_reg(6));
-      }
-   }
-
-   if (devinfo->gen == 6 && ir->op == ir_tg4) {
-      emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
-   }
+   glsl_type const *type = ir->sampler->type;
+   bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
+      type->sampler_array;
 
    this->result = src_reg(this, ir->type);
    dst_reg dest = dst_reg(this->result);
 
-   swizzle_result(ir->op, dest, src_reg(inst->dst),
-                  sampler, ir->type);
+   emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
+                shadow_comparitor,
+                lod, lod2, sample_index,
+                constant_offset, offset_value,
+                mcs, is_cube_array, sampler, sampler_reg);
 }
 
 /**

From 19cf934f7f18237e1a212b0a019026d5d36c6fac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Mon, 6 Jul 2015 15:08:15 +0200
Subject: [PATCH 0946/1208] i965/nir/vec4: Add implementation of
 nir_emit_texture()

Uses the nir structure to get all the info needed (sources,
dest reg, etc), and then it uses the common
vec4_visitor::emit_texture to emit the final code.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 195 ++++++++++++++++++++-
 1 file changed, 194 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 544f4263f2a..a5af33e6e10 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1283,10 +1283,203 @@ vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
    }
 }
 
+enum ir_texture_opcode
+ir_texture_opcode_for_nir_texop(nir_texop texop)
+{
+   enum ir_texture_opcode op;
+
+   switch (texop) {
+   case nir_texop_lod: op = ir_lod; break;
+   case nir_texop_query_levels: op = ir_query_levels; break;
+   case nir_texop_tex: op = ir_tex; break;
+   case nir_texop_tg4: op = ir_tg4; break;
+   case nir_texop_txb: op = ir_txb; break;
+   case nir_texop_txd: op = ir_txd; break;
+   case nir_texop_txf: op = ir_txf; break;
+   case nir_texop_txf_ms: op = ir_txf_ms; break;
+   case nir_texop_txl: op = ir_txl; break;
+   case nir_texop_txs: op = ir_txs; break;
+   default:
+      unreachable("unknown texture opcode");
+   }
+
+   return op;
+}
+const glsl_type *
+glsl_type_for_nir_alu_type(nir_alu_type alu_type,
+                           unsigned components)
+{
+   switch (alu_type) {
+   case nir_type_float:
+      return glsl_type::vec(components);
+   case nir_type_int:
+      return glsl_type::ivec(components);
+   case nir_type_unsigned:
+      return glsl_type::uvec(components);
+   case nir_type_bool:
+      return glsl_type::bvec(components);
+   default:
+      return glsl_type::error_type;
+   }
+
+   return glsl_type::error_type;
+}
+
 void
 vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
 {
-   /* @TODO: Not yet implemented */
+   unsigned sampler = instr->sampler_index;
+   src_reg sampler_reg = src_reg(sampler);
+   src_reg coordinate;
+   const glsl_type *coord_type = NULL;
+   src_reg shadow_comparitor;
+   src_reg offset_value;
+   src_reg lod, lod2;
+   src_reg sample_index;
+   src_reg mcs;
+
+   const glsl_type *dest_type =
+      glsl_type_for_nir_alu_type(instr->dest_type,
+                                 nir_tex_instr_dest_size(instr));
+   dst_reg dest = get_nir_dest(instr->dest, instr->dest_type);
+
+   /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
+    * emitting anything other than setting up the constant result.
+    */
+   if (instr->op == nir_texop_tg4) {
+      int swiz = GET_SWZ(key->tex.swizzles[sampler], instr->component);
+      if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
+         emit(MOV(dest, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
+         return;
+      }
+   }
+
+   /* Load the texture operation sources */
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+      switch (instr->src[i].src_type) {
+      case nir_tex_src_comparitor:
+         shadow_comparitor = get_nir_src(instr->src[i].src,
+                                         BRW_REGISTER_TYPE_F, 1);
+         break;
+
+      case nir_tex_src_coord: {
+         unsigned src_size = nir_tex_instr_src_size(instr, i);
+
+         switch (instr->op) {
+         case nir_texop_txf:
+         case nir_texop_txf_ms:
+            coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D,
+                                     src_size);
+            coord_type = glsl_type::ivec(src_size);
+            break;
+
+         default:
+            coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                                     src_size);
+            coord_type = glsl_type::vec(src_size);
+            break;
+         }
+         break;
+      }
+
+      case nir_tex_src_ddx:
+         lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                           nir_tex_instr_src_size(instr, i));
+         break;
+
+      case nir_tex_src_ddy:
+         lod2 = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                           nir_tex_instr_src_size(instr, i));
+         break;
+
+      case nir_tex_src_lod:
+         switch (instr->op) {
+         case nir_texop_txs:
+         case nir_texop_txf:
+            lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
+            break;
+
+         default:
+            lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 1);
+            break;
+         }
+         break;
+
+      case nir_tex_src_ms_index: {
+         sample_index = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
+         assert(coord_type != NULL);
+         if (devinfo->gen >= 7 &&
+             key->tex.compressed_multisample_layout_mask & (1<<sampler)) {
+            mcs = emit_mcs_fetch(coord_type, coordinate, sampler_reg);
+         } else {
+            mcs = src_reg(0u);
+         }
+         mcs = retype(mcs, BRW_REGISTER_TYPE_UD);
+         break;
+      }
+
+      case nir_tex_src_offset:
+         offset_value = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2);
+         break;
+
+      case nir_tex_src_sampler_offset: {
+         /* The highest sampler which may be used by this operation is
+          * the last element of the array. Mark it here, because the generator
+          * doesn't have enough information to determine the bound.
+          */
+         uint32_t array_size = instr->sampler_array_size;
+         uint32_t max_used = sampler + array_size - 1;
+         if (instr->op == nir_texop_tg4) {
+            max_used += prog_data->base.binding_table.gather_texture_start;
+         } else {
+            max_used += prog_data->base.binding_table.texture_start;
+         }
+
+         brw_mark_surface_used(&prog_data->base, max_used);
+
+         /* Emit code to evaluate the actual indexing expression */
+         src_reg src = get_nir_src(instr->src[i].src, 1);
+         src_reg temp(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(temp), src, src_reg(sampler)));
+         sampler_reg = emit_uniformize(temp);
+         break;
+      }
+
+      case nir_tex_src_projector:
+         unreachable("Should be lowered by do_lower_texture_projection");
+
+      case nir_tex_src_bias:
+         unreachable("LOD bias is not valid for vertex shaders.\n");
+
+      default:
+         unreachable("unknown texture source");
+      }
+   }
+
+   uint32_t constant_offset = 0;
+   for (unsigned i = 0; i < 3; i++) {
+      if (instr->const_offset[i] != 0) {
+         constant_offset = brw_texture_offset(instr->const_offset, 3);
+         break;
+      }
+   }
+
+   /* Stuff the channel select bits in the top of the texture offset */
+   if (instr->op == nir_texop_tg4)
+      constant_offset |= gather_channel(instr->component, sampler) << 16;
+
+   ir_texture_opcode op = ir_texture_opcode_for_nir_texop(instr->op);
+
+   bool is_cube_array =
+      instr->op == nir_texop_txs &&
+      instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
+      instr->is_array;
+
+   emit_texture(op, dest, dest_type, coordinate, instr->coord_components,
+                shadow_comparitor,
+                lod, lod2, sample_index,
+                constant_offset, offset_value,
+                mcs, is_cube_array, sampler, sampler_reg);
 }
 
 }

From 418c004f802e63ca4e9f3456a46498d2fc543854 Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Thu, 11 Jun 2015 12:32:26 +0200
Subject: [PATCH 0947/1208] nir: Fix output swizzle in get_mul_for_src

Avoid copying an overwritten swizzle, use the original values.

Example:

   Former swizzle[] = xyzw
   src->swizzle[] = zyxx

The expected output swizzle = zyxx but if we reuse swizzle in the loop,
then output swizzle would be zyzz.

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/glsl/nir/nir_opt_peephole_ffma.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/glsl/nir/nir_opt_peephole_ffma.c b/src/glsl/nir/nir_opt_peephole_ffma.c
index 798506b7595..a823adbb465 100644
--- a/src/glsl/nir/nir_opt_peephole_ffma.c
+++ b/src/glsl/nir/nir_opt_peephole_ffma.c
@@ -76,6 +76,7 @@ static nir_alu_instr *
 get_mul_for_src(nir_alu_src *src, int num_components,
                 uint8_t swizzle[4], bool *negate, bool *abs)
 {
+   uint8_t swizzle_tmp[4];
    assert(src->src.is_ssa && !src->abs && !src->negate);
 
    nir_instr *instr = src->src.ssa->parent_instr;
@@ -116,8 +117,18 @@ get_mul_for_src(nir_alu_src *src, int num_components,
    if (!alu)
       return NULL;
 
+   /* Copy swizzle data before overwriting it to avoid setting a wrong swizzle.
+    *
+    * Example:
+    *   Former swizzle[] = xyzw
+    *   src->swizzle[] = zyxx
+    *
+    *   Expected output swizzle = zyxx
+    *   If we reuse swizzle in the loop, then output swizzle would be zyzz.
+    */
+   memcpy(swizzle_tmp, swizzle, 4*sizeof(uint8_t));
    for (unsigned i = 0; i < num_components; i++)
-      swizzle[i] = swizzle[src->swizzle[i]];
+      swizzle[i] = swizzle_tmp[src->swizzle[i]];
 
    return alu;
 }

From 38fc4a91cd5c04fdd5921b8776f8e203513ab517 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 1 Jul 2015 09:51:25 +0200
Subject: [PATCH 0948/1208] i965/nir: Enable NIR-vec4 pass on geometry shaders

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_shader.cpp | 4 ++++
 src/mesa/drivers/dri/i965/brw_vec4.cpp   | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 7c5095ddce3..819e4f2d3e4 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -135,6 +135,10 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
       compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions = nir_options;
    }
 
+   if (brw_env_var_as_boolean("INTEL_USE_NIR", false)) {
+      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].NirOptions = nir_options;
+   }
+
    compiler->glsl_compiler_options[MESA_SHADER_FRAGMENT].NirOptions = nir_options;
    compiler->glsl_compiler_options[MESA_SHADER_COMPUTE].NirOptions = nir_options;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 6f6e6271a91..8d83887c2ad 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1721,7 +1721,7 @@ bool
 vec4_visitor::run(gl_clip_plane *clip_planes)
 {
    bool use_vec4_nir =
-      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions != NULL;
+      compiler->glsl_compiler_options[stage].NirOptions != NULL;
 
    sanity_param_count = prog->Parameters->NumParameters;
 

From 7ade42755f8900aaf67073214c073419f734e7a8 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 29 Jun 2015 13:37:31 +0200
Subject: [PATCH 0949/1208] i965/gs: Refactor ir_emit_vertex and
 ir_end_primitive

So the implementation is independent of GLSL IR and the visit methods of the
vec4 visitor. This way we will be able to reuse that implementation directly
from the NIR vec4 backend.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h          |  2 ++
 .../drivers/dri/i965/brw_vec4_gs_visitor.cpp  | 20 +++++++++++++++----
 .../drivers/dri/i965/brw_vec4_gs_visitor.h    |  2 ++
 .../drivers/dri/i965/brw_vec4_visitor.cpp     | 13 ++++++++++++
 4 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 3ddfe88ad38..985886d7024 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -463,6 +463,8 @@ protected:
    virtual void emit_urb_write_header(int mrf) = 0;
    virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0;
    virtual int compute_array_stride(ir_dereference_array *ir);
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
 
 private:
    /**
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 704644e7429..019efecac66 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -467,7 +467,7 @@ vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
 }
 
 void
-vec4_gs_visitor::visit(ir_emit_vertex *ir)
+vec4_gs_visitor::gs_emit_vertex(int stream_id)
 {
    this->current_annotation = "emit vertex: safety check";
 
@@ -481,7 +481,7 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
     * be recorded by transform feedback, we can simply discard all geometry
     * bound to these streams when transform feedback is disabled.
     */
-   if (ir->stream_id() > 0 && shader_prog->TransformFeedback.NumVarying == 0)
+   if (stream_id > 0 && shader_prog->TransformFeedback.NumVarying == 0)
       return;
 
    /* To ensure that we don't output more vertices than the shader specified
@@ -560,7 +560,7 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
           c->prog_data.control_data_format ==
              GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
           this->current_annotation = "emit vertex: Stream control data bits";
-          set_stream_control_data_bits(ir->stream_id());
+          set_stream_control_data_bits(stream_id);
       }
 
       this->current_annotation = "emit vertex: increment vertex count";
@@ -573,7 +573,13 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
 }
 
 void
-vec4_gs_visitor::visit(ir_end_primitive *)
+vec4_gs_visitor::visit(ir_emit_vertex *ir)
+{
+   gs_emit_vertex(ir->stream_id());
+}
+
+void
+vec4_gs_visitor::gs_end_primitive()
 {
    /* We can only do EndPrimitive() functionality when the control data
     * consists of cut bits.  Fortunately, the only time it isn't is when the
@@ -623,6 +629,12 @@ vec4_gs_visitor::visit(ir_end_primitive *)
    emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
 }
 
+void
+vec4_gs_visitor::visit(ir_end_primitive *)
+{
+   gs_end_primitive();
+}
+
 static const unsigned *
 generate_assembly(struct brw_context *brw,
                   struct gl_shader_program *shader_prog,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index 54cd2a1ff34..8cad570b85d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -87,6 +87,8 @@ protected:
    virtual int compute_array_stride(ir_dereference_array *ir);
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
 
 protected:
    int setup_varying_inputs(int payload_reg, int *attribute_map,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index f6155a3a3b5..c5c0d2c73e0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -2993,12 +2993,25 @@ vec4_visitor::visit(ir_if *ir)
    emit(BRW_OPCODE_ENDIF);
 }
 
+void
+vec4_visitor::gs_emit_vertex(int stream_id)
+{
+   unreachable("not reached");
+}
+
 void
 vec4_visitor::visit(ir_emit_vertex *)
 {
    unreachable("not reached");
 }
 
+void
+vec4_visitor::gs_end_primitive()
+{
+   unreachable("not reached");
+}
+
+
 void
 vec4_visitor::visit(ir_end_primitive *)
 {

From 551af29d2d8be33b66641fe47ee5156489c16132 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 29 Jun 2015 13:52:30 +0200
Subject: [PATCH 0950/1208] i965/nir/gs: Handle geometry shaders inputs

Outputs from the vertex shader become array inputs in the geomtry shader,
but the arrays are interleaved, so we need to map our inputs accordingly.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/Makefile.sources    |  1 +
 src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp | 68 +++++++++++++++++++
 .../drivers/dri/i965/brw_vec4_gs_visitor.h    |  2 +
 3 files changed, 71 insertions(+)
 create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 3585126526a..dfdad75329d 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -126,6 +126,7 @@ i965_FILES = \
 	brw_vec4_live_variables.cpp \
 	brw_vec4_live_variables.h \
 	brw_vec4_nir.cpp \
+	brw_vec4_gs_nir.cpp \
 	brw_vec4_reg_allocate.cpp \
 	brw_vec4_visitor.cpp \
 	brw_vec4_vp.cpp \
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
new file mode 100644
index 00000000000..de38bdaa836
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4_gs_visitor.h"
+
+namespace brw {
+
+void
+vec4_gs_visitor::nir_setup_inputs(nir_shader *shader)
+{
+   nir_inputs = ralloc_array(mem_ctx, src_reg, shader->num_inputs);
+
+   foreach_list_typed(nir_variable, var, node, &shader->inputs) {
+      int offset = var->data.driver_location;
+      if (var->type->base_type == GLSL_TYPE_ARRAY) {
+         /* Geometry shader inputs are arrays, but they use an unusual array
+          * layout: instead of all array elements for a given geometry shader
+          * input being stored consecutively, all geometry shader inputs are
+          * interleaved into one giant array. At this stage of compilation, we
+          * assume that the stride of the array is BRW_VARYING_SLOT_COUNT.
+          * Later, setup_attributes() will remap our accesses to the actual
+          * input array.
+          */
+         assert(var->type->length > 0);
+         int length = var->type->length;
+         int size = type_size(var->type) / length;
+         for (int i = 0; i < length; i++) {
+            int location = var->data.location + i * BRW_VARYING_SLOT_COUNT;
+            for (int j = 0; j < size; j++) {
+               src_reg src = src_reg(ATTR, location + j, var->type);
+               src = retype(src, brw_type_for_base_type(var->type));
+               nir_inputs[offset] = src;
+               offset++;
+            }
+         }
+      } else {
+         int size = type_size(var->type);
+         for (int i = 0; i < size; i++) {
+            src_reg src = src_reg(ATTR, var->data.location + i, var->type);
+            src = retype(src, brw_type_for_base_type(var->type));
+            nir_inputs[offset] = src;
+            offset++;
+         }
+      }
+   }
+}
+}
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index 8cad570b85d..6a52d580621 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -75,6 +75,8 @@ public:
                    bool no_spills,
                    int shader_time_index);
 
+   virtual void nir_setup_inputs(nir_shader *shader);
+
 protected:
    virtual dst_reg *make_reg_for_system_value(int location,
                                               const glsl_type *type);

From 1836201fde1826c82f579fb132455c8df4176ecd Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 29 Jun 2015 14:08:11 +0200
Subject: [PATCH 0951/1208] i965/nir/gs: Implement EmitVertex and EndPrimitive

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp | 22 +++++++++++++++++++
 .../drivers/dri/i965/brw_vec4_gs_visitor.h    |  1 +
 2 files changed, 23 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
index de38bdaa836..f627a8dbdae 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
@@ -65,4 +65,26 @@ vec4_gs_visitor::nir_setup_inputs(nir_shader *shader)
       }
    }
 }
+
+void
+vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg dest;
+   src_reg src;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_emit_vertex: {
+      int stream_id = instr->const_index[0];
+      gs_emit_vertex(stream_id);
+      break;
+   }
+
+   case nir_intrinsic_end_primitive:
+      gs_end_primitive();
+      break;
+
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index 6a52d580621..517e99d7862 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -91,6 +91,7 @@ protected:
    virtual void visit(ir_end_primitive *);
    virtual void gs_emit_vertex(int stream_id);
    virtual void gs_end_primitive();
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
 
 protected:
    int setup_varying_inputs(int payload_reg, int *attribute_map,

From 7eced3aa863394c6e74ac3f037ed1cf9c481fe37 Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Mon, 13 Jul 2015 15:51:17 +0200
Subject: [PATCH 0952/1208] i965/gs/gen6: Refactor ir_emit_vertex and
 ir_end_primitive for gen6

So the implementation is independent of GLSL IR and the visit methods of the
gen6 GS visitor. This way we will be able to reuse that implementation directly
from the NIR vec4 backend.

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp | 13 ++++++++++++-
 src/mesa/drivers/dri/i965/gen6_gs_visitor.h   |  2 ++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
index 782687aac57..68e443d38a5 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
@@ -147,7 +147,12 @@ gen6_gs_visitor::emit_prolog()
 }
 
 void
-gen6_gs_visitor::visit(ir_emit_vertex *)
+gen6_gs_visitor::visit(ir_emit_vertex *ir)
+{
+   gs_emit_vertex(ir->stream_id());
+}
+void
+gen6_gs_visitor::gs_emit_vertex(int stream_id)
 {
    this->current_annotation = "gen6 emit vertex";
    /* Honor max_vertex layout indication in geometry shader by ignoring any
@@ -223,6 +228,12 @@ gen6_gs_visitor::visit(ir_emit_vertex *)
 
 void
 gen6_gs_visitor::visit(ir_end_primitive *)
+{
+   gs_end_primitive();
+}
+
+void
+gen6_gs_visitor::gs_end_primitive()
 {
    this->current_annotation = "gen6 end primitive";
    /* Calling EndPrimitive() is optional for point output. In this case we set
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
index 862d0eaacf0..4cf94893261 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
@@ -51,6 +51,8 @@ protected:
    virtual void emit_thread_end();
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
    virtual void emit_urb_write_header(int mrf);
    virtual void emit_urb_write_opcode(bool complete,
                                       int base_mrf,

From 287b006a673dabe3e21cc207a1b4622ef91a877e Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 1 Jul 2015 10:12:10 +0200
Subject: [PATCH 0953/1208] i965/nir/gs: Implement support for gl_InvocationID
 system value

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp | 28 +++++++++++++++++++
 .../drivers/dri/i965/brw_vec4_gs_visitor.h    |  1 +
 2 files changed, 29 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
index f627a8dbdae..d85fb6f31ec 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
@@ -66,6 +66,25 @@ vec4_gs_visitor::nir_setup_inputs(nir_shader *shader)
    }
 }
 
+void
+vec4_gs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg *reg;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_invocation_id:
+      reg = &this->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+      if (reg->file == BAD_FILE)
+         *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_INVOCATION_ID,
+                                                 glsl_type::int_type);
+      break;
+
+   default:
+      vec4_visitor::nir_setup_system_value_intrinsic(instr);
+   }
+
+}
+
 void
 vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 {
@@ -83,6 +102,15 @@ vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       gs_end_primitive();
       break;
 
+   case nir_intrinsic_load_invocation_id: {
+      src_reg invocation_id =
+         src_reg(nir_system_values[SYSTEM_VALUE_INVOCATION_ID]);
+      assert(invocation_id.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, invocation_id.type);
+      emit(MOV(dest, invocation_id));
+      break;
+   }
+
    default:
       vec4_visitor::nir_emit_intrinsic(instr);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index 517e99d7862..0e8fefabecc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -76,6 +76,7 @@ public:
                    int shader_time_index);
 
    virtual void nir_setup_inputs(nir_shader *shader);
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
 
 protected:
    virtual dst_reg *make_reg_for_system_value(int location,

From 90825e3ca977057c8f3d6ad2d1aa38277cc3ff11 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 8 Jul 2015 12:44:15 +0200
Subject: [PATCH 0954/1208] i965/vec4: Enable NIR-vec4 pass on
 ARB_vertex_programs

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp | 47 +++++++++++++-------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 8d83887c2ad..88b601a6cca 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1732,18 +1732,16 @@ vec4_visitor::run(gl_clip_plane *clip_planes)
 
    emit_prolog();
 
-   if (shader) {
-      if (use_vec4_nir) {
-         assert(prog->nir != NULL);
-         emit_nir_code();
-         if (failed)
-            return false;
-      } else {
-         /* Generate VS IR for main().  (the visitor only descends into
-          * functions called "main").
-          */
-         visit_instructions(shader->base.ir);
-      }
+   if (use_vec4_nir) {
+      assert(prog->nir != NULL);
+      emit_nir_code();
+      if (failed)
+         return false;
+   } else if (shader) {
+      /* Generate VS IR for main().  (the visitor only descends into
+       * functions called "main").
+       */
+      visit_instructions(shader->base.ir);
    } else {
       emit_program_code();
    }
@@ -1921,18 +1919,21 @@ brw_vs_emit(struct brw_context *brw,
    if (unlikely(INTEL_DEBUG & DEBUG_VS))
       brw_dump_ir("vertex", prog, &shader->base, &vp->Base);
 
-   if (brw->intelScreen->compiler->scalar_vs) {
-      if (!vp->Base.nir) {
-         /* Normally we generate NIR in LinkShader() or
-          * ProgramStringNotify(), but Mesa's fixed-function vertex program
-          * handling doesn't notify the driver at all.  Just do it here, at
-          * the last minute, even though it's lame.
-          */
-         assert(vp->Base.Id == 0 && prog == NULL);
-         vp->Base.nir =
-            brw_create_nir(brw, NULL, &vp->Base, MESA_SHADER_VERTEX, true);
-      }
+   if (!vp->Base.nir &&
+       (brw->intelScreen->compiler->scalar_vs ||
+        brw->intelScreen->compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions != NULL)) {
+      /* Normally we generate NIR in LinkShader() or
+       * ProgramStringNotify(), but Mesa's fixed-function vertex program
+       * handling doesn't notify the driver at all.  Just do it here, at
+       * the last minute, even though it's lame.
+       */
+      assert(vp->Base.Id == 0 && prog == NULL);
+      vp->Base.nir =
+         brw_create_nir(brw, NULL, &vp->Base, MESA_SHADER_VERTEX,
+                        brw->intelScreen->compiler->scalar_vs);
+   }
 
+   if (brw->intelScreen->compiler->scalar_vs) {
       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
       fs_visitor v(brw->intelScreen->compiler, brw,

From 82f2e706bfd646b91bc0b8beecdff4e54b1f7b04 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Mon, 29 Jun 2015 14:21:38 +0200
Subject: [PATCH 0955/1208] i965/nir/vec4: Handle uniforms on vertex programs

The implementation takes into account that on ARB_vertex_program
only a single nir variable is generated to support all the uniform data.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 34 ++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index a5af33e6e10..b13465bcb30 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -153,8 +153,38 @@ vec4_visitor::nir_setup_uniforms(nir_shader *shader)
             nir_setup_uniform(var);
       }
    } else {
-      /* ARB_vertex_program is not supported yet */
-      assert("Not implemented");
+      /* For ARB_vertex_program, only a single "parameters" variable is
+       * generated to support uniform data.
+       */
+      nir_variable *var = (nir_variable *) shader->uniforms.get_head();
+      assert(shader->uniforms.length() == 1 &&
+             strcmp(var->name, "parameters") == 0);
+
+      assert(uniforms < uniform_array_size);
+      this->uniform_size[uniforms] = type_size(var->type);
+
+      struct gl_program_parameter_list *plist = prog->Parameters;
+      for (unsigned p = 0; p < plist->NumParameters; p++) {
+         uniform_vector_size[uniforms] = plist->Parameters[p].Size;
+
+         /* Parameters should be either vec4 uniforms or single component
+          * constants; matrices and other larger types should have been broken
+          * down earlier.
+          */
+         assert(uniform_vector_size[uniforms] <= 4);
+
+         int i;
+         for (i = 0; i < uniform_vector_size[uniforms]; i++) {
+            stage_prog_data->param[uniforms * 4 + i] = &plist->ParameterValues[p][i];
+         }
+         for (; i < 4; i++) {
+            static const gl_constant_value zero = { 0.0 };
+            stage_prog_data->param[uniforms * 4 + i] = &zero;
+         }
+
+         nir_uniform_driver_location[uniforms] = var->data.driver_location;
+         uniforms++;
+      }
    }
 }
 

From 34d162260f513a7eaec12611e3859bb34230cf33 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Wed, 8 Jul 2015 16:08:17 +0200
Subject: [PATCH 0956/1208] i965/vec4: Handle uniform and GRF array access on
 vertex programs (NIR)

When the NIR-vec4 pass is enabled, handles uniform and GRF array access
on ARB_vertex_program like it is done on vertex shaders.

When the old IR-vec4 pass is used, emit_program_code() emits pull constant
loads directly instead of using relative addressing, hence to call to
move_uniform_array_access_to_pull_constants() is not needed and it is enough
to call to split_uniform_registers().

The patch also calls to move_grf_array_access_to_scratch() like it is
done for shaders, however I suspect this is a no-op for vertex programs and
we could remove it.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 88b601a6cca..f18915a8e38 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1760,7 +1760,7 @@ vec4_visitor::run(gl_clip_plane *clip_planes)
     * that we have reladdr computations available for CSE, since we'll
     * often do repeated subexpressions for those.
     */
-   if (shader) {
+   if (shader || use_vec4_nir) {
       move_grf_array_access_to_scratch();
       move_uniform_array_access_to_pull_constants();
    } else {

From da1b1bf85cdc691ec27f379de84dec495cdd51e0 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 15 Jul 2015 09:32:17 +0200
Subject: [PATCH 0957/1208] i965/nir: Do not scalarize phis in non-scalar
 setups

Significantly reduces register pressure in some piglit tests.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_nir.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 49d17422546..b5788fa2e33 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -42,8 +42,12 @@ nir_optimize(nir_shader *nir, bool is_scalar)
 
       progress |= nir_copy_prop(nir);
       nir_validate_shader(nir);
-      nir_lower_phis_to_scalar(nir);
-      nir_validate_shader(nir);
+
+      if (is_scalar) {
+         nir_lower_phis_to_scalar(nir);
+         nir_validate_shader(nir);
+      }
+
       progress |= nir_copy_prop(nir);
       nir_validate_shader(nir);
       progress |= nir_opt_dce(nir);

From 2b81cefb3fec3c5c17e7ef9f95c9681abfad5386 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Wed, 10 Jun 2015 17:09:16 -0700
Subject: [PATCH 0958/1208] i965: Trivial formatting changes in brw_wm.c

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Acked-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_wm.c | 84 +++++++++++++++---------------
 1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index b590b177ece..9b90a7c191f 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -1,34 +1,28 @@
 /*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keithw@vmware.com>
-  */
-
+ * Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ * Intel funded Tungsten Graphics to
+ * develop this 3D driver.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
 #include "brw_context.h"
 #include "brw_wm.h"
 #include "brw_state.h"
@@ -350,13 +344,15 @@ static uint8_t
 gen6_gather_workaround(GLenum internalformat)
 {
    switch (internalformat) {
-      case GL_R8I: return WA_SIGN | WA_8BIT;
-      case GL_R8UI: return WA_8BIT;
-      case GL_R16I: return WA_SIGN | WA_16BIT;
-      case GL_R16UI: return WA_16BIT;
-      /* note that even though GL_R32I and GL_R32UI have format overrides
-       * in the surface state, there is no shader w/a required */
-      default: return 0;
+   case GL_R8I: return WA_SIGN | WA_8BIT;
+   case GL_R8UI: return WA_8BIT;
+   case GL_R16I: return WA_SIGN | WA_16BIT;
+   case GL_R16UI: return WA_16BIT;
+   default:
+      /* Note that even though GL_R32I and GL_R32UI have format overrides in
+       * the surface state, there is no shader w/a required.
+       */
+      return 0;
    }
 }
 
@@ -403,8 +399,9 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx,
 	       key->gl_clamp_mask[2] |= 1 << s;
 	 }
 
-         /* gather4's channel select for green from RG32F is broken;
-          * requires a shader w/a on IVB; fixable with just SCS on HSW. */
+         /* gather4's channel select for green from RG32F is broken; requires
+          * a shader w/a on IVB; fixable with just SCS on HSW.
+          */
          if (brw->gen == 7 && !brw->is_haswell && prog->UsesGather) {
             if (img->InternalFormat == GL_RG32F)
                key->gather_channel_quirk_mask |= 1 << s;
@@ -453,13 +450,13 @@ brw_wm_state_dirty (struct brw_context *brw)
                           BRW_NEW_VUE_MAP_GEOM_OUT);
 }
 
-static void brw_wm_populate_key( struct brw_context *brw,
-				 struct brw_wm_prog_key *key )
+static void
+brw_wm_populate_key(struct brw_context *brw, struct brw_wm_prog_key *key)
 {
    struct gl_context *ctx = &brw->ctx;
    /* BRW_NEW_FRAGMENT_PROGRAM */
    const struct brw_fragment_program *fp =
-      (struct brw_fragment_program *)brw->fragment_program;
+      (struct brw_fragment_program *) brw->fragment_program;
    const struct gl_program *prog = (struct gl_program *) brw->fragment_program;
    GLuint lookup = 0;
    GLuint line_aa;
@@ -605,7 +602,8 @@ static void brw_wm_populate_key( struct brw_context *brw,
     * like GL requires.  Fix that by building the alpha test into the
     * shader, and we'll skip enabling the fixed function alpha test.
     */
-   if (brw->gen < 6 && ctx->DrawBuffer->_NumColorDrawBuffers > 1 && ctx->Color.AlphaEnabled) {
+   if (brw->gen < 6 && ctx->DrawBuffer->_NumColorDrawBuffers > 1 &&
+       ctx->Color.AlphaEnabled) {
       key->alpha_test_func = ctx->Color.AlphaFunc;
       key->alpha_test_ref = ctx->Color.AlphaRef;
    }

From 5b6218395c303ff82a19294c05c63c7b92d24e3f Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Thu, 18 Jun 2015 18:45:44 -0700
Subject: [PATCH 0959/1208] i965: Trivial formatting changes in brw_draw.c

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Acked-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_draw.c | 102 +++++++++++++--------------
 1 file changed, 51 insertions(+), 51 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index b5ef276edfc..9113d0f5a8e 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -104,8 +104,8 @@ get_hw_prim_for_gl_prim(int mode)
  * programs be immune to the active primitive (ie. cope with all
  * possibilities).  That may not be realistic however.
  */
-static void brw_set_prim(struct brw_context *brw,
-                         const struct _mesa_prim *prim)
+static void
+brw_set_prim(struct brw_context *brw, const struct _mesa_prim *prim)
 {
    struct gl_context *ctx = &brw->ctx;
    uint32_t hw_prim = get_hw_prim_for_gl_prim(prim->mode);
@@ -138,15 +138,12 @@ static void brw_set_prim(struct brw_context *brw,
    }
 }
 
-static void gen6_set_prim(struct brw_context *brw,
-                          const struct _mesa_prim *prim)
+static void
+gen6_set_prim(struct brw_context *brw, const struct _mesa_prim *prim)
 {
-   uint32_t hw_prim;
-
    DBG("PRIM: %s\n", _mesa_enum_to_string(prim->mode));
 
-   hw_prim = get_hw_prim_for_gl_prim(prim->mode);
-
+   const uint32_t hw_prim = get_hw_prim_for_gl_prim(prim->mode);
    if (hw_prim != brw->primitive) {
       brw->primitive = hw_prim;
       brw->ctx.NewDriverState |= BRW_NEW_PRIMITIVE;
@@ -162,7 +159,8 @@ static void gen6_set_prim(struct brw_context *brw,
  * quads so that those dangling vertices won't get drawn when we convert to
  * trifans/tristrips.
  */
-static GLuint trim(GLenum prim, GLuint length)
+static GLuint
+trim(GLenum prim, GLuint length)
 {
    if (prim == GL_QUAD_STRIP)
       return length > 3 ? (length - length % 2) : 0;
@@ -173,14 +171,14 @@ static GLuint trim(GLenum prim, GLuint length)
 }
 
 
-static void brw_emit_prim(struct brw_context *brw,
-			  const struct _mesa_prim *prim,
-			  uint32_t hw_prim)
+static void
+brw_emit_prim(struct brw_context *brw,
+              const struct _mesa_prim *prim,
+              uint32_t hw_prim)
 {
    int verts_per_instance;
    int vertex_access_type;
    int indirect_flag;
-   int predicate_enable;
 
    DBG("PRIM: %s %d %d\n", _mesa_enum_to_string(prim->mode),
        prim->start, prim->count);
@@ -216,9 +214,8 @@ static void brw_emit_prim(struct brw_context *brw,
     * and missed flushes of the render cache as it heads to other parts of
     * the besides the draw code.
     */
-   if (brw->always_flush_cache) {
+   if (brw->always_flush_cache)
       brw_emit_mi_flush(brw);
-   }
 
    /* If indirect, emit a bunch of loads from the indirect BO. */
    if (prim->is_indirect) {
@@ -256,18 +253,16 @@ static void brw_emit_prim(struct brw_context *brw,
          OUT_BATCH(0);
          ADVANCE_BATCH();
       }
-   }
-   else {
+   } else {
       indirect_flag = 0;
    }
 
    BEGIN_BATCH(brw->gen >= 7 ? 7 : 6);
 
    if (brw->gen >= 7) {
-      if (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT)
-         predicate_enable = GEN7_3DPRIM_PREDICATE_ENABLE;
-      else
-         predicate_enable = 0;
+      const int predicate_enable =
+         (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT)
+         ? GEN7_3DPRIM_PREDICATE_ENABLE : 0;
 
       OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2) | indirect_flag | predicate_enable);
       OUT_BATCH(hw_prim | vertex_access_type);
@@ -283,14 +278,14 @@ static void brw_emit_prim(struct brw_context *brw,
    OUT_BATCH(base_vertex_location);
    ADVANCE_BATCH();
 
-   if (brw->always_flush_cache) {
+   if (brw->always_flush_cache)
       brw_emit_mi_flush(brw);
-   }
 }
 
 
-static void brw_merge_inputs( struct brw_context *brw,
-		       const struct gl_client_array *arrays[])
+static void
+brw_merge_inputs(struct brw_context *brw,
+                 const struct gl_client_array *arrays[])
 {
    const struct gl_context *ctx = &brw->ctx;
    GLuint i;
@@ -359,7 +354,8 @@ static void brw_merge_inputs( struct brw_context *brw,
  * Also mark any render targets which will be textured as needing a render
  * cache flush.
  */
-static void brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
+static void
+brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    struct gl_framebuffer *fb = ctx->DrawBuffer;
@@ -399,21 +395,22 @@ static void brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
 /* May fail if out of video memory for texture or vbo upload, or on
  * fallback conditions.
  */
-static void brw_try_draw_prims( struct gl_context *ctx,
-				     const struct gl_client_array *arrays[],
-				     const struct _mesa_prim *prims,
-				     GLuint nr_prims,
-				     const struct _mesa_index_buffer *ib,
-				     GLuint min_index,
-				     GLuint max_index,
-				     struct gl_buffer_object *indirect)
+static void
+brw_try_draw_prims(struct gl_context *ctx,
+                   const struct gl_client_array *arrays[],
+                   const struct _mesa_prim *prims,
+                   GLuint nr_prims,
+                   const struct _mesa_index_buffer *ib,
+                   GLuint min_index,
+                   GLuint max_index,
+                   struct gl_buffer_object *indirect)
 {
    struct brw_context *brw = brw_context(ctx);
    GLuint i;
    bool fail_next = false;
 
    if (ctx->NewState)
-      _mesa_update_state( ctx );
+      _mesa_update_state(ctx);
 
    /* Find the highest sampler unit used by each shader program.  A bit-count
     * won't work since ARB programs use the texture unit number as the sampler
@@ -433,7 +430,7 @@ static void brw_try_draw_prims( struct gl_context *ctx,
     * software fallback will segfault if it attempts to access any
     * texture level other than level 0.
     */
-   brw_validate_textures( brw );
+   brw_validate_textures(brw);
 
    intel_prepare_render(brw);
 
@@ -445,7 +442,7 @@ static void brw_try_draw_prims( struct gl_context *ctx,
 
    /* Bind all inputs, derive varying and size information:
     */
-   brw_merge_inputs( brw, arrays );
+   brw_merge_inputs(brw, arrays);
 
    brw->ib.ib = ib;
    brw->ctx.NewDriverState |= BRW_NEW_INDICES;
@@ -553,15 +550,16 @@ retry:
    return;
 }
 
-void brw_draw_prims( struct gl_context *ctx,
-		     const struct _mesa_prim *prims,
-		     GLuint nr_prims,
-		     const struct _mesa_index_buffer *ib,
-		     GLboolean index_bounds_valid,
-		     GLuint min_index,
-		     GLuint max_index,
-		     struct gl_transform_feedback_object *unused_tfb_object,
-		     struct gl_buffer_object *indirect )
+void
+brw_draw_prims(struct gl_context *ctx,
+               const struct _mesa_prim *prims,
+               GLuint nr_prims,
+               const struct _mesa_index_buffer *ib,
+               GLboolean index_bounds_valid,
+               GLuint min_index,
+               GLuint max_index,
+               struct gl_transform_feedback_object *unused_tfb_object,
+               struct gl_buffer_object *indirect)
 {
    struct brw_context *brw = brw_context(ctx);
    const struct gl_client_array **arrays = ctx->Array._DrawArrays;
@@ -604,26 +602,28 @@ void brw_draw_prims( struct gl_context *ctx,
     * manage it.  swrast doesn't support our featureset, so we can't fall back
     * to it.
     */
-   brw_try_draw_prims(ctx, arrays, prims, nr_prims, ib, min_index, max_index, indirect);
+   brw_try_draw_prims(ctx, arrays, prims, nr_prims, ib, min_index, max_index,
+                      indirect);
 }
 
-void brw_draw_init( struct brw_context *brw )
+void
+brw_draw_init(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    struct vbo_context *vbo = vbo_context(ctx);
-   int i;
 
    /* Register our drawing function:
     */
    vbo->draw_prims = brw_draw_prims;
 
-   for (i = 0; i < VERT_ATTRIB_MAX; i++)
+   for (int i = 0; i < VERT_ATTRIB_MAX; i++)
       brw->vb.inputs[i].buffer = -1;
    brw->vb.nr_buffers = 0;
    brw->vb.nr_enabled = 0;
 }
 
-void brw_draw_destroy( struct brw_context *brw )
+void
+brw_draw_destroy(struct brw_context *brw)
 {
    int i;
 

From 680d09b072af7ea1541cfd4fbc62c83e8bd02d0d Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Thu, 25 Jun 2015 08:20:01 -0700
Subject: [PATCH 0960/1208] i965: Trivial formatting changes in
 brw_draw_upload.c

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Acked-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_draw_upload.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index c6dd69bbd29..cbfd5855410 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -649,7 +649,8 @@ emit_vertex_buffer_state(struct brw_context *brw,
 }
 #define EMIT_VERTEX_BUFFER_STATE(...) __map = emit_vertex_buffer_state(__VA_ARGS__, __map)
 
-static void brw_emit_vertices(struct brw_context *brw)
+static void
+brw_emit_vertices(struct brw_context *brw)
 {
    GLuint i;
 
@@ -859,7 +860,8 @@ const struct brw_tracked_state brw_vertices = {
    .emit = brw_emit_vertices,
 };
 
-static void brw_upload_indices(struct brw_context *brw)
+static void
+brw_upload_indices(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
@@ -939,7 +941,8 @@ const struct brw_tracked_state brw_indices = {
    .emit = brw_upload_indices,
 };
 
-static void brw_emit_index_buffer(struct brw_context *brw)
+static void
+brw_emit_index_buffer(struct brw_context *brw)
 {
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
    GLuint cut_index_setting;

From 07433760e3311ff17c0f909514ececdae9f6e9c6 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Thu, 25 Jun 2015 08:20:01 -0700
Subject: [PATCH 0961/1208] i965: Trivial formatting changes in
 brw_misc_state.c

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Acked-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_misc_state.c | 49 ++++++++++------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 16b0ed28d0d..e9d9467d330 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -44,7 +44,8 @@
 #include "main/glformats.h"
 
 /* Constant single cliprect for framebuffer object or DRI2 drawing */
-static void upload_drawing_rect(struct brw_context *brw)
+static void
+upload_drawing_rect(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    const struct gl_framebuffer *fb = ctx->DrawBuffer;
@@ -73,7 +74,8 @@ const struct brw_tracked_state brw_drawing_rect = {
  * The state pointers in this packet are all relative to the general state
  * base address set by CMD_STATE_BASE_ADDRESS, which is 0.
  */
-static void upload_pipelined_state_pointers(struct brw_context *brw )
+static void
+upload_pipelined_state_pointers(struct brw_context *brw)
 {
    if (brw->gen == 5) {
       /* Need to flush before changing clip max threads for errata. */
@@ -104,7 +106,8 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
    brw->ctx.NewDriverState |= BRW_NEW_PSP;
 }
 
-static void upload_psp_urb_cbs(struct brw_context *brw )
+static void
+upload_psp_urb_cbs(struct brw_context *brw)
 {
    upload_pipelined_state_pointers(brw);
    brw_upload_urb_fence(brw);
@@ -700,13 +703,11 @@ const struct brw_tracked_state brw_depthbuffer = {
    .emit = brw_emit_depthbuffer,
 };
 
-
-
-/***********************************************************************
+/**
  * Polygon stipple packet
  */
-
-static void upload_polygon_stipple(struct brw_context *brw)
+static void
+upload_polygon_stipple(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    GLuint i;
@@ -728,8 +729,7 @@ static void upload_polygon_stipple(struct brw_context *brw)
    if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
       for (i = 0; i < 32; i++)
 	  OUT_BATCH(ctx->PolygonStipple[31 - i]); /* invert */
-   }
-   else {
+   } else {
       for (i = 0; i < 32; i++)
 	 OUT_BATCH(ctx->PolygonStipple[i]);
    }
@@ -745,12 +745,11 @@ const struct brw_tracked_state brw_polygon_stipple = {
    .emit = upload_polygon_stipple
 };
 
-
-/***********************************************************************
+/**
  * Polygon stipple offset packet
  */
-
-static void upload_polygon_stipple_offset(struct brw_context *brw)
+static void
+upload_polygon_stipple_offset(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
 
@@ -785,10 +784,11 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
    .emit = upload_polygon_stipple_offset
 };
 
-/**********************************************************************
+/**
  * AA Line parameters
  */
-static void upload_aa_line_parameters(struct brw_context *brw)
+static void
+upload_aa_line_parameters(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
 
@@ -815,11 +815,11 @@ const struct brw_tracked_state brw_aa_line_parameters = {
    .emit = upload_aa_line_parameters
 };
 
-/***********************************************************************
+/**
  * Line stipple packet
  */
-
-static void upload_line_stipple(struct brw_context *brw)
+static void
+upload_line_stipple(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    GLfloat tmp;
@@ -837,8 +837,7 @@ static void upload_line_stipple(struct brw_context *brw)
       tmp = 1.0f / ctx->Line.StippleFactor;
       tmpi = tmp * (1<<16);
       OUT_BATCH(tmpi << 15 | ctx->Line.StippleFactor);
-   }
-   else {
+   } else {
       /* in U1.13 */
       tmp = 1.0f / ctx->Line.StippleFactor;
       tmpi = tmp * (1<<13);
@@ -856,7 +855,6 @@ const struct brw_tracked_state brw_line_stipple = {
    .emit = upload_line_stipple
 };
 
-
 void
 brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
 {
@@ -872,11 +870,9 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
    ADVANCE_BATCH();
 }
 
-
-/***********************************************************************
+/**
  * Misc invariant state packets
  */
-
 void
 brw_upload_invariant_state(struct brw_context *brw)
 {
@@ -930,7 +926,8 @@ const struct brw_tracked_state brw_invariant_state = {
  * surface state objects, but not the surfaces that the surface state
  * objects point to.
  */
-static void upload_state_base_address( struct brw_context *brw )
+static void
+upload_state_base_address(struct brw_context *brw)
 {
    /* FINISHME: According to section 3.6.1 "STATE_BASE_ADDRESS" of
     * vol1a of the G45 PRM, MI_FLUSH with the ISC invalidate should be

From f917a65b3eeaf0e201bd7e695a5d13403e7ad487 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Thu, 25 Jun 2015 08:20:01 -0700
Subject: [PATCH 0962/1208] i965: Trivial formatting changes in
 gen6_multisample_state.c

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Acked-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
---
 src/mesa/drivers/dri/i965/gen6_multisample_state.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index cf1421e5e9f..8444c0c9bae 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -143,7 +143,6 @@ gen6_emit_3dstate_multisample(struct brw_context *brw,
    ADVANCE_BATCH();
 }
 
-
 unsigned
 gen6_determine_sample_mask(struct brw_context *brw)
 {
@@ -176,7 +175,6 @@ gen6_determine_sample_mask(struct brw_context *brw)
    }
 }
 
-
 /**
  * 3DSTATE_SAMPLE_MASK
  */
@@ -189,15 +187,14 @@ gen6_emit_3dstate_sample_mask(struct brw_context *brw, unsigned mask)
    ADVANCE_BATCH();
 }
 
-
-static void upload_multisample_state(struct brw_context *brw)
+static void
+upload_multisample_state(struct brw_context *brw)
 {
    /* BRW_NEW_NUM_SAMPLES */
    gen6_emit_3dstate_multisample(brw, brw->num_samples);
    gen6_emit_3dstate_sample_mask(brw, gen6_determine_sample_mask(brw));
 }
 
-
 const struct brw_tracked_state gen6_multisample_state = {
    .dirty = {
       .mesa = _NEW_MULTISAMPLE,

From d302f51a1ee949fae5dc53f3c872c2712021caf7 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Thu, 25 Jun 2015 08:20:01 -0700
Subject: [PATCH 0963/1208] i965: Trivial formatting changes in gen7_vs_state.c

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Acked-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
---
 src/mesa/drivers/dri/i965/gen7_vs_state.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index 4b17d06fa83..00bc6f24dbe 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -62,6 +62,7 @@ gen7_upload_constant_state(struct brw_context *brw,
       OUT_BATCH(active ? stage_state->push_const_size : 0);
       OUT_BATCH(0);
    }
+
    /* Pointer to the constant buffer.  Covered by the set of state flags
     * from gen6_prepare_wm_contants
     */
@@ -95,15 +96,14 @@ gen7_upload_constant_state(struct brw_context *brw,
 
    ADVANCE_BATCH();
 
-  /* On SKL+ the new constants don't take effect until the next corresponding
-   * 3DSTATE_BINDING_TABLE_POINTER_* command is parsed so we need to ensure
-   * that is sent
-   */
+   /* On SKL+ the new constants don't take effect until the next corresponding
+    * 3DSTATE_BINDING_TABLE_POINTER_* command is parsed so we need to ensure
+    * that is sent
+    */
    if (brw->gen >= 9)
       brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
 }
 
-
 static void
 upload_vs_state(struct brw_context *brw)
 {

From 7a12e646d3874f4ff755e05cfb27560d11d075a7 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Thu, 25 Jun 2015 08:43:13 -0700
Subject: [PATCH 0964/1208] i965: Remove extern declaration for nonexistent
 state atom

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Acked-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_state.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 2eff1b50e83..a99bbbd30ca 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -121,7 +121,6 @@ extern const struct brw_tracked_state gen6_wm_state;
 extern const struct brw_tracked_state gen7_depthbuffer;
 extern const struct brw_tracked_state gen7_clip_state;
 extern const struct brw_tracked_state gen7_disable_stages;
-extern const struct brw_tracked_state gen7_gs_push_constants;
 extern const struct brw_tracked_state gen7_gs_state;
 extern const struct brw_tracked_state gen7_ps_state;
 extern const struct brw_tracked_state gen7_push_constant_space;

From 93977d3a151675946c03ec28102c651691cdb0bd Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Thu, 25 Jun 2015 10:45:34 -0700
Subject: [PATCH 0965/1208] i965: Make gen7_upload_ps_state static

It is only ever called from within the same file.

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Acked-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_state.h     | 9 ---------
 src/mesa/drivers/dri/i965/gen7_wm_state.c | 2 +-
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index a99bbbd30ca..cf3a96c7bee 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -266,15 +266,6 @@ void brw_update_renderbuffer_surfaces(struct brw_context *brw,
                                       uint32_t render_target_start,
                                       uint32_t *surf_offset);
 
-/* gen7_wm_state.c */
-void
-gen7_upload_ps_state(struct brw_context *brw,
-                     const struct gl_fragment_program *fp,
-                     const struct brw_stage_state *stage_state,
-                     const struct brw_wm_prog_data *prog_data,
-                     bool enable_dual_src_blend, unsigned sample_mask,
-                     unsigned fast_clear_op);
-
 /* gen7_wm_surface_state.c */
 uint32_t gen7_surface_tiling_mode(uint32_t tiling);
 uint32_t gen7_surface_msaa_bits(unsigned num_samples, enum intel_msaa_layout l);
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index ea11ae845e3..d7be58dd0f7 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -127,7 +127,7 @@ const struct brw_tracked_state gen7_wm_state = {
    .emit = upload_wm_state,
 };
 
-void
+static void
 gen7_upload_ps_state(struct brw_context *brw,
                      const struct gl_fragment_program *fp,
                      const struct brw_stage_state *stage_state,

From 7ac946e546bba440f87ce95ef022745201744f9c Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Thu, 30 Jul 2015 06:00:56 -0700
Subject: [PATCH 0966/1208] glsl: Add constuctors for the common cases of
 glsl_struct_field

Fixes a giant pile of GCC warnings:

builtin_types.cpp:60:1: warning: missing initializer for member 'glsl_struct_field::stream' [-Wmissing-field-initializers]

I had to add a default constructor because a non-default constructor
was added.  Otherwise the only constructor would be the one with
parameters, and all the plases like

    glsl_struct_field foo;

would fail to compile.

I wanted to do this in two patches.  All of the initializers of
glsl_struct_field structures had to be converted to use the
constructor because C++ apparently forces you to do one or the other:

builtin_types.cpp:61:1: error: could not convert '{glsl_type::float_type, "near", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0, -1}' from '<brace-enclosed initializer list>' to 'glsl_struct_field'

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
---
 src/glsl/builtin_types.cpp         | 74 +++++++++++++++---------------
 src/glsl/glsl_types.h              | 13 ++++++
 src/glsl/tests/general_ir_test.cpp | 12 +----
 src/glsl/tests/varyings_test.cpp   | 10 +---
 4 files changed, 53 insertions(+), 56 deletions(-)

diff --git a/src/glsl/builtin_types.cpp b/src/glsl/builtin_types.cpp
index d92e2eb3007..ffbc5e6fdbc 100644
--- a/src/glsl/builtin_types.cpp
+++ b/src/glsl/builtin_types.cpp
@@ -54,64 +54,64 @@
       &glsl_type::_struct_##NAME##_type;
 
 static const struct glsl_struct_field gl_DepthRangeParameters_fields[] = {
-   { glsl_type::float_type, "near", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "far",  -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "diff", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::float_type, "near"),
+   glsl_struct_field(glsl_type::float_type, "far"),
+   glsl_struct_field(glsl_type::float_type, "diff"),
 };
 
 static const struct glsl_struct_field gl_PointParameters_fields[] = {
-   { glsl_type::float_type, "size", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "sizeMin", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "sizeMax", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "fadeThresholdSize", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "distanceConstantAttenuation", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "distanceLinearAttenuation", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "distanceQuadraticAttenuation", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::float_type, "size"),
+   glsl_struct_field(glsl_type::float_type, "sizeMin"),
+   glsl_struct_field(glsl_type::float_type, "sizeMax"),
+   glsl_struct_field(glsl_type::float_type, "fadeThresholdSize"),
+   glsl_struct_field(glsl_type::float_type, "distanceConstantAttenuation"),
+   glsl_struct_field(glsl_type::float_type, "distanceLinearAttenuation"),
+   glsl_struct_field(glsl_type::float_type, "distanceQuadraticAttenuation"),
 };
 
 static const struct glsl_struct_field gl_MaterialParameters_fields[] = {
-   { glsl_type::vec4_type, "emission", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "ambient", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "diffuse", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "specular", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "shininess", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::vec4_type, "emission"),
+   glsl_struct_field(glsl_type::vec4_type, "ambient"),
+   glsl_struct_field(glsl_type::vec4_type, "diffuse"),
+   glsl_struct_field(glsl_type::vec4_type, "specular"),
+   glsl_struct_field(glsl_type::float_type, "shininess"),
 };
 
 static const struct glsl_struct_field gl_LightSourceParameters_fields[] = {
-   { glsl_type::vec4_type, "ambient", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "diffuse", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "specular", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "position", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "halfVector", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec3_type, "spotDirection", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "spotExponent", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "spotCutoff", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "spotCosCutoff", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "constantAttenuation", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "linearAttenuation", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "quadraticAttenuation", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::vec4_type, "ambient"),
+   glsl_struct_field(glsl_type::vec4_type, "diffuse"),
+   glsl_struct_field(glsl_type::vec4_type, "specular"),
+   glsl_struct_field(glsl_type::vec4_type, "position"),
+   glsl_struct_field(glsl_type::vec4_type, "halfVector"),
+   glsl_struct_field(glsl_type::vec3_type, "spotDirection"),
+   glsl_struct_field(glsl_type::float_type, "spotExponent"),
+   glsl_struct_field(glsl_type::float_type, "spotCutoff"),
+   glsl_struct_field(glsl_type::float_type, "spotCosCutoff"),
+   glsl_struct_field(glsl_type::float_type, "constantAttenuation"),
+   glsl_struct_field(glsl_type::float_type, "linearAttenuation"),
+   glsl_struct_field(glsl_type::float_type, "quadraticAttenuation"),
 };
 
 static const struct glsl_struct_field gl_LightModelParameters_fields[] = {
-   { glsl_type::vec4_type, "ambient", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::vec4_type, "ambient"),
 };
 
 static const struct glsl_struct_field gl_LightModelProducts_fields[] = {
-   { glsl_type::vec4_type, "sceneColor", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::vec4_type, "sceneColor"),
 };
 
 static const struct glsl_struct_field gl_LightProducts_fields[] = {
-   { glsl_type::vec4_type, "ambient", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "diffuse", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "specular", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::vec4_type, "ambient"),
+   glsl_struct_field(glsl_type::vec4_type, "diffuse"),
+   glsl_struct_field(glsl_type::vec4_type, "specular"),
 };
 
 static const struct glsl_struct_field gl_FogParameters_fields[] = {
-   { glsl_type::vec4_type, "color", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "density", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "start", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "end", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "scale", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::vec4_type, "color"),
+   glsl_struct_field(glsl_type::float_type, "density"),
+   glsl_struct_field(glsl_type::float_type, "start"),
+   glsl_struct_field(glsl_type::float_type, "end"),
+   glsl_struct_field(glsl_type::float_type, "scale"),
 };
 
 #include "builtin_type_macros.h"
diff --git a/src/glsl/glsl_types.h b/src/glsl/glsl_types.h
index 52672b313c0..e7c73dac3c3 100644
--- a/src/glsl/glsl_types.h
+++ b/src/glsl/glsl_types.h
@@ -781,6 +781,19 @@ struct glsl_struct_field {
     * streams (as in ir_variable::stream). -1 otherwise.
     */
    int stream;
+
+   glsl_struct_field(const struct glsl_type *_type, const char *_name)
+      : type(_type), name(_name), location(-1), interpolation(0), centroid(0),
+        sample(0), matrix_layout(GLSL_MATRIX_LAYOUT_INHERITED), patch(0),
+        stream(-1)
+   {
+      /* empty */
+   }
+
+   glsl_struct_field()
+   {
+      /* empty */
+   }
 };
 
 static inline unsigned int
diff --git a/src/glsl/tests/general_ir_test.cpp b/src/glsl/tests/general_ir_test.cpp
index 882642d141b..217305bf847 100644
--- a/src/glsl/tests/general_ir_test.cpp
+++ b/src/glsl/tests/general_ir_test.cpp
@@ -31,11 +31,7 @@ TEST(ir_variable_constructor, interface)
    void *mem_ctx = ralloc_context(NULL);
 
    static const glsl_struct_field f[] = {
-      {
-         glsl_type::vec(4),
-         "v",
-         false
-      }
+      glsl_struct_field(glsl_type::vec(4), "v")
    };
 
    const glsl_type *const interface =
@@ -60,11 +56,7 @@ TEST(ir_variable_constructor, interface_array)
    void *mem_ctx = ralloc_context(NULL);
 
    static const glsl_struct_field f[] = {
-      {
-         glsl_type::vec(4),
-         "v",
-         false
-      }
+      glsl_struct_field(glsl_type::vec(4), "v")
    };
 
    const glsl_type *const interface =
diff --git a/src/glsl/tests/varyings_test.cpp b/src/glsl/tests/varyings_test.cpp
index 62f8c6bc5ad..0c4e0a471b8 100644
--- a/src/glsl/tests/varyings_test.cpp
+++ b/src/glsl/tests/varyings_test.cpp
@@ -76,15 +76,7 @@ public:
 link_varyings::link_varyings()
 {
    static const glsl_struct_field f[] = {
-      {
-         glsl_type::vec(4),
-         "v",
-         false,
-         0,
-         0,
-         0,
-         0
-      }
+      glsl_struct_field(glsl_type::vec(4), "v")
    };
 
    this->simple_interface =

From e23cbaadaac0c67a72b10e3dd14b75abc19ab3c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Segu=C3=AD?= <alesegdia@gmail.com>
Date: Mon, 3 Aug 2015 02:15:20 +0200
Subject: [PATCH 0967/1208] glsl: replace old hash table with new and faster
 one

The util/hash_table was intended to be a fast hash table
replacement for the program/hash_table see 35fd61bd99c1 and
72e55bb6888ff.

Reviewed-by: Timothy Arceri <t_arceri@yahoo.com.au>
---
 src/glsl/ir_print_visitor.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/glsl/ir_print_visitor.cpp b/src/glsl/ir_print_visitor.cpp
index 78475f39b78..8dbd938c58b 100644
--- a/src/glsl/ir_print_visitor.cpp
+++ b/src/glsl/ir_print_visitor.cpp
@@ -25,7 +25,7 @@
 #include "glsl_types.h"
 #include "glsl_parser_extras.h"
 #include "main/macros.h"
-#include "program/hash_table.h"
+#include "util/hash_table.h"
 
 static void print_type(FILE *f, const glsl_type *t);
 
@@ -89,14 +89,14 @@ ir_print_visitor::ir_print_visitor(FILE *f)
 {
    indentation = 0;
    printable_names =
-      hash_table_ctor(32, hash_table_pointer_hash, hash_table_pointer_compare);
+      _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
    symbols = _mesa_symbol_table_ctor();
    mem_ctx = ralloc_context(NULL);
 }
 
 ir_print_visitor::~ir_print_visitor()
 {
-   hash_table_dtor(printable_names);
+   _mesa_hash_table_destroy(printable_names, NULL);
    _mesa_symbol_table_dtor(symbols);
    ralloc_free(mem_ctx);
 }
@@ -121,18 +121,22 @@ ir_print_visitor::unique_name(ir_variable *var)
    }
 
    /* Do we already have a name for this variable? */
-   const char *name = (const char *) hash_table_find(this->printable_names, var);
-   if (name != NULL)
-      return name;
+   struct hash_entry * entry =
+      _mesa_hash_table_search(this->printable_names, var);
+
+   if (entry != NULL) {
+      return (const char *) entry->data;
+   }
 
    /* If there's no conflict, just use the original name */
+   const char* name = NULL;
    if (_mesa_symbol_table_find_symbol(this->symbols, -1, var->name) == NULL) {
       name = var->name;
    } else {
       static unsigned i = 1;
       name = ralloc_asprintf(this->mem_ctx, "%s@%u", var->name, ++i);
    }
-   hash_table_insert(this->printable_names, (void *) name, var);
+   _mesa_hash_table_insert(this->printable_names, var, (void *) name);
    _mesa_symbol_table_add_symbol(this->symbols, -1, name, var);
    return name;
 }

From 996349cb190154ebdc8cc9f23e5f8f9aabbd6b4d Mon Sep 17 00:00:00 2001
From: Vinson Lee <vlee@freedesktop.org>
Date: Wed, 29 Jul 2015 20:32:41 -0700
Subject: [PATCH 0968/1208] vl/mpeg12: Silence GCC unused-variable warning.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vl/vl_mpeg12_bitstream.c: In function 'decode_slice':
vl/vl_mpeg12_bitstream.c:928:19: warning: unused variable 'extra' [-Wunused-variable]
          unsigned extra = vl_vlc_get_uimsbf(&bs->vlc, 1);
                   ^

Signed-off-by: Vinson Lee <vlee@freedesktop.org>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c b/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
index 539a9913e02..52ce6c416aa 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
@@ -929,6 +929,7 @@ decode_slice(struct vl_mpg12_bs *bs, struct pipe_video_buffer *target)
          mb.PMV[1][0][0] = mb.PMV[0][0][0];
          mb.PMV[1][0][1] = mb.PMV[0][0][1];
          assert(extra);
+         (void) extra;
       } else if (mb.macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA ||
                 !(mb.macroblock_type & (PIPE_MPEG12_MB_TYPE_MOTION_FORWARD |
                                         PIPE_MPEG12_MB_TYPE_MOTION_BACKWARD))) {

From 3c050222b0d5b47c885ca72f3c7af22c0d28b5ad Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 25 Jun 2015 21:43:30 -0700
Subject: [PATCH 0969/1208] mesa: Use _mesa_lroundevenf() in some more places.

---
 src/glsl/ir_constant_expression.cpp | 14 ++++----------
 src/mesa/main/imports.c             |  4 ++--
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp
index 2853c1643b3..309b6b72b5b 100644
--- a/src/glsl/ir_constant_expression.cpp
+++ b/src/glsl/ir_constant_expression.cpp
@@ -230,12 +230,9 @@ pack_snorm_1x8(float x)
      *    follows:
      *
      *      packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
-     *
-     * We must first cast the float to an int, because casting a negative
-     * float to a uint is undefined.
      */
-   return (uint8_t) (int)
-          _mesa_roundevenf(CLAMP(x, -1.0f, +1.0f) * 127.0f);
+   return (uint8_t)
+          _mesa_lroundevenf(CLAMP(x, -1.0f, +1.0f) * 127.0f);
 }
 
 /**
@@ -252,12 +249,9 @@ pack_snorm_1x16(float x)
      *    follows:
      *
      *      packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
-     *
-     * We must first cast the float to an int, because casting a negative
-     * float to a uint is undefined.
      */
-   return (uint16_t) (int)
-          _mesa_roundevenf(CLAMP(x, -1.0f, +1.0f) * 32767.0f);
+   return (uint16_t)
+          _mesa_lroundevenf(CLAMP(x, -1.0f, +1.0f) * 32767.0f);
 }
 
 /**
diff --git a/src/mesa/main/imports.c b/src/mesa/main/imports.c
index 68c7316575c..350e6752c8b 100644
--- a/src/mesa/main/imports.c
+++ b/src/mesa/main/imports.c
@@ -369,7 +369,7 @@ _mesa_float_to_half(float val)
           * or normal.
           */
          e = 0;
-         m = (int) _mesa_roundevenf((1 << 24) * fabsf(fi.f));
+         m = _mesa_lroundevenf((1 << 24) * fabsf(fi.f));
       }
       else if (new_exp > 15) {
          /* map this value to infinity */
@@ -383,7 +383,7 @@ _mesa_float_to_half(float val)
           * either normal or infinite.
           */
          e = new_exp + 15;
-         m = (int) _mesa_roundevenf(flt_m / (float) (1 << 13));
+         m = _mesa_lroundevenf(flt_m / (float) (1 << 13));
       }
    }
 

From 680de24545d23d0c2b699020267ca484f81a04a9 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 29 Jun 2015 09:38:34 -0700
Subject: [PATCH 0970/1208] util: Use SSE intrinsics in _mesa_lroundeven{f,}.

gcc actually generates this for us now that we use -fno-math-errno
(which is weird, since lrintf()/lrint() don't set errno) but clang still
does not. Presumably helps MSVC as well.

Reduced .text size by 8.5k with gcc before -fno-math-errno.

   text     data      bss      dec      hex  filename
4935850   195136    26192  5157178   4eb13a  i965_dri.so before
4927225   195128    26192  5148545   4e8f81  i965_dri.so after

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/util/rounding.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/util/rounding.h b/src/util/rounding.h
index 088cf86dd08..b0c9918fd6e 100644
--- a/src/util/rounding.h
+++ b/src/util/rounding.h
@@ -25,6 +25,12 @@
 #define _ROUNDING_H
 
 #include <math.h>
+#include <limits.h>
+
+#ifdef __x86_64__
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#endif
 
 #ifdef __SSE4_1__
 #include <smmintrin.h>
@@ -87,7 +93,15 @@ _mesa_roundeven(double x)
 static inline long
 _mesa_lroundevenf(float x)
 {
+#ifdef __x86_64__
+#if LONG_BIT == 64
+   return _mm_cvtss_si64(_mm_load_ss(&x));
+#elif LONG_BIT == 32
+   return _mm_cvtss_si32(_mm_load_ss(&x));
+#endif
+#else
    return lrintf(x);
+#endif
 }
 
 /**
@@ -97,7 +111,15 @@ _mesa_lroundevenf(float x)
 static inline long
 _mesa_lroundeven(double x)
 {
+#ifdef __x86_64__
+#if LONG_BIT == 64
+   return _mm_cvtsd_si64(_mm_load_sd(&x));
+#elif LONG_BIT == 32
+   return _mm_cvtsd_si32(_mm_load_sd(&x));
+#endif
+#else
    return lrint(x);
+#endif
 }
 
 #endif

From 5f247a9656cb8a0eccdc98ef5911ed15c1248dfb Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 12 Jul 2015 18:05:58 -0700
Subject: [PATCH 0971/1208] glx: Use _mesa_lroundevenf() in glPixelStoref().

Functional change in which way half-way cases are rounded from towards
positive-infinity to even. The spec says "the passed value is rounded to
the nearest integer". Removes another case of bad half-up rounding.
---
 src/glx/pixelstore.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/glx/pixelstore.c b/src/glx/pixelstore.c
index 1d776b817e5..dc33ff300fc 100644
--- a/src/glx/pixelstore.c
+++ b/src/glx/pixelstore.c
@@ -30,6 +30,7 @@
 
 #include "glxclient.h"
 #include "indirect.h"
+#include "util/rounding.h"
 
 #if !defined(__GNUC__)
 #  define __builtin_expect(x, y) x
@@ -77,7 +78,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
 
    switch (pname) {
    case GL_PACK_ROW_LENGTH:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -85,7 +86,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storePack.rowLength = a;
       break;
    case GL_PACK_IMAGE_HEIGHT:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -93,7 +94,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storePack.imageHeight = a;
       break;
    case GL_PACK_SKIP_ROWS:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -101,7 +102,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storePack.skipRows = a;
       break;
    case GL_PACK_SKIP_PIXELS:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -109,7 +110,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storePack.skipPixels = a;
       break;
    case GL_PACK_SKIP_IMAGES:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -117,7 +118,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storePack.skipImages = a;
       break;
    case GL_PACK_ALIGNMENT:
-      a = (GLint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       switch (a) {
       case 1:
       case 2:
@@ -138,7 +139,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       break;
 
    case GL_UNPACK_ROW_LENGTH:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -146,7 +147,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storeUnpack.rowLength = a;
       break;
    case GL_UNPACK_IMAGE_HEIGHT:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -154,7 +155,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storeUnpack.imageHeight = a;
       break;
    case GL_UNPACK_SKIP_ROWS:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -162,7 +163,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storeUnpack.skipRows = a;
       break;
    case GL_UNPACK_SKIP_PIXELS:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -170,7 +171,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storeUnpack.skipPixels = a;
       break;
    case GL_UNPACK_SKIP_IMAGES:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -178,7 +179,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storeUnpack.skipImages = a;
       break;
    case GL_UNPACK_ALIGNMENT:
-      a = (GLint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       switch (a) {
       case 1:
       case 2:

From a221f8d9ebb4ef43a83ef638458d1338dfe1e517 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 31 Jul 2015 09:36:31 -0400
Subject: [PATCH 0972/1208] freedreno: small bit of cleanup about max
 rendertargets

We hard-coded 4 or 8 as the max in various places.  Switch it all to a
define since the limit will go up with a4xx (and maybe even again in the
future?)

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a3xx/fd3_blend.h    |  4 +++-
 src/gallium/drivers/freedreno/a3xx/fd3_draw.c     |  2 +-
 src/gallium/drivers/freedreno/a3xx/fd3_gmem.c     |  2 +-
 src/gallium/drivers/freedreno/a3xx/fd3_program.c  |  4 +++-
 src/gallium/drivers/freedreno/a4xx/fd4_blend.h    |  4 +++-
 src/gallium/drivers/freedreno/a4xx/fd4_draw.c     |  2 +-
 src/gallium/drivers/freedreno/a4xx/fd4_emit.c     |  2 +-
 src/gallium/drivers/freedreno/a4xx/fd4_gmem.c     |  2 +-
 src/gallium/drivers/freedreno/freedreno_context.h |  2 +-
 src/gallium/drivers/freedreno/freedreno_gmem.c    | 15 ++++++++++-----
 src/gallium/drivers/freedreno/freedreno_gmem.h    |  6 ++++--
 src/gallium/drivers/freedreno/freedreno_program.c |  6 +++++-
 src/gallium/drivers/freedreno/freedreno_util.h    |  6 ++++++
 13 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_blend.h b/src/gallium/drivers/freedreno/a3xx/fd3_blend.h
index 0267001b0b9..142df7c300f 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_blend.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_blend.h
@@ -32,6 +32,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_context.h"
 
+#include "freedreno_util.h"
+
 struct fd3_blend_stateobj {
 	struct pipe_blend_state base;
 	struct {
@@ -42,7 +44,7 @@ struct fd3_blend_stateobj {
 		/* Blend control bits for alpha channel */
 		uint32_t blend_control_alpha;
 		uint32_t control;
-	} rb_mrt[4];
+	} rb_mrt[A3XX_MAX_RENDER_TARGETS];
 };
 
 static inline struct fd3_blend_stateobj *
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index fc30d4842ba..43550ae6a22 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -324,7 +324,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 				A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP));
 	}
 
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < A3XX_MAX_RENDER_TARGETS; i++) {
 		OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1);
 		OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) |
 				A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_ALWAYS) |
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index 7d3975761dd..4689085e516 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -57,7 +57,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		tile_mode = LINEAR;
 	}
 
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < A3XX_MAX_RENDER_TARGETS; i++) {
 		enum pipe_format pformat = 0;
 		enum a3xx_color_fmt format = 0;
 		enum a3xx_color_swap swap = WZYX;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index 7cd4885ab4f..e98c6b5cff4 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -136,6 +136,8 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 	int constmode;
 	int i, j, k;
 
+	debug_assert(nr <= ARRAY_SIZE(color_regid));
+
 	vp = fd3_emit_get_vp(emit);
 
 	if (emit->key.binning_pass) {
@@ -207,7 +209,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 			unsigned idx = sem2idx(sem);
 			if (sem2name(sem) != TGSI_SEMANTIC_COLOR)
 				continue;
-			assert(idx < 4);
+			debug_assert(idx < ARRAY_SIZE(color_regid));
 			color_regid[idx] = fp->outputs[i].regid;
 		}
 	}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
index 821b3c82832..7620d00a625 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
@@ -32,13 +32,15 @@
 #include "pipe/p_state.h"
 #include "pipe/p_context.h"
 
+#include "freedreno_util.h"
+
 struct fd4_blend_stateobj {
 	struct pipe_blend_state base;
 	struct {
 		uint32_t control;
 		uint32_t buf_info;
 		uint32_t blend_control;
-	} rb_mrt[8];
+	} rb_mrt[A4XX_MAX_RENDER_TARGETS];
 	uint32_t rb_fs_output;
 };
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index ff1dfdc392f..154154190db 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -265,7 +265,7 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 		ce = 0x0;
 	}
 
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
 		OUT_RING(ring, A4XX_RB_MRT_CONTROL_FASTCLEAR |
 				A4XX_RB_MRT_CONTROL_B11 |
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 62a1e9ee955..df96601c747 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -487,7 +487,7 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		struct fd4_blend_stateobj *blend = fd4_blend_stateobj(ctx->blend);
 		uint32_t i;
 
-		for (i = 0; i < 8; i++) {
+		for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
 			OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
 			OUT_RING(ring, blend->rb_mrt[i].control);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index 9a905062071..976255f5879 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -63,7 +63,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		tile_mode = TILE4_LINEAR;
 	}
 
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
 		enum a4xx_color_fmt format = 0;
 		enum a3xx_color_swap swap = WZYX;
 		struct fd_resource *rsc = NULL;
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index cc585af1b3f..509a90fdf23 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -193,7 +193,7 @@ struct fd_context {
 	struct fd_program_stateobj solid_prog; // TODO move to screen?
 
 	/* shaders used by mem->gmem blits: */
-	struct fd_program_stateobj blit_prog[8]; // TODO move to screen?
+	struct fd_program_stateobj blit_prog[MAX_RENDER_TARGETS]; // TODO move to screen?
 	struct fd_program_stateobj blit_z, blit_zs;
 
 	/* do we need to mem2gmem before rendering.  We don't, if for example,
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index c105378ec4e..648db9baee5 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -82,7 +82,7 @@ total_size(uint8_t cbuf_cpp[], uint8_t zsbuf_cpp[2],
 {
 	uint32_t total = 0, i;
 
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < MAX_RENDER_TARGETS; i++) {
 		if (cbuf_cpp[i]) {
 			gmem->cbuf_base[i] = align(total, 0x4000);
 			total = gmem->cbuf_base[i] + cbuf_cpp[i] * bin_w * bin_h;
@@ -113,7 +113,7 @@ calculate_tiles(struct fd_context *ctx)
 	uint32_t nbins_x = 1, nbins_y = 1;
 	uint32_t bin_w, bin_h;
 	uint32_t max_width = bin_width(ctx);
-	uint8_t cbuf_cpp[4] = {0}, zsbuf_cpp[2] = {0};
+	uint8_t cbuf_cpp[MAX_RENDER_TARGETS] = {0}, zsbuf_cpp[2] = {0};
 	uint32_t i, j, t, xoff, yoff;
 	uint32_t tpp_x, tpp_y;
 	bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL));
@@ -162,12 +162,17 @@ calculate_tiles(struct fd_context *ctx)
 		bin_w = align(width / nbins_x, 32);
 	}
 
+	if (fd_mesa_debug & FD_DBG_MSGS) {
+		debug_printf("binning input: cbuf cpp:");
+		for (i = 0; i < pfb->nr_cbufs; i++)
+			debug_printf(" %d", cbuf_cpp[i]);
+		debug_printf(", zsbuf cpp: %d; %dx%d\n",
+				zsbuf_cpp[0], width, height);
+	}
+
 	/* then find a bin width/height that satisfies the memory
 	 * constraints:
 	 */
-	DBG("binning input: cbuf cpp: %d %d %d %d, zsbuf cpp: %d; %dx%d",
-		cbuf_cpp[0], cbuf_cpp[1], cbuf_cpp[2], cbuf_cpp[3], zsbuf_cpp[0],
-		width, height);
 	while (total_size(cbuf_cpp, zsbuf_cpp, bin_w, bin_h, gmem) > gmem_size) {
 		if (bin_w > bin_h) {
 			nbins_x++;
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.h b/src/gallium/drivers/freedreno/freedreno_gmem.h
index 5867235db90..38b557eb077 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.h
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.h
@@ -31,6 +31,8 @@
 
 #include "pipe/p_context.h"
 
+#include "freedreno_util.h"
+
 /* per-pipe configuration for hw binning: */
 struct fd_vsc_pipe {
 	struct fd_bo *bo;
@@ -47,9 +49,9 @@ struct fd_tile {
 
 struct fd_gmem_stateobj {
 	struct pipe_scissor_state scissor;
-	uint32_t cbuf_base[4];
+	uint32_t cbuf_base[MAX_RENDER_TARGETS];
 	uint32_t zsbuf_base[2];
-	uint8_t cbuf_cpp[4];
+	uint8_t cbuf_cpp[MAX_RENDER_TARGETS];
 	uint8_t zsbuf_cpp[2];
 	uint16_t bin_h, nbins_y;
 	uint16_t bin_w, nbins_x;
diff --git a/src/gallium/drivers/freedreno/freedreno_program.c b/src/gallium/drivers/freedreno/freedreno_program.c
index 5e344e69146..e6a647852a3 100644
--- a/src/gallium/drivers/freedreno/freedreno_program.c
+++ b/src/gallium/drivers/freedreno/freedreno_program.c
@@ -96,7 +96,11 @@ fd_prog_blit(struct pipe_context *pctx, int rts, bool depth)
 {
 	int i;
 	struct ureg_src tc;
-	struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
+	struct ureg_program *ureg;
+
+	debug_assert(rts <= MAX_RENDER_TARGETS);
+
+	ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
 	if (!ureg)
 		return NULL;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index 6aec2585ceb..ed568173497 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -54,6 +54,12 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 /* TBD if it is same on a2xx, but for now: */
 #define MAX_MIP_LEVELS A3XX_MAX_MIP_LEVELS
 
+#define A2XX_MAX_RENDER_TARGETS 1
+#define A3XX_MAX_RENDER_TARGETS 4
+#define A4XX_MAX_RENDER_TARGETS 8
+/* for now until a4xx MRT support: */
+#define MAX_RENDER_TARGETS A3XX_MAX_RENDER_TARGETS
+
 #define FD_DBG_MSGS     0x0001
 #define FD_DBG_DISASM   0x0002
 #define FD_DBG_DCLEAR   0x0004

From c7deea51d2b611564c91e146fbd1ed0b547f65c0 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 31 Jul 2015 10:54:23 -0400
Subject: [PATCH 0973/1208] freedreno: fix stream-out caps vec4->components

Should be in units of components, not vec4's

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/freedreno_screen.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 97e4161ede2..bab6131678b 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -236,11 +236,11 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 		return 0;
 	case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
 		if (is_a3xx(screen) || is_a4xx(screen))
-			return 16;    /* should only be shader out limit? */
+			return 16 * 4;   /* should only be shader out limit? */
 		return 0;
 	case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
 		if (is_a3xx(screen) || is_a4xx(screen))
-			return 16;    /* should only be shader out limit? */
+			return 16 * 4;   /* should only be shader out limit? */
 		return 0;
 
 	/* Geometry shader output, unsupported. */

From 5ca032a9a8ece0a8a43151f988215484da3c1811 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 31 Jul 2015 12:07:24 -0400
Subject: [PATCH 0974/1208] freedreno: simplify/cleanup resource status
 tracking

Collapse dirty/reading bools into status bitmask (and drop writing which
should really be the same as dirty).  And use 'used_resources' list for
all tracking, including zsbuf/cbufs, rather than special casing the
color and depth/stencil buffers.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/freedreno_context.c     | 18 +----
 .../drivers/freedreno/freedreno_draw.c        | 65 +++++++++++--------
 .../drivers/freedreno/freedreno_resource.c    | 19 ++++--
 .../drivers/freedreno/freedreno_resource.h    | 17 ++++-
 4 files changed, 71 insertions(+), 48 deletions(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 02613dcdbdb..8e6d43150ce 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -94,9 +94,7 @@ void
 fd_context_render(struct pipe_context *pctx)
 {
 	struct fd_context *ctx = fd_context(pctx);
-	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd_resource *rsc, *rsc_tmp;
-	int i;
 
 	DBG("needs_flush: %d", ctx->needs_flush);
 
@@ -118,21 +116,11 @@ fd_context_render(struct pipe_context *pctx)
 	ctx->gmem_reason = 0;
 	ctx->num_draws = 0;
 
-	for (i = 0; i < pfb->nr_cbufs; i++)
-		if (pfb->cbufs[i])
-			fd_resource(pfb->cbufs[i]->texture)->dirty = false;
-	if (pfb->zsbuf) {
-		rsc = fd_resource(pfb->zsbuf->texture);
-		rsc->dirty = false;
-		if (rsc->stencil)
-			rsc->stencil->dirty = false;
-	}
-
 	/* go through all the used resources and clear their reading flag */
 	LIST_FOR_EACH_ENTRY_SAFE(rsc, rsc_tmp, &ctx->used_resources, list) {
-		assert(rsc->reading || rsc->writing);
-		rsc->reading = false;
-		rsc->writing = false;
+		debug_assert(rsc->status != 0);
+		rsc->status = 0;
+		rsc->pending_ctx = NULL;
 		list_delinit(&rsc->list);
 	}
 
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index f88654063fa..a04cc879630 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -40,7 +40,8 @@
 #include "freedreno_util.h"
 
 static void
-resource_used(struct fd_context *ctx, struct pipe_resource *prsc, boolean reading)
+resource_used(struct fd_context *ctx, struct pipe_resource *prsc,
+		enum fd_resource_status status)
 {
 	struct fd_resource *rsc;
 
@@ -48,12 +49,29 @@ resource_used(struct fd_context *ctx, struct pipe_resource *prsc, boolean readin
 		return;
 
 	rsc = fd_resource(prsc);
-	if (reading)
-		rsc->reading = true;
-	else
-		rsc->writing = true;
+	rsc->status |= status;
+	if (rsc->stencil)
+		rsc->stencil->status |= status;
+
+	/* TODO resources can actually be shared across contexts,
+	 * so I'm not sure a single list-head will do the trick?
+	 */
+	debug_assert((rsc->pending_ctx == ctx) || !rsc->pending_ctx);
 	list_delinit(&rsc->list);
 	list_addtail(&rsc->list, &ctx->used_resources);
+	rsc->pending_ctx = ctx;
+}
+
+static void
+resource_read(struct fd_context *ctx, struct pipe_resource *prsc)
+{
+	resource_used(ctx, prsc, FD_PENDING_READ);
+}
+
+static void
+resource_written(struct fd_context *ctx, struct pipe_resource *prsc)
+{
+	resource_used(ctx, prsc, FD_PENDING_WRITE);
 }
 
 static void
@@ -72,6 +90,8 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 	/* emulate unsupported primitives: */
 	if (!fd_supported_prim(ctx, info->mode)) {
+		if (ctx->streamout.num_targets > 0)
+			debug_error("stream-out with emulated prims");
 		util_primconvert_save_index_buffer(ctx->primconvert, &ctx->indexbuf);
 		util_primconvert_save_rasterizer_state(ctx->primconvert, ctx->rasterizer);
 		util_primconvert_draw_vbo(ctx->primconvert, info);
@@ -86,17 +106,13 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 	if (fd_depth_enabled(ctx)) {
 		buffers |= FD_BUFFER_DEPTH;
-		fd_resource(pfb->zsbuf->texture)->dirty = true;
+		resource_written(ctx, pfb->zsbuf->texture);
 		ctx->gmem_reason |= FD_GMEM_DEPTH_ENABLED;
 	}
 
 	if (fd_stencil_enabled(ctx)) {
-		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
 		buffers |= FD_BUFFER_STENCIL;
-		if (rsc->stencil)
-			rsc->stencil->dirty = true;
-		else
-			rsc->dirty = true;
+		resource_written(ctx, pfb->zsbuf->texture);
 		ctx->gmem_reason |= FD_GMEM_STENCIL_ENABLED;
 	}
 
@@ -111,7 +127,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 		surf = pfb->cbufs[i]->texture;
 
-		fd_resource(surf)->dirty = true;
+		resource_written(ctx, surf);
 		buffers |= PIPE_CLEAR_COLOR0 << i;
 
 		if (surf->nr_samples > 1)
@@ -123,31 +139,31 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 	/* Skip over buffer 0, that is sent along with the command stream */
 	for (i = 1; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
-		resource_used(ctx, ctx->constbuf[PIPE_SHADER_VERTEX].cb[i].buffer, true);
-		resource_used(ctx, ctx->constbuf[PIPE_SHADER_FRAGMENT].cb[i].buffer, true);
+		resource_read(ctx, ctx->constbuf[PIPE_SHADER_VERTEX].cb[i].buffer);
+		resource_read(ctx, ctx->constbuf[PIPE_SHADER_FRAGMENT].cb[i].buffer);
 	}
 
 	/* Mark VBOs as being read */
 	for (i = 0; i < ctx->vtx.vertexbuf.count; i++) {
 		assert(!ctx->vtx.vertexbuf.vb[i].user_buffer);
-		resource_used(ctx, ctx->vtx.vertexbuf.vb[i].buffer, true);
+		resource_read(ctx, ctx->vtx.vertexbuf.vb[i].buffer);
 	}
 
 	/* Mark index buffer as being read */
-	resource_used(ctx, ctx->indexbuf.buffer, true);
+	resource_read(ctx, ctx->indexbuf.buffer);
 
 	/* Mark textures as being read */
 	for (i = 0; i < ctx->verttex.num_textures; i++)
 		if (ctx->verttex.textures[i])
-			resource_used(ctx, ctx->verttex.textures[i]->texture, true);
+			resource_read(ctx, ctx->verttex.textures[i]->texture);
 	for (i = 0; i < ctx->fragtex.num_textures; i++)
 		if (ctx->fragtex.textures[i])
-			resource_used(ctx, ctx->fragtex.textures[i]->texture, true);
+			resource_read(ctx, ctx->fragtex.textures[i]->texture);
 
-	/* Mark streamout buffers as being read.. actually they are written.. */
+	/* Mark streamout buffers as being written.. */
 	for (i = 0; i < ctx->streamout.num_targets; i++)
 		if (ctx->streamout.targets[i])
-			resource_used(ctx, ctx->streamout.targets[i]->buffer, false);
+			resource_written(ctx, ctx->streamout.targets[i]->buffer);
 
 	ctx->num_draws++;
 
@@ -228,15 +244,10 @@ fd_clear(struct pipe_context *pctx, unsigned buffers,
 	if (buffers & PIPE_CLEAR_COLOR)
 		for (i = 0; i < pfb->nr_cbufs; i++)
 			if (buffers & (PIPE_CLEAR_COLOR0 << i))
-				fd_resource(pfb->cbufs[i]->texture)->dirty = true;
+				resource_written(ctx, pfb->cbufs[i]->texture);
 
 	if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
-		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
-		if (rsc->stencil && buffers & PIPE_CLEAR_STENCIL)
-			rsc->stencil->dirty = true;
-		if (!rsc->stencil || buffers & PIPE_CLEAR_DEPTH)
-			rsc->dirty = true;
-
+		resource_written(ctx, pfb->zsbuf->texture);
 		ctx->gmem_reason |= FD_GMEM_CLEARS_DEPTH_STENCIL;
 	}
 
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index d649925af48..709ad4eb55b 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -42,6 +42,14 @@
 
 #include <errno.h>
 
+
+static bool
+pending(struct fd_resource *rsc, enum fd_resource_status status)
+{
+	return (rsc->status & status) ||
+		(rsc->stencil && (rsc->stencil->status & status));
+}
+
 static void
 fd_invalidate_resource(struct fd_context *ctx, struct pipe_resource *prsc)
 {
@@ -97,7 +105,8 @@ realloc_bo(struct fd_resource *rsc, uint32_t size)
 
 	rsc->bo = fd_bo_new(screen->dev, size, flags);
 	rsc->timestamp = 0;
-	rsc->dirty = rsc->reading = rsc->writing = false;
+	rsc->status = 0;
+	rsc->pending_ctx = NULL;
 	list_delinit(&rsc->list);
 	util_range_set_empty(&rsc->valid_buffer_range);
 }
@@ -238,9 +247,9 @@ fd_resource_transfer_map(struct pipe_context *pctx,
 		/* If the GPU is writing to the resource, or if it is reading from the
 		 * resource and we're trying to write to it, flush the renders.
 		 */
-		if (rsc->dirty || (rsc->stencil && rsc->stencil->dirty) ||
-			((ptrans->usage & PIPE_TRANSFER_WRITE) && rsc->reading) ||
-			((ptrans->usage & PIPE_TRANSFER_READ) && rsc->writing))
+		if (((ptrans->usage & PIPE_TRANSFER_WRITE) &&
+					pending(rsc, FD_PENDING_READ | FD_PENDING_WRITE)) ||
+				pending(rsc, FD_PENDING_WRITE))
 			fd_context_render(pctx);
 
 		/* The GPU keeps track of how the various bo's are being used, and
@@ -678,7 +687,7 @@ fd_flush_resource(struct pipe_context *pctx, struct pipe_resource *prsc)
 {
 	struct fd_resource *rsc = fd_resource(prsc);
 
-	if (rsc->dirty || (rsc->stencil && rsc->stencil->dirty))
+	if (pending(rsc, FD_PENDING_WRITE | FD_PENDING_READ))
 		fd_context_render(pctx);
 }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h
index e7f127edca4..7549becaa1f 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.h
+++ b/src/gallium/drivers/freedreno/freedreno_resource.h
@@ -60,6 +60,15 @@ struct fd_resource_slice {
 	uint32_t size0;          /* size of first layer in slice */
 };
 
+/* status of queued up but not flushed reads and write operations.
+ * In _transfer_map() we need to know if queued up rendering needs
+ * to be flushed to preserve the order of cpu and gpu access.
+ */
+enum fd_resource_status {
+	FD_PENDING_WRITE = 0x01,
+	FD_PENDING_READ  = 0x02,
+};
+
 struct fd_resource {
 	struct u_resource base;
 	struct fd_bo *bo;
@@ -68,14 +77,20 @@ struct fd_resource {
 	uint32_t layer_size;
 	struct fd_resource_slice slices[MAX_MIP_LEVELS];
 	uint32_t timestamp;
-	bool dirty, reading, writing;
 	/* buffer range that has been initialized */
 	struct util_range valid_buffer_range;
 
 	/* reference to the resource holding stencil data for a z32_s8 texture */
+	/* TODO rename to secondary or auxiliary? */
 	struct fd_resource *stencil;
 
+	/* pending read/write state: */
+	enum fd_resource_status status;
+	/* resources accessed by queued but not flushed draws are tracked
+	 * in the used_resources list.
+	 */
 	struct list_head list;
+	struct fd_context *pending_ctx;
 };
 
 static inline struct fd_resource *

From b37a97c97d6477d5062a75a0313162ed324a36ed Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 31 Jul 2015 14:34:19 -0400
Subject: [PATCH 0975/1208] freedreno: move the half-precision logic into core

Both a3xx and a4xx need the same logic to decide if half-precision can
be used for blit shaders.  So move it to core and simplify things a bit
with a helper that considers all render targets.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a3xx/fd3_draw.c |  5 +--
 .../drivers/freedreno/a3xx/fd3_format.h       | 23 ------------
 src/gallium/drivers/freedreno/a3xx/fd3_gmem.c |  5 +--
 .../drivers/freedreno/freedreno_util.h        | 36 +++++++++++++++++++
 4 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index 43550ae6a22..a9498835011 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -243,10 +243,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 		.vtx  = &fd3_ctx->solid_vbuf_state,
 		.prog = &ctx->solid_prog,
 		.key = {
-			.half_precision = (fd3_half_precision(pfb->cbufs[0]) &&
-							   fd3_half_precision(pfb->cbufs[1]) &&
-							   fd3_half_precision(pfb->cbufs[2]) &&
-							   fd3_half_precision(pfb->cbufs[3])),
+			.half_precision = fd_half_precision(pfb),
 		},
 	};
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.h b/src/gallium/drivers/freedreno/a3xx/fd3_format.h
index 678343aaa11..05c5ea3d247 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_format.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.h
@@ -41,27 +41,4 @@ enum a3xx_color_swap fd3_pipe2swap(enum pipe_format format);
 uint32_t fd3_tex_swiz(enum pipe_format format, unsigned swizzle_r,
 		unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a);
 
-static inline bool
-fd3_half_precision(const struct pipe_surface *surface)
-{
-	enum pipe_format format;
-	if (!surface)
-		return true;
-
-	format = surface->format;
-
-	/* colors are provided in consts, which go through cov.f32f16, which will
-	 * break these values
-	 */
-	if (util_format_is_pure_integer(format))
-		return false;
-
-	/* avoid losing precision on 32-bit float formats */
-	if (util_format_is_float(format) &&
-		util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) == 32)
-		return false;
-
-	return true;
-}
-
 #endif /* FD3_FORMAT_H_ */
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index 4689085e516..302452657b6 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -537,10 +537,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 			/* NOTE: They all use the same VP, this is for vtx bufs. */
 			.prog = &ctx->blit_prog[0],
 			.key = {
-				.half_precision = (fd3_half_precision(pfb->cbufs[0]) &&
-								   fd3_half_precision(pfb->cbufs[1]) &&
-								   fd3_half_precision(pfb->cbufs[2]) &&
-								   fd3_half_precision(pfb->cbufs[3]))
+				.half_precision = fd_half_precision(pfb),
 			},
 	};
 	float x0, y0, x1, y1;
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index ed568173497..2880e890a1c 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -116,6 +116,42 @@ pipe_surface_format(struct pipe_surface *psurf)
 	return psurf->format;
 }
 
+static inline bool
+fd_surface_half_precision(const struct pipe_surface *psurf)
+{
+	enum pipe_format format;
+
+	if (!psurf)
+		return true;
+
+	format = psurf->format;
+
+	/* colors are provided in consts, which go through cov.f32f16, which will
+	 * break these values
+	 */
+	if (util_format_is_pure_integer(format))
+		return false;
+
+	/* avoid losing precision on 32-bit float formats */
+	if (util_format_is_float(format) &&
+		util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) == 32)
+		return false;
+
+	return true;
+}
+
+static inline bool
+fd_half_precision(struct pipe_framebuffer_state *pfb)
+{
+	unsigned i;
+
+	for (i = 0; i < pfb->nr_cbufs; i++)
+		if (!fd_surface_half_precision(pfb->cbufs[i]))
+			return false;
+
+	return true;
+}
+
 #define LOG_DWORDS 0
 
 static inline void emit_marker(struct fd_ringbuffer *ring, int scratch_idx);

From 054526e49abb5e7fd49fed6f589cff6f1ab4c9f6 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 31 Jul 2015 15:32:58 -0400
Subject: [PATCH 0976/1208] freedreno/a4xx: MRT support

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/a3xx/fd3_program.c      |   2 +-
 .../drivers/freedreno/a3xx/fd3_screen.c       |   2 +-
 .../drivers/freedreno/a4xx/fd4_blend.c        |   6 +-
 src/gallium/drivers/freedreno/a4xx/fd4_draw.c |  36 +++--
 src/gallium/drivers/freedreno/a4xx/fd4_emit.c | 125 +++++++++++++-----
 src/gallium/drivers/freedreno/a4xx/fd4_emit.h |   4 +-
 src/gallium/drivers/freedreno/a4xx/fd4_gmem.c |  92 +++++++------
 .../drivers/freedreno/a4xx/fd4_program.c      |  67 ++++++----
 .../drivers/freedreno/a4xx/fd4_program.h      |   3 +-
 .../drivers/freedreno/a4xx/fd4_screen.c       |   2 +-
 .../drivers/freedreno/freedreno_draw.c        |   3 +-
 .../drivers/freedreno/freedreno_util.h        |   2 +-
 12 files changed, 212 insertions(+), 132 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index e98c6b5cff4..b5360797745 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -204,7 +204,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 		color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
 			ir3_find_output_regid(fp, ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));
 	} else {
-		for (int i = 0; i < fp->outputs_count; i++) {
+		for (i = 0; i < fp->outputs_count; i++) {
 			ir3_semantic sem = fp->outputs[i].semantic;
 			unsigned idx = sem2idx(sem);
 			if (sem2name(sem) != TGSI_SEMANTIC_COLOR)
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
index 094dcf376e5..722fe360202 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
@@ -105,7 +105,7 @@ void
 fd3_screen_init(struct pipe_screen *pscreen)
 {
 	struct fd_screen *screen = fd_screen(pscreen);
-	screen->max_rts = 4;
+	screen->max_rts = A3XX_MAX_RENDER_TARGETS;
 	screen->compiler = ir3_compiler_create(screen->gpu_id);
 	pscreen->context_create = fd3_context_create;
 	pscreen->is_format_supported = fd3_screen_is_format_supported;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
index 396caa532fc..9d5ae4242f9 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
@@ -61,7 +61,7 @@ fd4_blend_state_create(struct pipe_context *pctx,
 	struct fd4_blend_stateobj *so;
 //	enum a3xx_rop_code rop = ROP_COPY;
 	bool reads_dest = false;
-	int i;
+	unsigned i, mrt_blend = 0;
 
 	if (cso->logicop_enable) {
 //		rop = cso->logicop_func;  /* maps 1:1 */
@@ -115,7 +115,7 @@ fd4_blend_state_create(struct pipe_context *pctx,
 					A4XX_RB_MRT_CONTROL_READ_DEST_ENABLE |
 					A4XX_RB_MRT_CONTROL_BLEND |
 					A4XX_RB_MRT_CONTROL_BLEND2;
-			so->rb_fs_output |= A4XX_RB_FS_OUTPUT_ENABLE_BLEND(1);
+			mrt_blend |= (1 << i);
 		}
 
 		if (reads_dest)
@@ -125,5 +125,7 @@ fd4_blend_state_create(struct pipe_context *pctx,
 			so->rb_mrt[i].buf_info |= A4XX_RB_MRT_BUF_INFO_DITHER_MODE(DITHER_ALWAYS);
 	}
 
+	so->rb_fs_output = A4XX_RB_FS_OUTPUT_ENABLE_BLEND(mrt_blend);
+
 	return so;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index 154154190db..0927b0d7682 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -111,7 +111,6 @@ static void
 fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 {
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
-	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd4_emit emit = {
 		.vtx  = &ctx->vtx,
 		.prog = &ctx->prog,
@@ -132,8 +131,6 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 			.fsaturate_t = fd4_ctx->fsaturate_t,
 			.fsaturate_r = fd4_ctx->fsaturate_r,
 		},
-		.format = fd4_emit_format(pfb->cbufs[0]),
-		.pformat = pipe_surface_format(pfb->cbufs[0]),
 	};
 	unsigned dirty;
 
@@ -173,20 +170,16 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
 	unsigned dirty = ctx->dirty;
-	unsigned ce, i;
+	unsigned i;
 	struct fd4_emit emit = {
 		.vtx  = &fd4_ctx->solid_vbuf_state,
 		.prog = &ctx->solid_prog,
 		.key = {
-			.half_precision = true,
+			.half_precision = fd_half_precision(pfb),
 		},
-		.format = fd4_emit_format(pfb->cbufs[0]),
 	};
-	uint32_t colr = 0;
-
-	if ((buffers & PIPE_CLEAR_COLOR) && pfb->nr_cbufs)
-		colr  = pack_rgba(pfb->cbufs[0]->format, color->f);
 
 	dirty &= FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR;
 	dirty |= FD_DIRTY_PROG;
@@ -260,16 +253,15 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 	if (buffers & PIPE_CLEAR_COLOR) {
 		OUT_PKT0(ring, REG_A4XX_RB_ALPHA_CONTROL, 1);
 		OUT_RING(ring, A4XX_RB_ALPHA_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER));
-		ce = 0xf;
-	} else {
-		ce = 0x0;
 	}
 
 	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+		mrt_comp[i] = (buffers & (PIPE_CLEAR_COLOR0 << i)) ? 0xf : 0x0;
+
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
 		OUT_RING(ring, A4XX_RB_MRT_CONTROL_FASTCLEAR |
 				A4XX_RB_MRT_CONTROL_B11 |
-				A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(ce));
+				A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf));
 
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1);
 		OUT_RING(ring, A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(FACTOR_ONE) |
@@ -280,6 +272,16 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 				A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO));
 	}
 
+	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+			A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+			A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+			A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+			A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+			A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+			A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+			A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
+
 	fd4_emit_vertex_bufs(ring, &emit);
 
 	OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1);
@@ -288,12 +290,6 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 	OUT_PKT0(ring, REG_A4XX_GRAS_CLEAR_CNTL, 1);
 	OUT_RING(ring, 0x00000000);
 
-	OUT_PKT0(ring, REG_A4XX_RB_CLEAR_COLOR_DW0, 4);
-	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW0 */
-	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW1 */
-	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW2 */
-	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW3 */
-
 	/* until fastclear works: */
 	fd4_emit_const(ring, SHADER_FRAGMENT, 0, 0, 4, color->ui, NULL);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index df96601c747..e7c210d3437 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -197,51 +197,90 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
  * special cases..
  */
 void
-fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, struct pipe_surface *psurf)
+fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, unsigned nr_bufs,
+		struct pipe_surface **bufs)
 {
-	struct fd_resource *rsc = fd_resource(psurf->texture);
-	unsigned lvl = psurf->u.tex.level;
-	struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl);
-	uint32_t offset = fd_resource_offset(rsc, lvl, psurf->u.tex.first_layer);
-	enum pipe_format format = fd4_gmem_restore_format(psurf->format);
+	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
+	int i;
 
-	debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+		mrt_comp[i] = (i < nr_bufs) ? 0xf : 0;
+	}
+
+	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+			A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+			A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+			A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+			A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+			A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+			A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+			A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
 
 	/* output sampler state: */
-	OUT_PKT3(ring, CP_LOAD_STATE, 4);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + (2 * nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
 			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
 			CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) |
-			CP_LOAD_STATE_0_NUM_UNIT(1));
+			CP_LOAD_STATE_0_NUM_UNIT(nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) |
 			CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
-	OUT_RING(ring, A4XX_TEX_SAMP_0_XY_MAG(A4XX_TEX_NEAREST) |
-			A4XX_TEX_SAMP_0_XY_MIN(A4XX_TEX_NEAREST) |
-			A4XX_TEX_SAMP_0_WRAP_S(A4XX_TEX_CLAMP_TO_EDGE) |
-			A4XX_TEX_SAMP_0_WRAP_T(A4XX_TEX_CLAMP_TO_EDGE) |
-			A4XX_TEX_SAMP_0_WRAP_R(A4XX_TEX_REPEAT));
-	OUT_RING(ring, 0x00000000);
+	for (i = 0; i < nr_bufs; i++) {
+		OUT_RING(ring, A4XX_TEX_SAMP_0_XY_MAG(A4XX_TEX_NEAREST) |
+				A4XX_TEX_SAMP_0_XY_MIN(A4XX_TEX_NEAREST) |
+				A4XX_TEX_SAMP_0_WRAP_S(A4XX_TEX_CLAMP_TO_EDGE) |
+				A4XX_TEX_SAMP_0_WRAP_T(A4XX_TEX_CLAMP_TO_EDGE) |
+				A4XX_TEX_SAMP_0_WRAP_R(A4XX_TEX_REPEAT));
+		OUT_RING(ring, 0x00000000);
+	}
 
 	/* emit texture state: */
-	OUT_PKT3(ring, CP_LOAD_STATE, 10);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + (8 * nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
 			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
 			CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) |
-			CP_LOAD_STATE_0_NUM_UNIT(1));
+			CP_LOAD_STATE_0_NUM_UNIT(nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) |
 			CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
-	OUT_RING(ring, A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) |
-			A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) |
-			fd4_tex_swiz(format,  PIPE_SWIZZLE_RED, PIPE_SWIZZLE_GREEN,
-					PIPE_SWIZZLE_BLUE, PIPE_SWIZZLE_ALPHA));
-	OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(psurf->width) |
-			A4XX_TEX_CONST_1_HEIGHT(psurf->height));
-	OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp));
-	OUT_RING(ring, 0x00000000);
-	OUT_RELOC(ring, rsc->bo, offset, 0, 0);
-	OUT_RING(ring, 0x00000000);
-	OUT_RING(ring, 0x00000000);
-	OUT_RING(ring, 0x00000000);
+	for (i = 0; i < nr_bufs; i++) {
+		if (bufs[i]) {
+			struct fd_resource *rsc = fd_resource(bufs[i]->texture);
+			unsigned lvl = bufs[i]->u.tex.level;
+			struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl);
+			uint32_t offset = fd_resource_offset(rsc, lvl, bufs[i]->u.tex.first_layer);
+			enum pipe_format format = fd4_gmem_restore_format(bufs[i]->format);
+
+			debug_assert(bufs[i]->u.tex.first_layer == bufs[i]->u.tex.last_layer);
+
+			OUT_RING(ring, A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) |
+					A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) |
+					fd4_tex_swiz(format,  PIPE_SWIZZLE_RED, PIPE_SWIZZLE_GREEN,
+							PIPE_SWIZZLE_BLUE, PIPE_SWIZZLE_ALPHA));
+			OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(bufs[i]->width) |
+					A4XX_TEX_CONST_1_HEIGHT(bufs[i]->height));
+			OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp));
+			OUT_RING(ring, 0x00000000);
+			OUT_RELOC(ring, rsc->bo, offset, 0, 0);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+		} else {
+			OUT_RING(ring, A4XX_TEX_CONST_0_FMT(0) |
+					A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) |
+					A4XX_TEX_CONST_0_SWIZ_X(A4XX_TEX_ONE) |
+					A4XX_TEX_CONST_0_SWIZ_Y(A4XX_TEX_ONE) |
+					A4XX_TEX_CONST_0_SWIZ_Z(A4XX_TEX_ONE) |
+					A4XX_TEX_CONST_0_SWIZ_W(A4XX_TEX_ONE));
+			OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(0) |
+					A4XX_TEX_CONST_1_HEIGHT(0));
+			OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(0));
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+		}
+	}
 }
 
 void
@@ -348,6 +387,25 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 	emit_marker(ring, 5);
 
+	if ((dirty & FD_DIRTY_FRAMEBUFFER) && !emit->key.binning_pass) {
+		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+		unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
+
+		for (unsigned i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+			mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0;
+		}
+
+		OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+		OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+				A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+				A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+				A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+				A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+				A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+				A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+				A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
+	}
+
 	if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) && !emit->key.binning_pass) {
 		uint32_t val = fd4_zsa_stateobj(ctx->zsa)->rb_render_control;
 
@@ -472,8 +530,10 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(ctx->viewport.scale[2]));
 	}
 
-	if (dirty & FD_DIRTY_PROG)
-		fd4_program_emit(ring, emit);
+	if (dirty & FD_DIRTY_PROG) {
+		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+		fd4_program_emit(ring, emit, pfb->nr_cbufs, pfb->cbufs);
+	}
 
 	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
 		ir3_emit_consts(vp, ring, emit->info, dirty);
@@ -690,9 +750,6 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT, 1);
 	OUT_RING(ring, A4XX_RB_FS_OUTPUT_SAMPLE_MASK(0xffff));
 
-	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
-	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(0xf));
-
 	OUT_PKT0(ring, REG_A4XX_GRAS_CLEAR_CNTL, 1);
 	OUT_RING(ring, A4XX_GRAS_CLEAR_CNTL_NOT_FASTCLEAR);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
index 7debee59471..99c7596fb82 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
@@ -43,7 +43,7 @@ void fd4_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		const uint32_t *dwords, struct pipe_resource *prsc);
 
 void fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring,
-		struct pipe_surface *psurf);
+		unsigned nr_bufs, struct pipe_surface **bufs);
 
 /* grouped together emit-state for prog/vertex/state emit: */
 struct fd4_emit {
@@ -51,8 +51,6 @@ struct fd4_emit {
 	const struct fd_program_stateobj *prog;
 	const struct pipe_draw_info *info;
 	struct ir3_shader_key key;
-	enum a4xx_color_fmt format;
-	enum pipe_format pformat;
 	uint32_t dirty;
 
 	/* cached to avoid repeated lookups of same variants: */
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index 976255f5879..fce5d7a3930 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -44,12 +44,6 @@
 #include "fd4_format.h"
 #include "fd4_zsa.h"
 
-static const struct ir3_shader_key key = {
-		// XXX should set this based on render target format!  We don't
-		// want half_precision if float32 render target!!!
-		.half_precision = true,
-};
-
 static void
 emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		struct pipe_surface **bufs, uint32_t *bases, uint32_t bin_w)
@@ -94,6 +88,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 			} else {
 				stride = slice->pitch * rsc->cpp;
 			}
+		} else if ((i < nr_bufs) && bases) {
+			base = bases[i];
 		}
 
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_BUF_INFO(i), 3);
@@ -101,7 +97,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 				A4XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) |
 				A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) |
 				A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap));
-		if (bin_w || (i >= nr_bufs)) {
+		if (bin_w || (i >= nr_bufs) || !bufs[i]) {
 			OUT_RING(ring, base);
 			OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(stride));
 		} else {
@@ -115,20 +111,6 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 	}
 }
 
-static uint32_t
-depth_base(struct fd_context *ctx)
-{
-	struct fd_gmem_stateobj *gmem = &ctx->gmem;
-	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
-	uint32_t cpp = 4;
-	if (pfb->cbufs[0]) {
-		struct fd_resource *rsc =
-				fd_resource(pfb->cbufs[0]->texture);
-		cpp = rsc->cpp;
-	}
-	return align(gmem->bin_w * gmem->bin_h * cpp, 0x4000);
-}
-
 /* transfer from gmem to system memory (ie. normal RAM) */
 
 static void
@@ -163,13 +145,15 @@ static void
 fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 {
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
+	struct fd_gmem_stateobj *gmem = &ctx->gmem;
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd4_emit emit = {
 			.vtx = &fd4_ctx->solid_vbuf_state,
 			.prog = &ctx->solid_prog,
-			.key = key,
-			.format = fd4_emit_format(pfb->cbufs[0]),
+			.key = {
+				.half_precision = true,
+			},
 	};
 
 	OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1);
@@ -238,16 +222,22 @@ fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
 	OUT_RING(ring, 0);            /* ??? UNKNOWN_2209 */
 
-	fd4_program_emit(ring, &emit);
+	fd4_program_emit(ring, &emit, 0, NULL);
 	fd4_emit_vertex_bufs(ring, &emit);
 
 	if (ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
-		uint32_t base = depth_base(ctx);
-		emit_gmem2mem_surf(ctx, base, pfb->zsbuf);
+		emit_gmem2mem_surf(ctx, gmem->zsbuf_base[0], pfb->zsbuf);
 	}
 
 	if (ctx->resolve & FD_BUFFER_COLOR) {
-		emit_gmem2mem_surf(ctx, 0, pfb->cbufs[0]);
+		unsigned i;
+		for (i = 0; i < pfb->nr_cbufs; i++) {
+			if (!pfb->cbufs[i])
+				continue;
+			if (!(ctx->resolve & (PIPE_CLEAR_COLOR0 << i)))
+				continue;
+			emit_gmem2mem_surf(ctx, gmem->cbuf_base[i], pfb->cbufs[i]);
+		}
 	}
 
 	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
@@ -260,14 +250,14 @@ fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 /* transfer from system memory to gmem */
 
 static void
-emit_mem2gmem_surf(struct fd_context *ctx, uint32_t base,
-		struct pipe_surface *psurf, uint32_t bin_w)
+emit_mem2gmem_surf(struct fd_context *ctx, uint32_t *bases,
+		struct pipe_surface **bufs, uint32_t nr_bufs, uint32_t bin_w)
 {
 	struct fd_ringbuffer *ring = ctx->ring;
 
-	emit_mrt(ring, 1, &psurf, &base, bin_w);
+	emit_mrt(ring, nr_bufs, bufs, bases, bin_w);
 
-	fd4_emit_gmem_restore_tex(ring, psurf);
+	fd4_emit_gmem_restore_tex(ring, nr_bufs, bufs);
 
 	fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
 			DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX_SIZE_IGN, 0, 0, NULL);
@@ -282,10 +272,13 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd4_emit emit = {
 			.vtx = &fd4_ctx->blit_vbuf_state,
+			/* NOTE: They all use the same VP, this is for vtx bufs. */
 			.prog = &ctx->blit_prog[0],
-			.key = key,
-			.format = fd4_emit_format(pfb->cbufs[0]),
+			.key = {
+				.half_precision = fd_half_precision(pfb),
+			},
 	};
+	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
 	float x0, y0, x1, y1;
 	unsigned bin_w = tile->bin_w;
 	unsigned bin_h = tile->bin_h;
@@ -304,7 +297,9 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, fui(x1));
 	OUT_RING(ring, fui(y1));
 
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+		mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0;
+
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
 		OUT_RING(ring, A4XX_RB_MRT_CONTROL_FASTCLEAR |
 				A4XX_RB_MRT_CONTROL_B11 |
@@ -319,6 +314,16 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 				A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO));
 	}
 
+	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+			A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+			A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+			A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+			A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+			A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+			A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+			A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
+
 	OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1);
 	OUT_RING(ring, 0x8);          /* XXX RB_RENDER_CONTROL */
 
@@ -381,7 +386,6 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
 	OUT_RING(ring, 0);            /* ??? UNKNOWN_2209 */
 
-	fd4_program_emit(ring, &emit);
 	fd4_emit_vertex_bufs(ring, &emit);
 
 	/* for gmem pitch/base calculations, we need to use the non-
@@ -390,11 +394,17 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	bin_w = gmem->bin_w;
 	bin_h = gmem->bin_h;
 
-	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL))
-		emit_mem2gmem_surf(ctx, depth_base(ctx), pfb->zsbuf, bin_w);
+	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
+		emit.prog = &ctx->blit_prog[0];
+		fd4_program_emit(ring, &emit, 1, &pfb->zsbuf);
+		emit_mem2gmem_surf(ctx, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w);
+	}
 
-	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR))
-		emit_mem2gmem_surf(ctx, 0, pfb->cbufs[0], bin_w);
+	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR)) {
+		emit.prog = &ctx->blit_prog[pfb->nr_cbufs - 1];
+		fd4_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs);
+		emit_mem2gmem_surf(ctx, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w);
+	}
 
 	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
 	OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
@@ -537,7 +547,7 @@ fd4_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile)
 	uint32_t reg;
 
 	OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3);
-	reg = A4XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base(ctx));
+	reg = A4XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]);
 	if (pfb->zsbuf) {
 		reg |= A4XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd4_pipe2depth(pfb->zsbuf->format));
 	}
@@ -586,7 +596,7 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1));
 	OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2));
 
-	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, gmem->bin_w);
+	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w);
 
 	/* setup scissor/offset for current tile: */
 	OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index b3c06e3aae1..c7a6dffea7b 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -31,8 +31,6 @@
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
-#include "tgsi/tgsi_dump.h"
-#include "tgsi/tgsi_parse.h"
 
 #include "freedreno_program.h"
 
@@ -213,14 +211,17 @@ setup_stages(struct fd4_emit *emit, struct stage *s)
 }
 
 void
-fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
+fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
+		int nr, struct pipe_surface **bufs)
 {
 	struct stage s[MAX_STAGES];
-	uint32_t pos_regid, posz_regid, psize_regid, color_regid;
+	uint32_t pos_regid, posz_regid, psize_regid, color_regid[8];
 	uint32_t face_regid, coord_regid, zwcoord_regid;
 	int constmode;
 	int i, j, k;
 
+	debug_assert(nr <= ARRAY_SIZE(color_regid));
+
 	setup_stages(emit, s);
 
 	/* blob seems to always use constmode currently: */
@@ -232,11 +233,30 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 		ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
 	psize_regid = ir3_find_output_regid(s[VS].v,
 		ir3_semantic_name(TGSI_SEMANTIC_PSIZE, 0));
-	color_regid = ir3_find_output_regid(s[FS].v,
-		ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));
+	if (s[FS].v->color0_mrt) {
+		color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
+		color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] =
+			ir3_find_output_regid(s[FS].v, ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));
+	} else {
+		const struct ir3_shader_variant *fp = s[FS].v;
+		memset(color_regid, 0, sizeof(color_regid));
+		for (i = 0; i < fp->outputs_count; i++) {
+			ir3_semantic sem = fp->outputs[i].semantic;
+			unsigned idx = sem2idx(sem);
+			if (sem2name(sem) != TGSI_SEMANTIC_COLOR)
+				continue;
+			debug_assert(idx < ARRAY_SIZE(color_regid));
+			color_regid[idx] = fp->outputs[i].regid;
+		}
+	}
+
+	/* adjust regids for alpha output formats. there is no alpha render
+	 * format, so it's just treated like red
+	 */
+	for (i = 0; i < nr; i++)
+		if (util_format_is_alpha(pipe_surface_format(bufs[i])))
+			color_regid[i] += 3;
 
-	if (util_format_is_alpha(emit->pformat))
-		color_regid += 3;
 
 	/* TODO get these dynamically: */
 	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
@@ -419,29 +439,24 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 					A4XX_RB_RENDER_CONTROL2_WCOORD));
 
 	OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1);
-	OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(1) |
+	OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(MAX2(1, nr)) |
 			COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z));
 
 	OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1);
-	if (s[FS].v->writes_pos) {
-		OUT_RING(ring, 0x00000001 |
-				A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE |
-				A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid));
-	} else {
-		OUT_RING(ring, 0x00000001);
-	}
+	OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr)) |
+			COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) |
+			A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid));
 
 	OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8);
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid) |
-			A4XX_SP_FS_MRT_REG_MRTFORMAT(emit->format) |
-			COND(emit->key.half_precision, A4XX_SP_FS_MRT_REG_HALF_PRECISION));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
+	for (i = 0; i < 8; i++) {
+		enum a4xx_color_fmt format = 0;
+		if (i < nr)
+			format = fd4_emit_format(bufs[i]);
+		OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) |
+				A4XX_SP_FS_MRT_REG_MRTFORMAT(format) |
+				COND(emit->key.half_precision,
+					A4XX_SP_FS_MRT_REG_HALF_PRECISION));
+	}
 
 	if (emit->key.binning_pass) {
 		OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.h b/src/gallium/drivers/freedreno/a4xx/fd4_program.h
index 52306a4c60d..8dfccaf9d74 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.h
@@ -39,7 +39,8 @@ struct fd4_shader_stateobj {
 
 struct fd4_emit;
 
-void fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit);
+void fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
+		int nr, struct pipe_surface **bufs);
 
 void fd4_prog_init(struct pipe_context *pctx);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
index e8cbb2d201a..d8ea414f300 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
@@ -102,7 +102,7 @@ void
 fd4_screen_init(struct pipe_screen *pscreen)
 {
 	struct fd_screen *screen = fd_screen(pscreen);
-	screen->max_rts = 1;
+	screen->max_rts = A4XX_MAX_RENDER_TARGETS;
 	screen->compiler = ir3_compiler_create(screen->gpu_id);
 	pscreen->context_create = fd4_context_create;
 	pscreen->is_format_supported = fd4_screen_is_format_supported;
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index a04cc879630..6831a58749c 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -265,7 +265,8 @@ fd_clear(struct pipe_context *pctx, unsigned buffers,
 			FD_DIRTY_SAMPLE_MASK |
 			FD_DIRTY_PROG |
 			FD_DIRTY_CONSTBUF |
-			FD_DIRTY_BLEND;
+			FD_DIRTY_BLEND |
+			FD_DIRTY_FRAMEBUFFER;
 
 	if (fd_mesa_debug & FD_DBG_DCLEAR)
 		ctx->dirty = 0xffffffff;
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index 2880e890a1c..b5c5db91788 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -58,7 +58,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define A3XX_MAX_RENDER_TARGETS 4
 #define A4XX_MAX_RENDER_TARGETS 8
 /* for now until a4xx MRT support: */
-#define MAX_RENDER_TARGETS A3XX_MAX_RENDER_TARGETS
+#define MAX_RENDER_TARGETS A4XX_MAX_RENDER_TARGETS
 
 #define FD_DBG_MSGS     0x0001
 #define FD_DBG_DISASM   0x0002

From d6d7515bec2e7421dcbc17f31f94613643599e33 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 1 Aug 2015 16:17:49 -0400
Subject: [PATCH 0977/1208] freedreno/a4xx: add independent blend function
 support

needed for MRT

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a4xx/fd4_blend.c   | 12 ++++++------
 src/gallium/drivers/freedreno/freedreno_screen.c |  6 ++++--
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
index 9d5ae4242f9..d5e823ef69d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
@@ -84,11 +84,6 @@ fd4_blend_state_create(struct pipe_context *pctx,
 		}
 	}
 
-	if (cso->independent_blend_enable) {
-		DBG("Unsupported! independent blend state");
-		return NULL;
-	}
-
 	so = CALLOC_STRUCT(fd4_blend_stateobj);
 	if (!so)
 		return NULL;
@@ -96,7 +91,12 @@ fd4_blend_state_create(struct pipe_context *pctx,
 	so->base = *cso;
 
 	for (i = 0; i < ARRAY_SIZE(so->rb_mrt); i++) {
-		const struct pipe_rt_blend_state *rt = &cso->rt[i];
+		const struct pipe_rt_blend_state *rt;
+
+		if (cso->independent_blend_enable)
+			rt = &cso->rt[i];
+		else
+			rt = &cso->rt[0];
 
 		so->rb_mrt[i].blend_control =
 				A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(rt->rgb_src_factor)) |
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index bab6131678b..417d7c65ad4 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -178,11 +178,13 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
 		return is_a3xx(screen) || is_a4xx(screen);
 
-	case PIPE_CAP_INDEP_BLEND_ENABLE:
-	case PIPE_CAP_INDEP_BLEND_FUNC:
 	case PIPE_CAP_DEPTH_CLIP_DISABLE:
 		return is_a3xx(screen);
 
+	case PIPE_CAP_INDEP_BLEND_ENABLE:
+	case PIPE_CAP_INDEP_BLEND_FUNC:
+		return is_a3xx(screen) || is_a4xx(screen);
+
 	case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
 		return 256;
 

From eae9c3286e2990879c6a01df3c9042b1e4031d5c Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 4 Aug 2015 17:18:43 -0700
Subject: [PATCH 0978/1208] Revert "nir: Use a single bit for the dual-source
 blend index"

This reverts commit ab5b7a0fe659ff6f9c1885d5cb047b6531959506.  We use more
than one bit of value in tgsi_to_nir.
---
 src/glsl/nir/nir.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index d9daaa2e7e2..0603b3eec67 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -292,13 +292,9 @@ typedef struct {
       unsigned int driver_location;
 
       /**
-       * Output index for dual source blending.
-       *
-       * \note
-       * The GLSL spec only allows the values 0 or 1 for the index in \b dual
-       * source blending.
+       * output index for dual source blending.
        */
-      unsigned index:1;
+      int index;
 
       /**
        * Initial binding point for a sampler or UBO.

From 9b403c0756ecf806a8ff768bd73a4cbf42986bdb Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 31 Jul 2015 10:02:45 -0700
Subject: [PATCH 0979/1208] vc4: Drop a dead prototype.

---
 src/gallium/drivers/vc4/vc4_qir.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 80a19713a10..e2d2574f1b1 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -586,14 +586,6 @@ qir_R4_UNPACK(struct vc4_compile *c, struct qreg r4, int i)
         return t;
 }
 
-static inline struct qreg
-qir_SEL_X_0_COND(struct vc4_compile *c, int i)
-{
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, c->undef, c->undef));
-        return t;
-}
-
 static inline struct qreg
 qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
 {

From cc8fb2904673588d31b660dbfaf692615b5202dd Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 31 Jul 2015 11:46:56 -0700
Subject: [PATCH 0980/1208] vc4: Make r4-writes implicitly move to a temp, and
 allocate temps to r4.

Previously, SFU values always moved to a temporary, and TLB color reads
and texture reads always lived in r4.  Instead, we can have these results
just be normal temporaries, and the register allocator can leave the
values in r4 when they don't interfere with anything else using r4.

shader-db results:
total instructions in shared programs: 100809 -> 100040 (-0.76%)
instructions in affected programs:     42383 -> 41614 (-1.81%)
---
 src/gallium/drivers/vc4/vc4_context.h         |  1 +
 .../drivers/vc4/vc4_opt_copy_propagation.c    |  5 +-
 src/gallium/drivers/vc4/vc4_opt_cse.c         | 16 +---
 src/gallium/drivers/vc4/vc4_program.c         | 19 ++---
 src/gallium/drivers/vc4/vc4_qir.c             | 18 ----
 src/gallium/drivers/vc4/vc4_qir.h             | 13 ---
 src/gallium/drivers/vc4/vc4_qpu_emit.c        | 56 ++++++-------
 .../drivers/vc4/vc4_register_allocate.c       | 83 ++++++++++++++-----
 8 files changed, 105 insertions(+), 106 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 30fb285eefe..654c46f3c0d 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -270,6 +270,7 @@ struct vc4_context {
 
         struct ra_regs *regs;
         unsigned int reg_class_any;
+        unsigned int reg_class_r4_or_a;
         unsigned int reg_class_a;
 
         uint8_t prim_mode;
diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index d6d2fbf257f..a755de9aa41 100644
--- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -67,10 +67,7 @@ qir_opt_copy_propagation(struct vc4_compile *c)
 
                 if (inst->op == QOP_MOV &&
                     inst->dst.file == QFILE_TEMP &&
-                    inst->src[0].file != QFILE_VPM &&
-                    !(inst->src[0].file == QFILE_TEMP &&
-                      (c->defs[inst->src[0].index]->op == QOP_TEX_RESULT ||
-                       c->defs[inst->src[0].index]->op == QOP_TLB_COLOR_READ))) {
+                    inst->src[0].file != QFILE_VPM) {
                         movs[inst->dst.index] = inst->src[0];
                 }
         }
diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c
index 51a56504e5e..0e5480ea781 100644
--- a/src/gallium/drivers/vc4/vc4_opt_cse.c
+++ b/src/gallium/drivers/vc4/vc4_opt_cse.c
@@ -46,8 +46,7 @@ struct inst_key {
         struct qreg src[4];
         /**
          * If the instruction depends on the flags, how many SFs have been
-         * seen before this instruction, or if it depends on r4, how many r4
-         * writes have been seen.
+         * seen before this instruction.
          */
         uint32_t implicit_arg_update_count;
 };
@@ -63,8 +62,7 @@ inst_key_equals(const void *a, const void *b)
 
 static struct qinst *
 vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
-             struct qinst *inst, uint32_t sf_count,
-             uint32_t r4_count)
+             struct qinst *inst, uint32_t sf_count)
 {
         if (inst->dst.file != QFILE_TEMP ||
             inst->op == QOP_MOV ||
@@ -79,8 +77,6 @@ vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
                qir_get_op_nsrc(inst->op) * sizeof(key.src[0]));
         if (qir_depends_on_flags(inst))
                 key.implicit_arg_update_count = sf_count;
-        if (qir_reads_r4(inst))
-                key.implicit_arg_update_count = r4_count;
 
         uint32_t hash = _mesa_hash_data(&key, sizeof(key));
         struct hash_entry *entry =
@@ -121,7 +117,7 @@ bool
 qir_opt_cse(struct vc4_compile *c)
 {
         bool progress = false;
-        uint32_t sf_count = 0, r4_count = 0;
+        uint32_t sf_count = 0;
 
         struct hash_table *ht = _mesa_hash_table_create(NULL, NULL,
                                                         inst_key_equals);
@@ -138,8 +134,7 @@ qir_opt_cse(struct vc4_compile *c)
                 if (inst->sf) {
                         sf_count++;
                 } else {
-                        struct qinst *cse = vc4_find_cse(c, ht, inst,
-                                                         sf_count, r4_count);
+                        struct qinst *cse = vc4_find_cse(c, ht, inst, sf_count);
                         if (cse) {
                                 inst->src[0] = cse->dst;
                                 for (int i = 1; i < qir_get_op_nsrc(inst->op);
@@ -155,9 +150,6 @@ qir_opt_cse(struct vc4_compile *c)
                                 }
                         }
                 }
-
-                if (qir_writes_r4(inst))
-                        r4_count++;
         }
 
         ralloc_free(ht);
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index f2742986beb..5e2a3f448a0 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -105,9 +105,8 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
                                                      range->size - 4)));
 
         qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
-        struct qreg r4 = qir_TEX_RESULT(c);
         c->num_texture_samples++;
-        return qir_MOV(c, r4);
+        return qir_TEX_RESULT(c);
 }
 
 static struct qreg *
@@ -360,13 +359,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
         qir_TEX_S(c, s, texture_u[next_texture_u++]);
 
         c->num_texture_samples++;
-        struct qreg r4 = qir_TEX_RESULT(c);
+        struct qreg tex = qir_TEX_RESULT(c);
 
         enum pipe_format format = c->key->tex[unit].format;
 
         struct qreg unpacked[4];
         if (util_format_is_depth_or_stencil(format)) {
-                struct qreg depthf = qir_ITOF(c, qir_SHR(c, r4,
+                struct qreg depthf = qir_ITOF(c, qir_SHR(c, tex,
                                                          qir_uniform_ui(c, 8)));
                 struct qreg normalized = qir_FMUL(c, depthf,
                                                   qir_uniform_f(c, 1.0f/0xffffff));
@@ -418,7 +417,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                         unpacked[i] = depth_output;
         } else {
                 for (int i = 0; i < 4; i++)
-                        unpacked[i] = qir_R4_UNPACK(c, r4, i);
+                        unpacked[i] = qir_UNPACK_8_F(c, tex, i);
         }
 
         const uint8_t *format_swiz = vc4_get_format_swizzle(format);
@@ -1305,9 +1304,10 @@ blend_pipeline(struct vc4_compile *c)
         if (c->fs_key->blend.blend_enable ||
             c->fs_key->blend.colormask != 0xf ||
             c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
-                struct qreg r4 = qir_TLB_COLOR_READ(c);
+                packed_dst_color = qir_TLB_COLOR_READ(c);
                 for (int i = 0; i < 4; i++)
-                        tlb_read_color[i] = qir_R4_UNPACK(c, r4, i);
+                        tlb_read_color[i] = qir_UNPACK_8_F(c,
+                                                           packed_dst_color, i);
                 for (int i = 0; i < 4; i++) {
                         dst_color[i] = get_swizzled_channel(c,
                                                             tlb_read_color,
@@ -1319,11 +1319,6 @@ blend_pipeline(struct vc4_compile *c)
                                 linear_dst_color[i] = dst_color[i];
                         }
                 }
-
-                /* Save the packed value for logic ops.  Can't reuse r4
-                 * because other things might smash it (like sRGB)
-                 */
-                packed_dst_color = qir_MOV(c, r4);
         }
 
         struct qreg undef_array[4] = { c->undef, c->undef, c->undef, c->undef };
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 1c96ef4795f..254140a72f5 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -96,10 +96,6 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_TEX_B] = { "tex_b", 0, 2 },
         [QOP_TEX_DIRECT] = { "tex_direct", 0, 2 },
         [QOP_TEX_RESULT] = { "tex_result", 1, 0, true },
-        [QOP_R4_UNPACK_A] = { "r4_unpack_a", 1, 1 },
-        [QOP_R4_UNPACK_B] = { "r4_unpack_b", 1, 1 },
-        [QOP_R4_UNPACK_C] = { "r4_unpack_c", 1, 1 },
-        [QOP_R4_UNPACK_D] = { "r4_unpack_d", 1, 1 },
         [QOP_UNPACK_8A_F] = { "unpack_8a_f", 1, 1 },
         [QOP_UNPACK_8B_F] = { "unpack_8b_f", 1, 1 },
         [QOP_UNPACK_8C_F] = { "unpack_8c_f", 1, 1 },
@@ -234,20 +230,6 @@ qir_writes_r4(struct qinst *inst)
         }
 }
 
-bool
-qir_reads_r4(struct qinst *inst)
-{
-        switch (inst->op) {
-        case QOP_R4_UNPACK_A:
-        case QOP_R4_UNPACK_B:
-        case QOP_R4_UNPACK_C:
-        case QOP_R4_UNPACK_D:
-                return true;
-        default:
-                return false;
-        }
-}
-
 static void
 qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index e2d2574f1b1..7a74018d9af 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -158,10 +158,6 @@ enum qop {
          * the destination
          */
         QOP_TEX_RESULT,
-        QOP_R4_UNPACK_A,
-        QOP_R4_UNPACK_B,
-        QOP_R4_UNPACK_C,
-        QOP_R4_UNPACK_D
 };
 
 struct queued_qpu_inst {
@@ -442,7 +438,6 @@ bool qir_is_multi_instruction(struct qinst *inst);
 bool qir_is_tex(struct qinst *inst);
 bool qir_depends_on_flags(struct qinst *inst);
 bool qir_writes_r4(struct qinst *inst);
-bool qir_reads_r4(struct qinst *inst);
 bool qir_src_needs_a_file(struct qinst *inst);
 struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg);
 
@@ -578,14 +573,6 @@ QIR_NODST_1(TLB_Z_WRITE)
 QIR_NODST_1(TLB_DISCARD_SETUP)
 QIR_NODST_1(TLB_STENCIL_SETUP)
 
-static inline struct qreg
-qir_R4_UNPACK(struct vc4_compile *c, struct qreg r4, int i)
-{
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, r4, c->undef));
-        return t;
-}
-
 static inline struct qreg
 qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index e1b3f3ce99a..f324056258c 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -320,7 +320,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 abort();
                         }
 
-                        queue(c, qpu_a_MOV(dst, qpu_r4()));
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
 
                         break;
 
@@ -403,6 +404,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         *last_inst(c) = qpu_set_sig(*last_inst(c),
                                                     QPU_SIG_COLOR_LOAD);
 
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
                         break;
 
                 case QOP_TLB_COLOR_WRITE:
@@ -452,21 +455,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         queue(c, qpu_NOP());
                         *last_inst(c) = qpu_set_sig(*last_inst(c),
                                                     QPU_SIG_LOAD_TMU0);
-
-                        break;
-
-                case QOP_R4_UNPACK_A:
-                case QOP_R4_UNPACK_B:
-                case QOP_R4_UNPACK_C:
-                case QOP_R4_UNPACK_D:
-                        assert(src[0].mux == QPU_MUX_R4);
-                        queue(c, qpu_a_MOV(dst, src[0]));
-                        *last_inst(c) |= QPU_PM;
-                        *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
-                                                       (qinst->op -
-                                                        QOP_R4_UNPACK_A),
-                                                       QPU_UNPACK);
-
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
                         break;
 
                 case QOP_UNPACK_8A_F:
@@ -475,20 +465,30 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QOP_UNPACK_8D_F:
                 case QOP_UNPACK_16A_F:
                 case QOP_UNPACK_16B_F: {
-                        assert(src[0].mux == QPU_MUX_A);
+                        if (src[0].mux == QPU_MUX_R4) {
+                                queue(c, qpu_a_MOV(dst, src[0]));
+                                *last_inst(c) |= QPU_PM;
+                                *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
+                                                               (qinst->op -
+                                                                QOP_UNPACK_8A_F),
+                                                               QPU_UNPACK);
+                        } else {
+                                assert(src[0].mux == QPU_MUX_A);
 
-                        /* Since we're setting the pack bits, if the
-                         * destination is in A it would get re-packed.
-                         */
-                        queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
-                                             qpu_rb(31) : dst),
-                                            src[0], src[0]));
-                        *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
-                                                                  QOP_UNPACK_8A_F],
-                                                       QPU_UNPACK);
+                                /* Since we're setting the pack bits, if the
+                                 * destination is in A it would get re-packed.
+                                 */
+                                queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
+                                                     qpu_rb(31) : dst),
+                                                    src[0], src[0]));
+                                *last_inst(c) |=
+                                        QPU_SET_FIELD(unpack_map[qinst->op -
+                                                                 QOP_UNPACK_8A_F],
+                                                      QPU_UNPACK);
 
-                        if (dst.mux == QPU_MUX_A) {
-                                queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+                                if (dst.mux == QPU_MUX_A) {
+                                        queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+                                }
                         }
                 }
                         break;
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 73964b48dca..a29db1f3abe 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -116,6 +116,8 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
         vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs));
 
         vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
+        vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
+        vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
         for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
                 /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in
                  * vc4_qpu_emit.c
@@ -126,15 +128,18 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
                 /* R4 can't be written as a general purpose register. (it's
                  * TMU_NOSWAP as a write address).
                  */
-                if (vc4_regs[i].mux == QPU_MUX_R4)
+                if (vc4_regs[i].mux == QPU_MUX_R4) {
+                        ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
                         continue;
+                }
 
                 ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
         }
 
-        vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
-        for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2)
+        for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) {
                 ra_class_add_reg(vc4->regs, vc4->reg_class_a, i);
+                ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
+        }
 
         ra_set_finalize(vc4->regs, NULL);
 }
@@ -153,6 +158,10 @@ node_to_temp_priority(const void *in_a, const void *in_b)
         return a->priority - b->priority;
 }
 
+#define CLASS_BIT_A			(1 << 0)
+#define CLASS_BIT_B_OR_ACC		(1 << 1)
+#define CLASS_BIT_R4			(1 << 2)
+
 /**
  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
  *
@@ -165,6 +174,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t temp_to_node[c->num_temps];
         uint32_t def[c->num_temps];
         uint32_t use[c->num_temps];
+        uint8_t class_bits[c->num_temps];
         struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                 sizeof(*temp_registers));
         memset(def, 0, sizeof(def));
@@ -181,10 +191,6 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
                                                          c->num_temps);
 
-        for (uint32_t i = 0; i < c->num_temps; i++) {
-                ra_set_node_class(g, i, vc4->reg_class_any);
-        }
-
         /* Compute the live ranges so we can figure out interference.
          */
         uint32_t ip = 0;
@@ -223,8 +229,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 temp_to_node[map[i].temp] = i;
         }
 
-        /* Figure out our register classes and preallocated registers*/
+        /* Figure out our register classes and preallocated registers.  We
+         * start with any temp being able to be in any file, then instructions
+         * incrementally remove bits that the temp definitely can't be in.
+         */
+        memset(class_bits,
+               CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4,
+               sizeof(class_bits));
+
+        ip = 0;
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
+                if (qir_writes_r4(inst)) {
+                        /* This instruction writes r4 (and optionally moves
+                         * its result to a temp), so nothing else can be
+                         * stored in r4 across it.
+                         */
+                        for (int i = 0; i < c->num_temps; i++) {
+                                if (def[i] < ip && use[i] > ip)
+                                        class_bits[i] &= ~CLASS_BIT_R4;
+                        }
+                } else {
+                        /* R4 can't be written as a general purpose
+                         * register. (it's TMU_NOSWAP as a write address).
+                         */
+                        if (inst->dst.file == QFILE_TEMP)
+                                class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
+                }
+
                 switch (inst->op) {
                 case QOP_FRAG_Z:
                         ra_set_node_reg(g, temp_to_node[inst->dst.index],
@@ -236,17 +267,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                                         AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
                         break;
 
-                case QOP_TEX_RESULT:
-                case QOP_TLB_COLOR_READ:
-                        assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4);
-                        ra_set_node_reg(g, temp_to_node[inst->dst.index],
-                                        ACC_INDEX + 4);
-                        break;
-
                 case QOP_PACK_SCALED:
                         /* The pack flags require an A-file dst register. */
-                        ra_set_node_class(g, temp_to_node[inst->dst.index],
-                                          vc4->reg_class_a);
+                        class_bits[inst->dst.index] &= CLASS_BIT_A;
                         break;
 
                 default:
@@ -254,8 +277,30 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 }
 
                 if (qir_src_needs_a_file(inst)) {
-                        ra_set_node_class(g, temp_to_node[inst->src[0].index],
-                                          vc4->reg_class_a);
+                        class_bits[inst->src[0].index] &= CLASS_BIT_A;
+                }
+                ip++;
+        }
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                int node = temp_to_node[i];
+
+                switch (class_bits[i]) {
+                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
+                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
+                        ra_set_node_class(g, node, vc4->reg_class_any);
+                        break;
+                case CLASS_BIT_A | CLASS_BIT_R4:
+                        ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
+                        break;
+                case CLASS_BIT_A:
+                        ra_set_node_class(g, node, vc4->reg_class_a);
+                        break;
+                default:
+                        fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
+                                i, class_bits[i]);
+                        abort();
+                        break;
                 }
         }
 

From 63eac5de8fd0e091d07f866a42584c057ca4bfa9 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 31 Jul 2015 17:08:46 -0700
Subject: [PATCH 0981/1208] vc4: Don't bother saturating the dst color for
 blending.

Since we just pulled it out of the destination as 8-bit unorm, we know
it's in [0, 1] already.

shader-db:
total instructions in shared programs: 100040 -> 98208 (-1.83%)
instructions in affected programs:     14084 -> 12252 (-13.01%)
---
 src/gallium/drivers/vc4/vc4_program.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 5e2a3f448a0..d6c60739e93 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1130,14 +1130,8 @@ vc4_blend(struct vc4_compile *c, struct qreg *result,
                 return;
         }
 
-        struct qreg clamped_src[4];
-        struct qreg clamped_dst[4];
-        for (int i = 0; i < 4; i++) {
-                clamped_src[i] = qir_SAT(c, src_color[i]);
-                clamped_dst[i] = qir_SAT(c, dst_color[i]);
-        }
-        src_color = clamped_src;
-        dst_color = clamped_dst;
+        for (int i = 0; i < 4; i++)
+                src_color[i] = qir_SAT(c, src_color[i]);
 
         struct qreg src_blend[4], dst_blend[4];
         for (int i = 0; i < 3; i++) {

From a70f63ab20d8bf922a307a92020237b1dec36314 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 31 Jul 2015 09:12:48 -0700
Subject: [PATCH 0982/1208] nir: Add algebraic opt for no-op iand.

I lazily generated some of these in VC4 NIR lowering.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/glsl/nir/nir_opt_algebraic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index 3068445cbfe..d7c17403f9f 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -132,6 +132,7 @@ optimizations = [
    # Logical and bit operations
    (('fand', a, 0.0), 0.0),
    (('iand', a, a), a),
+   (('iand', a, ~0), a),
    (('iand', a, 0), 0),
    (('ior', a, a), a),
    (('ior', a, 0), a),

From 45248d3640f5a0356085e26c44548bf3af5dec0f Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 3 Aug 2015 19:08:37 -0700
Subject: [PATCH 0983/1208] vc4: Don't bother de-SSAing values that aren't part
 of phi webs.

We can just support them the same way we do load_const's SSA values.
---
 src/gallium/drivers/vc4/vc4_program.c | 59 ++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index d6c60739e93..307667e23c8 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -110,18 +110,32 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
 }
 
 static struct qreg *
-ntq_get_dest(struct vc4_compile *c, nir_dest dest)
+ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
 {
-        assert(!dest.is_ssa);
-        nir_register *reg = dest.reg.reg;
-        struct hash_entry *entry = _mesa_hash_table_search(c->def_ht, reg);
-        assert(reg->num_array_elems == 0);
-        assert(dest.reg.base_offset == 0);
-
-        struct qreg *qregs = entry->data;
+        struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
+                                          def->num_components);
+        _mesa_hash_table_insert(c->def_ht, def, qregs);
         return qregs;
 }
 
+static struct qreg *
+ntq_get_dest(struct vc4_compile *c, nir_dest *dest)
+{
+        if (dest->is_ssa) {
+                struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa);
+                for (int i = 0; i < dest->ssa.num_components; i++)
+                        qregs[i] = c->undef;
+                return qregs;
+        } else {
+                nir_register *reg = dest->reg.reg;
+                assert(dest->reg.base_offset == 0);
+                assert(reg->num_array_elems == 0);
+                struct hash_entry *entry =
+                        _mesa_hash_table_search(c->def_ht, reg);
+                return entry->data;
+        }
+}
+
 static struct qreg
 ntq_get_src(struct vc4_compile *c, nir_src src, int i)
 {
@@ -433,7 +447,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                                                             texture_output[i]);
         }
 
-        struct qreg *dest = ntq_get_dest(c, instr->dest);
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         for (int i = 0; i < 4; i++) {
                 dest[i] = get_swizzled_channel(c, texture_output,
                                                c->key->tex[unit].swizzle[i]);
@@ -800,7 +814,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
                         srcs[i] = ntq_get_src(c, instr->src[i].src,
                                               instr->src[i].swizzle[0]);
-                struct qreg *dest = ntq_get_dest(c, instr->dest.dest);
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
                         dest[i] = srcs[i];
                 return;
@@ -814,7 +828,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 
         /* Pick the channel to store the output in. */
         assert(!instr->dest.saturate);
-        struct qreg *dest = ntq_get_dest(c, instr->dest.dest);
+        struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
         assert(util_is_power_of_two(instr->dest.write_mask));
         dest += ffs(instr->dest.write_mask) - 1;
 
@@ -1761,14 +1775,25 @@ ntq_setup_registers(struct vc4_compile *c, struct exec_list *list)
 static void
 ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
 {
-        struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
-                                          instr->def.num_components);
+        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
         for (int i = 0; i < instr->def.num_components; i++)
                 qregs[i] = qir_uniform_ui(c, instr->value.u[i]);
 
         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
 }
 
+static void
+ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr)
+{
+        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
+
+        /* QIR needs there to be *some* value, so pick 0 (same as for
+         * ntq_setup_registers().
+         */
+        for (int i = 0; i < instr->def.num_components; i++)
+                qregs[i] = qir_uniform_ui(c, 0);
+}
+
 static void
 ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
 {
@@ -1776,7 +1801,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
         struct qreg *dest = NULL;
 
         if (info->has_dest) {
-                dest = ntq_get_dest(c, instr->dest);
+                dest = ntq_get_dest(c, &instr->dest);
         }
 
         switch (instr->intrinsic) {
@@ -1844,6 +1869,10 @@ ntq_emit_instr(struct vc4_compile *c, nir_instr *instr)
                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
                 break;
 
+        case nir_instr_type_ssa_undef:
+                ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
+                break;
+
         case nir_instr_type_tex:
                 ntq_emit_tex(c, nir_instr_as_tex(instr));
                 break;
@@ -2008,7 +2037,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
 
         nir_remove_dead_variables(c->s);
 
-        nir_convert_from_ssa(c->s, false);
+        nir_convert_from_ssa(c->s, true);
 
         if (vc4_debug & VC4_DEBUG_SHADERDB) {
                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n",

From 6c28ee20410afe97dd441b0c9c680b26eb4072fc Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 3 Aug 2015 17:20:33 -0700
Subject: [PATCH 0984/1208] nir: Add a nir_lower_load_const_to_scalar() pass.

This is useful to increase the CSE opportunities for a scalar backend.  It
avoids regressions when dropping vc4's custom CSE implementation.

v2: Cleanups by Matt (decl in the for loop, and unreachable()).
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/Makefile.sources                     |   1 +
 src/glsl/nir/nir.h                            |   1 +
 src/glsl/nir/nir_lower_load_const_to_scalar.c | 103 ++++++++++++++++++
 3 files changed, 105 insertions(+)
 create mode 100644 src/glsl/nir/nir_lower_load_const_to_scalar.c

diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index 93f4e48d615..a0e85ed9e4a 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -33,6 +33,7 @@ NIR_FILES = \
 	nir/nir_lower_alu_to_scalar.c \
 	nir/nir_lower_atomics.c \
 	nir/nir_lower_global_vars_to_local.c \
+	nir/nir_lower_load_const_to_scalar.c \
 	nir/nir_lower_locals_to_regs.c \
 	nir/nir_lower_idiv.c \
 	nir/nir_lower_io.c \
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 0603b3eec67..9aae6d70354 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1654,6 +1654,7 @@ void nir_remove_dead_variables(nir_shader *shader);
 
 void nir_lower_vec_to_movs(nir_shader *shader);
 void nir_lower_alu_to_scalar(nir_shader *shader);
+void nir_lower_load_const_to_scalar(nir_shader *shader);
 
 void nir_lower_phis_to_scalar(nir_shader *shader);
 
diff --git a/src/glsl/nir/nir_lower_load_const_to_scalar.c b/src/glsl/nir/nir_lower_load_const_to_scalar.c
new file mode 100644
index 00000000000..a90e5245898
--- /dev/null
+++ b/src/glsl/nir/nir_lower_load_const_to_scalar.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/macros.h"
+#include "nir.h"
+#include "nir_builder.h"
+
+/** @file nir_lower_load_const_to_scalar.c
+ *
+ * Replaces vector nir_load_const instructions with a series of loads and a
+ * vec[234] to reconstruct the original vector (on the assumption that
+ * nir_lower_alu_to_scalar() will then be used to split it up).
+ *
+ * This gives NIR a chance to CSE more operations on a scalar shader, when the
+ * same value was used in different vector contant loads.
+ */
+
+static void
+lower_load_const_instr_scalar(nir_load_const_instr *lower)
+{
+   if (lower->def.num_components == 1)
+      return;
+
+   nir_builder b;
+   nir_builder_init(&b, nir_cf_node_get_function(&lower->instr.block->cf_node));
+   nir_builder_insert_before_instr(&b, &lower->instr);
+
+   /* Emit the individual loads. */
+   nir_ssa_def *loads[4];
+   for (unsigned i = 0; i < lower->def.num_components; i++) {
+      nir_load_const_instr *load_comp = nir_load_const_instr_create(b.shader, 1);
+      load_comp->value.u[0] = lower->value.u[i];
+      nir_builder_instr_insert(&b, &load_comp->instr);
+      loads[i] = &load_comp->def;
+   }
+
+   /* Batch things back together into a vector. */
+   nir_ssa_def *vec;
+   switch (lower->def.num_components) {
+   case 2:
+      vec = nir_vec2(&b, loads[0], loads[1]);
+      break;
+   case 3:
+      vec = nir_vec3(&b, loads[0], loads[1], loads[2]);
+      break;
+   case 4:
+      vec = nir_vec4(&b, loads[0], loads[1], loads[2], loads[3]);
+      break;
+   default:
+      unreachable("Unknown load_const component count.");
+   }
+
+   /* Replace the old load with a reference to our reconstructed vector. */
+   nir_ssa_def_rewrite_uses(&lower->def, nir_src_for_ssa(vec),
+                            ralloc_parent(b.impl));
+   nir_instr_remove(&lower->instr);
+}
+
+static bool
+lower_load_const_to_scalar_block(nir_block *block, void *data)
+{
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type == nir_instr_type_load_const)
+         lower_load_const_instr_scalar(nir_instr_as_load_const(instr));
+   }
+
+   return true;
+}
+
+static void
+nir_lower_load_const_to_scalar_impl(nir_function_impl *impl)
+{
+   nir_foreach_block(impl, lower_load_const_to_scalar_block, NULL);
+}
+
+void
+nir_lower_load_const_to_scalar(nir_shader *shader)
+{
+   nir_foreach_overload(shader, overload) {
+      if (overload->impl)
+         nir_lower_load_const_to_scalar_impl(overload->impl);
+   }
+}

From ee47d13abbc6770b4e6513c894ede56b1e846785 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 3 Aug 2015 17:36:47 -0700
Subject: [PATCH 0985/1208] vc4: Use nir_lower_load_const_to_scalar().

---
 src/gallium/drivers/vc4/vc4_program.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 307667e23c8..e9120b7a1ff 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2032,6 +2032,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         nir_convert_to_ssa(c->s);
         vc4_nir_lower_io(c);
         nir_lower_idiv(c->s);
+        nir_lower_load_const_to_scalar(c->s);
 
         vc4_optimize_nir(c->s);
 

From cfc3200a35647026a0b5cf188f378ce33802044b Mon Sep 17 00:00:00 2001
From: Frank Binns <frank.binns@imgtec.com>
Date: Fri, 31 Jul 2015 09:11:46 +0100
Subject: [PATCH 0986/1208] egl/dri: Add error info needed for
 EGL_EXT_image_dma_buf_import extension

Update the DRI image interface error codes to reflect the needs of the
EGL_EXT_image_dma_buf_import extension. This means updating the existing error
code documentation and adding a new __DRI_IMAGE_ERROR_BAD_ACCESS error code
so that drivers can correctly reject unsupported pitches and offsets. Hook
the new error code up in EGL to return EGL_BAD_ACCESS.

Cc: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Frank Binns <frank.binns@imgtec.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 include/GL/internal/dri_interface.h | 8 ++++++--
 src/egl/drivers/dri2/egl_dri2.c     | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index e7cf50df83f..a0f155a1f42 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -1178,7 +1178,8 @@ enum __DRIChromaSiting {
 };
 
 /**
- * \name Reasons that __DRIimageExtensionRec::createImageFromTexture might fail
+ * \name Reasons that __DRIimageExtensionRec::createImageFromTexture or
+ * __DRIimageExtensionRec::createImageFromDmaBufs might fail
  */
 /*@{*/
 /** Success! */
@@ -1187,11 +1188,14 @@ enum __DRIChromaSiting {
 /** Memory allocation failure */
 #define __DRI_IMAGE_ERROR_BAD_ALLOC     1
 
-/** Client requested an invalid attribute for a texture object  */
+/** Client requested an invalid attribute */
 #define __DRI_IMAGE_ERROR_BAD_MATCH     2
 
 /** Client requested an invalid texture object */
 #define __DRI_IMAGE_ERROR_BAD_PARAMETER 3
+
+/** Client requested an invalid pitch and/or offset */
+#define __DRI_IMAGE_ERROR_BAD_ACCESS    4
 /*@}*/
 
 /**
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 47f52a5e6d8..0290c077ef6 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -1545,6 +1545,10 @@ dri2_create_image_khr_texture_error(int dri_error)
       egl_error = EGL_BAD_PARAMETER;
       break;
 
+   case __DRI_IMAGE_ERROR_BAD_ACCESS:
+      egl_error = EGL_BAD_ACCESS;
+      break;
+
    default:
       assert(0);
       egl_error = EGL_BAD_MATCH;

From b2c5986ea1c8e66c4e0a05bcacbcf28c27f5b183 Mon Sep 17 00:00:00 2001
From: Frank Binns <frank.binns@imgtec.com>
Date: Fri, 31 Jul 2015 09:11:45 +0100
Subject: [PATCH 0987/1208] egl: Add eglQuerySurface surface type check for
 EGL_LARGEST_PBUFFER attrib

Calling eglQuerySurface on a window or pixmap with the EGL_LARGEST_PBUFFER
attribute resulted in the contents of the 'value' parameter being modified.
This is the wrong behaviour according to the EGL spec, which states:

    "Querying EGL_LARGEST_PBUFFER for a pbuffer surface returns the
     same attribute value specified when the surface was created with
     eglCreatePbufferSurface. For a window or pixmap surface, the
     contents of value are not modified."

Avoid this from happening by checking that the surface type is EGL_PBUFFER_BIT
before modifying the contents of the parameter.

Cc: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Frank Binns <frank.binns@imgtec.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/egl/main/eglsurface.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/egl/main/eglsurface.c b/src/egl/main/eglsurface.c
index 541353f9e0a..4fa43f3e2b1 100644
--- a/src/egl/main/eglsurface.c
+++ b/src/egl/main/eglsurface.c
@@ -326,7 +326,8 @@ _eglQuerySurface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface,
       *value = surface->Config->ConfigID;
       break;
    case EGL_LARGEST_PBUFFER:
-      *value = surface->LargestPbuffer;
+      if (surface->Type == EGL_PBUFFER_BIT)
+         *value = surface->LargestPbuffer;
       break;
    case EGL_TEXTURE_FORMAT:
       /* texture attributes: only for pbuffers, no error otherwise */

From 7d88413ade2c41054f79b20338253aacf1ac341d Mon Sep 17 00:00:00 2001
From: Frank Binns <frank.binns@imgtec.com>
Date: Fri, 31 Jul 2015 09:11:47 +0100
Subject: [PATCH 0988/1208] dri: set the __DRI_API_OPENGL bit based on max gl
 compat version

This matches similar behaviour for the __DRI_API_OPENGL_CORE bit.

Signed-off-by: Frank Binns <frank.binns@imgtec.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/mesa/drivers/dri/common/dri_util.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index 884a7e0f650..d35ac263a45 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -163,7 +163,9 @@ driCreateNewScreen2(int scrn, int fd,
        }
     }
 
-    psp->api_mask = (1 << __DRI_API_OPENGL);
+    psp->api_mask = 0;
+    if (psp->max_gl_compat_version > 0)
+       psp->api_mask |= (1 << __DRI_API_OPENGL);
     if (psp->max_gl_core_version > 0)
        psp->api_mask |= (1 << __DRI_API_OPENGL_CORE);
     if (psp->max_gl_es1_version > 0)

From 18c5cdb9433b472d9aad13175295a848bce03185 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Mon, 3 Aug 2015 08:48:32 +0300
Subject: [PATCH 0989/1208] glsl: add variable mode check to build_stageref
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently stage reference mask is built using the variable name
only. However it can happen that input of one stage has same name
as output from another stage. Adding check of variable mode makes
sure we do not pick wrong variable.

Fixes some subcases from
   ES31-CTS.program_interface_query.no-locations

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Timothy Arceri <t_arceri@yahoo.com.au>
---
 src/glsl/linker.cpp | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index a16dab4ddcd..e2da0af2521 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3110,7 +3110,8 @@ add_program_resource(struct gl_shader_program *prog, GLenum type,
  * Function builds a stage reference bitmask from variable name.
  */
 static uint8_t
-build_stageref(struct gl_shader_program *shProg, const char *name)
+build_stageref(struct gl_shader_program *shProg, const char *name,
+               unsigned mode)
 {
    uint8_t stages = 0;
 
@@ -3131,6 +3132,13 @@ build_stageref(struct gl_shader_program *shProg, const char *name)
          ir_variable *var = node->as_variable();
          if (var) {
             unsigned baselen = strlen(var->name);
+
+            /* Type needs to match if specified, otherwise we might
+             * pick a variable with same name but different interface.
+             */
+            if (mode != 0 && var->data.mode != mode)
+               continue;
+
             if (strncmp(var->name, name, baselen) == 0) {
                /* Check for exact name matches but also check for arrays and
                 * structs.
@@ -3188,7 +3196,8 @@ add_interface_variables(struct gl_shader_program *shProg,
       };
 
       if (!add_program_resource(shProg, programInterface, var,
-                                build_stageref(shProg, var->name) | mask))
+                                build_stageref(shProg, var->name,
+                                               var->data.mode) | mask))
          return false;
    }
    return true;
@@ -3241,7 +3250,7 @@ build_program_resource_list(struct gl_context *ctx,
       for (int i = 0; i < shProg->LinkedTransformFeedback.NumVarying; i++) {
          uint8_t stageref =
             build_stageref(shProg,
-                           shProg->LinkedTransformFeedback.Varyings[i].Name);
+                           shProg->LinkedTransformFeedback.Varyings[i].Name, 0);
          if (!add_program_resource(shProg, GL_TRANSFORM_FEEDBACK_VARYING,
                                    &shProg->LinkedTransformFeedback.Varyings[i],
                                    stageref))
@@ -3256,7 +3265,7 @@ build_program_resource_list(struct gl_context *ctx,
          continue;
 
       uint8_t stageref =
-         build_stageref(shProg, shProg->UniformStorage[i].name);
+         build_stageref(shProg, shProg->UniformStorage[i].name, 0);
 
       /* Add stagereferences for uniforms in a uniform block. */
       int block_index = shProg->UniformStorage[i].block_index;

From 784bea5a38c219a5ab587ff1ddce8879d4f7dce1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Tue, 4 Aug 2015 11:09:35 +0300
Subject: [PATCH 0990/1208] mesa: do not modify args when errors with
 GetProgramResourceName
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Original purpose of these lines was to be more friendly against
GUI tools using the extension. However conformance suite explicitly
checks that buffers are not modified in error conditions.

Fixes:
   ES31-CTS.program_interface_query.buff-length

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Timothy Arceri <t_arceri@yahoo.com.au>
---
 src/mesa/main/program_resource.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index 3864775818c..23d2b4d2da0 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -274,12 +274,6 @@ _mesa_GetProgramResourceName(GLuint program, GLenum programInterface,
       _mesa_lookup_shader_program_err(ctx, program,
                                       "glGetProgramResourceName");
 
-   /* Set user friendly return values in case of errors. */
-   if (name)
-      *name = '\0';
-   if (length)
-      *length = 0;
-
    if (!shProg || !name)
       return;
 

From b38a50f1e3edae6079c91f73a8d9c63a2dbf512a Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Wed, 29 Jul 2015 16:01:23 +0200
Subject: [PATCH 0991/1208] mesa: Fix errors values returned by
 glShaderBinary()

Page 68, section 7.2 'Shader Binaries" of the of the OpenGL ES 3.1,
and page 88 of the OpenGL 4.5 specs state:

    "An INVALID_VALUE error is generated if count or length is negative.
     An INVALID_ENUM error is generated if binaryformat is not a supported
     format returned in SHADER_BINARY_FORMATS."

Currently, an INVALID_OPERATION error is returned for all cases.

Fixes 1 dEQP test:
* dEQP-GLES3.functional.negative_api.shader.shader_binary

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Cc: 10.6 <mesa-stable@lists.freedesktop.org>
---
 src/mesa/main/shaderapi.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index b702dcde47a..5b28a2caf3c 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1793,12 +1793,23 @@ _mesa_ShaderBinary(GLint n, const GLuint* shaders, GLenum binaryformat,
                    const void* binary, GLint length)
 {
    GET_CURRENT_CONTEXT(ctx);
-   (void) n;
    (void) shaders;
    (void) binaryformat;
    (void) binary;
-   (void) length;
-   _mesa_error(ctx, GL_INVALID_OPERATION, "glShaderBinary");
+
+   /* Page 68, section 7.2 'Shader Binaries" of the of the OpenGL ES 3.1, and
+    * page 88 of the OpenGL 4.5 specs state:
+    *
+    *     "An INVALID_VALUE error is generated if count or length is negative.
+    *      An INVALID_ENUM error is generated if binaryformat is not a supported
+    *      format returned in SHADER_BINARY_FORMATS."
+    */
+   if (n < 0 || length < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glShaderBinary(count or length < 0)");
+      return;
+   }
+
+   _mesa_error(ctx, GL_INVALID_ENUM, "glShaderBinary(format)");
 }
 
 

From 5d64cae8427b090c42d6d38da7fb474b3ddd4eb0 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Wed, 29 Jul 2015 16:01:26 +0200
Subject: [PATCH 0992/1208] mesa: Validate target before resolving tex obj in
 glTex(ture)SubImageXD

Currently, glTexSubImageXD attempt to resolve the texture object
(by calling _mesa_get_current_tex_object()) before validating the given
target. However, that method explicitly states that target must have been
validated before calling it, so it never returns a user error.

The target validation occurs later when texsubimage_error_check() is called.

This patch reorganizes target validation, taking it out from the error check
function and into a point before the texture object is resolved.

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Cc: 10.6 <mesa-stable@lists.freedesktop.org>
---
 src/mesa/main/teximage.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 841ec364020..115dee644b4 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -2487,13 +2487,6 @@ texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
-   /* check target (proxies not allowed) */
-   if (!legal_texsubimage_target(ctx, dimensions, target, dsa)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "%s(target=%s)",
-                  callerName, _mesa_enum_to_string(target));
-      return GL_TRUE;
-   }
-
    /* level check */
    if (level < 0 || level >= _mesa_max_texture_levels(ctx, target)) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(level=%d)", callerName, level);
@@ -3522,14 +3515,6 @@ _mesa_texture_sub_image(struct gl_context *ctx, GLuint dims,
 {
    FLUSH_VERTICES(ctx, 0);
 
-   /* check target (proxies not allowed) */
-   if (!legal_texsubimage_target(ctx, dims, target, dsa)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sSubImage%uD(target=%s)",
-                  dsa ? "ture" : "",
-                  dims, _mesa_enum_to_string(target));
-      return;
-   }
-
    if (ctx->NewState & _NEW_PIXEL)
       _mesa_update_state(ctx);
 
@@ -3579,6 +3564,13 @@ texsubimage(struct gl_context *ctx, GLuint dims, GLenum target, GLint level,
    struct gl_texture_object *texObj;
    struct gl_texture_image *texImage;
 
+   /* check target (proxies not allowed) */
+   if (!legal_texsubimage_target(ctx, dims, target, false)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glTexSubImage%uD(target=%s)",
+                  dims, _mesa_enum_to_string(target));
+      return;
+   }
+
    texObj = _mesa_get_current_tex_object(ctx, target);
    if (!texObj)
       return;
@@ -3639,6 +3631,13 @@ texturesubimage(struct gl_context *ctx, GLuint dims,
       return;
    }
 
+   /* check target (proxies not allowed) */
+   if (!legal_texsubimage_target(ctx, dims, texObj->Target, true)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(target=%s)",
+                  callerName, _mesa_enum_to_string(texObj->Target));
+      return;
+   }
+
    if (texsubimage_error_check(ctx, dims, texObj, texObj->Target, level,
                                xoffset, yoffset, zoffset,
                                width, height, depth, format, type,

From 4b07e9a033ddb6733eba206b5bd47a2373756f7d Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Wed, 29 Jul 2015 16:01:28 +0200
Subject: [PATCH 0993/1208] mesa: Fix error returned by glCopyTexImage2D() upon
 an invalid internal format

Page 161 of the OpenGL-ES 3.1 (PDF) spec, and page 207 of the OpenGL 4.5 (PDF),
both on section '8.6. ALTERNATE TEXTURE IMAGE SPECIFICATION COMMANDS', states:

    "An INVALID_ENUM error is generated if an invalid value is specified for
     internalformat".

It is currently returning INVALID_OPERATION error because
_mesa_get_read_renderbuffer_for_format() is called before the internalformat
argument has been validated. To fix this, we move this call down the validation
process, after _mesa_base_tex_format() has been called. _mesa_base_tex_format()
effectively serves as a validator for the internal format.

Fixes 1 dEQP test:
* dEQP-GLES3.functional.negative_api.texture.copyteximage2d_invalid_format

Fixes 1 piglit test:
* spec@oes_compressed_etc1_rgb8_texture@basic

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Cc: 10.6 <mesa-stable@lists.freedesktop.org>
---
 src/mesa/main/teximage.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 115dee644b4..d19ad641a5f 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -2630,13 +2630,6 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
-   rb = _mesa_get_read_renderbuffer_for_format(ctx, internalFormat);
-   if (rb == NULL) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glCopyTexImage%dD(read buffer)", dimensions);
-      return GL_TRUE;
-   }
-
    /* OpenGL ES 1.x and OpenGL ES 2.0 impose additional restrictions on the
     * internalFormat.
     */
@@ -2649,7 +2642,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       case GL_LUMINANCE_ALPHA:
          break;
       default:
-         _mesa_error(ctx, GL_INVALID_VALUE,
+         _mesa_error(ctx, GL_INVALID_ENUM,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
                      _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
@@ -2658,12 +2651,19 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
 
    baseFormat = _mesa_base_tex_format(ctx, internalFormat);
    if (baseFormat < 0) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
+      _mesa_error(ctx, GL_INVALID_ENUM,
                   "glCopyTexImage%dD(internalFormat=%s)", dimensions,
                   _mesa_enum_to_string(internalFormat));
       return GL_TRUE;
    }
 
+   rb = _mesa_get_read_renderbuffer_for_format(ctx, internalFormat);
+   if (rb == NULL) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glCopyTexImage%dD(read buffer)", dimensions);
+      return GL_TRUE;
+   }
+
    rb_internal_format = rb->InternalFormat;
    rb_base_format = _mesa_base_tex_format(ctx, rb->InternalFormat);
    if (_mesa_is_color_format(internalFormat)) {

From 03b7221dbb93e2439f30b2e0918f6215eb741979 Mon Sep 17 00:00:00 2001
From: Eduardo Lima Mitev <elima@igalia.com>
Date: Wed, 29 Jul 2015 16:01:27 +0200
Subject: [PATCH 0994/1208] mesa: Add missing check of format and type in
 glTexSubImageXD on GLES 3.0

Argument validation for glTexSubImageXD is missing a check of format and type
against texture object's internal format when profile is OpenGL-ES 3.0+.

This patch also groups together all format and type checks on GLES into a
new function texture_format_error_check_gles(), to factorize similar
code in texture_format_error_check().

Fixes 2 dEQP tests:
* dEQP-GLES3.functional.negative_api.texture.texsubimage2d
* dEQP-GLES3.functional.negative_api.texture.texsubimage3d

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/main/teximage.c | 116 +++++++++++++++++++++++----------------
 1 file changed, 69 insertions(+), 47 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index d19ad641a5f..fc69387204f 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -2088,6 +2088,53 @@ texture_formats_agree(GLenum internalFormat,
    return true;
 }
 
+/**
+ * Test the combination of format, type and internal format arguments of
+ * different texture operations on GLES.
+ *
+ * \param ctx GL context.
+ * \param format pixel data format given by the user.
+ * \param type pixel data type given by the user.
+ * \param internalFormat internal format given by the user.
+ * \param dimensions texture image dimensions (must be 1, 2 or 3).
+ * \param callerName name of the caller function to print in the error message
+ *
+ * \return true if a error is found, false otherwise
+ *
+ * Currently, it is used by texture_error_check() and texsubimage_error_check().
+ */
+static bool
+texture_format_error_check_gles(struct gl_context *ctx, GLenum format,
+                                GLenum type, GLenum internalFormat,
+                                GLuint dimensions, const char *callerName)
+{
+   GLenum err;
+
+   if (_mesa_is_gles3(ctx)) {
+      err = _mesa_es3_error_check_format_and_type(ctx, format, type,
+                                                  internalFormat);
+      if (err != GL_NO_ERROR) {
+         _mesa_error(ctx, err,
+                     "%s(format = %s, type = %s, internalformat = %s)",
+                     callerName, _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type),
+                     _mesa_enum_to_string(internalFormat));
+         return true;
+      }
+   }
+   else {
+      err = _mesa_es_error_check_format_and_type(format, type, dimensions);
+      if (err != GL_NO_ERROR) {
+         _mesa_error(ctx, err, "%s(format = %s, type = %s)",
+                     callerName, _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type));
+         return true;
+      }
+   }
+
+   return false;
+}
+
 /**
  * Test the glTexImage[123]D() parameters for errors.
  *
@@ -2159,32 +2206,10 @@ texture_error_check( struct gl_context *ctx,
     * Formats and types that require additional extensions (e.g., GL_FLOAT
     * requires GL_OES_texture_float) are filtered elsewhere.
     */
-
-   if (_mesa_is_gles(ctx)) {
-      if (_mesa_is_gles3(ctx)) {
-         err = _mesa_es3_error_check_format_and_type(ctx, format, type,
-                                                     internalFormat);
-      } else {
-         if (format != internalFormat) {
-            _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glTexImage%dD(format = %s, internalFormat = %s)",
-                        dimensions,
-                        _mesa_enum_to_string(format),
-                        _mesa_enum_to_string(internalFormat));
-            return GL_TRUE;
-         }
-
-         err = _mesa_es_error_check_format_and_type(format, type, dimensions);
-      }
-      if (err != GL_NO_ERROR) {
-         _mesa_error(ctx, err,
-                     "glTexImage%dD(format = %s, type = %s, internalFormat = %s)",
-                     dimensions,
-                     _mesa_enum_to_string(format),
-                     _mesa_enum_to_string(type),
-                     _mesa_enum_to_string(internalFormat));
-         return GL_TRUE;
-      }
+   if (_mesa_is_gles(ctx) &&
+       texture_format_error_check_gles(ctx, format, type, internalFormat,
+                                       dimensions, "glTexImage%dD")) {
+     return GL_TRUE;
    }
 
    /* Check internalFormat */
@@ -2493,19 +2518,12 @@ texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
-   /* OpenGL ES 1.x and OpenGL ES 2.0 impose additional restrictions on the
-    * combinations of format and type that can be used.  Formats and types
-    * that require additional extensions (e.g., GL_FLOAT requires
-    * GL_OES_texture_float) are filtered elsewhere.
-    */
-   if (_mesa_is_gles(ctx) && !_mesa_is_gles3(ctx)) {
-      err = _mesa_es_error_check_format_and_type(format, type, dimensions);
-      if (err != GL_NO_ERROR) {
-         _mesa_error(ctx, err, "%s(format = %s, type = %s)",
-                     callerName, _mesa_enum_to_string(format),
-                     _mesa_enum_to_string(type));
-         return GL_TRUE;
-      }
+   texImage = _mesa_select_tex_image(texObj, target, level);
+   if (!texImage) {
+      /* non-existant texture level */
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid texture image)",
+                  callerName);
+      return GL_TRUE;
    }
 
    err = _mesa_error_check_format_and_type(ctx, format, type);
@@ -2517,6 +2535,18 @@ texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
+   /* OpenGL ES 1.x and OpenGL ES 2.0 impose additional restrictions on the
+    * combinations of format, internalFormat, and type that can be used.
+    * Formats and types that require additional extensions (e.g., GL_FLOAT
+    * requires GL_OES_texture_float) are filtered elsewhere.
+    */
+   if (_mesa_is_gles(ctx) &&
+       texture_format_error_check_gles(ctx, format, type,
+                                       texImage->InternalFormat,
+                                       dimensions, callerName)) {
+      return GL_TRUE;
+   }
+
    /* validate the bound PBO, if any */
    if (!_mesa_validate_pbo_source(ctx, dimensions, &ctx->Unpack,
                                   width, height, depth, format, type,
@@ -2524,14 +2554,6 @@ texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
-   texImage = _mesa_select_tex_image(texObj, target, level);
-   if (!texImage) {
-      /* non-existant texture level */
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid texture image)",
-                  callerName);
-      return GL_TRUE;
-   }
-
    if (error_check_subtexture_dimensions(ctx, dimensions,
                                          texImage, xoffset, yoffset, zoffset,
                                          width, height, depth, callerName)) {

From 2c61d583f8c931fc9834dd852b1c960c95acefb5 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Wed, 5 Aug 2015 20:27:24 +1000
Subject: [PATCH 0995/1208] nir: add missing type to type_size_vec4()

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/glsl/nir/nir_lower_io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c
index 3c179299bd9..71bfd347c1a 100644
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -62,6 +62,8 @@ type_size_vec4(const struct glsl_type *type)
          size += type_size_vec4(glsl_get_struct_field(type, i));
       }
       return size;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
    case GLSL_TYPE_SAMPLER:
       return 0;
    case GLSL_TYPE_ATOMIC_UINT:

From f7ac4ef4eeea737115d0b574fed7ecae46426072 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
Date: Wed, 5 Aug 2015 18:17:14 +0900
Subject: [PATCH 0996/1208] glsl: Initialize patch member of glsl_struct_field

There is apparently a subtle difference in C++ between

    F f;

and

    F f();

The former will use the default constructor.  If there is no default
constructor specified, the compiler provides one that simply invokes the
default constructor for each field.  For built-in basic types, the
default constructor does nothing.  The later will, according to
http://stackoverflow.com/questions/2417065/does-the-default-constructor-initialize-built-in-types)
perform value-initialization of the type.  For built-in types this means
initializing to zero.

The per_vertex_accumulator constructor is:

    per_vertex_accumulator::per_vertex_accumulator()
       : fields(),
         num_fields(0)
    {
    }

This is the second form of constructor, so the glsl_struct_field
objects were previously zero initialized.  With the addition of an empty
default constructor in commit 7ac946e5, per_vertex_accumulator::fields
receive no initialization.

Fixes a bunch of random (mostly tessellation related) piglit failures
since commit 7ac946e5 ("glsl: Add constuctors for the common cases of
glsl_struct_field").

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91544
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/glsl/builtin_variables.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index 0ff3a3f7062..53d3500b1f4 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -322,6 +322,7 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type,
    this->fields[this->num_fields].interpolation = INTERP_QUALIFIER_NONE;
    this->fields[this->num_fields].centroid = 0;
    this->fields[this->num_fields].sample = 0;
+   this->fields[this->num_fields].patch = 0;
    this->num_fields++;
 }
 
@@ -1159,6 +1160,7 @@ builtin_variable_generator::generate_varyings()
          var->data.interpolation = fields[i].interpolation;
          var->data.centroid = fields[i].centroid;
          var->data.sample = fields[i].sample;
+         var->data.patch = fields[i].patch;
          var->init_interface_type(per_vertex_out_type);
       }
    }

From 8f5d0988ea2ccaba7f049f113b652f331524d2a6 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 4 Aug 2015 19:04:55 +0300
Subject: [PATCH 0997/1208] i965: Define virtual instruction to calculate the
 high 32 bits of a multiply.

This instruction will translate to the MUL/MACH sequence that computes
the high 32-bits of the result of a 64-bit multiply.  Before Gen8
integer operations that used the accumulator were limited to 8-wide,
but the SIMD lowering pass can easily be hooked up to sidestep this
limitation, we just need a virtual opcode to represent the MUL/MACH
sequence in the IR.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_defines.h                 | 5 +++++
 src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp   | 1 +
 src/mesa/drivers/dri/i965/brw_fs_cse.cpp                | 1 +
 src/mesa/drivers/dri/i965/brw_shader.cpp                | 4 ++++
 src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp | 1 +
 src/mesa/drivers/dri/i965/brw_vec4_cse.cpp              | 1 +
 6 files changed, 13 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index f595366eeac..9c232c46ff3 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1214,6 +1214,11 @@ enum opcode {
     * GLSL barrier()
     */
    SHADER_OPCODE_BARRIER,
+
+   /**
+    * Calculate the high 32-bits of a 32x32 multiply.
+    */
+   SHADER_OPCODE_MULH,
 };
 
 enum brw_urb_write_flags {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 269bdb5b272..5445ad55670 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -534,6 +534,7 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
 
       case BRW_OPCODE_MACH:
       case BRW_OPCODE_MUL:
+      case SHADER_OPCODE_MULH:
       case BRW_OPCODE_ADD:
       case BRW_OPCODE_OR:
       case BRW_OPCODE_AND:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index a123ff2d89b..c7628dcc2f4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -61,6 +61,7 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
    case BRW_OPCODE_CMPN:
    case BRW_OPCODE_ADD:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
    case BRW_OPCODE_FRC:
    case BRW_OPCODE_RNDU:
    case BRW_OPCODE_RNDD:
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 819e4f2d3e4..bccf8d6d8d2 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -778,6 +778,8 @@ brw_instruction_name(enum opcode op)
       return "cs_terminate";
    case SHADER_OPCODE_BARRIER:
       return "barrier";
+   case SHADER_OPCODE_MULH:
+      return "mulh";
    }
 
    unreachable("not reached");
@@ -996,6 +998,7 @@ backend_instruction::is_commutative() const
    case BRW_OPCODE_XOR:
    case BRW_OPCODE_ADD:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
       return true;
    case BRW_OPCODE_SEL:
       /* MIN and MAX are commutative. */
@@ -1103,6 +1106,7 @@ backend_instruction::can_do_saturate() const
    case BRW_OPCODE_MATH:
    case BRW_OPCODE_MOV:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
    case BRW_OPCODE_PLN:
    case BRW_OPCODE_RNDD:
    case BRW_OPCODE_RNDE:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
index 2d9afa8145f..5a15eb89766 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
@@ -179,6 +179,7 @@ try_constant_propagate(const struct brw_device_info *devinfo,
 
    case BRW_OPCODE_MACH:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
    case BRW_OPCODE_ADD:
    case BRW_OPCODE_OR:
    case BRW_OPCODE_AND:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index c9fe0cebf27..5a277f74c44 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -62,6 +62,7 @@ is_expression(const vec4_instruction *const inst)
    case BRW_OPCODE_CMPN:
    case BRW_OPCODE_ADD:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
    case BRW_OPCODE_FRC:
    case BRW_OPCODE_RNDU:
    case BRW_OPCODE_RNDD:

From f5b37fb1acad9cf044b7b6d4fa5f2582bd8bc7f4 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 5 Aug 2015 16:43:37 +0300
Subject: [PATCH 0998/1208] i965/fs: Lower 32x32 bit multiplication on BXT.

AFAIK BXT has the same annoying alignment limitation as CHV on the
source register regions of 32x32 bit MULs, give it the same treatment.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 15fe3648af8..0278237101e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3130,9 +3130,9 @@ fs_visitor::lower_integer_multiplication()
    bool progress = false;
 
    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
-    * directly, but Cherryview cannot.
+    * directly, but CHV/BXT cannot.
     */
-   if (devinfo->gen >= 8 && !devinfo->is_cherryview)
+   if (devinfo->gen >= 8 && !devinfo->is_cherryview && !devinfo->is_broxton)
       return false;
 
    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {

From 2e731264382954beb1192cd7cc62e16e0b8e7978 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 5 Aug 2015 16:47:18 +0300
Subject: [PATCH 0999/1208] i965/fs: Indent the implementation of 32x32-bit MUL
 lowering by one level.

In order to make room for the code that will lower the MULH virtual
instruction.  Also move the hardware generation and execution type
checks into the same branch, they are going to have to be different
for MULH.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 290 ++++++++++++++-------------
 1 file changed, 147 insertions(+), 143 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 0278237101e..17f0eaba311 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3129,155 +3129,159 @@ fs_visitor::lower_integer_multiplication()
 {
    bool progress = false;
 
-   /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
-    * directly, but CHV/BXT cannot.
-    */
-   if (devinfo->gen >= 8 && !devinfo->is_cherryview && !devinfo->is_broxton)
-      return false;
-
    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      if (inst->opcode != BRW_OPCODE_MUL ||
-          inst->dst.is_accumulator() ||
-          (inst->dst.type != BRW_REGISTER_TYPE_D &&
-           inst->dst.type != BRW_REGISTER_TYPE_UD))
-         continue;
-
       const fs_builder ibld(this, block, inst);
 
-      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
-       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
-       * src1 are used.
-       *
-       * If multiplying by an immediate value that fits in 16-bits, do a
-       * single MUL instruction with that value in the proper location.
-       */
-      if (inst->src[1].file == IMM &&
-          inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
-         if (devinfo->gen < 7) {
-            fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
-                       inst->dst.type);
-            ibld.MOV(imm, inst->src[1]);
-            ibld.MUL(inst->dst, imm, inst->src[0]);
+      if (inst->opcode == BRW_OPCODE_MUL) {
+         if (inst->dst.is_accumulator() ||
+             (inst->dst.type != BRW_REGISTER_TYPE_D &&
+              inst->dst.type != BRW_REGISTER_TYPE_UD))
+            continue;
+
+         /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
+          * operation directly, but CHV/BXT cannot.
+          */
+         if (devinfo->gen >= 8 &&
+             !devinfo->is_cherryview && !devinfo->is_broxton)
+            continue;
+
+         if (inst->src[1].file == IMM &&
+             inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
+            /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+             * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+             * src1 are used.
+             *
+             * If multiplying by an immediate value that fits in 16-bits, do a
+             * single MUL instruction with that value in the proper location.
+             */
+            if (devinfo->gen < 7) {
+               fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
+                          inst->dst.type);
+               ibld.MOV(imm, inst->src[1]);
+               ibld.MUL(inst->dst, imm, inst->src[0]);
+            } else {
+               ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
+            }
          } else {
-            ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
+            /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
+             * do 32-bit integer multiplication in one instruction, but instead
+             * must do a sequence (which actually calculates a 64-bit result):
+             *
+             *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
+             *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
+             *    mov(8)  g2<1>D     acc0<8,8,1>D
+             *
+             * But on Gen > 6, the ability to use second accumulator register
+             * (acc1) for non-float data types was removed, preventing a simple
+             * implementation in SIMD16. A 16-channel result can be calculated by
+             * executing the three instructions twice in SIMD8, once with quarter
+             * control of 1Q for the first eight channels and again with 2Q for
+             * the second eight channels.
+             *
+             * Which accumulator register is implicitly accessed (by AccWrEnable
+             * for instance) is determined by the quarter control. Unfortunately
+             * Ivybridge (and presumably Baytrail) has a hardware bug in which an
+             * implicit accumulator access by an instruction with 2Q will access
+             * acc1 regardless of whether the data type is usable in acc1.
+             *
+             * Specifically, the 2Q mach(8) writes acc1 which does not exist for
+             * integer data types.
+             *
+             * Since we only want the low 32-bits of the result, we can do two
+             * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
+             * adjust the high result and add them (like the mach is doing):
+             *
+             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
+             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
+             *    shl(8)  g9<1>D     g8<8,8,1>D      16D
+             *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
+             *
+             * We avoid the shl instruction by realizing that we only want to add
+             * the low 16-bits of the "high" result to the high 16-bits of the
+             * "low" result and using proper regioning on the add:
+             *
+             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
+             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
+             *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
+             *
+             * Since it does not use the (single) accumulator register, we can
+             * schedule multi-component multiplications much better.
+             */
+
+            if (inst->conditional_mod && inst->dst.is_null()) {
+               inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+                                  inst->dst.type);
+            }
+            fs_reg low = inst->dst;
+            fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
+                        inst->dst.type);
+
+            if (devinfo->gen >= 7) {
+               fs_reg src1_0_w = inst->src[1];
+               fs_reg src1_1_w = inst->src[1];
+
+               if (inst->src[1].file == IMM) {
+                  src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
+                  src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
+               } else {
+                  src1_0_w.type = BRW_REGISTER_TYPE_UW;
+                  if (src1_0_w.stride != 0) {
+                     assert(src1_0_w.stride == 1);
+                     src1_0_w.stride = 2;
+                  }
+
+                  src1_1_w.type = BRW_REGISTER_TYPE_UW;
+                  if (src1_1_w.stride != 0) {
+                     assert(src1_1_w.stride == 1);
+                     src1_1_w.stride = 2;
+                  }
+                  src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
+               }
+               ibld.MUL(low, inst->src[0], src1_0_w);
+               ibld.MUL(high, inst->src[0], src1_1_w);
+            } else {
+               fs_reg src0_0_w = inst->src[0];
+               fs_reg src0_1_w = inst->src[0];
+
+               src0_0_w.type = BRW_REGISTER_TYPE_UW;
+               if (src0_0_w.stride != 0) {
+                  assert(src0_0_w.stride == 1);
+                  src0_0_w.stride = 2;
+               }
+
+               src0_1_w.type = BRW_REGISTER_TYPE_UW;
+               if (src0_1_w.stride != 0) {
+                  assert(src0_1_w.stride == 1);
+                  src0_1_w.stride = 2;
+               }
+               src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
+
+               ibld.MUL(low, src0_0_w, inst->src[1]);
+               ibld.MUL(high, src0_1_w, inst->src[1]);
+            }
+
+            fs_reg dst = inst->dst;
+            dst.type = BRW_REGISTER_TYPE_UW;
+            dst.subreg_offset = 2;
+            dst.stride = 2;
+
+            high.type = BRW_REGISTER_TYPE_UW;
+            high.stride = 2;
+
+            low.type = BRW_REGISTER_TYPE_UW;
+            low.subreg_offset = 2;
+            low.stride = 2;
+
+            ibld.ADD(dst, low, high);
+
+            if (inst->conditional_mod) {
+               fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
+               set_condmod(inst->conditional_mod,
+                           ibld.MOV(null, inst->dst));
+            }
          }
       } else {
-         /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
-          * do 32-bit integer multiplication in one instruction, but instead
-          * must do a sequence (which actually calculates a 64-bit result):
-          *
-          *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
-          *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
-          *    mov(8)  g2<1>D     acc0<8,8,1>D
-          *
-          * But on Gen > 6, the ability to use second accumulator register
-          * (acc1) for non-float data types was removed, preventing a simple
-          * implementation in SIMD16. A 16-channel result can be calculated by
-          * executing the three instructions twice in SIMD8, once with quarter
-          * control of 1Q for the first eight channels and again with 2Q for
-          * the second eight channels.
-          *
-          * Which accumulator register is implicitly accessed (by AccWrEnable
-          * for instance) is determined by the quarter control. Unfortunately
-          * Ivybridge (and presumably Baytrail) has a hardware bug in which an
-          * implicit accumulator access by an instruction with 2Q will access
-          * acc1 regardless of whether the data type is usable in acc1.
-          *
-          * Specifically, the 2Q mach(8) writes acc1 which does not exist for
-          * integer data types.
-          *
-          * Since we only want the low 32-bits of the result, we can do two
-          * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
-          * adjust the high result and add them (like the mach is doing):
-          *
-          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
-          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
-          *    shl(8)  g9<1>D     g8<8,8,1>D      16D
-          *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
-          *
-          * We avoid the shl instruction by realizing that we only want to add
-          * the low 16-bits of the "high" result to the high 16-bits of the
-          * "low" result and using proper regioning on the add:
-          *
-          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
-          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
-          *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
-          *
-          * Since it does not use the (single) accumulator register, we can
-          * schedule multi-component multiplications much better.
-          */
-
-         if (inst->conditional_mod && inst->dst.is_null()) {
-            inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
-                               inst->dst.type);
-         }
-         fs_reg low = inst->dst;
-         fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
-                     inst->dst.type);
-
-         if (devinfo->gen >= 7) {
-            fs_reg src1_0_w = inst->src[1];
-            fs_reg src1_1_w = inst->src[1];
-
-            if (inst->src[1].file == IMM) {
-               src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
-               src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
-            } else {
-               src1_0_w.type = BRW_REGISTER_TYPE_UW;
-               if (src1_0_w.stride != 0) {
-                  assert(src1_0_w.stride == 1);
-                  src1_0_w.stride = 2;
-               }
-
-               src1_1_w.type = BRW_REGISTER_TYPE_UW;
-               if (src1_1_w.stride != 0) {
-                  assert(src1_1_w.stride == 1);
-                  src1_1_w.stride = 2;
-               }
-               src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
-            }
-            ibld.MUL(low, inst->src[0], src1_0_w);
-            ibld.MUL(high, inst->src[0], src1_1_w);
-         } else {
-            fs_reg src0_0_w = inst->src[0];
-            fs_reg src0_1_w = inst->src[0];
-
-            src0_0_w.type = BRW_REGISTER_TYPE_UW;
-            if (src0_0_w.stride != 0) {
-               assert(src0_0_w.stride == 1);
-               src0_0_w.stride = 2;
-            }
-
-            src0_1_w.type = BRW_REGISTER_TYPE_UW;
-            if (src0_1_w.stride != 0) {
-               assert(src0_1_w.stride == 1);
-               src0_1_w.stride = 2;
-            }
-            src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
-
-            ibld.MUL(low, src0_0_w, inst->src[1]);
-            ibld.MUL(high, src0_1_w, inst->src[1]);
-         }
-
-         fs_reg dst = inst->dst;
-         dst.type = BRW_REGISTER_TYPE_UW;
-         dst.subreg_offset = 2;
-         dst.stride = 2;
-
-         high.type = BRW_REGISTER_TYPE_UW;
-         high.stride = 2;
-
-         low.type = BRW_REGISTER_TYPE_UW;
-         low.subreg_offset = 2;
-         low.stride = 2;
-
-         ibld.ADD(dst, low, high);
-
-         if (inst->conditional_mod) {
-            fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
-            set_condmod(inst->conditional_mod,
-                        ibld.MOV(null, inst->dst));
-         }
+         continue;
       }
 
       inst->remove(block);

From 3b48a0eeda20f5cf2dbc8de5e36f8fe3461f41bf Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Thu, 6 Aug 2015 14:04:00 +0300
Subject: [PATCH 1000/1208] i965/fs: Lower the MULH virtual instruction.

Translate MULH into the MUL/MACH sequence.  This does roughly the same
thing that nir_emit_alu() used to do but we can now handle 16-wide by
taking advantage of the SIMD lowering pass.  The force_sechalf
workaround near the bottom is required because the SIMD lowering pass
will emit instructions with non-zero quarter control and we need to
make sure we avoid that on integer arithmetic instructions with
implicit accumulator access due to a known hardware bug on IVB.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 55 ++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 17f0eaba311..284528d1f84 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3280,6 +3280,55 @@ fs_visitor::lower_integer_multiplication()
                            ibld.MOV(null, inst->dst));
             }
          }
+
+      } else if (inst->opcode == SHADER_OPCODE_MULH) {
+         /* Should have been lowered to 8-wide. */
+         assert(inst->exec_size <= 8);
+         const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
+                                   inst->dst.type);
+         fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
+         fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
+
+         if (devinfo->gen >= 8) {
+            /* Until Gen8, integer multiplies read 32-bits from one source,
+             * and 16-bits from the other, and relying on the MACH instruction
+             * to generate the high bits of the result.
+             *
+             * On Gen8, the multiply instruction does a full 32x32-bit
+             * multiply, but in order to do a 64-bit multiply we can simulate
+             * the previous behavior and then use a MACH instruction.
+             *
+             * FINISHME: Don't use source modifiers on src1.
+             */
+            assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
+                   mul->src[1].type == BRW_REGISTER_TYPE_UD);
+            mul->src[1].type = (type_is_signed(mul->src[1].type) ?
+                                BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+            mul->src[1].stride *= 2;
+
+         } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
+                    inst->force_sechalf) {
+            /* Among other things the quarter control bits influence which
+             * accumulator register is used by the hardware for instructions
+             * that access the accumulator implicitly (e.g. MACH).  A
+             * second-half instruction would normally map to acc1, which
+             * doesn't exist on Gen7 and up (the hardware does emulate it for
+             * floating-point instructions *only* by taking advantage of the
+             * extra precision of acc0 not normally used for floating point
+             * arithmetic).
+             *
+             * HSW and up are careful enough not to try to access an
+             * accumulator register that doesn't exist, but on earlier Gen7
+             * hardware we need to make sure that the quarter control bits are
+             * zero to avoid non-deterministic behaviour and emit an extra MOV
+             * to get the result masked correctly according to the current
+             * channel enables.
+             */
+            mach->force_sechalf = false;
+            mach->force_writemask_all = true;
+            mach->dst = ibld.vgrf(inst->dst.type);
+            ibld.MOV(inst->dst, mach->dst);
+         }
       } else {
          continue;
       }
@@ -4083,6 +4132,12 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
                        const fs_inst *inst)
 {
    switch (inst->opcode) {
+   case SHADER_OPCODE_MULH:
+      /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
+       * is 8-wide on Gen7+.
+       */
+      return (devinfo->gen >= 7 ? 8 : inst->exec_size);
+
    case FS_OPCODE_FB_WRITE_LOGICAL:
       /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
        * here.

From e77a4a9b1f66de383043df95aada40fd5a004913 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 4 Aug 2015 19:08:45 +0300
Subject: [PATCH 1001/1208] i965/fs: Implement nir_op_imul/umul_high in terms
 of MULH.

And get rid of another no16() call.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 33 ++----------------------
 1 file changed, 2 insertions(+), 31 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index e922a85573c..ee964a0f45d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -768,38 +768,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       break;
 
    case nir_op_imul_high:
-   case nir_op_umul_high: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
-
-      fs_inst *mul = bld.MUL(acc, op[0], op[1]);
-      bld.MACH(result, op[0], op[1]);
-
-      /* Until Gen8, integer multiplies read 32-bits from one source, and
-       * 16-bits from the other, and relying on the MACH instruction to
-       * generate the high bits of the result.
-       *
-       * On Gen8, the multiply instruction does a full 32x32-bit multiply,
-       * but in order to do a 64x64-bit multiply we have to simulate the
-       * previous behavior and then use a MACH instruction.
-       *
-       * FINISHME: Don't use source modifiers on src1.
-       */
-      if (devinfo->gen >= 8) {
-         assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
-                mul->src[1].type == BRW_REGISTER_TYPE_UD);
-         if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
-            mul->src[1].type = BRW_REGISTER_TYPE_W;
-            mul->src[1].stride = 2;
-         } else {
-            mul->src[1].type = BRW_REGISTER_TYPE_UW;
-            mul->src[1].stride = 2;
-         }
-      }
+   case nir_op_umul_high:
+      bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
       break;
-   }
 
    case nir_op_idiv:
    case nir_op_udiv:

From 42a18ca76057621ae7d8812b29ea2245d6ff282d Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 5 Aug 2015 16:29:30 +0300
Subject: [PATCH 1002/1208] i965/fs: Fix fs_inst::regs_read() for sources in
 the ATTR file.

Otherwise it would crash on Gen8 with scalar VS.  The issue can easily
be reproduced with the following patch, but I don't see any reason why
it wouldn't be possible to end up with an ATTR argument here even
without it.

CC: mesa-stable@lists.freedesktop.org
Reviewed-by: Connor Abbott <connor.w.abbott@intel.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 284528d1f84..1d3bdb42284 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -808,6 +808,7 @@ fs_inst::regs_read(int arg) const
    case IMM:
       return 1;
    case GRF:
+   case ATTR:
    case HW_REG:
       return DIV_ROUND_UP(components_read(arg) *
                           src[arg].component_size(exec_size),

From ee977183dcb543c919d0d70dde610cb191d5a3ea Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 4 Aug 2015 19:07:19 +0300
Subject: [PATCH 1003/1208] i965/fs: Lower arithmetic instructions with
 register regions of unsupported width.

This extends the SIMD lowering pass to enforce the hardware limitation
that no directly-addressed source may read more than 2 physical GRFs.
One can easily go over this limit when doing 64-bit arithmetic
(e.g. FP64 or extended-precision integer MULs) or SIMD32, so it's nice
to be able to just emit an instruction of the intended execution size
from the visitor and let the lowering pass deal with this restriction
transparently.

Some hardware arithmetic instructions are not handled here, including
all instructions that use the accumulator implicitly (which the SIMD
lowering pass deliberately doesn't handle), instructions with
non-per-channel sources (e.g. LINE or PLANE) and SEND-like
instructions, which need special handling most likely as virtual
opcodes.

Reviewed-by: Connor Abbott <connor.w.abbott@intel.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 62 ++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 1d3bdb42284..ce1edc3f889 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -4133,6 +4133,68 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
                        const fs_inst *inst)
 {
    switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_SEL:
+   case BRW_OPCODE_NOT:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_CMPN:
+   case BRW_OPCODE_CSEL:
+   case BRW_OPCODE_F32TO16:
+   case BRW_OPCODE_F16TO32:
+   case BRW_OPCODE_BFREV:
+   case BRW_OPCODE_BFE:
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_BFI2:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_MUL:
+   case BRW_OPCODE_AVG:
+   case BRW_OPCODE_FRC:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_LZD:
+   case BRW_OPCODE_FBH:
+   case BRW_OPCODE_FBL:
+   case BRW_OPCODE_CBIT:
+   case BRW_OPCODE_SAD2:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS: {
+      /* According to the PRMs:
+       *  "A. In Direct Addressing mode, a source cannot span more than 2
+       *      adjacent GRF registers.
+       *   B. A destination cannot span more than 2 adjacent GRF registers."
+       *
+       * Look for the source or destination with the largest register region
+       * which is the one that is going to limit the overal execution size of
+       * the instruction due to this rule.
+       */
+      unsigned reg_count = inst->regs_written;
+
+      for (unsigned i = 0; i < inst->sources; i++)
+         reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i));
+
+      /* Calculate the maximum execution size of the instruction based on the
+       * factor by which it goes over the hardware limit of 2 GRFs.
+       */
+      return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
+   }
    case SHADER_OPCODE_MULH:
       /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
        * is 8-wide on Gen7+.

From 115964052b25a958b2ad4ec42ae07133b2768cf9 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 3 Aug 2015 15:06:42 -0600
Subject: [PATCH 1004/1208] mesa: handle no-op cases sooner in
 _mesa_[Client]ActiveTexture()

If the new texture unit is the current texture unit, we can return
before error checking.

Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/mesa/main/texstate.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/mesa/main/texstate.c b/src/mesa/main/texstate.c
index 1e75e0a2ff1..9b5928c4306 100644
--- a/src/mesa/main/texstate.c
+++ b/src/mesa/main/texstate.c
@@ -289,23 +289,23 @@ _mesa_ActiveTexture(GLenum texture)
    GLuint k;
    GET_CURRENT_CONTEXT(ctx);
 
-   k = _mesa_max_tex_unit(ctx);
-
-   assert(k <= ARRAY_SIZE(ctx->Texture.Unit));
-
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glActiveTexture %s\n",
                   _mesa_enum_to_string(texture));
 
+   if (ctx->Texture.CurrentUnit == texUnit)
+      return;
+
+   k = _mesa_max_tex_unit(ctx);
+
+   assert(k <= ARRAY_SIZE(ctx->Texture.Unit));
+
    if (texUnit >= k) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glActiveTexture(texture=%s)",
                   _mesa_enum_to_string(texture));
       return;
    }
 
-   if (ctx->Texture.CurrentUnit == texUnit)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_TEXTURE);
 
    ctx->Texture.CurrentUnit = texUnit;
@@ -327,14 +327,14 @@ _mesa_ClientActiveTexture(GLenum texture)
       _mesa_debug(ctx, "glClientActiveTexture %s\n",
                   _mesa_enum_to_string(texture));
 
+   if (ctx->Array.ActiveTexture == texUnit)
+      return;
+
    if (texUnit >= ctx->Const.MaxTextureCoordUnits) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glClientActiveTexture(texture)");
       return;
    }
 
-   if (ctx->Array.ActiveTexture == texUnit)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_ARRAY);
    ctx->Array.ActiveTexture = texUnit;
 }

From c63e8b1193fd380e999b8ef258a20e57884820f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 30 Jul 2015 15:43:09 +0200
Subject: [PATCH 1005/1208] vbo: pass the stream from
 DrawTransformFeedbackStream to drivers

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/drivers/dri/i965/brw_draw.c          |  3 ++-
 src/mesa/drivers/dri/i965/brw_draw.h          |  3 ++-
 .../drivers/dri/i965/brw_meta_fast_clear.c    |  2 +-
 .../drivers/dri/i965/brw_primitive_restart.c  |  3 ++-
 src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c  |  7 +++++--
 src/mesa/state_tracker/st_cb_rasterpos.c      |  2 +-
 src/mesa/state_tracker/st_draw.c              |  1 +
 src/mesa/state_tracker/st_draw.h              |  2 ++
 src/mesa/state_tracker/st_draw_feedback.c     |  1 +
 src/mesa/tnl/t_draw.c                         |  1 +
 src/mesa/tnl/tnl.h                            |  3 ++-
 src/mesa/vbo/vbo.h                            |  3 ++-
 src/mesa/vbo/vbo_exec_array.c                 | 20 +++++++++----------
 src/mesa/vbo/vbo_exec_draw.c                  |  2 +-
 src/mesa/vbo/vbo_primitive_restart.c          |  4 ++--
 src/mesa/vbo/vbo_rebase.c                     |  2 +-
 src/mesa/vbo/vbo_save_draw.c                  |  2 +-
 src/mesa/vbo/vbo_split_copy.c                 |  2 +-
 src/mesa/vbo/vbo_split_inplace.c              |  2 +-
 19 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 9113d0f5a8e..e092ef4a7c6 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -559,6 +559,7 @@ brw_draw_prims(struct gl_context *ctx,
                GLuint min_index,
                GLuint max_index,
                struct gl_transform_feedback_object *unused_tfb_object,
+               unsigned stream,
                struct gl_buffer_object *indirect)
 {
    struct brw_context *brw = brw_context(ctx);
@@ -584,7 +585,7 @@ brw_draw_prims(struct gl_context *ctx,
       _swsetup_Wakeup(ctx);
       _tnl_wakeup(ctx);
       _tnl_draw_prims(ctx, prims, nr_prims, ib,
-                      index_bounds_valid, min_index, max_index, NULL, NULL);
+                      index_bounds_valid, min_index, max_index, NULL, 0, NULL);
       return;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_draw.h b/src/mesa/drivers/dri/i965/brw_draw.h
index fc83dcdd0bb..f994726f5b6 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.h
+++ b/src/mesa/drivers/dri/i965/brw_draw.h
@@ -34,7 +34,7 @@
 struct brw_context;
 
 
-void brw_draw_prims( struct gl_context *ctx,
+void brw_draw_prims(struct gl_context *ctx,
 		     const struct _mesa_prim *prims,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
@@ -42,6 +42,7 @@ void brw_draw_prims( struct gl_context *ctx,
 		     GLuint min_index,
 		     GLuint max_index,
 		     struct gl_transform_feedback_object *unused_tfb_object,
+                     unsigned stream,
 		     struct gl_buffer_object *indirect );
 
 void brw_draw_init( struct brw_context *brw );
diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index e7e8df5bded..f5ecbb54989 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -200,7 +200,7 @@ brw_draw_rectlist(struct gl_context *ctx, struct rect *rect, int num_instances)
 
    brw_draw_prims(ctx, &prim, 1, NULL,
                   GL_TRUE, start, start + count - 1,
-                  NULL, NULL);
+                  NULL, 0, NULL);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/brw_primitive_restart.c b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
index 2c7a7e8b8dd..6ed79d7cb75 100644
--- a/src/mesa/drivers/dri/i965/brw_primitive_restart.c
+++ b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
@@ -161,7 +161,8 @@ brw_handle_primitive_restart(struct gl_context *ctx,
       /* Cut index should work for primitive restart, so use it
        */
       brw->prim_restart.enable_cut_index = true;
-      brw_draw_prims(ctx, prims, nr_prims, ib, GL_FALSE, -1, -1, NULL, indirect);
+      brw_draw_prims(ctx, prims, nr_prims, ib, GL_FALSE, -1, -1, NULL, 0,
+                     indirect);
       brw->prim_restart.enable_cut_index = false;
    } else {
       /* Not all the primitive draw modes are supported by the cut index,
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
index c85acec1268..a3fbad07e66 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
@@ -223,6 +223,7 @@ TAG(vbo_render_prims)(struct gl_context *ctx,
 		      GLboolean index_bounds_valid,
 		      GLuint min_index, GLuint max_index,
 		      struct gl_transform_feedback_object *tfb_vertcount,
+                      unsigned stream,
 		      struct gl_buffer_object *indirect);
 
 static GLboolean
@@ -455,6 +456,7 @@ TAG(vbo_render_prims)(struct gl_context *ctx,
 		      GLboolean index_bounds_valid,
 		      GLuint min_index, GLuint max_index,
 		      struct gl_transform_feedback_object *tfb_vertcount,
+                      unsigned stream,
 		      struct gl_buffer_object *indirect)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
@@ -492,6 +494,7 @@ TAG(vbo_check_render_prims)(struct gl_context *ctx,
 			    GLboolean index_bounds_valid,
 			    GLuint min_index, GLuint max_index,
 			    struct gl_transform_feedback_object *tfb_vertcount,
+                            unsigned stream,
 			    struct gl_buffer_object *indirect)
 {
 	struct nouveau_context *nctx = to_nouveau_context(ctx);
@@ -501,12 +504,12 @@ TAG(vbo_check_render_prims)(struct gl_context *ctx,
 	if (nctx->fallback == HWTNL)
 		TAG(vbo_render_prims)(ctx, prims, nr_prims, ib,
 				      index_bounds_valid, min_index, max_index,
-				      tfb_vertcount, indirect);
+				      tfb_vertcount, stream, indirect);
 
 	if (nctx->fallback == SWTNL)
 		_tnl_draw_prims(ctx, prims, nr_prims, ib,
 				index_bounds_valid, min_index, max_index,
-				tfb_vertcount, indirect);
+				tfb_vertcount, stream, indirect);
 }
 
 void
diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c
index 272cbb91d52..b9997dacfd2 100644
--- a/src/mesa/state_tracker/st_cb_rasterpos.c
+++ b/src/mesa/state_tracker/st_cb_rasterpos.c
@@ -254,7 +254,7 @@ st_RasterPos(struct gl_context *ctx, const GLfloat v[4])
     * st_feedback_draw_vbo doesn't check for that flag. */
    ctx->Array._DrawArrays = rs->arrays;
    st_feedback_draw_vbo(ctx, &rs->prim, 1, NULL, GL_TRUE, 0, 1,
-                        NULL, NULL);
+                        NULL, 0, NULL);
    ctx->Array._DrawArrays = saved_arrays;
 
    /* restore draw's rasterization stage depending on rendermode */
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index bae8096e080..a29971c484a 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -184,6 +184,7 @@ st_draw_vbo(struct gl_context *ctx,
             GLuint min_index,
             GLuint max_index,
             struct gl_transform_feedback_object *tfb_vertcount,
+            unsigned stream,
             struct gl_buffer_object *indirect)
 {
    struct st_context *st = st_context(ctx);
diff --git a/src/mesa/state_tracker/st_draw.h b/src/mesa/state_tracker/st_draw.h
index 780d4bde713..a973c8a4a5d 100644
--- a/src/mesa/state_tracker/st_draw.h
+++ b/src/mesa/state_tracker/st_draw.h
@@ -56,6 +56,7 @@ st_draw_vbo(struct gl_context *ctx,
             GLuint min_index,
             GLuint max_index,
             struct gl_transform_feedback_object *tfb_vertcount,
+            unsigned stream,
             struct gl_buffer_object *indirect);
 
 extern void
@@ -67,6 +68,7 @@ st_feedback_draw_vbo(struct gl_context *ctx,
                      GLuint min_index,
                      GLuint max_index,
                      struct gl_transform_feedback_object *tfb_vertcount,
+                     unsigned stream,
                      struct gl_buffer_object *indirect);
 
 /**
diff --git a/src/mesa/state_tracker/st_draw_feedback.c b/src/mesa/state_tracker/st_draw_feedback.c
index 177f6b5aefa..88c10a8f150 100644
--- a/src/mesa/state_tracker/st_draw_feedback.c
+++ b/src/mesa/state_tracker/st_draw_feedback.c
@@ -117,6 +117,7 @@ st_feedback_draw_vbo(struct gl_context *ctx,
                      GLuint min_index,
                      GLuint max_index,
                      struct gl_transform_feedback_object *tfb_vertcount,
+                     unsigned stream,
                      struct gl_buffer_object *indirect)
 {
    struct st_context *st = st_context(ctx);
diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c
index 632acb55f8b..c130ab3f93d 100644
--- a/src/mesa/tnl/t_draw.c
+++ b/src/mesa/tnl/t_draw.c
@@ -425,6 +425,7 @@ void _tnl_draw_prims(struct gl_context *ctx,
 			 GLuint min_index,
 			 GLuint max_index,
 			 struct gl_transform_feedback_object *tfb_vertcount,
+                         unsigned stream,
 			 struct gl_buffer_object *indirect)
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
diff --git a/src/mesa/tnl/tnl.h b/src/mesa/tnl/tnl.h
index 8c59ff9e58f..5a9938e7afb 100644
--- a/src/mesa/tnl/tnl.h
+++ b/src/mesa/tnl/tnl.h
@@ -76,7 +76,7 @@ struct _mesa_prim;
 struct _mesa_index_buffer;
 
 void
-_tnl_draw_prims( struct gl_context *ctx,
+_tnl_draw_prims(struct gl_context *ctx,
 		     const struct _mesa_prim *prim,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
@@ -84,6 +84,7 @@ _tnl_draw_prims( struct gl_context *ctx,
 		     GLuint min_index,
 		     GLuint max_index,
 		     struct gl_transform_feedback_object *tfb_vertcount,
+                     unsigned stream,
 		     struct gl_buffer_object *indirect );
 
 extern void
diff --git a/src/mesa/vbo/vbo.h b/src/mesa/vbo/vbo.h
index 54dee6c464f..2aaff5df019 100644
--- a/src/mesa/vbo/vbo.h
+++ b/src/mesa/vbo/vbo.h
@@ -97,7 +97,8 @@ typedef void (*vbo_draw_func)( struct gl_context *ctx,
 			       GLuint min_index,
 			       GLuint max_index,
 			       struct gl_transform_feedback_object *tfb_vertcount,
-			       struct gl_buffer_object *indirect );
+                               unsigned stream,
+			       struct gl_buffer_object *indirect);
 
 
 
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index ec13449c751..34d2c1d3d6b 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -633,7 +633,7 @@ vbo_draw_arrays(struct gl_context *ctx, GLenum mode, GLint start,
          /* draw one or two prims */
          check_buffers_are_unmapped(exec->array.inputs);
          vbo->draw_prims(ctx, prim, primCount, NULL,
-                         GL_TRUE, start, start + count - 1, NULL, NULL);
+                         GL_TRUE, start, start + count - 1, NULL, 0, NULL);
       }
    }
    else {
@@ -644,7 +644,7 @@ vbo_draw_arrays(struct gl_context *ctx, GLenum mode, GLint start,
       check_buffers_are_unmapped(exec->array.inputs);
       vbo->draw_prims(ctx, prim, 1, NULL,
                       GL_TRUE, start, start + count - 1,
-                      NULL, NULL);
+                      NULL, 0, NULL);
    }
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH) {
@@ -990,7 +990,7 @@ vbo_validated_drawrangeelements(struct gl_context *ctx, GLenum mode,
 
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, 1, &ib,
-                   index_bounds_valid, start, end, NULL, NULL);
+                   index_bounds_valid, start, end, NULL, 0, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH) {
       _mesa_flush(ctx);
@@ -1350,7 +1350,7 @@ vbo_validated_multidrawelements(struct gl_context *ctx, GLenum mode,
 
       check_buffers_are_unmapped(exec->array.inputs);
       vbo->draw_prims(ctx, prim, primcount, &ib,
-                      false, ~0, ~0, NULL, NULL);
+                      false, ~0, ~0, NULL, 0, NULL);
    } else {
       /* render one prim at a time */
       for (i = 0; i < primcount; i++) {
@@ -1379,7 +1379,7 @@ vbo_validated_multidrawelements(struct gl_context *ctx, GLenum mode,
 
          check_buffers_are_unmapped(exec->array.inputs);
          vbo->draw_prims(ctx, prim, 1, &ib,
-                         false, ~0, ~0, NULL, NULL);
+                         false, ~0, ~0, NULL, 0, NULL);
       }
    }
 
@@ -1464,7 +1464,7 @@ vbo_draw_transform_feedback(struct gl_context *ctx, GLenum mode,
 
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, 1, NULL,
-                   GL_TRUE, 0, 0, obj, NULL);
+                   GL_TRUE, 0, 0, obj, stream, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH) {
       _mesa_flush(ctx);
@@ -1563,7 +1563,7 @@ vbo_validated_drawarraysindirect(struct gl_context *ctx,
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, 1,
                    NULL, GL_TRUE, 0, ~0,
-                   NULL,
+                   NULL, 0,
                    ctx->DrawIndirectBuffer);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
@@ -1603,7 +1603,7 @@ vbo_validated_multidrawarraysindirect(struct gl_context *ctx,
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, primcount,
                    NULL, GL_TRUE, 0, ~0,
-                   NULL,
+                   NULL, 0,
                    ctx->DrawIndirectBuffer);
 
    free(prim);
@@ -1640,7 +1640,7 @@ vbo_validated_drawelementsindirect(struct gl_context *ctx,
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, 1,
                    &ib, GL_TRUE, 0, ~0,
-                   NULL,
+                   NULL, 0,
                    ctx->DrawIndirectBuffer);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
@@ -1689,7 +1689,7 @@ vbo_validated_multidrawelementsindirect(struct gl_context *ctx,
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, primcount,
                    &ib, GL_TRUE, 0, ~0,
-                   NULL,
+                   NULL, 0,
                    ctx->DrawIndirectBuffer);
 
    free(prim);
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index 37b53a8309d..2bfb0c32b73 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -412,7 +412,7 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped)
 				       GL_TRUE,
 				       0,
 				       exec->vtx.vert_count - 1,
-				       NULL, NULL);
+				       NULL, 0, NULL);
 
 	 /* If using a real VBO, get new storage -- unless asked not to.
           */
diff --git a/src/mesa/vbo/vbo_primitive_restart.c b/src/mesa/vbo/vbo_primitive_restart.c
index dafc4fd2a9a..0662c5cd4ef 100644
--- a/src/mesa/vbo/vbo_primitive_restart.c
+++ b/src/mesa/vbo/vbo_primitive_restart.c
@@ -251,11 +251,11 @@ vbo_sw_primitive_restart(struct gl_context *ctx,
                 (temp_prim.count == sub_prim->count)) {
                draw_prims_func(ctx, &temp_prim, 1, ib,
                                GL_TRUE, sub_prim->min_index, sub_prim->max_index,
-                               NULL, NULL);
+                               NULL, 0, NULL);
             } else {
                draw_prims_func(ctx, &temp_prim, 1, ib,
                                GL_FALSE, -1, -1,
-                               NULL, NULL);
+                               NULL, 0, NULL);
             }
          }
          if (sub_end_index >= end_index) {
diff --git a/src/mesa/vbo/vbo_rebase.c b/src/mesa/vbo/vbo_rebase.c
index c3c4b64e65c..24c04ca7e6a 100644
--- a/src/mesa/vbo/vbo_rebase.c
+++ b/src/mesa/vbo/vbo_rebase.c
@@ -258,7 +258,7 @@ void vbo_rebase_prims( struct gl_context *ctx,
 	 GL_TRUE,
 	 0, 
 	 max_index - min_index,
-	 NULL, NULL );
+	 NULL, 0, NULL );
 
    ctx->Array._DrawArrays = saved_arrays;
    ctx->NewDriverState |= ctx->DriverFlags.NewArray;
diff --git a/src/mesa/vbo/vbo_save_draw.c b/src/mesa/vbo/vbo_save_draw.c
index de744e0c763..b1fd6892026 100644
--- a/src/mesa/vbo/vbo_save_draw.c
+++ b/src/mesa/vbo/vbo_save_draw.c
@@ -314,7 +314,7 @@ vbo_save_playback_vertex_list(struct gl_context *ctx, void *data)
                                       GL_TRUE,
                                       0,    /* Node is a VBO, so this is ok */
                                       node->count - 1,
-                                      NULL, NULL);
+                                      NULL, 0, NULL);
       }
    }
 
diff --git a/src/mesa/vbo/vbo_split_copy.c b/src/mesa/vbo/vbo_split_copy.c
index 7b1e20b18d2..cb27ef961ab 100644
--- a/src/mesa/vbo/vbo_split_copy.c
+++ b/src/mesa/vbo/vbo_split_copy.c
@@ -203,7 +203,7 @@ flush( struct copy_context *copy )
 	       GL_TRUE,
 	       0,
 	       copy->dstbuf_nr - 1,
-	       NULL, NULL );
+	       NULL, 0, NULL );
 
    ctx->Array._DrawArrays = saved_arrays;
    ctx->NewDriverState |= ctx->DriverFlags.NewArray;
diff --git a/src/mesa/vbo/vbo_split_inplace.c b/src/mesa/vbo/vbo_split_inplace.c
index 5887b74d829..cff4bcd30ff 100644
--- a/src/mesa/vbo/vbo_split_inplace.c
+++ b/src/mesa/vbo/vbo_split_inplace.c
@@ -94,7 +94,7 @@ static void flush_vertex( struct split_context *split )
 	       !split->ib,
 	       split->min_index,
 	       split->max_index,
-	       NULL, NULL);
+	       NULL, 0, NULL);
 
    ctx->Array._DrawArrays = saved_arrays;
    ctx->NewDriverState |= ctx->DriverFlags.NewArray;

From 7d3939f0de7dcb5e68eca638d5832c683a124775 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 30 Jul 2015 16:11:50 +0200
Subject: [PATCH 1006/1208] mesa: save which transform feedback buffer is
 associated with which stream

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/link_varyings.cpp | 1 +
 src/mesa/main/mtypes.h     | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 1c52ff35f44..f7a7b8c4c5b 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -572,6 +572,7 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
       info->Outputs[info->NumOutputs].DstOffset = info->BufferStride[buffer];
       ++info->NumOutputs;
       info->BufferStride[buffer] += output_size;
+      info->BufferStream[buffer] = this->stream_id;
       num_components -= output_size;
       location++;
       location_frac = 0;
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index a252bf46606..4e00fb6645d 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1868,6 +1868,11 @@ struct gl_transform_feedback_info
     * multiple transform feedback outputs in the same buffer.
     */
    unsigned BufferStride[MAX_FEEDBACK_BUFFERS];
+
+   /**
+    * Which transform feedback stream this buffer binding is associated with.
+    */
+   unsigned BufferStream[MAX_FEEDBACK_BUFFERS];
 };
 
 

From b2eb13d602f71f19216284a584834cdaa2550eb3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 30 Jul 2015 16:14:03 +0200
Subject: [PATCH 1007/1208] st/mesa: implement DrawTransformFeedbackStream

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_cb_xformfb.c | 61 ++++++++++++++------------
 src/mesa/state_tracker/st_cb_xformfb.h |  2 +-
 src/mesa/state_tracker/st_draw.c       |  2 +-
 3 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_xformfb.c b/src/mesa/state_tracker/st_cb_xformfb.c
index 0708e68ec7b..0c01cd5ab78 100644
--- a/src/mesa/state_tracker/st_cb_xformfb.c
+++ b/src/mesa/state_tracker/st_cb_xformfb.c
@@ -54,9 +54,9 @@ struct st_transform_feedback_object {
    struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
 
    /* This encapsulates the count that can be used as a source for draw_vbo.
-    * It contains a stream output target from the last call of
-    * EndTransformFeedback. */
-   struct pipe_stream_output_target *draw_count;
+    * It contains stream output targets from the last call of
+    * EndTransformFeedback for each stream. */
+   struct pipe_stream_output_target *draw_count[MAX_VERTEX_STREAMS];
 };
 
 static inline struct st_transform_feedback_object *
@@ -88,7 +88,8 @@ st_delete_transform_feedback(struct gl_context *ctx,
          st_transform_feedback_object(obj);
    unsigned i;
 
-   pipe_so_target_reference(&sobj->draw_count, NULL);
+   for (i = 0; i < ARRAY_SIZE(sobj->draw_count); i++)
+      pipe_so_target_reference(&sobj->draw_count[i], NULL);
 
    /* Unreference targets. */
    for (i = 0; i < sobj->num_targets; i++) {
@@ -123,9 +124,12 @@ st_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
       struct st_buffer_object *bo = st_buffer_object(sobj->base.Buffers[i]);
 
       if (bo && bo->buffer) {
+         unsigned stream =
+            obj->shader_program->LinkedTransformFeedback.BufferStream[i];
+
          /* Check whether we need to recreate the target. */
          if (!sobj->targets[i] ||
-             sobj->targets[i] == sobj->draw_count ||
+             sobj->targets[i] == sobj->draw_count[stream] ||
              sobj->targets[i]->buffer != bo->buffer ||
              sobj->targets[i]->buffer_offset != sobj->base.Offset[i] ||
              sobj->targets[i]->buffer_size != sobj->base.Size[i]) {
@@ -178,23 +182,6 @@ st_resume_transform_feedback(struct gl_context *ctx,
 }
 
 
-static struct pipe_stream_output_target *
-st_transform_feedback_get_draw_target(struct gl_transform_feedback_object *obj)
-{
-   struct st_transform_feedback_object *sobj =
-         st_transform_feedback_object(obj);
-   unsigned i;
-
-   for (i = 0; i < ARRAY_SIZE(sobj->targets); i++) {
-      if (sobj->targets[i]) {
-         return sobj->targets[i];
-      }
-   }
-
-   return NULL;
-}
-
-
 static void
 st_end_transform_feedback(struct gl_context *ctx,
                           struct gl_transform_feedback_object *obj)
@@ -202,25 +189,41 @@ st_end_transform_feedback(struct gl_context *ctx,
    struct st_context *st = st_context(ctx);
    struct st_transform_feedback_object *sobj =
          st_transform_feedback_object(obj);
+   unsigned i;
 
    cso_set_stream_outputs(st->cso_context, 0, NULL, NULL);
 
-   pipe_so_target_reference(&sobj->draw_count,
-                            st_transform_feedback_get_draw_target(obj));
+   /* The next call to glDrawTransformFeedbackStream should use the vertex
+    * count from the last call to glEndTransformFeedback.
+    * Therefore, save the targets for each stream.
+    *
+    * NULL means the vertex counter is 0 (initial state).
+    */
+   for (i = 0; i < ARRAY_SIZE(sobj->draw_count); i++)
+      pipe_so_target_reference(&sobj->draw_count[i], NULL);
+
+   for (i = 0; i < ARRAY_SIZE(sobj->targets); i++) {
+      unsigned stream =
+         obj->shader_program->LinkedTransformFeedback.BufferStream[i];
+
+      /* Is it not bound or already set for this stream? */
+      if (!sobj->targets[i] || sobj->draw_count[stream])
+         continue;
+
+      pipe_so_target_reference(&sobj->draw_count[stream], sobj->targets[i]);
+   }
 }
 
 
 bool
 st_transform_feedback_draw_init(struct gl_transform_feedback_object *obj,
-                                struct pipe_draw_info *out)
+                                unsigned stream, struct pipe_draw_info *out)
 {
    struct st_transform_feedback_object *sobj =
          st_transform_feedback_object(obj);
 
-   if (sobj->draw_count == NULL)
-      return false;
-   out->count_from_stream_output = sobj->draw_count;
-   return true;
+   out->count_from_stream_output = sobj->draw_count[stream];
+   return out->count_from_stream_output != NULL;
 }
 
 
diff --git a/src/mesa/state_tracker/st_cb_xformfb.h b/src/mesa/state_tracker/st_cb_xformfb.h
index fb50deddfae..444d11842c5 100644
--- a/src/mesa/state_tracker/st_cb_xformfb.h
+++ b/src/mesa/state_tracker/st_cb_xformfb.h
@@ -40,7 +40,7 @@ st_init_xformfb_functions(struct dd_function_table *functions);
 
 extern bool
 st_transform_feedback_draw_init(struct gl_transform_feedback_object *obj,
-                                struct pipe_draw_info *out);
+                                unsigned stream, struct pipe_draw_info *out);
 
 
 #endif /* ST_CB_XFORMFB_H */
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index a29971c484a..957fcfd410e 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -243,7 +243,7 @@ st_draw_vbo(struct gl_context *ctx,
       /* Transform feedback drawing is always non-indexed. */
       /* Set info.count_from_stream_output. */
       if (tfb_vertcount) {
-         if (st_transform_feedback_draw_init(tfb_vertcount, &info) == false)
+         if (!st_transform_feedback_draw_init(tfb_vertcount, stream, &info))
             return;
       }
    }

From 57245cce52d544c61f03fc966850f0f94e8118d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 31 Jul 2015 02:39:02 +0200
Subject: [PATCH 1008/1208] gallium/radeon: suspend timer queries between IBs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When we are measuring the time spent in a draw call, an unexpected flush
can distort the result.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/r600/r600_hw_context.c    |  3 +-
 src/gallium/drivers/radeon/r600_pipe_common.c |  8 ++-
 src/gallium/drivers/radeon/r600_pipe_common.h |  9 ++-
 src/gallium/drivers/radeon/r600_query.c       | 68 ++++++++++++++-----
 src/gallium/drivers/radeonsi/si_hw_context.c  |  3 +-
 5 files changed, 66 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 8eb0c6806b9..9155707f866 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -68,7 +68,8 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	}
 
 	/* Count in queries_suspend. */
-	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend;
+	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend +
+		  ctx->b.num_cs_dw_timer_queries_suspend;
 
 	/* Count in streamout_end at the end of CS. */
 	if (ctx->b.streamout.begin_emitted) {
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 75e820144e3..c940f6d35c2 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -132,10 +132,11 @@ void r600_preflush_suspend_features(struct r600_common_context *ctx)
 	}
 
 	/* suspend queries */
-	ctx->nontimer_queries_suspended = false;
+	ctx->queries_suspended_for_flush = false;
 	if (ctx->num_cs_dw_nontimer_queries_suspend) {
 		r600_suspend_nontimer_queries(ctx);
-		ctx->nontimer_queries_suspended = true;
+		r600_suspend_timer_queries(ctx);
+		ctx->queries_suspended_for_flush = true;
 	}
 
 	ctx->streamout.suspended = false;
@@ -153,8 +154,9 @@ void r600_postflush_resume_features(struct r600_common_context *ctx)
 	}
 
 	/* resume queries */
-	if (ctx->nontimer_queries_suspended) {
+	if (ctx->queries_suspended_for_flush) {
 		r600_resume_nontimer_queries(ctx);
+		r600_resume_timer_queries(ctx);
 	}
 
 	/* Re-enable render condition. */
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index e2a60c59c82..fbd2a21da17 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -384,11 +384,14 @@ struct r600_common_context {
 	int				num_occlusion_queries;
 	/* Keep track of non-timer queries, because they should be suspended
 	 * during context flushing.
-	 * The timer queries (TIME_ELAPSED) shouldn't be suspended. */
+	 * The timer queries (TIME_ELAPSED) shouldn't be suspended for blits,
+	 * but they should be suspended between IBs. */
 	struct list_head		active_nontimer_queries;
+	struct list_head		active_timer_queries;
 	unsigned			num_cs_dw_nontimer_queries_suspend;
+	unsigned			num_cs_dw_timer_queries_suspend;
 	/* If queries have been suspended. */
-	bool				nontimer_queries_suspended;
+	bool				queries_suspended_for_flush;
 	/* Additional hardware info. */
 	unsigned			backend_mask;
 	unsigned			max_db; /* for OQ */
@@ -503,6 +506,8 @@ unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin);
 void r600_query_init(struct r600_common_context *rctx);
 void r600_suspend_nontimer_queries(struct r600_common_context *ctx);
 void r600_resume_nontimer_queries(struct r600_common_context *ctx);
+void r600_suspend_timer_queries(struct r600_common_context *ctx);
+void r600_resume_timer_queries(struct r600_common_context *ctx);
 void r600_query_init_backend_mask(struct r600_common_context *ctx);
 
 /* r600_streamout.c */
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 909d5029bfa..7fc22537f43 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -226,9 +226,10 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 	r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE,
 			RADEON_PRIO_MIN);
 
-	if (!r600_is_timer_query(query->type)) {
+	if (r600_is_timer_query(query->type))
+		ctx->num_cs_dw_timer_queries_suspend += query->num_cs_dw;
+	else
 		ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw;
-	}
 }
 
 static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_query *query)
@@ -290,9 +291,10 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 	query->buffer.results_end += query->result_size;
 
 	if (r600_query_needs_begin(query->type)) {
-		if (!r600_is_timer_query(query->type)) {
+		if (r600_is_timer_query(query->type))
+			ctx->num_cs_dw_timer_queries_suspend -= query->num_cs_dw;
+		else
 			ctx->num_cs_dw_nontimer_queries_suspend -= query->num_cs_dw;
-		}
 	}
 
 	r600_update_occlusion_query_state(ctx, query->type, -1);
@@ -503,9 +505,10 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 
 	r600_emit_query_begin(rctx, rquery);
 
-	if (!r600_is_timer_query(rquery->type)) {
+	if (r600_is_timer_query(rquery->type))
+		LIST_ADDTAIL(&rquery->list, &rctx->active_timer_queries);
+	else
 		LIST_ADDTAIL(&rquery->list, &rctx->active_nontimer_queries);
-	}
    return true;
 }
 
@@ -561,9 +564,8 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
 
 	r600_emit_query_end(rctx, rquery);
 
-	if (r600_query_needs_begin(rquery->type) && !r600_is_timer_query(rquery->type)) {
+	if (r600_query_needs_begin(rquery->type))
 		LIST_DELINIT(&rquery->list);
-	}
 }
 
 static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
@@ -838,22 +840,37 @@ static void r600_render_condition(struct pipe_context *ctx,
 	}
 }
 
-void r600_suspend_nontimer_queries(struct r600_common_context *ctx)
+static void r600_suspend_queries(struct r600_common_context *ctx,
+				 struct list_head *query_list,
+				 unsigned *num_cs_dw_queries_suspend)
 {
 	struct r600_query *query;
 
-	LIST_FOR_EACH_ENTRY(query, &ctx->active_nontimer_queries, list) {
+	LIST_FOR_EACH_ENTRY(query, query_list, list) {
 		r600_emit_query_end(ctx, query);
 	}
-	assert(ctx->num_cs_dw_nontimer_queries_suspend == 0);
+	assert(*num_cs_dw_queries_suspend == 0);
 }
 
-static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx)
+void r600_suspend_nontimer_queries(struct r600_common_context *ctx)
+{
+	r600_suspend_queries(ctx, &ctx->active_nontimer_queries,
+			     &ctx->num_cs_dw_nontimer_queries_suspend);
+}
+
+void r600_suspend_timer_queries(struct r600_common_context *ctx)
+{
+	r600_suspend_queries(ctx, &ctx->active_timer_queries,
+			     &ctx->num_cs_dw_timer_queries_suspend);
+}
+
+static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
+						    struct list_head *query_list)
 {
 	struct r600_query *query;
 	unsigned num_dw = 0;
 
-	LIST_FOR_EACH_ENTRY(query, &ctx->active_nontimer_queries, list) {
+	LIST_FOR_EACH_ENTRY(query, query_list, list) {
 		/* begin + end */
 		num_dw += query->num_cs_dw * 2;
 
@@ -872,21 +889,35 @@ static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *
 	return num_dw;
 }
 
-void r600_resume_nontimer_queries(struct r600_common_context *ctx)
+static void r600_resume_queries(struct r600_common_context *ctx,
+				struct list_head *query_list,
+				unsigned *num_cs_dw_queries_suspend)
 {
 	struct r600_query *query;
+	unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, query_list);
 
-	assert(ctx->num_cs_dw_nontimer_queries_suspend == 0);
+	assert(*num_cs_dw_queries_suspend == 0);
 
 	/* Check CS space here. Resuming must not be interrupted by flushes. */
-	ctx->need_gfx_cs_space(&ctx->b,
-			       r600_queries_num_cs_dw_for_resuming(ctx), TRUE);
+	ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, TRUE);
 
-	LIST_FOR_EACH_ENTRY(query, &ctx->active_nontimer_queries, list) {
+	LIST_FOR_EACH_ENTRY(query, query_list, list) {
 		r600_emit_query_begin(ctx, query);
 	}
 }
 
+void r600_resume_nontimer_queries(struct r600_common_context *ctx)
+{
+	r600_resume_queries(ctx, &ctx->active_nontimer_queries,
+			    &ctx->num_cs_dw_nontimer_queries_suspend);
+}
+
+void r600_resume_timer_queries(struct r600_common_context *ctx)
+{
+	r600_resume_queries(ctx, &ctx->active_timer_queries,
+			    &ctx->num_cs_dw_timer_queries_suspend);
+}
+
 /* Get backends mask */
 void r600_query_init_backend_mask(struct r600_common_context *ctx)
 {
@@ -979,4 +1010,5 @@ void r600_query_init(struct r600_common_context *rctx)
 	    rctx->b.render_condition = r600_render_condition;
 
 	LIST_INITHEAD(&rctx->active_nontimer_queries);
+	LIST_INITHEAD(&rctx->active_timer_queries);
 }
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 581b130caec..b80245c9785 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -63,7 +63,8 @@ void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
 	}
 
 	/* Count in queries_suspend. */
-	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend;
+	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend +
+		  ctx->b.num_cs_dw_timer_queries_suspend;
 
 	/* Count in streamout_end at the end of CS. */
 	if (ctx->b.streamout.begin_emitted) {

From d587742650c262dea8007474b9956fd65472f8b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 27 Jun 2015 14:19:41 +0200
Subject: [PATCH 1009/1208] gallium/radeon: allow the winsys to choose the IB
 size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Picked from the amdgpu branch.

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/r300/r300_blit.c          | 2 +-
 src/gallium/drivers/r300/r300_cs.h            | 2 +-
 src/gallium/drivers/r300/r300_render.c        | 2 +-
 src/gallium/drivers/r600/r600_hw_context.c    | 2 +-
 src/gallium/drivers/r600/r600_pipe.h          | 4 ++--
 src/gallium/drivers/radeon/r600_cs.h          | 8 ++++----
 src/gallium/drivers/radeon/r600_pipe_common.c | 4 ++--
 src/gallium/drivers/radeon/radeon_winsys.h    | 3 +--
 src/gallium/drivers/radeonsi/si_hw_context.c  | 2 +-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 5 +++--
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h | 2 +-
 11 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index baf05cea965..6ea8f24cc14 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -382,7 +382,7 @@ static void r300_clear(struct pipe_context* pipe,
             r300_get_num_cs_end_dwords(r300);
 
         /* Reserve CS space. */
-        if (dwords > (RADEON_MAX_CMDBUF_DWORDS - r300->cs->cdw)) {
+        if (dwords > (r300->cs->max_dw - r300->cs->cdw)) {
             r300_flush(&r300->context, RADEON_FLUSH_ASYNC, NULL);
         }
 
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index 37f9641ab3e..fc150542d4b 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -46,7 +46,7 @@
 #ifdef DEBUG
 
 #define BEGIN_CS(size) do { \
-    assert(size <= (RADEON_MAX_CMDBUF_DWORDS - cs_copy->cdw)); \
+    assert(size <= (cs_copy->max_dw - cs_copy->cdw)); \
     cs_count = size; \
 } while (0)
 
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 9dffa4935a3..0487b11e775 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -215,7 +215,7 @@ static boolean r300_reserve_cs_dwords(struct r300_context *r300,
     cs_dwords += r300_get_num_cs_end_dwords(r300);
 
     /* Reserve requested CS space. */
-    if (cs_dwords > (RADEON_MAX_CMDBUF_DWORDS - r300->cs->cdw)) {
+    if (cs_dwords > (r300->cs->max_dw - r300->cs->cdw)) {
         r300_flush(&r300->context, RADEON_FLUSH_ASYNC, NULL);
         flushed = TRUE;
     }
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 9155707f866..ad11e760781 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -93,7 +93,7 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	num_dw += 10;
 
 	/* Flush if there's not enough space. */
-	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
+	if (num_dw > ctx->b.rings.gfx.cs->max_dw) {
 		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 }
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 991fda894ac..bcb9390f331 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -493,7 +493,7 @@ struct r600_context {
 static inline void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
 					    struct r600_command_buffer *cb)
 {
-	assert(cs->cdw + cb->num_dw <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw + cb->num_dw <= cs->max_dw);
 	memcpy(cs->buf + cs->cdw, cb->buf, 4 * cb->num_dw);
 	cs->cdw += cb->num_dw;
 }
@@ -826,7 +826,7 @@ static inline void r600_write_compute_context_reg_seq(struct radeon_winsys_cs *c
 static inline void r600_write_ctl_const_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CTL_CONST_OFFSET);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CTL_CONST, num, 0);
 	cs->buf[cs->cdw++] = (reg - R600_CTL_CONST_OFFSET) >> 2;
 }
diff --git a/src/gallium/drivers/radeon/r600_cs.h b/src/gallium/drivers/radeon/r600_cs.h
index d6d4c88fbf6..03a04b754d6 100644
--- a/src/gallium/drivers/radeon/r600_cs.h
+++ b/src/gallium/drivers/radeon/r600_cs.h
@@ -77,7 +77,7 @@ static inline void r600_emit_reloc(struct r600_common_context *rctx,
 static inline void r600_write_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg < R600_CONTEXT_REG_OFFSET);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
 	radeon_emit(cs, (reg - R600_CONFIG_REG_OFFSET) >> 2);
 }
@@ -91,7 +91,7 @@ static inline void r600_write_config_reg(struct radeon_winsys_cs *cs, unsigned r
 static inline void r600_write_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CONTEXT_REG_OFFSET);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
 	radeon_emit(cs, (reg - R600_CONTEXT_REG_OFFSET) >> 2);
 }
@@ -105,7 +105,7 @@ static inline void r600_write_context_reg(struct radeon_winsys_cs *cs, unsigned
 static inline void si_write_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
 	radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
 }
@@ -119,7 +119,7 @@ static inline void si_write_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, un
 static inline void cik_write_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
 	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
 }
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index c940f6d35c2..79b4b544004 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -108,9 +108,9 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw)
 {
 	/* Flush if there's not enough space. */
-	if ((num_dw + ctx->rings.dma.cs->cdw) > RADEON_MAX_CMDBUF_DWORDS) {
+	if ((num_dw + ctx->rings.dma.cs->cdw) > ctx->rings.dma.cs->max_dw) {
 		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
-		assert((num_dw + ctx->rings.dma.cs->cdw) <= RADEON_MAX_CMDBUF_DWORDS);
+		assert((num_dw + ctx->rings.dma.cs->cdw) <= ctx->rings.dma.cs->max_dw);
 	}
 }
 
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index dabee534fe3..c79a0823ad0 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -42,8 +42,6 @@
 
 #include "pipebuffer/pb_buffer.h"
 
-#define RADEON_MAX_CMDBUF_DWORDS (16 * 1024)
-
 #define RADEON_FLUSH_ASYNC		(1 << 0)
 #define RADEON_FLUSH_KEEP_TILING_FLAGS	(1 << 1) /* needs DRM 2.12.0 */
 #define RADEON_FLUSH_COMPUTE		(1 << 2)
@@ -196,6 +194,7 @@ struct radeon_winsys_cs_handle;
 
 struct radeon_winsys_cs {
     unsigned                    cdw;  /* Number of used dwords. */
+    unsigned                    max_dw; /* Maximum number of dwords. */
     uint32_t                    *buf; /* The command buffer. */
     enum ring_type              ring_type;
 };
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index b80245c9785..3203ceb61fd 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -86,7 +86,7 @@ void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
 #endif
 
 	/* Flush if there's not enough space. */
-	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
+	if (num_dw > ctx->b.rings.gfx.cs->max_dw) {
 		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index d618f5904fd..e363cc042dd 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -188,6 +188,7 @@ radeon_drm_cs_create(struct radeon_winsys *rws,
     cs->cst = &cs->csc2;
     cs->base.buf = cs->csc->buf;
     cs->base.ring_type = ring_type;
+    cs->base.max_dw = ARRAY_SIZE(cs->csc->buf);
 
     p_atomic_inc(&ws->num_cs);
     return &cs->base;
@@ -467,7 +468,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
         break;
     }
 
-    if (rcs->cdw > RADEON_MAX_CMDBUF_DWORDS) {
+    if (rcs->cdw > rcs->max_dw) {
        fprintf(stderr, "radeon: command stream overflowed\n");
     }
 
@@ -486,7 +487,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
     cs->cst->cs_trace_id = cs_trace_id;
 
     /* If the CS is not empty or overflowed, emit it in a separate thread. */
-    if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS && !debug_get_option_noop()) {
+    if (cs->base.cdw && cs->base.cdw <= cs->base.max_dw && !debug_get_option_noop()) {
         unsigned i, crelocs;
 
         crelocs = cs->cst->crelocs;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
index 658057d75b3..6ceb8e98ee7 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
@@ -30,7 +30,7 @@
 #include "radeon_drm_bo.h"
 
 struct radeon_cs_context {
-    uint32_t                    buf[RADEON_MAX_CMDBUF_DWORDS];
+    uint32_t                    buf[16 * 1024];
 
     int                         fd;
     struct drm_radeon_cs        cs;

From 567394112d904096abff1d994ab952f475dfb444 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 31 Jul 2015 11:45:13 +0200
Subject: [PATCH 1010/1208] radeon/winsys: increase the IB size for VM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Luckily, there is a kernel query, so use the size from that.
It currently returns 256KB. It can be increased in the kernel.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c     |  8 +++++++-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h     |  2 +-
 src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 12 ++++++++----
 src/gallium/winsys/radeon/drm/radeon_drm_winsys.h |  1 +
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index e363cc042dd..45eef294369 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -85,17 +85,22 @@ static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
 {
     int i;
 
+    csc->buf = MALLOC(ws->ib_max_size);
+    if (!csc->buf)
+        return FALSE;
     csc->fd = ws->fd;
     csc->nrelocs = 512;
     csc->relocs_bo = (struct radeon_bo**)
                      CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
     if (!csc->relocs_bo) {
+        FREE(csc->buf);
         return FALSE;
     }
 
     csc->relocs = (struct drm_radeon_cs_reloc*)
                   CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
     if (!csc->relocs) {
+        FREE(csc->buf);
         FREE(csc->relocs_bo);
         return FALSE;
     }
@@ -148,6 +153,7 @@ static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
     radeon_cs_context_cleanup(csc);
     FREE(csc->relocs_bo);
     FREE(csc->relocs);
+    FREE(csc->buf);
 }
 
 
@@ -188,7 +194,7 @@ radeon_drm_cs_create(struct radeon_winsys *rws,
     cs->cst = &cs->csc2;
     cs->base.buf = cs->csc->buf;
     cs->base.ring_type = ring_type;
-    cs->base.max_dw = ARRAY_SIZE(cs->csc->buf);
+    cs->base.max_dw = ws->ib_max_size / 4;
 
     p_atomic_inc(&ws->num_cs);
     return &cs->base;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
index 6ceb8e98ee7..ab154945880 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
@@ -30,7 +30,7 @@
 #include "radeon_drm_bo.h"
 
 struct radeon_cs_context {
-    uint32_t                    buf[16 * 1024];
+    uint32_t                    *buf;
 
     int                         fd;
     struct drm_radeon_cs        cs;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index f7784fb795e..b70bbaa54a3 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -395,16 +395,20 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
         }
 
         ws->info.r600_virtual_address = FALSE;
-        if (ws->info.drm_minor >= 13) {
-            uint32_t ib_vm_max_size;
+        ws->ib_max_size = 64 * 1024;
 
+        if (ws->info.drm_minor >= 13) {
             ws->info.r600_virtual_address = TRUE;
             if (!radeon_get_drm_value(ws->fd, RADEON_INFO_VA_START, NULL,
                                       &ws->va_start))
                 ws->info.r600_virtual_address = FALSE;
-            if (!radeon_get_drm_value(ws->fd, RADEON_INFO_IB_VM_MAX_SIZE, NULL,
-                                      &ib_vm_max_size))
+
+            if (radeon_get_drm_value(ws->fd, RADEON_INFO_IB_VM_MAX_SIZE, NULL,
+                                     &ws->ib_max_size))
+                ws->ib_max_size *= 4; /* the kernel returns the size in dwords */
+            else
                 ws->info.r600_virtual_address = FALSE;
+
             radeon_get_drm_value(ws->fd, RADEON_INFO_VA_UNMAP_WORKING, NULL,
                                  &ws->va_unmap_working);
         }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
index 308b5bd976d..c1a8d6ae564 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
@@ -73,6 +73,7 @@ struct radeon_drm_winsys {
 
     enum radeon_generation gen;
     struct radeon_info info;
+    uint32_t ib_max_size;
     uint32_t va_start;
     uint32_t va_unmap_working;
     uint32_t accel_working2;

From cc59c78b0aa202f1a76a8708ec318e19a8502c9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 31 Jul 2015 17:26:08 +0200
Subject: [PATCH 1011/1208] gallium/radeon: always use the llvm. prefix in
 intrinsic names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Acked-by: Michel Dänzer <michel.daenzer@amd.com>
Reviewed-by: Tom Stellard <thomas.stellard@amd.com>
---
 .../drivers/radeon/radeon_setup_tgsi_llvm.c   | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 5c08cf5d339..56694700a47 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -767,7 +767,7 @@ static void radeon_llvm_cube_to_2d_coords(struct lp_build_tgsi_context *bld_base
 		coords[i] = LLVMBuildExtractElement(builder, v,
 						    lp_build_const_int32(gallivm, i), "");
 
-	coords[2] = lp_build_intrinsic(builder, "fabs",
+	coords[2] = lp_build_intrinsic(builder, "llvm.fabs.f32",
 			type, &coords[2], 1, LLVMReadNoneAttribute);
 	coords[2] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_RCP, coords[2]);
 
@@ -1176,8 +1176,18 @@ static void emit_frac(
 		struct lp_build_emit_data * emit_data)
 {
 	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	char *intr;
 
-	LLVMValueRef floor = lp_build_intrinsic(builder, "floor", emit_data->dst_type,
+	if (emit_data->info->opcode == TGSI_OPCODE_FRC)
+		intr = "llvm.floor.f32";
+	else if (emit_data->info->opcode == TGSI_OPCODE_DFRAC)
+		intr = "llvm.floor.f64";
+	else {
+		assert(0);
+		return;
+	}
+
+	LLVMValueRef floor = lp_build_intrinsic(builder, intr, emit_data->dst_type,
 						&emit_data->args[0], 1,
 						LLVMReadNoneAttribute);
 	emit_data->output[emit_data->chan] = LLVMBuildFSub(builder,
@@ -1425,7 +1435,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	lp_set_default_actions(bld_base);
 
 	bld_base->op_actions[TGSI_OPCODE_ABS].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_ABS].intr_name = "fabs";
+	bld_base->op_actions[TGSI_OPCODE_ABS].intr_name = "llvm.fabs.f32";
 	bld_base->op_actions[TGSI_OPCODE_AND].emit = emit_and;
 	bld_base->op_actions[TGSI_OPCODE_ARL].emit = emit_arl;
 	bld_base->op_actions[TGSI_OPCODE_BFI].emit = emit_bfi;
@@ -1434,7 +1444,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_BREV].intr_name = "llvm.AMDGPU.brev";
 	bld_base->op_actions[TGSI_OPCODE_BRK].emit = brk_emit;
 	bld_base->op_actions[TGSI_OPCODE_CEIL].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "ceil";
+	bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "llvm.ceil.f32";
 	bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_CLAMP].intr_name = "llvm.AMDIL.clamp.";
 	bld_base->op_actions[TGSI_OPCODE_CMP].emit = build_tgsi_intrinsic_nomem;
@@ -1443,7 +1453,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_COS].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.cos.f32";
 	bld_base->op_actions[TGSI_OPCODE_DABS].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_DABS].intr_name = "fabs";
+	bld_base->op_actions[TGSI_OPCODE_DABS].intr_name = "llvm.fabs.f64";
 	bld_base->op_actions[TGSI_OPCODE_DFMA].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_DFMA].intr_name = "llvm.fma.f64";
 	bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = emit_frac;
@@ -1462,7 +1472,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_EX2].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.AMDIL.exp.";
 	bld_base->op_actions[TGSI_OPCODE_FLR].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "floor";
+	bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "llvm.floor.f32";
 	bld_base->op_actions[TGSI_OPCODE_FMA].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_FMA].intr_name = "llvm.fma.f32";
 	bld_base->op_actions[TGSI_OPCODE_FRC].emit = emit_frac;

From c2a5d1dcb14acbd2db4a674453a8622d4b9a572a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 22:01:25 +0200
Subject: [PATCH 1012/1208] winsys/radeon: loosen up the requirements for how
 much memory IBs can use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 45eef294369..7e5fff4c1ba 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -379,11 +379,16 @@ static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
 static boolean radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
-    boolean status =
-        (cs->csc->used_gart + gtt) < cs->ws->info.gart_size * 0.7 &&
-        (cs->csc->used_vram + vram) < cs->ws->info.vram_size * 0.7;
 
-    return status;
+    vram += cs->csc->used_vram;
+    gtt += cs->csc->used_gart;
+
+    /* Anything that goes above the VRAM size should go to GTT. */
+    if (vram > cs->ws->info.vram_size)
+        gtt += vram - cs->ws->info.vram_size;
+
+    /* Now we just need to check if we have enough GTT. */
+    return gtt < cs->ws->info.gart_size * 0.7;
 }
 
 void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_drm_cs *cs, struct radeon_cs_context *csc)

From 60159bcfc66a067b50da06f5cabfa20d72e898ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 3 Aug 2015 01:34:32 +0200
Subject: [PATCH 1013/1208] radeonsi: before storing tess levels, load them
 from LDS instead of temporary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Also use only one store if stride <= 4.
All the fetches from and stores to temporaries can be removed now.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91461

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 134 ++++++++++-------------
 1 file changed, 56 insertions(+), 78 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 92382e85985..61d36430bcf 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -681,18 +681,8 @@ static LLVMValueRef fetch_output_tcs(
 		enum tgsi_opcode_type type, unsigned swizzle)
 {
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
-	struct si_shader *shader = si_shader_ctx->shader;
-	struct tgsi_shader_info *info = &shader->selector->info;
-	unsigned name = info->output_semantic_name[reg->Register.Index];
 	LLVMValueRef dw_addr, stride;
 
-	/* Just read the local temp "output" register to get TESSOUTER/INNER. */
-	if (!reg->Register.Indirect &&
-	    (name == TGSI_SEMANTIC_TESSOUTER ||
-	     name == TGSI_SEMANTIC_TESSINNER)) {
-		return radeon_llvm_emit_fetch(bld_base, reg, type, swizzle);
-	}
-
 	if (reg->Register.Dimension) {
 		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
 		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
@@ -731,8 +721,6 @@ static void store_output_tcs(struct lp_build_tgsi_context * bld_base,
 			     LLVMValueRef dst[4])
 {
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
-	struct si_shader *shader = si_shader_ctx->shader;
-	struct tgsi_shader_info *sinfo = &shader->selector->info;
 	const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 	unsigned chan_index;
 	LLVMValueRef dw_addr, stride;
@@ -746,14 +734,6 @@ static void store_output_tcs(struct lp_build_tgsi_context * bld_base,
 		return;
 	}
 
-	/* Write tessellation levels to "output" temp registers.
-	 * Also write them to LDS as per-patch outputs (below).
-	 */
-	if (!reg->Register.Indirect &&
-	    (sinfo->output_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_TESSINNER ||
-             sinfo->output_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_TESSOUTER))
-		radeon_llvm_emit_store(bld_base, inst, info, dst);
-
 	if (reg->Register.Dimension) {
 		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
 		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
@@ -1854,58 +1834,78 @@ handle_semantic:
 	}
 }
 
-static void si_write_tess_factors(struct si_shader_context *si_shader_ctx,
-				  unsigned name, LLVMValueRef *out_ptr)
+/* This only writes the tessellation factor levels. */
+static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
-	struct si_shader *shader = si_shader_ctx->shader;
-	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	struct si_shader *shader = si_shader_ctx->shader;
+	unsigned tess_inner_index, tess_outer_index;
+	LLVMValueRef lds_base, lds_inner, lds_outer;
 	LLVMValueRef tf_base, rel_patch_id, byteoffset, buffer, rw_buffers;
-	LLVMValueRef output, out[4];
+	LLVMValueRef out[6], vec0, vec1, invocation_id;
 	unsigned stride, outer_comps, inner_comps, i;
+	struct lp_build_if_state if_ctx;
 
-	if (name != TGSI_SEMANTIC_TESSOUTER &&
-	    name != TGSI_SEMANTIC_TESSINNER) {
-		assert(0);
-		return;
-	}
+	invocation_id = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5);
 
+	/* Do this only for invocation 0, because the tess levels are per-patch,
+	 * not per-vertex.
+	 *
+	 * This can't jump, because invocation 0 executes this. It should
+	 * at least mask out the loads and stores for other invocations.
+	 */
+	lp_build_if(&if_ctx, gallivm,
+		    LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
+				  invocation_id, bld_base->uint_bld.zero, ""));
+
+	/* Determine the layout of one tess factor element in the buffer. */
 	switch (shader->key.tcs.prim_mode) {
 	case PIPE_PRIM_LINES:
-		stride = 2;
+		stride = 2; /* 2 dwords, 1 vec2 store */
 		outer_comps = 2;
 		inner_comps = 0;
 		break;
 	case PIPE_PRIM_TRIANGLES:
-		stride = 4;
+		stride = 4; /* 4 dwords, 1 vec4 store */
 		outer_comps = 3;
 		inner_comps = 1;
 		break;
 	case PIPE_PRIM_QUADS:
-		stride = 6;
+		stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
 		outer_comps = 4;
 		inner_comps = 2;
 		break;
 	default:
 		assert(0);
+		return;
 	}
 
-	/* Load the outputs as i32. */
-	for (i = 0; i < 4; i++)
-		out[i] = LLVMBuildBitCast(gallivm->builder,
-				LLVMBuildLoad(gallivm->builder, out_ptr[i], ""),
-				bld_base->uint_bld.elem_type, "");
+	/* Load tess_inner and tess_outer from LDS.
+	 * Any invocation can write them, so we can't get them from a temporary.
+	 */
+	tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
+	tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
 
-	/* Convert the outputs to vectors. */
-	if (name == TGSI_SEMANTIC_TESSOUTER)
-		output = lp_build_gather_values(gallivm, out,
-						util_next_power_of_two(outer_comps));
-	else if (inner_comps > 1)
-		output = lp_build_gather_values(gallivm, out, inner_comps);
-	else if (inner_comps == 1)
-		output = out[0];
-	else
-		return;
+	lds_base = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+	lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
+				 lp_build_const_int32(gallivm,
+						      tess_inner_index * 4), "");
+	lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
+				 lp_build_const_int32(gallivm,
+						      tess_outer_index * 4), "");
+
+	for (i = 0; i < outer_comps; i++)
+		out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
+	for (i = 0; i < inner_comps; i++)
+		out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
+
+	/* Convert the outputs to vectors for stores. */
+	vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
+	vec1 = NULL;
+
+	if (stride > 4)
+		vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
 
 	/* Get the buffer. */
 	rw_buffers = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
@@ -1913,22 +1913,20 @@ static void si_write_tess_factors(struct si_shader_context *si_shader_ctx,
 	buffer = build_indexed_load_const(si_shader_ctx, rw_buffers,
 			lp_build_const_int32(gallivm, SI_RING_TESS_FACTOR));
 
-	/* Get offsets. */
+	/* Get the offset. */
 	tf_base = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 			       SI_PARAM_TESS_FACTOR_OFFSET);
 	rel_patch_id = get_rel_patch_id(si_shader_ctx);
 	byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
 				  lp_build_const_int32(gallivm, 4 * stride), "");
 
-	/* Store the output. */
-	if (name == TGSI_SEMANTIC_TESSOUTER) {
-		build_tbuffer_store_dwords(si_shader_ctx, buffer, output,
-					   outer_comps, byteoffset, tf_base, 0);
-	} else if (inner_comps) {
-		build_tbuffer_store_dwords(si_shader_ctx, buffer, output,
-					   inner_comps, byteoffset, tf_base,
-					   outer_comps * 4);
-	}
+	/* Store the outputs. */
+	build_tbuffer_store_dwords(si_shader_ctx, buffer, vec0,
+				   MIN2(stride, 4), byteoffset, tf_base, 0);
+	if (vec1)
+		build_tbuffer_store_dwords(si_shader_ctx, buffer, vec1,
+					   stride - 4, byteoffset, tf_base, 16);
+	lp_build_endif(&if_ctx);
 }
 
 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base)
@@ -1962,26 +1960,6 @@ static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base)
 	}
 }
 
-static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context * bld_base)
-{
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
-	struct si_shader *shader = si_shader_ctx->shader;
-	struct tgsi_shader_info *info = &shader->selector->info;
-	unsigned i;
-
-	/* Only write tessellation factors. Other outputs have already been
-	 * written to LDS by instructions. */
-	for (i = 0; i < info->num_outputs; i++) {
-		LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i];
-		unsigned name = info->output_semantic_name[i];
-
-		if (name == TGSI_SEMANTIC_TESSINNER ||
-		    name == TGSI_SEMANTIC_TESSOUTER) {
-			si_write_tess_factors(si_shader_ctx, name, out_ptr);
-		}
-	}
-}
-
 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 {
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);

From 6b47b8978101897cc0dab8f2017e3aa25d31582d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 17:06:17 +0200
Subject: [PATCH 1014/1208] gallium,hud: add support for Hz units in driver
 queries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/auxiliary/hud/hud_context.c | 6 ++++++
 src/gallium/include/pipe/p_defines.h    | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index 4631cd3b323..e10152e69c5 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -240,6 +240,8 @@ number_to_human_readable(uint64_t num, enum pipe_driver_query_type type,
       {"", " k", " M", " G", " T", " P", " E"};
    static const char *time_units[] =
       {" us", " ms", " s"};  /* based on microseconds */
+   static const char *hz_units[] =
+      {" Hz", " KHz", " MHz", " GHz"};
    const char *suffix;
    double divisor = (type == PIPE_DRIVER_QUERY_TYPE_BYTES) ? 1024 : 1000;
    int unit = 0;
@@ -262,6 +264,10 @@ number_to_human_readable(uint64_t num, enum pipe_driver_query_type type,
       assert(unit < ARRAY_SIZE(byte_units));
       suffix = byte_units[unit];
       break;
+   case PIPE_DRIVER_QUERY_TYPE_HZ:
+      assert(unit < ARRAY_SIZE(hz_units));
+      suffix = hz_units[unit];
+      break;
    default:
       assert(unit < ARRAY_SIZE(metric_units));
       suffix = metric_units[unit];
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 68c536f1101..b124d8e4d13 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -762,6 +762,7 @@ union pipe_query_result
    /* PIPE_QUERY_PRIMITIVES_GENERATED */
    /* PIPE_QUERY_PRIMITIVES_EMITTED */
    /* PIPE_DRIVER_QUERY_TYPE_UINT64 */
+   /* PIPE_DRIVER_QUERY_TYPE_HZ */
    uint64_t u64;
 
    /* PIPE_DRIVER_QUERY_TYPE_UINT */
@@ -796,6 +797,7 @@ enum pipe_driver_query_type
    PIPE_DRIVER_QUERY_TYPE_PERCENTAGE   = 3,
    PIPE_DRIVER_QUERY_TYPE_BYTES        = 4,
    PIPE_DRIVER_QUERY_TYPE_MICROSECONDS = 5,
+   PIPE_DRIVER_QUERY_TYPE_HZ           = 6,
 };
 
 enum pipe_driver_query_group_type

From 130a03e360e6aebe93e86b1d522ebf22371aa2d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 17:08:29 +0200
Subject: [PATCH 1015/1208] gallium/hud: fix printing byte units
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/auxiliary/hud/hud_context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index e10152e69c5..e0ca29da329 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -235,7 +235,7 @@ number_to_human_readable(uint64_t num, enum pipe_driver_query_type type,
                          char *out)
 {
    static const char *byte_units[] =
-      {"", " KB", " MB", " GB", " TB", " PB", " EB"};
+      {" B", " KB", " MB", " GB", " TB", " PB", " EB"};
    static const char *metric_units[] =
       {"", " k", " M", " G", " T", " P", " E"};
    static const char *time_units[] =

From 97a65d90fe88e6b4b4a42d866b23e73ce72f6dc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 17:24:30 +0200
Subject: [PATCH 1016/1208] gallium,hud: allow displaying cumulative values
 instead of average
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cumulative value is useful for queries like the number of shader
compilations.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/auxiliary/hud/hud_context.c      |  9 +++++---
 src/gallium/auxiliary/hud/hud_driver_query.c | 22 ++++++++++++++++----
 src/gallium/auxiliary/hud/hud_private.h      |  3 ++-
 src/gallium/include/pipe/p_defines.h         | 10 +++++++++
 4 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index e0ca29da329..decc055b21a 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -896,13 +896,15 @@ hud_parse_env_var(struct hud_context *hud, const char *env)
                has_occlusion_query(hud->pipe->screen)) {
          hud_pipe_query_install(pane, hud->pipe, "samples-passed",
                                 PIPE_QUERY_OCCLUSION_COUNTER, 0, 0,
-                                PIPE_DRIVER_QUERY_TYPE_UINT64);
+                                PIPE_DRIVER_QUERY_TYPE_UINT64,
+                                PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE);
       }
       else if (strcmp(name, "primitives-generated") == 0 &&
                has_streamout(hud->pipe->screen)) {
          hud_pipe_query_install(pane, hud->pipe, "primitives-generated",
                                 PIPE_QUERY_PRIMITIVES_GENERATED, 0, 0,
-                                PIPE_DRIVER_QUERY_TYPE_UINT64);
+                                PIPE_DRIVER_QUERY_TYPE_UINT64,
+                                PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE);
       }
       else {
          boolean processed = FALSE;
@@ -929,7 +931,8 @@ hud_parse_env_var(struct hud_context *hud, const char *env)
             if (i < Elements(pipeline_statistics_names)) {
                hud_pipe_query_install(pane, hud->pipe, name,
                                       PIPE_QUERY_PIPELINE_STATISTICS, i,
-                                      0, PIPE_DRIVER_QUERY_TYPE_UINT64);
+                                      0, PIPE_DRIVER_QUERY_TYPE_UINT64,
+                                      PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE);
                processed = TRUE;
             }
          }
diff --git a/src/gallium/auxiliary/hud/hud_driver_query.c b/src/gallium/auxiliary/hud/hud_driver_query.c
index c47d232842f..f14305ea835 100644
--- a/src/gallium/auxiliary/hud/hud_driver_query.c
+++ b/src/gallium/auxiliary/hud/hud_driver_query.c
@@ -43,6 +43,7 @@ struct query_info {
    struct pipe_context *pipe;
    unsigned query_type;
    unsigned result_index; /* unit depends on query_type */
+   enum pipe_driver_query_result_type result_type;
 
    /* Ring of queries. If a query is busy, we use another slot. */
    struct pipe_query *query[NUM_QUERIES];
@@ -108,8 +109,19 @@ query_new_value(struct hud_graph *gr)
       }
 
       if (info->num_results && info->last_time + gr->pane->period <= now) {
-         /* compute the average value across all frames */
-         hud_graph_add_value(gr, info->results_cumulative / info->num_results);
+         uint64_t value;
+
+         switch (info->result_type) {
+         default:
+         case PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE:
+            value = info->results_cumulative / info->num_results;
+            break;
+         case PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE:
+            value = info->results_cumulative;
+            break;
+         }
+
+         hud_graph_add_value(gr, value);
 
          info->last_time = now;
          info->results_cumulative = 0;
@@ -150,7 +162,8 @@ void
 hud_pipe_query_install(struct hud_pane *pane, struct pipe_context *pipe,
                        const char *name, unsigned query_type,
                        unsigned result_index,
-                       uint64_t max_value, enum pipe_driver_query_type type)
+                       uint64_t max_value, enum pipe_driver_query_type type,
+                       enum pipe_driver_query_result_type result_type)
 {
    struct hud_graph *gr;
    struct query_info *info;
@@ -174,6 +187,7 @@ hud_pipe_query_install(struct hud_pane *pane, struct pipe_context *pipe,
    info->pipe = pipe;
    info->query_type = query_type;
    info->result_index = result_index;
+   info->result_type = result_type;
 
    hud_pane_add_graph(pane, gr);
    if (pane->max_value < max_value)
@@ -207,7 +221,7 @@ hud_driver_query_install(struct hud_pane *pane, struct pipe_context *pipe,
       return FALSE;
 
    hud_pipe_query_install(pane, pipe, query.name, query.query_type, 0,
-                          query.max_value.u64, query.type);
+                          query.max_value.u64, query.type, query.result_type);
 
    return TRUE;
 }
diff --git a/src/gallium/auxiliary/hud/hud_private.h b/src/gallium/auxiliary/hud/hud_private.h
index 033be534a75..01caf7b8b2c 100644
--- a/src/gallium/auxiliary/hud/hud_private.h
+++ b/src/gallium/auxiliary/hud/hud_private.h
@@ -90,7 +90,8 @@ void hud_pipe_query_install(struct hud_pane *pane, struct pipe_context *pipe,
                             const char *name, unsigned query_type,
                             unsigned result_index,
                             uint64_t max_value,
-                            enum pipe_driver_query_type type);
+                            enum pipe_driver_query_type type,
+                            enum pipe_driver_query_result_type result_type);
 boolean hud_driver_query_install(struct hud_pane *pane,
                                  struct pipe_context *pipe, const char *name);
 
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index b124d8e4d13..85328264cc3 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -806,6 +806,15 @@ enum pipe_driver_query_group_type
    PIPE_DRIVER_QUERY_GROUP_TYPE_GPU = 1,
 };
 
+/* Whether an average value per frame or a cumulative value should be
+ * displayed.
+ */
+enum pipe_driver_query_result_type
+{
+   PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE = 0,
+   PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE = 1,
+};
+
 union pipe_numeric_type_union
 {
    uint64_t u64;
@@ -819,6 +828,7 @@ struct pipe_driver_query_info
    unsigned query_type; /* PIPE_QUERY_DRIVER_SPECIFIC + i */
    union pipe_numeric_type_union max_value; /* max value that can be returned */
    enum pipe_driver_query_type type;
+   enum pipe_driver_query_result_type result_type;
    unsigned group_id;
 };
 

From cbad30344d6e0b1ccc9fc8d5a8e6560e97dd9188 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 18:00:57 +0200
Subject: [PATCH 1017/1208] gallium/hud: replace assertions with clamping the
 unit index
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/auxiliary/hud/hud_context.c | 58 +++++++++++++------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index decc055b21a..d3ad271f515 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -242,43 +242,47 @@ number_to_human_readable(uint64_t num, enum pipe_driver_query_type type,
       {" us", " ms", " s"};  /* based on microseconds */
    static const char *hz_units[] =
       {" Hz", " KHz", " MHz", " GHz"};
-   const char *suffix;
+   static const char *percent_units[] = {"%"};
+
+   const char **units;
+   unsigned max_unit;
    double divisor = (type == PIPE_DRIVER_QUERY_TYPE_BYTES) ? 1024 : 1000;
-   int unit = 0;
+   unsigned unit = 0;
    double d = num;
 
-   while (d > divisor) {
+   switch (type) {
+   case PIPE_DRIVER_QUERY_TYPE_MICROSECONDS:
+      max_unit = ARRAY_SIZE(time_units)-1;
+      units = time_units;
+      break;
+   case PIPE_DRIVER_QUERY_TYPE_PERCENTAGE:
+      max_unit = ARRAY_SIZE(percent_units)-1;
+      units = percent_units;
+      break;
+   case PIPE_DRIVER_QUERY_TYPE_BYTES:
+      max_unit = ARRAY_SIZE(byte_units)-1;
+      units = byte_units;
+      break;
+   case PIPE_DRIVER_QUERY_TYPE_HZ:
+      max_unit = ARRAY_SIZE(hz_units)-1;
+      units = hz_units;
+      break;
+   default:
+      max_unit = ARRAY_SIZE(metric_units)-1;
+      units = metric_units;
+   }
+
+   while (d > divisor && unit < max_unit) {
       d /= divisor;
       unit++;
    }
 
-   switch (type) {
-   case PIPE_DRIVER_QUERY_TYPE_MICROSECONDS:
-      assert(unit < ARRAY_SIZE(time_units));
-      suffix = time_units[unit];
-      break;
-   case PIPE_DRIVER_QUERY_TYPE_PERCENTAGE:
-      suffix = "%";
-      break;
-   case PIPE_DRIVER_QUERY_TYPE_BYTES:
-      assert(unit < ARRAY_SIZE(byte_units));
-      suffix = byte_units[unit];
-      break;
-   case PIPE_DRIVER_QUERY_TYPE_HZ:
-      assert(unit < ARRAY_SIZE(hz_units));
-      suffix = hz_units[unit];
-      break;
-   default:
-      assert(unit < ARRAY_SIZE(metric_units));
-      suffix = metric_units[unit];
-   }
-
    if (d >= 100 || d == (int)d)
-      sprintf(out, "%.0f%s", d, suffix);
+      sprintf(out, "%.0f%s", d, units[unit]);
    else if (d >= 10 || d*10 == (int)(d*10))
-      sprintf(out, "%.1f%s", d, suffix);
+      sprintf(out, "%.1f%s", d, units[unit]);
    else
-      sprintf(out, "%.2f%s", d, suffix);
+      sprintf(out, "%.2f%s", d, units[unit]);
 }
 
 static void

From 4e2a3e0376ca4fe39ca05e80557edfaa12e93e2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 18:11:09 +0200
Subject: [PATCH 1018/1208] gallium/hud: fix printing % next to panes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/auxiliary/hud/hud_context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index d3ad271f515..275070d1664 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -332,7 +332,7 @@ hud_pane_accumulate_vertices(struct hud_context *hud,
 
       number_to_human_readable(pane->max_value * i / 5,
                                pane->type, str);
-      hud_draw_string(hud, x, y, str);
+      hud_draw_string(hud, x, y, "%s", str);
    }
 
    /* draw info below the pane */

From dbfeb0ec12d6550e68de1bcd164e422e79bccf2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 18:11:55 +0200
Subject: [PATCH 1019/1208] gallium/hud: automatically print % if max_value ==
 100
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/auxiliary/hud/hud_context.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index 275070d1664..95eed2698bc 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -231,8 +231,8 @@ hud_draw_string(struct hud_context *hud, unsigned x, unsigned y,
 }
 
 static void
-number_to_human_readable(uint64_t num, enum pipe_driver_query_type type,
-                         char *out)
+number_to_human_readable(uint64_t num, uint64_t max_value,
+                         enum pipe_driver_query_type type, char *out)
 {
    static const char *byte_units[] =
       {" B", " KB", " MB", " GB", " TB", " PB", " EB"};
@@ -268,8 +268,13 @@ number_to_human_readable(uint64_t num, enum pipe_driver_query_type type,
       units = hz_units;
       break;
    default:
-      max_unit = ARRAY_SIZE(metric_units)-1;
-      units = metric_units;
+      if (max_value == 100) {
+         max_unit = ARRAY_SIZE(percent_units)-1;
+         units = percent_units;
+      } else {
+         max_unit = ARRAY_SIZE(metric_units)-1;
+         units = metric_units;
+      }
    }
 
    while (d > divisor && unit < max_unit) {
@@ -330,7 +335,7 @@ hud_pane_accumulate_vertices(struct hud_context *hud,
       unsigned y = pane->inner_y1 + pane->inner_height * (5 - i) / 5 -
                    hud->font.glyph_height / 2;
 
-      number_to_human_readable(pane->max_value * i / 5,
+      number_to_human_readable(pane->max_value * i / 5, pane->max_value,
                                pane->type, str);
       hud_draw_string(hud, x, y, "%s", str);
    }
@@ -341,7 +346,7 @@ hud_pane_accumulate_vertices(struct hud_context *hud,
       unsigned x = pane->x1 + 2;
       unsigned y = pane->y2 + 2 + i*hud->font.glyph_height;
 
-      number_to_human_readable(gr->current_value,
+      number_to_human_readable(gr->current_value, pane->max_value,
                                pane->type, str);
       hud_draw_string(hud, x, y, "  %s: %s", gr->name, str);
       i++;

From 0257e1fbd24e2ab442996296e49e2ebe4c0f07b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 17:09:01 +0200
Subject: [PATCH 1020/1208] gallium/radeon: change some driver query types to
 Hz
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 79b4b544004..48920e3a4c2 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -698,8 +698,8 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 		{"VRAM-usage", R600_QUERY_VRAM_USAGE, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"GTT-usage", R600_QUERY_GTT_USAGE, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"temperature", R600_QUERY_GPU_TEMPERATURE, {100}},
-		{"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}},
-		{"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}},
+		{"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
+		{"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
 		{"GPU-load", R600_QUERY_GPU_LOAD, {100}}
 	};
 	unsigned num_queries;

From 18501ff468db2091fde6029f4ec674b8365513e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 17:47:38 +0200
Subject: [PATCH 1021/1208] gallium/radeon: switch the buffer-wait-time query
 to microseconds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This display the units in the HUD.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.c | 2 +-
 src/gallium/drivers/radeon/r600_query.c       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 48920e3a4c2..86483a2cb79 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -692,7 +692,7 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 		{"draw-calls", R600_QUERY_DRAW_CALLS, {0}},
 		{"requested-VRAM", R600_QUERY_REQUESTED_VRAM, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"requested-GTT", R600_QUERY_REQUESTED_GTT, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
-		{"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}},
+		{"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}, PIPE_DRIVER_QUERY_TYPE_MICROSECONDS},
 		{"num-cs-flushes", R600_QUERY_NUM_CS_FLUSHES, {0}},
 		{"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"VRAM-usage", R600_QUERY_VRAM_USAGE, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 7fc22537f43..11f838f38a7 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -472,7 +472,7 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 		rquery->begin_result = 0;
 		return true;
 	case R600_QUERY_BUFFER_WAIT_TIME:
-		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS);
+		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000;
 		return true;
 	case R600_QUERY_NUM_CS_FLUSHES:
 		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES);
@@ -534,7 +534,7 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
 		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_REQUESTED_GTT_MEMORY);
 		return;
 	case R600_QUERY_BUFFER_WAIT_TIME:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS);
+		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000;
 		return;
 	case R600_QUERY_NUM_CS_FLUSHES:
 		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES);

From 028528215a8a6d0a5945256cc67709eef2e68189 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 17:28:20 +0200
Subject: [PATCH 1022/1208] gallium/radeon: display cumulative results for some
 driver queries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 86483a2cb79..571adaa7f45 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -692,9 +692,11 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 		{"draw-calls", R600_QUERY_DRAW_CALLS, {0}},
 		{"requested-VRAM", R600_QUERY_REQUESTED_VRAM, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"requested-GTT", R600_QUERY_REQUESTED_GTT, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
-		{"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}, PIPE_DRIVER_QUERY_TYPE_MICROSECONDS},
+		{"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}, PIPE_DRIVER_QUERY_TYPE_MICROSECONDS,
+		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
 		{"num-cs-flushes", R600_QUERY_NUM_CS_FLUSHES, {0}},
-		{"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES},
+		{"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES,
+		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
 		{"VRAM-usage", R600_QUERY_VRAM_USAGE, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"GTT-usage", R600_QUERY_GTT_USAGE, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"temperature", R600_QUERY_GPU_TEMPERATURE, {100}},

From 70f5e49ba5ca8eb063a0d7db94fbef1585b21b2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 16:57:39 +0200
Subject: [PATCH 1023/1208] radeonsi: add a HUD query showing the number of
 compiler invocations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.c   | 4 +++-
 src/gallium/drivers/radeon/r600_pipe_common.h   | 6 ++++++
 src/gallium/drivers/radeon/r600_query.c         | 9 +++++++++
 src/gallium/drivers/radeonsi/si_state_shaders.c | 1 +
 4 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 571adaa7f45..66cb4ed961c 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -689,6 +689,8 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 	struct pipe_driver_query_info list[] = {
+		{"num-compilations", R600_QUERY_NUM_COMPILATIONS, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
+		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
 		{"draw-calls", R600_QUERY_DRAW_CALLS, {0}},
 		{"requested-VRAM", R600_QUERY_REQUESTED_VRAM, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"requested-GTT", R600_QUERY_REQUESTED_GTT, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
@@ -709,7 +711,7 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
 		num_queries = Elements(list);
 	else
-		num_queries = 8;
+		num_queries = 9;
 
 	if (!info)
 		return num_queries;
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index fbd2a21da17..1a15cb92600 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -59,6 +59,7 @@
 #define R600_QUERY_CURRENT_GPU_SCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 9)
 #define R600_QUERY_CURRENT_GPU_MCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 10)
 #define R600_QUERY_GPU_LOAD		(PIPE_QUERY_DRIVER_SPECIFIC + 11)
+#define R600_QUERY_NUM_COMPILATIONS	(PIPE_QUERY_DRIVER_SPECIFIC + 12)
 
 #define R600_CONTEXT_STREAMOUT_FLUSH		(1u << 0)
 #define R600_CONTEXT_PRIVATE_FLAG		(1u << 1)
@@ -288,6 +289,11 @@ struct r600_common_screen {
 	uint32_t			*trace_ptr;
 	unsigned			cs_count;
 
+	/* This must be in the screen, because UE4 uses one context for
+	 * compilation and another one for rendering.
+	 */
+	unsigned			num_compilations;
+
 	/* GPU load thread. */
 	pipe_mutex			gpu_load_mutex;
 	pipe_thread			gpu_load_thread;
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 11f838f38a7..5f64ec660c6 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -92,6 +92,7 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
 	case R600_QUERY_GPU_LOAD:
+	case R600_QUERY_NUM_COMPILATIONS:
 		return NULL;
 	}
 
@@ -408,6 +409,7 @@ static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned q
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
 	case R600_QUERY_GPU_LOAD:
+	case R600_QUERY_NUM_COMPILATIONS:
 		skip_allocation = true;
 		break;
 	default:
@@ -483,6 +485,9 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 	case R600_QUERY_GPU_LOAD:
 		rquery->begin_result = r600_gpu_load_begin(rctx->screen);
 		return true;
+	case R600_QUERY_NUM_COMPILATIONS:
+		rquery->begin_result = p_atomic_read(&rctx->screen->num_compilations);
+		return true;
 	}
 
 	/* Discard the old query buffers. */
@@ -560,6 +565,9 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
 	case R600_QUERY_GPU_LOAD:
 		rquery->end_result = r600_gpu_load_end(rctx->screen, rquery->begin_result);
 		return;
+	case R600_QUERY_NUM_COMPILATIONS:
+		rquery->end_result = p_atomic_read(&rctx->screen->num_compilations);
+		return;
 	}
 
 	r600_emit_query_end(rctx, rquery);
@@ -619,6 +627,7 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
 	case R600_QUERY_GPU_TEMPERATURE:
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
+	case R600_QUERY_NUM_COMPILATIONS:
 		result->u64 = query->end_result - query->begin_result;
 		return TRUE;
 	case R600_QUERY_GPU_LOAD:
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 1a6854e069d..f2c8d6c898f 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -635,6 +635,7 @@ static int si_shader_select(struct pipe_context *ctx,
 		}
 		si_shader_init_pm4_state(shader);
 		sel->num_shaders++;
+		p_atomic_inc(&sctx->screen->b.num_compilations);
 	}
 
 	return 0;

From 30a7e0c021c3a77c20c6f041dc80b7dc90ad238f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 21:12:18 +0200
Subject: [PATCH 1024/1208] radeonsi: add a HUD query showing the number of
 shaders created
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.c   | 2 ++
 src/gallium/drivers/radeon/r600_pipe_common.h   | 5 +++++
 src/gallium/drivers/radeon/r600_query.c         | 9 +++++++++
 src/gallium/drivers/radeonsi/si_state_shaders.c | 1 +
 4 files changed, 17 insertions(+)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 66cb4ed961c..79e7457a02b 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -691,6 +691,8 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 	struct pipe_driver_query_info list[] = {
 		{"num-compilations", R600_QUERY_NUM_COMPILATIONS, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
 		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
+		{"num-shaders-created", R600_QUERY_NUM_SHADERS_CREATED, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
+		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
 		{"draw-calls", R600_QUERY_DRAW_CALLS, {0}},
 		{"requested-VRAM", R600_QUERY_REQUESTED_VRAM, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"requested-GTT", R600_QUERY_REQUESTED_GTT, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 1a15cb92600..6a75c4cdb56 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -60,6 +60,7 @@
 #define R600_QUERY_CURRENT_GPU_MCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 10)
 #define R600_QUERY_GPU_LOAD		(PIPE_QUERY_DRIVER_SPECIFIC + 11)
 #define R600_QUERY_NUM_COMPILATIONS	(PIPE_QUERY_DRIVER_SPECIFIC + 12)
+#define R600_QUERY_NUM_SHADERS_CREATED	(PIPE_QUERY_DRIVER_SPECIFIC + 13)
 
 #define R600_CONTEXT_STREAMOUT_FLUSH		(1u << 0)
 #define R600_CONTEXT_PRIVATE_FLAG		(1u << 1)
@@ -293,6 +294,10 @@ struct r600_common_screen {
 	 * compilation and another one for rendering.
 	 */
 	unsigned			num_compilations;
+	/* Along with ST_DEBUG=precompile, this should show if applications
+	 * are loading shaders on demand. This is a monotonic counter.
+	 */
+	unsigned			num_shaders_created;
 
 	/* GPU load thread. */
 	pipe_mutex			gpu_load_mutex;
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 5f64ec660c6..de1cdf0fbd3 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -93,6 +93,7 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 	case R600_QUERY_CURRENT_GPU_MCLK:
 	case R600_QUERY_GPU_LOAD:
 	case R600_QUERY_NUM_COMPILATIONS:
+	case R600_QUERY_NUM_SHADERS_CREATED:
 		return NULL;
 	}
 
@@ -410,6 +411,7 @@ static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned q
 	case R600_QUERY_CURRENT_GPU_MCLK:
 	case R600_QUERY_GPU_LOAD:
 	case R600_QUERY_NUM_COMPILATIONS:
+	case R600_QUERY_NUM_SHADERS_CREATED:
 		skip_allocation = true;
 		break;
 	default:
@@ -488,6 +490,9 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 	case R600_QUERY_NUM_COMPILATIONS:
 		rquery->begin_result = p_atomic_read(&rctx->screen->num_compilations);
 		return true;
+	case R600_QUERY_NUM_SHADERS_CREATED:
+		rquery->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
+		return true;
 	}
 
 	/* Discard the old query buffers. */
@@ -568,6 +573,9 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
 	case R600_QUERY_NUM_COMPILATIONS:
 		rquery->end_result = p_atomic_read(&rctx->screen->num_compilations);
 		return;
+	case R600_QUERY_NUM_SHADERS_CREATED:
+		rquery->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
+		return;
 	}
 
 	r600_emit_query_end(rctx, rquery);
@@ -628,6 +636,7 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
 	case R600_QUERY_NUM_COMPILATIONS:
+	case R600_QUERY_NUM_SHADERS_CREATED:
 		result->u64 = query->end_result - query->begin_result;
 		return TRUE;
 	case R600_QUERY_GPU_LOAD:
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index f2c8d6c898f..fbcb0f4a1d8 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -653,6 +653,7 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 	sel->tokens = tgsi_dup_tokens(state->tokens);
 	sel->so = state->stream_output;
 	tgsi_scan_shader(state->tokens, &sel->info);
+	p_atomic_inc(&sscreen->b.num_shaders_created);
 
 	switch (pipe_shader_type) {
 	case PIPE_SHADER_GEOMETRY:

From 3d551c5c7036b650124f23e4e2e3f40b9a8ad426 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 5 Aug 2015 18:51:24 -0700
Subject: [PATCH 1025/1208] i965: Request a miptree with no tiling
 intel_miptree_map_blit().

Regression since commit 3a31876600, when tiling modes were moved into
layout_flags.

Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 5e7859cf990..cb2791db02f 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -2149,7 +2149,7 @@ intel_miptree_map_blit(struct brw_context *brw,
    map->mt = intel_miptree_create(brw, GL_TEXTURE_2D, mt->format,
                                   0, 0,
                                   map->w, map->h, 1,
-                                  0, 0);
+                                  0, MIPTREE_LAYOUT_ALLOC_LINEAR);
 
    if (!map->mt) {
       fprintf(stderr, "Failed to allocate blit temporary\n");

From 1c175fc2e3a685b531920dec247086463ab9a154 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Tue, 4 Aug 2015 22:58:08 -0700
Subject: [PATCH 1026/1208] i965: Correct a mistake that always forced texture
 tiling.

Regression since commit 3a31876600, when tiling modes were moved into
layout_flags.

The relevant enum values are

   MIPTREE_LAYOUT_ALLOC_YTILED = 1 << 5
   MIPTREE_LAYOUT_ALLOC_XTILED = 1 << 6
   MIPTREE_LAYOUT_ALLOC_ANY_TILED = MIPTREE_LAYOUT_ALLOC_YTILED |
                                    MIPTREE_LAYOUT_ALLOC_XTILED
   MIPTREE_LAYOUT_ALLOC_LINEAR = 1 << 7

so the expression (layout_flags & MIPTREE_LAYOUT_ALLOC_ANY_TILED) can
never produce a value of MIPTREE_LAYOUT_ALLOC_LINEAR.

The enum this replaced was

   enum intel_miptree_tiling_mode {
      INTEL_MIPTREE_TILING_ANY,
      INTEL_MIPTREE_TILING_Y,
      INTEL_MIPTREE_TILING_NONE,
   };

where "ANY" means "Y" or "NONE" (i.e., linear). As such, remove the
unused (and worse, unhandled) MIPTREE_LAYOUT_ALLOC_XTILED and redefine
MIPTREE_LAYOUT_ALLOC_ANY_TILED to mean what it did before.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91513
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/intel_mipmap_tree.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index 89fdccb1730..506c25846cf 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -537,13 +537,11 @@ enum {
    MIPTREE_LAYOUT_FORCE_HALIGN16           = 1 << 4,
 
    MIPTREE_LAYOUT_ALLOC_YTILED             = 1 << 5,
-   MIPTREE_LAYOUT_ALLOC_XTILED             = 1 << 6,
-   MIPTREE_LAYOUT_ALLOC_LINEAR             = 1 << 7,
+   MIPTREE_LAYOUT_ALLOC_LINEAR             = 1 << 6,
+   MIPTREE_LAYOUT_ALLOC_ANY_TILED          = MIPTREE_LAYOUT_ALLOC_YTILED |
+                                             MIPTREE_LAYOUT_ALLOC_LINEAR,
 };
 
-#define MIPTREE_LAYOUT_ALLOC_ANY_TILED (MIPTREE_LAYOUT_ALLOC_YTILED | \
-                                        MIPTREE_LAYOUT_ALLOC_XTILED)
-
 struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLenum target,
 					       mesa_format format,

From 9f78e27fc60b3473b708ab4ca04e4ebd6be6cf4e Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 6 Aug 2015 10:59:15 -0700
Subject: [PATCH 1027/1208] i965: Rename MIPTREE_LAYOUT_ALLOC_* ->
 MIPTREE_LAYOUT_TILING_*.

Ben suggested that I rename MIPTREE_LAYOUT_ALLOC_ANY_TILED since it
needed to include no tiling at all, but the name
MIPTREE_LAYOUT_ALLOC_ANY is pretty nondescriptive. We can avoid
confusion by replacing "ALLOC" with "TILING" in the identifiers.

Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c     |  8 ++++----
 src/mesa/drivers/dri/i965/intel_fbo.c          |  2 +-
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c  | 16 ++++++++--------
 src/mesa/drivers/dri/i965/intel_mipmap_tree.h  |  8 ++++----
 src/mesa/drivers/dri/i965/intel_tex.c          |  2 +-
 src/mesa/drivers/dri/i965/intel_tex_image.c    |  2 +-
 src/mesa/drivers/dri/i965/intel_tex_validate.c |  2 +-
 7 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index fb78b08b649..c9ee5265a1d 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -630,12 +630,12 @@ brw_miptree_choose_tiling(struct brw_context *brw,
    /* Some usages may want only one type of tiling, like depth miptrees (Y
     * tiled), or temporary BOs for uploading data once (linear).
     */
-   switch (layout_flags & MIPTREE_LAYOUT_ALLOC_ANY_TILED) {
-   case MIPTREE_LAYOUT_ALLOC_ANY_TILED:
+   switch (layout_flags & MIPTREE_LAYOUT_TILING_ANY) {
+   case MIPTREE_LAYOUT_TILING_ANY:
       break;
-   case MIPTREE_LAYOUT_ALLOC_YTILED:
+   case MIPTREE_LAYOUT_TILING_Y:
       return I915_TILING_Y;
-   case MIPTREE_LAYOUT_ALLOC_LINEAR:
+   case MIPTREE_LAYOUT_TILING_NONE:
       return I915_TILING_NONE;
    }
 
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 87ba624b70d..72648b01e33 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -1023,7 +1023,7 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
    int width, height, depth;
 
    uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                           MIPTREE_LAYOUT_ALLOC_ANY_TILED;
+                           MIPTREE_LAYOUT_TILING_ANY;
 
    intel_miptree_get_dimensions_for_image(rb->TexImage, &width, &height, &depth);
 
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index cb2791db02f..e85c3f00c7b 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -455,7 +455,7 @@ intel_miptree_create_layout(struct brw_context *brw,
       uint32_t stencil_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
       if (brw->gen == 6) {
          stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD |
-                          MIPTREE_LAYOUT_ALLOC_ANY_TILED;
+                          MIPTREE_LAYOUT_TILING_ANY;
       }
 
       mt->stencil_mt = intel_miptree_create(brw,
@@ -759,8 +759,8 @@ intel_miptree_create_for_bo(struct brw_context *brw,
    /* The BO already has a tiling format and we shouldn't confuse the lower
     * layers by making it try to find a tiling format again.
     */
-   assert((layout_flags & MIPTREE_LAYOUT_ALLOC_ANY_TILED) == 0);
-   assert((layout_flags & MIPTREE_LAYOUT_ALLOC_LINEAR) == 0);
+   assert((layout_flags & MIPTREE_LAYOUT_TILING_ANY) == 0);
+   assert((layout_flags & MIPTREE_LAYOUT_TILING_NONE) == 0);
 
    layout_flags |= MIPTREE_LAYOUT_FOR_BO;
    mt = intel_miptree_create_layout(brw, target, format,
@@ -874,7 +874,7 @@ intel_miptree_create_for_renderbuffer(struct brw_context *brw,
    bool ok;
    GLenum target = num_samples > 1 ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D;
    const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                                 MIPTREE_LAYOUT_ALLOC_ANY_TILED;
+                                 MIPTREE_LAYOUT_TILING_ANY;
 
 
    mt = intel_miptree_create(brw, target, format, 0, 0,
@@ -1385,7 +1385,7 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
     *     "The MCS surface must be stored as Tile Y."
     */
    const uint32_t mcs_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                              MIPTREE_LAYOUT_ALLOC_YTILED;
+                              MIPTREE_LAYOUT_TILING_Y;
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1444,7 +1444,7 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
       ALIGN(mt->logical_height0, height_divisor) / height_divisor;
    assert(mt->logical_depth0 == 1);
    uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                           MIPTREE_LAYOUT_ALLOC_YTILED;
+                           MIPTREE_LAYOUT_TILING_Y;
    if (brw->gen >= 8) {
       layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
    }
@@ -1709,7 +1709,7 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
    if (!buf)
       return NULL;
 
-   layout_flags |= MIPTREE_LAYOUT_ALLOC_ANY_TILED;
+   layout_flags |= MIPTREE_LAYOUT_TILING_ANY;
    buf->mt = intel_miptree_create(brw,
                                   mt->target,
                                   mt->format,
@@ -2149,7 +2149,7 @@ intel_miptree_map_blit(struct brw_context *brw,
    map->mt = intel_miptree_create(brw, GL_TEXTURE_2D, mt->format,
                                   0, 0,
                                   map->w, map->h, 1,
-                                  0, MIPTREE_LAYOUT_ALLOC_LINEAR);
+                                  0, MIPTREE_LAYOUT_TILING_NONE);
 
    if (!map->mt) {
       fprintf(stderr, "Failed to allocate blit temporary\n");
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index 506c25846cf..790d3129207 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -536,10 +536,10 @@ enum {
    MIPTREE_LAYOUT_DISABLE_AUX              = 1 << 3,
    MIPTREE_LAYOUT_FORCE_HALIGN16           = 1 << 4,
 
-   MIPTREE_LAYOUT_ALLOC_YTILED             = 1 << 5,
-   MIPTREE_LAYOUT_ALLOC_LINEAR             = 1 << 6,
-   MIPTREE_LAYOUT_ALLOC_ANY_TILED          = MIPTREE_LAYOUT_ALLOC_YTILED |
-                                             MIPTREE_LAYOUT_ALLOC_LINEAR,
+   MIPTREE_LAYOUT_TILING_Y                 = 1 << 5,
+   MIPTREE_LAYOUT_TILING_NONE              = 1 << 6,
+   MIPTREE_LAYOUT_TILING_ANY               = MIPTREE_LAYOUT_TILING_Y |
+                                             MIPTREE_LAYOUT_TILING_NONE,
 };
 
 struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
index 8fa5e3cd55a..e16b0def0d4 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -145,7 +145,7 @@ intel_alloc_texture_storage(struct gl_context *ctx,
                                               0, levels - 1,
                                               width, height, depth,
                                               num_samples,
-                                              MIPTREE_LAYOUT_ALLOC_ANY_TILED);
+                                              MIPTREE_LAYOUT_TILING_ANY);
 
       if (intel_texobj->mt == NULL) {
          return false;
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index 3611280012e..93a8cdee0cb 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -80,7 +80,7 @@ intel_miptree_create_for_teximage(struct brw_context *brw,
 			       height,
 			       depth,
                                intelImage->base.Base.NumSamples,
-                               layout_flags | MIPTREE_LAYOUT_ALLOC_ANY_TILED);
+                               layout_flags | MIPTREE_LAYOUT_TILING_ANY);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_tex_validate.c b/src/mesa/drivers/dri/i965/intel_tex_validate.c
index 6ebf381e626..d3fb252b5d5 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_validate.c
@@ -137,7 +137,7 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                  width, height, depth, validate_last_level + 1);
 
       const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                                    MIPTREE_LAYOUT_ALLOC_ANY_TILED;
+                                    MIPTREE_LAYOUT_TILING_ANY;
       intelObj->mt = intel_miptree_create(brw,
                                           intelObj->base.Target,
 					  firstImage->base.Base.TexFormat,

From 6d6208a431f6a01a22f892c71258fd3567d969b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 6 May 2015 19:34:09 +0200
Subject: [PATCH 1028/1208] radeonsi: don't crash when cleaning up after an
 incomplete context

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 44735575c1d..73797976b5b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -56,18 +56,22 @@ static void si_destroy_context(struct pipe_context *context)
 
 	if (sctx->pstipple_sampler_state)
 		sctx->b.b.delete_sampler_state(&sctx->b.b, sctx->pstipple_sampler_state);
-	if (sctx->dummy_pixel_shader) {
+	if (sctx->dummy_pixel_shader)
 		sctx->b.b.delete_fs_state(&sctx->b.b, sctx->dummy_pixel_shader);
-	}
 	if (sctx->fixed_func_tcs_shader)
 		sctx->b.b.delete_tcs_state(&sctx->b.b, sctx->fixed_func_tcs_shader);
-	sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush);
-	sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve);
-	sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress);
-	sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fastclear);
+	if (sctx->custom_dsa_flush)
+		sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush);
+	if (sctx->custom_blend_resolve)
+		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve);
+	if (sctx->custom_blend_decompress)
+		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress);
+	if (sctx->custom_blend_fastclear)
+		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fastclear);
 	util_unreference_framebuffer_state(&sctx->framebuffer.state);
 
-	util_blitter_destroy(sctx->blitter);
+	if (sctx->blitter)
+		util_blitter_destroy(sctx->blitter);
 
 	si_pm4_cleanup(sctx);
 

From 0615ad1c70777b515d00aa5b0c41b1073ad5a2d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 27 Jun 2015 14:49:34 +0200
Subject: [PATCH 1029/1208] radeonsi: don't count the exact needed CS space if
 the CS is large enough

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeonsi/si_hw_context.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 3203ceb61fd..153d3d89f8c 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -30,8 +30,17 @@
 void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
 			boolean count_draw_in)
 {
+	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
 	int i;
 
+	/* If the CS is sufficiently large, don't count the space needed
+	 * and just flush if there is less than 8096 dwords left. */
+	if (cs->max_dw >= 24 * 1024) {
+		if (cs->cdw > cs->max_dw - 8 * 1024)
+			ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		return;
+	}
+
 	/* There are two memory usage counters in the winsys for all buffers
 	 * that have been added (cs_add_reloc) and two counters in the pipe
 	 * driver for those that haven't been added yet.
@@ -46,7 +55,7 @@ void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
 	ctx->b.vram = 0;
 
 	/* The number of dwords we already used in the CS so far. */
-	num_dw += ctx->b.rings.gfx.cs->cdw;
+	num_dw += cs->cdw;
 
 	if (count_draw_in) {
 		for (i = 0; i < SI_NUM_ATOMS(ctx); i++) {
@@ -86,7 +95,7 @@ void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
 #endif
 
 	/* Flush if there's not enough space. */
-	if (num_dw > ctx->b.rings.gfx.cs->max_dw) {
+	if (num_dw > cs->max_dw) {
 		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 }

From a3e81f819c20dd50d551de9b7e1280b2bd9c18de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Jul 2015 14:40:00 +0200
Subject: [PATCH 1030/1208] radeonsi: always flush framebuffer caches at the
 beginning of IBs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

better safe than sorry

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/radeonsi/si_hw_context.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 153d3d89f8c..8658056d15e 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -164,7 +164,8 @@ void si_context_gfx_flush(void *context, unsigned flags,
 void si_begin_new_cs(struct si_context *ctx)
 {
 	/* Flush read caches at the beginning of CS. */
-	ctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
+	ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
+			SI_CONTEXT_INV_TC_L1 |
 			SI_CONTEXT_INV_TC_L2 |
 			SI_CONTEXT_INV_KCACHE |
 			SI_CONTEXT_INV_ICACHE;

From a3723fb9e32ab114dcffcf74946def92647c5f03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 20 Jul 2015 00:15:59 +0200
Subject: [PATCH 1031/1208] gallium/radeon: add DRM and LLVM version to the
 renderer string
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.c | 26 ++++++++++++++++---
 src/gallium/drivers/radeon/r600_pipe_common.h |  2 ++
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 79e7457a02b..4c29f5235e1 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -365,11 +365,9 @@ static const char* r600_get_device_vendor(struct pipe_screen* pscreen)
 	return "AMD";
 }
 
-static const char* r600_get_name(struct pipe_screen* pscreen)
+static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
 {
-	struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
-
-	switch (rscreen->family) {
+	switch (rscreen->info.family) {
 	case CHIP_R600: return "AMD R600";
 	case CHIP_RV610: return "AMD RV610";
 	case CHIP_RV630: return "AMD RV630";
@@ -409,6 +407,13 @@ static const char* r600_get_name(struct pipe_screen* pscreen)
 	}
 }
 
+static const char* r600_get_name(struct pipe_screen* pscreen)
+{
+	struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
+
+	return rscreen->renderer_string;
+}
+
 static float r600_get_paramf(struct pipe_screen* pscreen,
 			     enum pipe_capf param)
 {
@@ -868,8 +873,21 @@ struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
 bool r600_common_screen_init(struct r600_common_screen *rscreen,
 			     struct radeon_winsys *ws)
 {
+	char llvm_string[32] = {};
+
 	ws->query_info(ws, &rscreen->info);
 
+	if (HAVE_LLVM)
+		snprintf(llvm_string, sizeof(llvm_string),
+			 ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
+			 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
+
+	snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string),
+		 "%s (DRM %i.%i.%i%s)",
+		 r600_get_chip_name(rscreen), rscreen->info.drm_major,
+		 rscreen->info.drm_minor, rscreen->info.drm_patchlevel,
+		 llvm_string);
+
 	rscreen->b.get_name = r600_get_name;
 	rscreen->b.get_vendor = r600_get_vendor;
 	rscreen->b.get_device_vendor = r600_get_device_vendor;
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 6a75c4cdb56..dbd82880583 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -305,6 +305,8 @@ struct r600_common_screen {
 	unsigned			gpu_load_counter_busy;
 	unsigned			gpu_load_counter_idle;
 	volatile unsigned		gpu_load_stop_thread; /* bool */
+
+	char				renderer_string[64];
 };
 
 /* This encapsulates a state or an operation which can emitted into the GPU

From 8118d3719aee5fdf313c33dbf3256dd78ff46bea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 3 Aug 2015 21:43:36 +0200
Subject: [PATCH 1032/1208] radeonsi: rename enable_s3tc ->
 enable_compressed_formats

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 867dbc68633..a7aa0624e02 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -1173,7 +1173,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 				       int first_non_void)
 {
 	struct si_screen *sscreen = (struct si_screen*)screen;
-	bool enable_s3tc = sscreen->b.info.drm_minor >= 31;
+	bool enable_compressed_formats = sscreen->b.info.drm_minor >= 31;
 	boolean uniform = TRUE;
 	int i;
 
@@ -1216,7 +1216,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 	}
 
 	if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
-		if (!enable_s3tc)
+		if (!enable_compressed_formats)
 			goto out_unknown;
 
 		switch (format) {
@@ -1236,7 +1236,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 	}
 
 	if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
-		if (!enable_s3tc)
+		if (!enable_compressed_formats)
 			goto out_unknown;
 
 		switch (format) {
@@ -1265,8 +1265,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 	}
 
 	if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
-
-		if (!enable_s3tc)
+		if (!enable_compressed_formats)
 			goto out_unknown;
 
 		if (!util_format_s3tc_enabled) {

From 592ce6e2d1b2c804a95cb00c06e7bbb9d83f554b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 6 Aug 2015 23:41:38 +0200
Subject: [PATCH 1033/1208] gallium/radeon: unify buffer_wait and
 buffer_is_busy in the winsys interface

The timeout parameter covers both cases.

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/r300/r300_query.c         |  5 +-
 src/gallium/drivers/r300/r300_screen_buffer.c |  2 +-
 src/gallium/drivers/r300/r300_transfer.c      |  2 +-
 .../drivers/radeon/r600_buffer_common.c       |  6 +-
 src/gallium/drivers/radeon/r600_query.c       |  2 +-
 src/gallium/drivers/radeon/r600_texture.c     |  2 +-
 src/gallium/drivers/radeon/radeon_winsys.h    | 23 +++----
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 61 +++++++++----------
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c |  6 +-
 9 files changed, 49 insertions(+), 60 deletions(-)

diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c
index 6a4cd71b169..4dd8156f616 100644
--- a/src/gallium/drivers/r300/r300_query.c
+++ b/src/gallium/drivers/r300/r300_query.c
@@ -146,10 +146,11 @@ static boolean r300_get_query_result(struct pipe_context* pipe,
 
     if (q->type == PIPE_QUERY_GPU_FINISHED) {
         if (wait) {
-            r300->rws->buffer_wait(q->buf, RADEON_USAGE_READWRITE);
+            r300->rws->buffer_wait(q->buf, PIPE_TIMEOUT_INFINITE,
+                                   RADEON_USAGE_READWRITE);
             vresult->b = TRUE;
         } else {
-            vresult->b = !r300->rws->buffer_is_busy(q->buf, RADEON_USAGE_READWRITE);
+            vresult->b = r300->rws->buffer_wait(q->buf, 0, RADEON_USAGE_READWRITE);
         }
         return vresult->b;
     }
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c
index de557b57776..6451a2c8df2 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.c
+++ b/src/gallium/drivers/r300/r300_screen_buffer.c
@@ -96,7 +96,7 @@ r300_buffer_transfer_map( struct pipe_context *context,
 
         /* Check if mapping this buffer would cause waiting for the GPU. */
         if (r300->rws->cs_is_buffer_referenced(r300->cs, rbuf->cs_buf, RADEON_USAGE_READWRITE) ||
-            r300->rws->buffer_is_busy(rbuf->buf, RADEON_USAGE_READWRITE)) {
+            !r300->rws->buffer_wait(rbuf->buf, 0, RADEON_USAGE_READWRITE)) {
             unsigned i;
             struct pb_buffer *new_buf;
 
diff --git a/src/gallium/drivers/r300/r300_transfer.c b/src/gallium/drivers/r300/r300_transfer.c
index 4fa360a042d..44303792f51 100644
--- a/src/gallium/drivers/r300/r300_transfer.c
+++ b/src/gallium/drivers/r300/r300_transfer.c
@@ -120,7 +120,7 @@ r300_texture_transfer_map(struct pipe_context *ctx,
         referenced_hw = TRUE;
     } else {
         referenced_hw =
-            r300->rws->buffer_is_busy(tex->buf, RADEON_USAGE_READWRITE);
+            !r300->rws->buffer_wait(tex->buf, 0, RADEON_USAGE_READWRITE);
     }
 
     trans = CALLOC_STRUCT(r300_transfer);
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index fc5f6c29870..5b5c06362a7 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -84,7 +84,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 		}
 	}
 
-	if (busy || ctx->ws->buffer_is_busy(resource->buf, rusage)) {
+	if (busy || !ctx->ws->buffer_wait(resource->buf, 0, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
 			return NULL;
 		} else {
@@ -274,7 +274,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 
 		/* Check if mapping this buffer would cause waiting for the GPU. */
 		if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
-		    rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) {
+		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
 			rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
 		}
 		/* At this point, the buffer is always idle. */
@@ -288,7 +288,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 
 		/* Check if mapping this buffer would cause waiting for the GPU. */
 		if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
-		    rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) {
+		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
 			/* Do a wait-free write-only transfer using a temporary buffer. */
 			unsigned offset;
 			struct r600_resource *staging = NULL;
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index de1cdf0fbd3..7057aa19a7c 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -505,7 +505,7 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 
 	/* Obtain a new buffer if the current one can't be mapped without a stall. */
 	if (r600_rings_is_buffer_referenced(rctx, rquery->buffer.buf->cs_buf, RADEON_USAGE_READWRITE) ||
-	    rctx->ws->buffer_is_busy(rquery->buffer.buf->buf, RADEON_USAGE_READWRITE)) {
+	    !rctx->ws->buffer_wait(rquery->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
 		pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL);
 		rquery->buffer.buf = r600_new_query_buffer(rctx, rquery->type);
 	}
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index d3f3e63f219..d05b0a102c0 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -941,7 +941,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 		use_staging_texture = TRUE;
 	} else if (!(usage & PIPE_TRANSFER_READ) &&
 	    (r600_rings_is_buffer_referenced(rctx, rtex->resource.cs_buf, RADEON_USAGE_READWRITE) ||
-	     rctx->ws->buffer_is_busy(rtex->resource.buf, RADEON_USAGE_READWRITE))) {
+	     !rctx->ws->buffer_wait(rtex->resource.buf, 0, RADEON_USAGE_READWRITE))) {
 		/* Use a staging texture for uploads if the underlying BO is busy. */
 		use_staging_texture = TRUE;
 	}
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index c79a0823ad0..ea1e1970157 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -398,24 +398,15 @@ struct radeon_winsys {
     void (*buffer_unmap)(struct radeon_winsys_cs_handle *buf);
 
     /**
-     * Return TRUE if a buffer object is being used by the GPU.
+     * Wait for the buffer and return true if the buffer is not used
+     * by the device.
      *
-     * \param buf       A winsys buffer object.
-     * \param usage     Only check whether the buffer is busy for the given usage.
+     * The timeout of 0 will only return the status.
+     * The timeout of PIPE_TIMEOUT_INFINITE will always wait until the buffer
+     * is idle.
      */
-    boolean (*buffer_is_busy)(struct pb_buffer *buf,
-                              enum radeon_bo_usage usage);
-
-    /**
-     * Wait for a buffer object until it is not used by a GPU. This is
-     * equivalent to a fence placed after the last command using the buffer,
-     * and synchronizing to the fence.
-     *
-     * \param buf       A winsys buffer object to wait for.
-     * \param usage     Only wait until the buffer is idle for the given usage,
-     *                  but may still be busy for some other usage.
-     */
-    void (*buffer_wait)(struct pb_buffer *buf, enum radeon_bo_usage usage);
+    bool (*buffer_wait)(struct pb_buffer *buf, uint64_t timeout,
+                        enum radeon_bo_usage usage);
 
     /**
      * Return tiling flags describing a memory layout of a buffer object.
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index ac37b24b6bb..ecaeb3bc29c 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -101,33 +101,30 @@ static struct radeon_bo *get_radeon_bo(struct pb_buffer *_buf)
     return bo;
 }
 
-static void radeon_bo_wait(struct pb_buffer *_buf, enum radeon_bo_usage usage)
+static bool radeon_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
+                           enum radeon_bo_usage usage)
 {
-    struct radeon_bo *bo = get_radeon_bo(_buf);
-    struct drm_radeon_gem_wait_idle args = {0};
+   struct radeon_bo *bo = get_radeon_bo(_buf);
 
-    while (p_atomic_read(&bo->num_active_ioctls)) {
-        sched_yield();
+   /* Wait if any ioctl is being submitted with this buffer. */
+   if (!os_wait_until_zero(&bo->num_active_ioctls, timeout))
+      return false;
+
+   /* TODO: handle arbitrary timeout */
+    if (!timeout) {
+        struct drm_radeon_gem_busy args = {0};
+
+        args.handle = bo->handle;
+        return drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_BUSY,
+                                   &args, sizeof(args)) == 0;
+    } else {
+        struct drm_radeon_gem_wait_idle args = {0};
+
+        args.handle = bo->handle;
+        while (drmCommandWrite(bo->rws->fd, DRM_RADEON_GEM_WAIT_IDLE,
+                               &args, sizeof(args)) == -EBUSY);
+        return true;
     }
-
-    args.handle = bo->handle;
-    while (drmCommandWrite(bo->rws->fd, DRM_RADEON_GEM_WAIT_IDLE,
-                           &args, sizeof(args)) == -EBUSY);
-}
-
-static boolean radeon_bo_is_busy(struct pb_buffer *_buf,
-                                 enum radeon_bo_usage usage)
-{
-    struct radeon_bo *bo = get_radeon_bo(_buf);
-    struct drm_radeon_gem_busy args = {0};
-
-    if (p_atomic_read(&bo->num_active_ioctls)) {
-        return TRUE;
-    }
-
-    args.handle = bo->handle;
-    return drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_BUSY,
-                               &args, sizeof(args)) != 0;
 }
 
 static enum radeon_bo_domain get_valid_domain(enum radeon_bo_domain domain)
@@ -410,8 +407,8 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
                     return NULL;
                 }
 
-                if (radeon_bo_is_busy((struct pb_buffer*)bo,
-                                      RADEON_USAGE_WRITE)) {
+                if (!radeon_bo_wait((struct pb_buffer*)bo, 0,
+                                    RADEON_USAGE_WRITE)) {
                     return NULL;
                 }
             } else {
@@ -420,8 +417,8 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
                     return NULL;
                 }
 
-                if (radeon_bo_is_busy((struct pb_buffer*)bo,
-                                      RADEON_USAGE_READWRITE)) {
+                if (!radeon_bo_wait((struct pb_buffer*)bo, 0,
+                                    RADEON_USAGE_READWRITE)) {
                     return NULL;
                 }
             }
@@ -439,7 +436,7 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
                 if (cs && radeon_bo_is_referenced_by_cs_for_write(cs, bo)) {
                     cs->flush_cs(cs->flush_data, 0, NULL);
                 }
-                radeon_bo_wait((struct pb_buffer*)bo,
+                radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
                                RADEON_USAGE_WRITE);
             } else {
                 /* Mapping for write. */
@@ -453,7 +450,8 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
                     }
                 }
 
-                radeon_bo_wait((struct pb_buffer*)bo, RADEON_USAGE_READWRITE);
+                radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
+                               RADEON_USAGE_READWRITE);
             }
 
             bo->mgr->rws->buffer_wait_time += os_time_get_nano() - time;
@@ -644,7 +642,7 @@ static boolean radeon_bomgr_is_buffer_busy(struct pb_manager *_mgr,
        return TRUE;
    }
 
-   if (radeon_bo_is_busy((struct pb_buffer*)bo, RADEON_USAGE_READWRITE)) {
+   if (!radeon_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_READWRITE)) {
        return TRUE;
    }
 
@@ -1166,7 +1164,6 @@ void radeon_bomgr_init_functions(struct radeon_drm_winsys *ws)
     ws->base.buffer_map = radeon_bo_map;
     ws->base.buffer_unmap = radeon_bo_unmap;
     ws->base.buffer_wait = radeon_bo_wait;
-    ws->base.buffer_is_busy = radeon_bo_is_busy;
     ws->base.buffer_create = radeon_winsys_bo_create;
     ws->base.buffer_from_handle = radeon_winsys_bo_from_handle;
     ws->base.buffer_from_ptr = radeon_winsys_bo_from_ptr;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 7e5fff4c1ba..6067bac36b8 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -637,7 +637,7 @@ static bool radeon_fence_wait(struct radeon_winsys *ws,
     struct pb_buffer *rfence = (struct pb_buffer*)fence;
 
     if (timeout == 0)
-        return !ws->buffer_is_busy(rfence, RADEON_USAGE_READWRITE);
+        return ws->buffer_wait(rfence, 0, RADEON_USAGE_READWRITE);
 
     if (timeout != PIPE_TIMEOUT_INFINITE) {
         int64_t start_time = os_time_get();
@@ -646,7 +646,7 @@ static bool radeon_fence_wait(struct radeon_winsys *ws,
         timeout /= 1000;
 
         /* Wait in a loop. */
-        while (ws->buffer_is_busy(rfence, RADEON_USAGE_READWRITE)) {
+        while (!ws->buffer_wait(rfence, 0, RADEON_USAGE_READWRITE)) {
             if (os_time_get() - start_time >= timeout) {
                 return FALSE;
             }
@@ -655,7 +655,7 @@ static bool radeon_fence_wait(struct radeon_winsys *ws,
         return TRUE;
     }
 
-    ws->buffer_wait(rfence, RADEON_USAGE_READWRITE);
+    ws->buffer_wait(rfence, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_READWRITE);
     return TRUE;
 }
 

From 42d9f6323a523d786fc3797587fdf63048becceb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 30 Apr 2015 16:07:12 +0200
Subject: [PATCH 1034/1208] winsys/radeon: add an interface for contexts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same idea as in libdrm_amdgpu.

A command stream can only be created for a specific context and it's always
submitted to that context.

This will mainly be used by amdgpu and it's required by the GPU reset status
query too.
(radeon only has a basic version of the query and thus doesn't need this)

Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/r300/r300_context.c       |  8 +++++++-
 src/gallium/drivers/r300/r300_context.h       |  2 ++
 src/gallium/drivers/r600/r600_pipe.c          |  2 +-
 src/gallium/drivers/radeon/r600_pipe_common.c | 14 +++++++++-----
 src/gallium/drivers/radeon/r600_pipe_common.h |  1 +
 src/gallium/drivers/radeon/radeon_uvd.c       |  3 ++-
 src/gallium/drivers/radeon/radeon_vce.c       |  3 ++-
 src/gallium/drivers/radeon/radeon_winsys.h    | 16 ++++++++++++++--
 src/gallium/drivers/radeonsi/si_pipe.c        |  2 +-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 18 ++++++++++++++++--
 10 files changed, 55 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index c35aa3b24aa..8c24ad6d98a 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -94,6 +94,8 @@ static void r300_destroy_context(struct pipe_context* context)
 
     if (r300->cs)
         r300->rws->cs_destroy(r300->cs);
+    if (r300->ctx)
+        r300->rws->ctx_destroy(r300->ctx);
 
     rc_destroy_regalloc_state(&r300->fs_regalloc_state);
 
@@ -382,7 +384,11 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
                      sizeof(struct pipe_transfer), 64,
                      UTIL_SLAB_SINGLETHREADED);
 
-    r300->cs = rws->cs_create(rws, RING_GFX, r300_flush_callback, r300, NULL);
+    r300->ctx = rws->ctx_create(rws);
+    if (!r300->ctx)
+        goto fail;
+
+    r300->cs = rws->cs_create(r300->ctx, RING_GFX, r300_flush_callback, r300, NULL);
     if (r300->cs == NULL)
         goto fail;
 
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 5a58500b074..18ae11a3a24 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -449,6 +449,8 @@ struct r300_context {
 
     /* The interface to the windowing system, etc. */
     struct radeon_winsys *rws;
+    /* The submission context. */
+    struct radeon_winsys_ctx *ctx;
     /* The command stream. */
     struct radeon_winsys_cs *cs;
     /* Screen. */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index e755784209a..f02014e17f0 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -176,7 +176,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
 		goto fail;
 	}
 
-	rctx->b.rings.gfx.cs = ws->cs_create(ws, RING_GFX,
+	rctx->b.rings.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX,
 					     r600_context_gfx_flush, rctx,
 					     rscreen->b.trace_bo ?
 						     rscreen->b.trace_bo->cs_buf : NULL);
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 4c29f5235e1..3f1c0f0eae9 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -262,8 +262,12 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	if (!rctx->uploader)
 		return false;
 
+	rctx->ctx = rctx->ws->ctx_create(rctx->ws);
+	if (!rctx->ctx)
+		return false;
+
 	if (rscreen->info.r600_has_dma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
-		rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ws, RING_DMA,
+		rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 							 r600_flush_dma_ring,
 							 rctx, NULL);
 		rctx->rings.dma.flush = r600_flush_dma_ring;
@@ -274,12 +278,12 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 
 void r600_common_context_cleanup(struct r600_common_context *rctx)
 {
-	if (rctx->rings.gfx.cs) {
+	if (rctx->rings.gfx.cs)
 		rctx->ws->cs_destroy(rctx->rings.gfx.cs);
-	}
-	if (rctx->rings.dma.cs) {
+	if (rctx->rings.dma.cs)
 		rctx->ws->cs_destroy(rctx->rings.dma.cs);
-	}
+	if (rctx->ctx)
+		rctx->ws->ctx_destroy(rctx->ctx);
 
 	if (rctx->uploader) {
 		u_upload_destroy(rctx->uploader);
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index dbd82880583..ce3f396011f 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -372,6 +372,7 @@ struct r600_common_context {
 
 	struct r600_common_screen	*screen;
 	struct radeon_winsys		*ws;
+	struct radeon_winsys_ctx	*ctx;
 	enum radeon_family		family;
 	enum chip_class			chip_class;
 	struct r600_rings		rings;
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index be58d0b9ce3..79fc0c726c4 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -761,6 +761,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 {
 	struct radeon_winsys* ws = ((struct r600_common_context *)context)->ws;
 	unsigned dpb_size = calc_dpb_size(templ);
+	struct r600_common_context *rctx = (struct r600_common_context*)context;
 	unsigned width = templ->width, height = templ->height;
 	unsigned bs_buf_size;
 	struct radeon_info info;
@@ -807,7 +808,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	dec->stream_handle = rvid_alloc_stream_handle();
 	dec->screen = context->screen;
 	dec->ws = ws;
-	dec->cs = ws->cs_create(ws, RING_UVD, NULL, NULL, NULL);
+	dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL, NULL);
 	if (!dec->cs) {
 		RVID_ERR("Can't get command submission context.\n");
 		goto error;
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index a6567379fe3..4557ce55556 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -377,6 +377,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 					     rvce_get_buffer get_buffer)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen *)context->screen;
+	struct r600_common_context *rctx = (struct r600_common_context*)context;
 	struct rvce_encoder *enc;
 	struct pipe_video_buffer *tmp_buf, templat = {};
 	struct radeon_surf *tmp_surf;
@@ -411,7 +412,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	enc->screen = context->screen;
 	enc->ws = ws;
-	enc->cs = ws->cs_create(ws, RING_VCE, rvce_cs_flush, enc, NULL);
+	enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc, NULL);
 	if (!enc->cs) {
 		RVID_ERR("Can't get command submission context.\n");
 		goto error;
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index ea1e1970157..93618ccbc1a 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -191,6 +191,7 @@ enum radeon_bo_priority {
 
 struct winsys_handle;
 struct radeon_winsys_cs_handle;
+struct radeon_winsys_ctx;
 
 struct radeon_winsys_cs {
     unsigned                    cdw;  /* Number of used dwords. */
@@ -505,16 +506,27 @@ struct radeon_winsys {
      * commands independently of other contexts.
      *************************************************************************/
 
+    /**
+     * Create a command submission context.
+     * Various command streams can be submitted to the same context.
+     */
+    struct radeon_winsys_ctx *(*ctx_create)(struct radeon_winsys *ws);
+
+    /**
+     * Destroy a context.
+     */
+    void (*ctx_destroy)(struct radeon_winsys_ctx *ctx);
+
     /**
      * Create a command stream.
      *
-     * \param ws        The winsys this function is called from.
+     * \param ctx       The submission context
      * \param ring_type The ring type (GFX, DMA, UVD)
      * \param flush     Flush callback function associated with the command stream.
      * \param user      User pointer that will be passed to the flush callback.
      * \param trace_buf Trace buffer when tracing is enabled
      */
-    struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys *ws,
+    struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys_ctx *ctx,
                                           enum ring_type ring_type,
                                           void (*flush)(void *ctx, unsigned flags,
 							struct pipe_fence_handle **fence),
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 73797976b5b..cacef9f0ae7 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -118,7 +118,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 		sctx->b.b.create_video_buffer = vl_video_buffer_create;
 	}
 
-	sctx->b.rings.gfx.cs = ws->cs_create(ws, RING_GFX, si_context_gfx_flush,
+	sctx->b.rings.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
 					     sctx, sscreen->b.trace_bo ?
 						sscreen->b.trace_bo->cs_buf : NULL);
 	sctx->b.rings.gfx.flush = si_context_gfx_flush;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 6067bac36b8..5fde875c34c 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -80,6 +80,18 @@ radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
 static void radeon_fence_reference(struct pipe_fence_handle **dst,
                                    struct pipe_fence_handle *src);
 
+static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
+{
+    /* No context support here. Just return the winsys pointer
+     * as the "context". */
+    return (struct radeon_winsys_ctx*)ws;
+}
+
+static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
+{
+    /* No context support here. */
+}
+
 static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
                                       struct radeon_drm_winsys *ws)
 {
@@ -158,14 +170,14 @@ static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
 
 
 static struct radeon_winsys_cs *
-radeon_drm_cs_create(struct radeon_winsys *rws,
+radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
                      enum ring_type ring_type,
                      void (*flush)(void *ctx, unsigned flags,
                                    struct pipe_fence_handle **fence),
                      void *flush_ctx,
                      struct radeon_winsys_cs_handle *trace_buf)
 {
-    struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
+    struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
     struct radeon_drm_cs *cs;
 
     cs = CALLOC_STRUCT(radeon_drm_cs);
@@ -667,6 +679,8 @@ static void radeon_fence_reference(struct pipe_fence_handle **dst,
 
 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
 {
+    ws->base.ctx_create = radeon_drm_ctx_create;
+    ws->base.ctx_destroy = radeon_drm_ctx_destroy;
     ws->base.cs_create = radeon_drm_cs_create;
     ws->base.cs_destroy = radeon_drm_cs_destroy;
     ws->base.cs_add_reloc = radeon_drm_cs_add_reloc;

From 6dea2456ca82d2c62afbd90327d265c5e78fca9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 15 Jul 2015 21:14:24 +0200
Subject: [PATCH 1035/1208] winsys/radeon: add a specific error message for
 cs_submit -> -ENOMEM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 5fde875c34c..856a4ede8de 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -406,10 +406,14 @@ static boolean radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, ui
 void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_drm_cs *cs, struct radeon_cs_context *csc)
 {
     unsigned i;
+    int r;
 
-    if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
-                            &csc->cs, sizeof(struct drm_radeon_cs))) {
-        if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
+    r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
+                            &csc->cs, sizeof(struct drm_radeon_cs));
+    if (r) {
+	if (r == -ENOMEM)
+	    fprintf(stderr, "radeon: Not enough memory for command submission.\n");
+	else if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
             unsigned i;
 
             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");

From 42d283a0cc928a9e3ecddf1a90f9417ef1a34392 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Wed, 5 Aug 2015 21:05:52 +1000
Subject: [PATCH 1036/1208] glsl: remove stage ref generation for transform
 feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stage ref cannot be queried for transform feedback.

Also simplify the build_stageref function by passing the
correct mode for uniforms.

Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/glsl/linker.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index e2da0af2521..d7efea5e05d 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3136,7 +3136,7 @@ build_stageref(struct gl_shader_program *shProg, const char *name,
             /* Type needs to match if specified, otherwise we might
              * pick a variable with same name but different interface.
              */
-            if (mode != 0 && var->data.mode != mode)
+            if (var->data.mode != mode)
                continue;
 
             if (strncmp(var->name, name, baselen) == 0) {
@@ -3248,12 +3248,9 @@ build_program_resource_list(struct gl_context *ctx,
    /* Add transform feedback varyings. */
    if (shProg->LinkedTransformFeedback.NumVarying > 0) {
       for (int i = 0; i < shProg->LinkedTransformFeedback.NumVarying; i++) {
-         uint8_t stageref =
-            build_stageref(shProg,
-                           shProg->LinkedTransformFeedback.Varyings[i].Name, 0);
          if (!add_program_resource(shProg, GL_TRANSFORM_FEEDBACK_VARYING,
                                    &shProg->LinkedTransformFeedback.Varyings[i],
-                                   stageref))
+                                   0))
          return;
       }
    }
@@ -3265,7 +3262,8 @@ build_program_resource_list(struct gl_context *ctx,
          continue;
 
       uint8_t stageref =
-         build_stageref(shProg, shProg->UniformStorage[i].name, 0);
+         build_stageref(shProg, shProg->UniformStorage[i].name,
+                        ir_var_uniform);
 
       /* Add stagereferences for uniforms in a uniform block. */
       int block_index = shProg->UniformStorage[i].block_index;

From f246aa6bcab57f85a143cbfe7e9de24237921249 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 31 Jul 2015 14:36:30 +0200
Subject: [PATCH 1037/1208] i965/vec4: do not predicate scratch writes for
 BRW_OPCODE_SEL instructions

The dst is always written, in this case the predicate is only used to select
the value to write, so if we are spilling the dst we always want to write
whatever value we selected to scratch.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index c5c0d2c73e0..ba352be3ad5 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -3482,7 +3482,8 @@ vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
 				       inst->dst.writemask));
    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
-   write->predicate = inst->predicate;
+   if (inst->opcode != BRW_OPCODE_SEL)
+      write->predicate = inst->predicate;
    write->ir = inst->ir;
    write->annotation = inst->annotation;
    inst->insert_after(block, write);

From a0b7c1c86e028309e639368b2b556b755761f68f Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 4 Aug 2015 10:06:41 +0200
Subject: [PATCH 1038/1208] i965/vec4: Fix indentation in
 vec4_visitor::evaluate_spill_costs

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 .../dri/i965/brw_vec4_reg_allocate.cpp        | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
index cd89edd64de..617c9889cad 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
@@ -280,15 +280,15 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
     */
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       for (unsigned int i = 0; i < 3; i++) {
-	 if (inst->src[i].file == GRF) {
-	    spill_costs[inst->src[i].reg] += loop_scale;
+         if (inst->src[i].file == GRF) {
+            spill_costs[inst->src[i].reg] += loop_scale;
             if (inst->src[i].reladdr)
                no_spill[inst->src[i].reg] = true;
-	 }
+         }
       }
 
       if (inst->dst.file == GRF) {
-	 spill_costs[inst->dst.reg] += loop_scale;
+         spill_costs[inst->dst.reg] += loop_scale;
          if (inst->dst.reladdr)
             no_spill[inst->dst.reg] = true;
       }
@@ -296,12 +296,12 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
       switch (inst->opcode) {
 
       case BRW_OPCODE_DO:
-	 loop_scale *= 10;
-	 break;
+         loop_scale *= 10;
+         break;
 
       case BRW_OPCODE_WHILE:
-	 loop_scale /= 10;
-	 break;
+         loop_scale /= 10;
+         break;
 
       case SHADER_OPCODE_GEN4_SCRATCH_READ:
       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
@@ -309,12 +309,12 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
             if (inst->src[i].file == GRF)
                no_spill[inst->src[i].reg] = true;
          }
-	 if (inst->dst.file == GRF)
-	    no_spill[inst->dst.reg] = true;
-	 break;
+         if (inst->dst.file == GRF)
+            no_spill[inst->dst.reg] = true;
+         break;
 
       default:
-	 break;
+         break;
       }
    }
 }

From 0508861f29f2d3b79fb803353e4ea8ab32654bc4 Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Tue, 23 Jun 2015 13:03:13 +0200
Subject: [PATCH 1039/1208] mesa: NULL check InfoLog

When a program is compiled, but linking failed the sh->InfoLog
could be NULL. This is expoloited by OpenGL ES 3.1 conformance tests.

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/main/shaderapi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 5b28a2caf3c..f9a7d130f9c 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -2033,8 +2033,8 @@ _mesa_create_shader_program(struct gl_context* ctx, GLboolean separate,
 	    }
 #endif
 	 }
-
-	 ralloc_strcat(&shProg->InfoLog, sh->InfoLog);
+         if (sh->InfoLog)
+            ralloc_strcat(&shProg->InfoLog, sh->InfoLog);
       }
 
       delete_shader(ctx, shader);

From a97f1b697b01dca9f72d8559f8269188d76dccc9 Mon Sep 17 00:00:00 2001
From: "Serge Martin (EdB)" <edb+mesa@sigluy.net>
Date: Fri, 7 Aug 2015 10:40:31 +0200
Subject: [PATCH 1040/1208] clover: Stub missing CL 1.2 functions.

As sugested by Tom a long time ago
and in order to be able to create Piglit tests

v2:
replace NOT_SUPPORTED_BY_CL_1_1 macro with an inline function
remove extra space in clLinkProgram arg

v3:
use __func__

v4:
back to a macro, it make more sense to use it with __func__

[ Francisco Jerez: Rename to CLOVER_NOT_SUPPORTED_UNTIL and pass the
  minimum API version required by the entry point so the error
  messages don't become stale when support for additional CL versions
  is introduced. ]

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 .../state_trackers/clover/api/dispatch.cpp    | 10 +++----
 .../state_trackers/clover/api/kernel.cpp      |  8 ++++++
 .../state_trackers/clover/api/memory.cpp      | 26 ++++++++++++++++---
 .../state_trackers/clover/api/program.cpp     | 10 +++++++
 .../state_trackers/clover/api/transfer.cpp    | 12 +++++++++
 .../state_trackers/clover/api/util.hpp        |  7 +++++
 6 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/src/gallium/state_trackers/clover/api/dispatch.cpp b/src/gallium/state_trackers/clover/api/dispatch.cpp
index b5a4094cf2f..f10babe31a0 100644
--- a/src/gallium/state_trackers/clover/api/dispatch.cpp
+++ b/src/gallium/state_trackers/clover/api/dispatch.cpp
@@ -123,12 +123,12 @@ namespace clover {
       clCreateImage,
       clCreateProgramWithBuiltInKernels,
       clCompileProgram,
-      NULL, // clLinkProgram
+      clLinkProgram,
       clUnloadPlatformCompiler,
-      NULL, // clGetKernelArgInfo
-      NULL, // clEnqueueFillBuffer
-      NULL, // clEnqueueFillImage
-      NULL, // clEnqueueMigrateMemObjects
+      clGetKernelArgInfo,
+      clEnqueueFillBuffer,
+      clEnqueueFillImage,
+      clEnqueueMigrateMemObjects,
       clEnqueueMarkerWithWaitList,
       clEnqueueBarrierWithWaitList,
       NULL, // clGetExtensionFunctionAddressForPlatform
diff --git a/src/gallium/state_trackers/clover/api/kernel.cpp b/src/gallium/state_trackers/clover/api/kernel.cpp
index 857a152b554..73ba34abe8e 100644
--- a/src/gallium/state_trackers/clover/api/kernel.cpp
+++ b/src/gallium/state_trackers/clover/api/kernel.cpp
@@ -189,6 +189,14 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev,
    return CL_INVALID_DEVICE;
 }
 
+CLOVER_API cl_int
+clGetKernelArgInfo(cl_kernel d_kern,
+                   cl_uint idx, cl_kernel_arg_info param,
+                   size_t size, void *r_buf, size_t *r_size) {
+   CLOVER_NOT_SUPPORTED_UNTIL("1.2");
+   return CL_KERNEL_ARG_INFO_NOT_AVAILABLE;
+}
+
 namespace {
    ///
    /// Common argument checking shared by kernel invocation commands.
diff --git a/src/gallium/state_trackers/clover/api/memory.cpp b/src/gallium/state_trackers/clover/api/memory.cpp
index 3ff6ba0e1c5..1efb95b5ce7 100644
--- a/src/gallium/state_trackers/clover/api/memory.cpp
+++ b/src/gallium/state_trackers/clover/api/memory.cpp
@@ -357,9 +357,29 @@ clCreateImage(cl_context d_ctx, cl_mem_flags flags,
               const cl_image_format *format,
               const cl_image_desc *image_desc,
               void *host_ptr, cl_int *r_errcode) {
-   // This function was added in OpenCL 1.2
-   std::cerr << "CL user error: clCreateImage() not supported by OpenCL 1.1." <<
-                std::endl;
+   CLOVER_NOT_SUPPORTED_UNTIL("1.2");
    ret_error(r_errcode, CL_INVALID_OPERATION);
    return NULL;
 }
+
+CLOVER_API cl_int
+clEnqueueFillBuffer(cl_command_queue command_queue, cl_mem buffer,
+                    const void *pattern, size_t pattern_size,
+                    size_t offset, size_t size,
+                    cl_uint num_events_in_wait_list,
+                    const cl_event *event_wait_list,
+                    cl_event *event) {
+   CLOVER_NOT_SUPPORTED_UNTIL("1.2");
+   return CL_INVALID_VALUE;
+}
+
+CLOVER_API cl_int
+clEnqueueFillImage(cl_command_queue command_queue, cl_mem image,
+                   const void *fill_color,
+                   const size_t *origin, const size_t *region,
+                   cl_uint num_events_in_wait_list,
+                   const cl_event *event_wait_list,
+                   cl_event *event) {
+   CLOVER_NOT_SUPPORTED_UNTIL("1.2");
+   return CL_INVALID_VALUE;
+}
diff --git a/src/gallium/state_trackers/clover/api/program.cpp b/src/gallium/state_trackers/clover/api/program.cpp
index e9b1f384344..27ca2efd0bc 100644
--- a/src/gallium/state_trackers/clover/api/program.cpp
+++ b/src/gallium/state_trackers/clover/api/program.cpp
@@ -231,6 +231,16 @@ clCompileProgram(cl_program d_prog, cl_uint num_devs,
    return e.get();
 }
 
+CLOVER_API cl_program
+clLinkProgram(cl_context d_ctx, cl_uint num_devs, const cl_device_id *d_devs,
+              const char *p_opts, cl_uint num_progs, const cl_program *d_progs,
+              void (*pfn_notify)(cl_program, void *), void *user_data,
+              cl_int *r_errcode) {
+   CLOVER_NOT_SUPPORTED_UNTIL("1.2");
+   ret_error(r_errcode, CL_LINKER_NOT_AVAILABLE);
+   return NULL;
+}
+
 CLOVER_API cl_int
 clUnloadCompiler() {
    return CL_SUCCESS;
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index fdb9405c918..f7046253be8 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -726,3 +726,15 @@ clEnqueueUnmapMemObject(cl_command_queue d_q, cl_mem d_mem, void *ptr,
 } catch (error &e) {
    return e.get();
 }
+
+CLOVER_API cl_int
+clEnqueueMigrateMemObjects(cl_command_queue command_queue,
+                           cl_uint num_mem_objects,
+                           const cl_mem *mem_objects,
+                           cl_mem_migration_flags flags,
+                           cl_uint num_events_in_wait_list,
+                           const cl_event *event_wait_list,
+                           cl_event *event) {
+   CLOVER_NOT_SUPPORTED_UNTIL("1.2");
+   return CL_INVALID_VALUE;
+}
diff --git a/src/gallium/state_trackers/clover/api/util.hpp b/src/gallium/state_trackers/clover/api/util.hpp
index 918df6125a4..31e20e424b9 100644
--- a/src/gallium/state_trackers/clover/api/util.hpp
+++ b/src/gallium/state_trackers/clover/api/util.hpp
@@ -38,6 +38,13 @@
 #define CLOVER_ICD_API PUBLIC
 #endif
 
+#define CLOVER_NOT_SUPPORTED_UNTIL(version)                    \
+   do {                                                        \
+      std::cerr << "CL user error: " << __func__               \
+                << "() requires OpenCL version " << (version)  \
+                << " or greater." << std::endl;                \
+   } while (0)
+
 namespace clover {
    ///
    /// Return an error code in \a p if non-zero.

From 4fa0cd17b77039ab67e81991002b4d5947298278 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Fri, 17 Jul 2015 12:41:24 +0100
Subject: [PATCH 1041/1208] vc4: automake: remove unused include

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/drivers/vc4/Makefile.am | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am
index 774463138d0..0b8279d1763 100644
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -31,7 +31,6 @@ AM_CFLAGS = \
 	$(LIBDRM_CFLAGS) \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(SIM_CFLAGS) \
-	-I$(top_srcdir)/src/mesa/ \
 	$()
 
 noinst_LTLIBRARIES = libvc4.la

From 75ce7919d6496981013a21a7055c668e47e7bed2 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Fri, 17 Jul 2015 12:52:27 +0100
Subject: [PATCH 1042/1208] vc4: add missing nir include, to fix the build

Cc: 10.6 <mesa-stable@lists.freedesktop.org>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/drivers/vc4/Makefile.am | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am
index 0b8279d1763..f4a57ba3404 100644
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -28,6 +28,7 @@ SIM_LDFLAGS = -lsimpenrose
 endif
 
 AM_CFLAGS = \
+	-I$(top_builddir)/src/glsl/nir \
 	$(LIBDRM_CFLAGS) \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(SIM_CFLAGS) \

From 60e9c35b3a0384860ffcb01d902a69ee13254eb9 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 29 Jul 2015 17:19:02 +0100
Subject: [PATCH 1043/1208] egl/x11: bail out if we cannot fetch the xcb
 connection

The documentation of xcb_connection_has_error() does not mention
what will happen, if NULL is fed to the function.

Upon closer look (props to Matt), it seems that we'll crash as the
implementation dereferences conn.

This will also allow us to remove the dri2_dpy->conn checking with the
next commit.

v2: Reword commit message as per Matt's findings.

Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/egl/drivers/dri2/platform_x11.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index fecd36b3e7e..7a28318c008 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -1222,7 +1222,7 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
       dri2_dpy->screen = DefaultScreen(dpy);
    }
 
-   if (xcb_connection_has_error(dri2_dpy->conn)) {
+   if (!dri2_dpy->conn || xcb_connection_has_error(dri2_dpy->conn)) {
       _eglLog(_EGL_WARNING, "DRI2: xcb_connect failed");
       goto cleanup_dpy;
    }

From bf66988b08786c123804c2be8846a6a21cf200ad Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 29 Jul 2015 17:19:03 +0100
Subject: [PATCH 1044/1208] egl/x11: remove dri2_dpy->conn checks

If the connection is NULL we won't be able to get here.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/egl/drivers/dri2/platform_x11.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 7a28318c008..92e733a2330 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -1227,10 +1227,8 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
       goto cleanup_dpy;
    }
 
-   if (dri2_dpy->conn) {
-      if (!dri2_x11_connect(dri2_dpy))
-	 goto cleanup_conn;
-   }
+   if (!dri2_x11_connect(dri2_dpy))
+      goto cleanup_conn;
 
    if (!dri2_load_driver(disp))
       goto cleanup_conn;
@@ -1243,10 +1241,8 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
       goto cleanup_driver;
    }
 
-   if (dri2_dpy->conn) {
-      if (!dri2_x11_local_authenticate(disp))
-	 goto cleanup_fd;
-   }
+   if (!dri2_x11_local_authenticate(disp))
+      goto cleanup_fd;
 
    if (dri2_dpy->dri2_minor >= 1) {
       dri2_dpy->dri2_loader_extension.base.name = __DRI_DRI2_LOADER;
@@ -1285,10 +1281,8 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
    disp->Extensions.WL_bind_wayland_display = EGL_TRUE;
 #endif
 
-   if (dri2_dpy->conn) {
-      if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp))
-	 goto cleanup_configs;
-   }
+   if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp))
+      goto cleanup_configs;
 
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.

From beddb0a2371059829b20240058931b8c9fd5be40 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 29 Jul 2015 17:19:04 +0100
Subject: [PATCH 1045/1208] egl/x11: fetch the device_name prior to driver_name

With the follow up commits we're about to further reshuffle things. Thus
we'll honour our our driver_name lookup (src/loader), and use the one
provided by xserver as a fall-back.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/egl/drivers/dri2/platform_x11.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 92e733a2330..2e970703774 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -579,17 +579,17 @@ dri2_x11_connect(struct dri2_egl_display *dri2_dpy)
       return EGL_FALSE;
    }
 
-   driver_name = xcb_dri2_connect_driver_name (connect);
-   dri2_dpy->driver_name =
-      strndup(driver_name,
-              xcb_dri2_connect_driver_name_length(connect));
-
    device_name = xcb_dri2_connect_device_name (connect);
 
    dri2_dpy->device_name =
       strndup(device_name,
               xcb_dri2_connect_device_name_length(connect));
 
+   driver_name = xcb_dri2_connect_driver_name (connect);
+   dri2_dpy->driver_name =
+      strndup(driver_name,
+              xcb_dri2_connect_driver_name_length(connect));
+
    if (dri2_dpy->device_name == NULL || dri2_dpy->driver_name == NULL) {
       free(dri2_dpy->device_name);
       free(dri2_dpy->driver_name);

From faf0f811e3f9fb724a89c463c0cb6a0d61715f95 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 29 Jul 2015 17:19:05 +0100
Subject: [PATCH 1046/1208] egl/x11: open the device from within
 dri2_x11_connect()

Allows us, with the next commit, to use alternative driver_name rather
than the one from xserver.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/egl/drivers/dri2/platform_x11.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 2e970703774..745e16a6cc1 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -585,12 +585,23 @@ dri2_x11_connect(struct dri2_egl_display *dri2_dpy)
       strndup(device_name,
               xcb_dri2_connect_device_name_length(connect));
 
+   dri2_dpy->fd = loader_open_device(dri2_dpy->device_name);
+   if (dri2_dpy->fd == -1) {
+      _eglLog(_EGL_WARNING,
+              "DRI2: could not open %s (%s)", dri2_dpy->device_name,
+              strerror(errno));
+      free(dri2_dpy->device_name);
+      free(connect);
+      return EGL_FALSE;
+   }
+
    driver_name = xcb_dri2_connect_driver_name (connect);
    dri2_dpy->driver_name =
       strndup(driver_name,
               xcb_dri2_connect_driver_name_length(connect));
 
    if (dri2_dpy->device_name == NULL || dri2_dpy->driver_name == NULL) {
+      close(dri2_dpy->fd);
       free(dri2_dpy->device_name);
       free(dri2_dpy->driver_name);
       free(connect);
@@ -1231,18 +1242,10 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
       goto cleanup_conn;
 
    if (!dri2_load_driver(disp))
-      goto cleanup_conn;
-
-   dri2_dpy->fd = loader_open_device(dri2_dpy->device_name);
-   if (dri2_dpy->fd == -1) {
-      _eglLog(_EGL_WARNING,
-	      "DRI2: could not open %s (%s)", dri2_dpy->device_name,
-              strerror(errno));
-      goto cleanup_driver;
-   }
+      goto cleanup_fd;
 
    if (!dri2_x11_local_authenticate(disp))
-      goto cleanup_fd;
+      goto cleanup_driver;
 
    if (dri2_dpy->dri2_minor >= 1) {
       dri2_dpy->dri2_loader_extension.base.name = __DRI_DRI2_LOADER;
@@ -1267,7 +1270,7 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
    dri2_dpy->invalidate_available = (dri2_dpy->dri2_minor >= 3);
 
    if (!dri2_create_screen(disp))
-      goto cleanup_fd;
+      goto cleanup_driver;
 
    dri2_x11_setup_swap_interval(dri2_dpy);
 
@@ -1294,10 +1297,10 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
  cleanup_configs:
    _eglCleanupDisplay(disp);
    dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
- cleanup_fd:
-   close(dri2_dpy->fd);
  cleanup_driver:
    dlclose(dri2_dpy->driver);
+ cleanup_fd:
+   close(dri2_dpy->fd);
  cleanup_conn:
    if (disp->PlatformDisplay == NULL)
       xcb_disconnect(dri2_dpy->conn);

From 45e110bad9d5d31eb67d7d32937aa5a752108df8 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 29 Jul 2015 17:19:06 +0100
Subject: [PATCH 1047/1208] egl/x11: trust our loader over the xserver for the
 drivername

This is a port of commit 7bd95ec437a(dri2: Trust our own driver name
lookup over the server's.) from glx/dri2.

v2: Add newline between code and multiline comment. (Matt)

Cc: Julien Isorce <julien.isorce@gmail.com>
Reported-by: Julien Isorce <julien.isorce@gmail.com>
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/egl/drivers/dri2/platform_x11.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 745e16a6cc1..a251694b873 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -519,7 +519,7 @@ dri2_x11_connect(struct dri2_egl_display *dri2_dpy)
    xcb_generic_error_t *error;
    xcb_screen_iterator_t s;
    xcb_screen_t *screen;
-   char *driver_name, *device_name;
+   char *driver_name, *loader_driver_name, *device_name;
    const xcb_query_extension_reply_t *extension;
 
    xcb_prefetch_extension_data (dri2_dpy->conn, &xcb_xfixes_id);
@@ -596,6 +596,16 @@ dri2_x11_connect(struct dri2_egl_display *dri2_dpy)
    }
 
    driver_name = xcb_dri2_connect_driver_name (connect);
+
+   /* If Mesa knows about the appropriate driver for this fd, then trust it.
+    * Otherwise, default to the server's value.
+    */
+   loader_driver_name = loader_get_driver_for_fd(dri2_dpy->fd, 0);
+   if (loader_driver_name) {
+      free(driver_name);
+      driver_name = loader_driver_name;
+   }
+
    dri2_dpy->driver_name =
       strndup(driver_name,
               xcb_dri2_connect_driver_name_length(connect));

From 2c7b6cf512a775a37677b1e467d2af952c449dae Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Wed, 29 Jul 2015 17:19:07 +0100
Subject: [PATCH 1048/1208] egl/x11: auth with xserver before attempting to
 open the dri module

No real change, apart from keeping the calls to the underlying winsys
(x11) next to each other. Just like platform_wayland.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/egl/drivers/dri2/platform_x11.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index a251694b873..df08a8a8303 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -1251,11 +1251,11 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
    if (!dri2_x11_connect(dri2_dpy))
       goto cleanup_conn;
 
-   if (!dri2_load_driver(disp))
+   if (!dri2_x11_local_authenticate(disp))
       goto cleanup_fd;
 
-   if (!dri2_x11_local_authenticate(disp))
-      goto cleanup_driver;
+   if (!dri2_load_driver(disp))
+      goto cleanup_fd;
 
    if (dri2_dpy->dri2_minor >= 1) {
       dri2_dpy->dri2_loader_extension.base.name = __DRI_DRI2_LOADER;

From 6de9a03bed400fca5672ef0c13c0039bbe94a679 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Fri, 7 Aug 2015 19:20:48 +0100
Subject: [PATCH 1049/1208] egl/x11: don't crash if dri2_dpy->conn is NULL

Identical to commit 60e9c35b3a0(egl/x11: bail out if we cannot fetch
the xcb connection) but for the swrast codepath.

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/egl/drivers/dri2/platform_x11.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index df08a8a8303..61e5b77ac78 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -1124,7 +1124,7 @@ dri2_initialize_x11_swrast(_EGLDriver *drv, _EGLDisplay *disp)
       dri2_dpy->screen = DefaultScreen(dpy);
    }
 
-   if (xcb_connection_has_error(dri2_dpy->conn)) {
+   if (!dri2_dpy->conn || xcb_connection_has_error(dri2_dpy->conn)) {
       _eglLog(_EGL_WARNING, "DRI2: xcb_connect failed");
       goto cleanup_dpy;
    }
@@ -1150,10 +1150,8 @@ dri2_initialize_x11_swrast(_EGLDriver *drv, _EGLDisplay *disp)
    if (!dri2_create_screen(disp))
       goto cleanup_driver;
 
-   if (dri2_dpy->conn) {
-      if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp))
-         goto cleanup_configs;
-   }
+   if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp))
+      goto cleanup_configs;
 
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.

From ba651967a201b48f380cd30495e271317c1d8522 Mon Sep 17 00:00:00 2001
From: Alexander von Gluck IV <kallisti5@unixzen.com>
Date: Fri, 7 Aug 2015 12:55:40 -0500
Subject: [PATCH 1050/1208] egl/dri2: Fix include path of u_atomic.h introduced
 e7e29189

This was causing a failure to build on SCons due to a missing
-Isrc/egl. Instead of adding in that path, lets just -Isrc/
and include "utils/u_atomic.h".

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/egl/Makefile.am             | 1 +
 src/egl/SConscript              | 1 +
 src/egl/drivers/dri2/egl_dri2.c | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/egl/Makefile.am b/src/egl/Makefile.am
index be7bfe9c3ab..5c2ba301ffb 100644
--- a/src/egl/Makefile.am
+++ b/src/egl/Makefile.am
@@ -25,6 +25,7 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/egl/main \
 	-I$(top_srcdir)/src/gbm/main \
+	-I$(top_srcdir)/src \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
 	$(LIBDRM_CFLAGS) \
diff --git a/src/egl/SConscript b/src/egl/SConscript
index a7f62824e14..1b2a4271ef7 100644
--- a/src/egl/SConscript
+++ b/src/egl/SConscript
@@ -9,6 +9,7 @@ env = env.Clone()
 env.Append(CPPPATH = [
     '#/include',
     '#/src/egl/main',
+    '#/src',
 ])
 
 
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 0290c077ef6..461735fe9e3 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -52,7 +52,7 @@
 #endif
 
 #include "egl_dri2.h"
-#include "../util/u_atomic.h"
+#include "util/u_atomic.h"
 
 /* The kernel header drm_fourcc.h defines the DRM formats below.  We duplicate
  * some of the definitions here so that building Mesa won't bleeding-edge

From ffadfbf5d076638fa4022106cfe989bc5a145f20 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 5 Aug 2015 13:58:46 +0100
Subject: [PATCH 1051/1208] i965: Fix HW binding tables editing

Since the introduction of new gl_shader_stages in

commit a2af956963b6bc4d29f37485e44c98008d2ef077
Author: Fabian Bieler <fabianbieler@fastmail.fm>
Date:   Fri Mar 7 10:19:09 2014 +0100

    mesa: add tessellation shader enums

the translation table for the stage into the HW binding table edit
command was broken, and so we used illegal commands. Fix the array
initialisation to be impervious to changes in the gl_shader_stages enum
and add the asserts that would have caught the issue earlier.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Cc: Jordan Justen <jordan.l.justen@intel.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_binding_tables.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c
index 8d3697abcd5..b188fc7de57 100644
--- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
+++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
@@ -44,10 +44,10 @@
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
 
-static const GLuint stage_to_bt_edit[MESA_SHADER_FRAGMENT + 1] = {
-   _3DSTATE_BINDING_TABLE_EDIT_VS,
-   _3DSTATE_BINDING_TABLE_EDIT_GS,
-   _3DSTATE_BINDING_TABLE_EDIT_PS,
+static const GLuint stage_to_bt_edit[] = {
+   [MESA_SHADER_VERTEX] = _3DSTATE_BINDING_TABLE_EDIT_VS,
+   [MESA_SHADER_GEOMETRY] = _3DSTATE_BINDING_TABLE_EDIT_GS,
+   [MESA_SHADER_FRAGMENT] = _3DSTATE_BINDING_TABLE_EDIT_PS,
 };
 
 static uint32_t
@@ -233,7 +233,8 @@ gen7_edit_hw_binding_table_entry(struct brw_context *brw,
                                  uint32_t index,
                                  uint32_t surf_offset)
 {
-   assert(stage <= MESA_SHADER_FRAGMENT);
+   assert(stage < ARRAY_SIZE(stage_to_bt_edit));
+   assert(stage_to_bt_edit[stage]);
 
    uint32_t dw2 = SET_FIELD(index, BRW_BINDING_TABLE_INDEX) |
       (brw->gen >= 8 ? GEN8_SURFACE_STATE_EDIT(surf_offset) :
@@ -259,7 +260,9 @@ gen7_update_binding_table_from_array(struct brw_context *brw,
                                      int num_surfaces)
 {
    uint32_t dw2 = 0;
-   assert(stage <= MESA_SHADER_FRAGMENT);
+
+   assert(stage < ARRAY_SIZE(stage_to_bt_edit));
+   assert(stage_to_bt_edit[stage]);
 
    BEGIN_BATCH(num_surfaces + 2);
    OUT_BATCH(stage_to_bt_edit[stage] << 16 | num_surfaces);

From a1adf0b3fe428a4bf690f166c2697d8c7ea2dcb0 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Fri, 7 Aug 2015 13:46:30 -0700
Subject: [PATCH 1052/1208] i965/skl: (trivial) Remove invalid comment about
 thread counts

This should have been a part of:
commit 7eaacc1678195738fab3bb98870828611cae066d
Author: Ben Widawsky <benjamin.widawsky@intel.com>
Date:   Wed Jul 29 12:35:24 2015 -0700

    i965/skl: Add production thread counts and URB size

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/i965/brw_device_info.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index fcc243e1d4c..be517e826c4 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -299,7 +299,6 @@ static const struct brw_device_info brw_device_info_chv = {
    }
 };
 
-/* Thread counts and URB limits are placeholders, and may not be accurate. */
 #define GEN9_FEATURES                               \
    .gen = 9,                                        \
    .has_hiz_and_separate_stencil = true,            \

From 27141f984d6401dc466f0e9b0c5da2a9248045e3 Mon Sep 17 00:00:00 2001
From: Boyan Ding <boyan.j.ding@gmail.com>
Date: Sat, 8 Aug 2015 17:23:28 +0800
Subject: [PATCH 1053/1208] egl/x11: Fix driver_name acquisition

We don't need to free driverName string from dri2 reply, on the other
hand, the driver name acquired from loader doesn't need duplication.

Fixes: 45e110bad9d (egl/x11: trust our loader over the xserver for the
drivername)

Reported-by: Timothy Arceri <t_arceri@yahoo.com.au>
Signed-off-by: Boyan Ding <boyan.j.ding@gmail.com>
[Emil Velikov: use brackets for both branches of conditional]
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/egl/drivers/dri2/platform_x11.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 61e5b77ac78..9b040f7d491 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -602,14 +602,13 @@ dri2_x11_connect(struct dri2_egl_display *dri2_dpy)
     */
    loader_driver_name = loader_get_driver_for_fd(dri2_dpy->fd, 0);
    if (loader_driver_name) {
-      free(driver_name);
-      driver_name = loader_driver_name;
+      dri2_dpy->driver_name = loader_driver_name;
+   } else {
+      dri2_dpy->driver_name =
+         strndup(driver_name,
+                 xcb_dri2_connect_driver_name_length(connect));
    }
 
-   dri2_dpy->driver_name =
-      strndup(driver_name,
-              xcb_dri2_connect_driver_name_length(connect));
-
    if (dri2_dpy->device_name == NULL || dri2_dpy->driver_name == NULL) {
       close(dri2_dpy->fd);
       free(dri2_dpy->device_name);

From 512aa0647f328fff69b3ce328b6466f2da8b7c4d Mon Sep 17 00:00:00 2001
From: Jose Fonseca <jfonseca@vmware.com>
Date: Fri, 7 Aug 2015 13:07:40 +0100
Subject: [PATCH 1054/1208] util: Rename PURE to ATTRIBUTE_PURE.

To avoid collission with windows.h's PURE macro.

We could consider eventually renaming to __pure, but that would require
further care, so it's left to the future.

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/util/macros.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/util/macros.h b/src/util/macros.h
index 5c5c92ec610..84e4f182bcf 100644
--- a/src/util/macros.h
+++ b/src/util/macros.h
@@ -145,9 +145,9 @@ do {                       \
  * return value.  As a result, calls to it can be dead code eliminated.
  */
 #ifdef HAVE_FUNC_ATTRIBUTE_PURE
-#define PURE __attribute__((__pure__))
+#define ATTRIBUTE_PURE __attribute__((__pure__))
 #else
-#define PURE
+#define ATTRIBUTE_PURE
 #endif
 
 #ifdef __cplusplus

From eb643db30e1bdf5171d0a012674016c317925b6e Mon Sep 17 00:00:00 2001
From: Jose Fonseca <jfonseca@vmware.com>
Date: Sun, 9 Aug 2015 11:21:03 +0100
Subject: [PATCH 1055/1208] gallium: GCC 4.9 allows to include tmmintrin.h
 without -msse3.

Fixes build with MinGW x86_64 build with GCC 4.9, due to conflicting
definition _mm_shuffle_epi8 of u_sse.h and system headers.

Trivial.
---
 src/gallium/include/pipe/p_config.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
index 794aabe85f2..ac14f86fdc4 100644
--- a/src/gallium/include/pipe/p_config.h
+++ b/src/gallium/include/pipe/p_config.h
@@ -100,8 +100,8 @@
 #else
 #define PIPE_ARCH_SSE
 #endif
-#if defined(PIPE_CC_GCC) && !defined(__SSSE3__)
-/* #warning SSE3 support requires -msse3 compiler options */
+#if defined(PIPE_CC_GCC) && (__GNUC__ * 100 + __GNUC_MINOR__) < 409 && !defined(__SSSE3__)
+/* #warning SSE3 support requires -msse3 compiler options before GCC 4.9 */
 #else
 #define PIPE_ARCH_SSSE3
 #endif

From 21ccdbdb5dd87b2ee66c4e78b011ec4df29efb98 Mon Sep 17 00:00:00 2001
From: Jose Fonseca <jfonseca@vmware.com>
Date: Sun, 9 Aug 2015 11:25:41 +0100
Subject: [PATCH 1056/1208] util: Cope with LONG_BIT not being defined on
 Windows.

Neither MSVC nor MinGW defines LONG_BIT.  For MSVC this was not a problem as
it doesn't define __x86_64__ macro (it's GCC specific.)

However on Windows long type is guaranteed to be 32bits.

Also add an #error, as GCC will just warn, not throw any error, when no
value is returned.

Trivial.
---
 src/util/rounding.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/util/rounding.h b/src/util/rounding.h
index b0c9918fd6e..ec31b47264e 100644
--- a/src/util/rounding.h
+++ b/src/util/rounding.h
@@ -96,8 +96,10 @@ _mesa_lroundevenf(float x)
 #ifdef __x86_64__
 #if LONG_BIT == 64
    return _mm_cvtss_si64(_mm_load_ss(&x));
-#elif LONG_BIT == 32
+#elif LONG_BIT == 32 || defined(_WIN32)
    return _mm_cvtss_si32(_mm_load_ss(&x));
+#else
+#error "Unsupported or undefined LONG_BIT"
 #endif
 #else
    return lrintf(x);
@@ -114,8 +116,10 @@ _mesa_lroundeven(double x)
 #ifdef __x86_64__
 #if LONG_BIT == 64
    return _mm_cvtsd_si64(_mm_load_sd(&x));
-#elif LONG_BIT == 32
+#elif LONG_BIT == 32 || defined(_WIN32)
    return _mm_cvtsd_si32(_mm_load_sd(&x));
+#else
+#error "Unsupported or undefined LONG_BIT"
 #endif
 #else
    return lrint(x);

From 497a22a727d3606c7327eb72efbf0d2c03607f0a Mon Sep 17 00:00:00 2001
From: Jose Fonseca <jfonseca@vmware.com>
Date: Sun, 9 Aug 2015 11:55:28 +0100
Subject: [PATCH 1057/1208] scons: Build roundevent_test.

Reviewed-by: Roland Scheidegger <sroland@vmware.co>
---
 src/util/SConscript | 7 +++++++
 src/util/rounding.h | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/src/util/SConscript b/src/util/SConscript
index 9e4d481f838..3dbe70a2e8a 100644
--- a/src/util/SConscript
+++ b/src/util/SConscript
@@ -54,3 +54,10 @@ u_atomic_test = env.Program(
 )
 alias = env.Alias("u_atomic_test", u_atomic_test, u_atomic_test[0].abspath)
 AlwaysBuild(alias)
+
+roundeven_test = env.Program(
+    target = 'roundeven_test',
+    source = ['roundeven_test.c'],
+)
+alias = env.Alias("roundeven_test", roundeven_test, roundeven_test[0].abspath)
+AlwaysBuild(alias)
diff --git a/src/util/rounding.h b/src/util/rounding.h
index ec31b47264e..868e1b434db 100644
--- a/src/util/rounding.h
+++ b/src/util/rounding.h
@@ -24,6 +24,8 @@
 #ifndef _ROUNDING_H
 #define _ROUNDING_H
 
+#include "c99_compat.h" // inline
+
 #include <math.h>
 #include <limits.h>
 

From 1eaa29cb300e927409281ef0a9413072766eaa3d Mon Sep 17 00:00:00 2001
From: Jose Fonseca <jfonseca@vmware.com>
Date: Sun, 9 Aug 2015 22:36:37 +0100
Subject: [PATCH 1058/1208] util: Use LONG_MAX instead of LONG_BIT.

More portable.  Based on Roland Scheidegger's idea.

Tested with roundevent_test on Linux, MinGW, and MSVC.

https://bugs.freedesktop.org/show_bug.cgi?id=91591

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/util/rounding.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/util/rounding.h b/src/util/rounding.h
index 868e1b434db..7b5608b8a78 100644
--- a/src/util/rounding.h
+++ b/src/util/rounding.h
@@ -28,6 +28,7 @@
 
 #include <math.h>
 #include <limits.h>
+#include <stdint.h>
 
 #ifdef __x86_64__
 #include <xmmintrin.h>
@@ -96,12 +97,12 @@ static inline long
 _mesa_lroundevenf(float x)
 {
 #ifdef __x86_64__
-#if LONG_BIT == 64
+#if LONG_MAX == INT64_MAX
    return _mm_cvtss_si64(_mm_load_ss(&x));
-#elif LONG_BIT == 32 || defined(_WIN32)
+#elif LONG_MAX == INT32_MAX
    return _mm_cvtss_si32(_mm_load_ss(&x));
 #else
-#error "Unsupported or undefined LONG_BIT"
+#error "Unsupported long size"
 #endif
 #else
    return lrintf(x);
@@ -116,12 +117,12 @@ static inline long
 _mesa_lroundeven(double x)
 {
 #ifdef __x86_64__
-#if LONG_BIT == 64
+#if LONG_MAX == INT64_MAX
    return _mm_cvtsd_si64(_mm_load_sd(&x));
-#elif LONG_BIT == 32 || defined(_WIN32)
+#elif LONG_MAX == INT32_MAX
    return _mm_cvtsd_si32(_mm_load_sd(&x));
 #else
-#error "Unsupported or undefined LONG_BIT"
+#error "Unsupported long size"
 #endif
 #else
    return lrint(x);

From 2ac171a7db4e4ad2fa902e62bf18bc1f67e91643 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Tue, 4 Aug 2015 21:39:32 +0300
Subject: [PATCH 1059/1208] mesa: clear existing swizzle info before bitwise-OR

This patch fixes a bug in big-endian treatment, where the previous
swizzle info wasn't cleared before a new swizzle info was inserted into
the format field using a bitwise-OR operation.

v2: use MESA_ARRAY_FORMAT_SWIZZLE_*_MASK instead of numeric constants
v3: align according to coding style

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
CC: "10.5 10.6" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/main/formats.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mesa/main/formats.h b/src/mesa/main/formats.h
index 7e451caf0ff..d938e6ad513 100644
--- a/src/mesa/main/formats.h
+++ b/src/mesa/main/formats.h
@@ -191,6 +191,11 @@ static inline void
 _mesa_array_format_set_swizzle(mesa_array_format *f,
                                int32_t x, int32_t y, int32_t z, int32_t w)
 {
+   *f &= ~(MESA_ARRAY_FORMAT_SWIZZLE_X_MASK |
+           MESA_ARRAY_FORMAT_SWIZZLE_Y_MASK |
+           MESA_ARRAY_FORMAT_SWIZZLE_Z_MASK |
+           MESA_ARRAY_FORMAT_SWIZZLE_W_MASK);
+
    *f |= ((x << 8 ) & MESA_ARRAY_FORMAT_SWIZZLE_X_MASK) |
          ((y << 11) & MESA_ARRAY_FORMAT_SWIZZLE_Y_MASK) |
          ((z << 14) & MESA_ARRAY_FORMAT_SWIZZLE_Z_MASK) |

From b6d014f0ba010f0e61be43abdceb5f2201028a04 Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Mon, 15 Jun 2015 13:50:21 +0200
Subject: [PATCH 1060/1208] mesa/es3.1: Pass sample count check for
 multisampled textures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v3 : Removed space in comment.

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/multisample.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c
index 490bad507c3..09e6154f7ec 100644
--- a/src/mesa/main/multisample.c
+++ b/src/mesa/main/multisample.c
@@ -164,8 +164,11 @@ _mesa_check_sample_count(struct gl_context *ctx, GLenum target,
     *
     *     "If internalformat is a signed or unsigned integer format and samples
     *     is greater than zero, then the error INVALID_OPERATION is generated."
+    *
+    * This restriction is relaxed for OpenGL ES 3.1.
     */
-   if (_mesa_is_gles3(ctx) && _mesa_is_enum_format_integer(internalFormat)
+   if ((ctx->API == API_OPENGLES2 && ctx->Version == 30) &&
+       _mesa_is_enum_format_integer(internalFormat)
        && samples > 0) {
       return GL_INVALID_OPERATION;
    }

From 08f2dfe3430789085c165ce7c546d5afd2e295c2 Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Mon, 10 Aug 2015 13:48:11 +0300
Subject: [PATCH 1061/1208] mesa/es3.1: Allow Multisampled FrameBufferTextures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GLES 3.1 must be allowed to use multisampled framebuffer textures.

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/fbobject.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index cc342c23c03..841834030df 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -2944,8 +2944,9 @@ check_textarget(struct gl_context *ctx, int dims, GLenum target,
          break;
       case GL_TEXTURE_2D_MULTISAMPLE:
       case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-         err = _mesa_is_gles(ctx)
-               || !ctx->Extensions.ARB_texture_multisample;
+         err = (_mesa_is_gles(ctx) ||
+                !ctx->Extensions.ARB_texture_multisample) &&
+               !_mesa_is_gles31(ctx);
          break;
       default:
          err = true;

From 6dabf455970f3a1fdbf384a53621ebe2bcd7545e Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 9 Aug 2015 08:38:25 -0400
Subject: [PATCH 1062/1208] freedreno/a3xx: clear cached fp when switching blit
 prog

For gmem restore (mem2gmem), we swap blit programs, in order to have a
different frag shader for depth vs color restore.  But we weren't
actually clearing the cached fp, so it would not actually change the
frag shader as expected.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a3xx/fd3_gmem.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index 302452657b6..9a5b45e2fcb 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -651,6 +651,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 
 	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR)) {
 		emit.prog = &ctx->blit_prog[pfb->nr_cbufs - 1];
+		emit.fp = NULL;      /* frag shader changed so clear cache */
 		fd3_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs);
 		emit_mem2gmem_surf(ctx, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w);
 	}
@@ -671,6 +672,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 				emit.prog = &ctx->blit_zs;
 			emit.key.half_precision = false;
 		}
+		emit.fp = NULL;      /* frag shader changed so clear cache */
 		fd3_program_emit(ring, &emit, 1, &pfb->zsbuf);
 		emit_mem2gmem_surf(ctx, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w);
 	}

From d2f669e6c72a16dede22f107c3b015ec0516bc56 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 10 Aug 2015 07:11:56 -0400
Subject: [PATCH 1063/1208] freedreno/a4xx: clear cached fp when switching blit
 prog

For gmem restore (mem2gmem), we swap blit programs, in order to have a
different frag shader for depth vs color restore.  But we weren't
actually clearing the cached fp, so it would not actually change the
frag shader as expected.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a4xx/fd4_gmem.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index fce5d7a3930..890e71ead2c 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -396,12 +396,14 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 
 	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
 		emit.prog = &ctx->blit_prog[0];
+		emit.fp = NULL;      /* frag shader changed so clear cache */
 		fd4_program_emit(ring, &emit, 1, &pfb->zsbuf);
 		emit_mem2gmem_surf(ctx, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w);
 	}
 
 	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR)) {
 		emit.prog = &ctx->blit_prog[pfb->nr_cbufs - 1];
+		emit.fp = NULL;      /* frag shader changed so clear cache */
 		fd4_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs);
 		emit_mem2gmem_surf(ctx, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w);
 	}

From 2d6a889e8b786cd76d6711627c10be50615c2b62 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 9 Aug 2015 09:03:25 -0400
Subject: [PATCH 1064/1208] freedreno/a4xx: fix vpsrepl for blit shaders

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/a4xx/fd4_program.c      | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index c7a6dffea7b..fdfdee55123 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -537,16 +537,25 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 
 /* hack.. until we figure out how to deal w/ vpsrepl properly.. */
 static void
-fix_blit_fp(struct pipe_context *pctx)
+fix_blit_fp(struct fd4_shader_stateobj *so)
 {
-	struct fd_context *ctx = fd_context(pctx);
-	struct fd4_shader_stateobj *so = ctx->blit_prog[0].fp;
-
 	so->shader->vpsrepl[0] = 0x99999999;
 	so->shader->vpsrepl[1] = 0x99999999;
 	so->shader->vpsrepl[2] = 0x99999999;
 	so->shader->vpsrepl[3] = 0x99999999;
 }
+static void
+fix_blit_fps(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	int i;
+
+	for (i = 0; i < ctx->screen->max_rts; i++)
+		fix_blit_fp(ctx->blit_prog[i].fp);
+
+	fix_blit_fp(ctx->blit_z.fp);
+	fix_blit_fp(ctx->blit_zs.fp);
+}
 
 void
 fd4_prog_init(struct pipe_context *pctx)
@@ -559,5 +568,5 @@ fd4_prog_init(struct pipe_context *pctx)
 
 	fd_prog_init(pctx);
 
-	fix_blit_fp(pctx);
+	fix_blit_fps(pctx);
 }

From fcb8a04c9ddcb46b7b8cca21e1203674ec04dde2 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 5 Aug 2015 14:21:06 -0400
Subject: [PATCH 1065/1208] freedreno: update generated headers

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a2xx/a2xx.xml.h |   2 +-
 src/gallium/drivers/freedreno/a3xx/a3xx.xml.h |   2 +-
 src/gallium/drivers/freedreno/a4xx/a4xx.xml.h | 180 +++++++++++++++++-
 .../drivers/freedreno/adreno_common.xml.h     |   2 +-
 .../drivers/freedreno/adreno_pm4.xml.h        |   2 +-
 5 files changed, 183 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index a4306314bfe..c4516baf2ec 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -14,7 +14,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  60800 bytes, from 2015-07-10 14:00:13)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index e99e317b1cc..8e8cf6a03f2 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -14,7 +14,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  60800 bytes, from 2015-07-10 14:00:13)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index b53badc0bdf..563f70ac5eb 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -14,7 +14,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  60800 bytes, from 2015-07-10 14:00:13)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -227,6 +227,7 @@ enum a4xx_depth_format {
 	DEPTH4_NONE = 0,
 	DEPTH4_16 = 1,
 	DEPTH4_24_8 = 2,
+	DEPTH4_32 = 3,
 };
 
 enum a4xx_tess_spacing {
@@ -820,6 +821,23 @@ static inline uint32_t A4XX_RB_STENCIL_CONTROL_ZFAIL_BF(enum adreno_stencil_op v
 #define REG_A4XX_RB_STENCIL_CONTROL2				0x00002107
 #define A4XX_RB_STENCIL_CONTROL2_STENCIL_BUFFER			0x00000001
 
+#define REG_A4XX_RB_STENCIL_INFO				0x00002108
+#define A4XX_RB_STENCIL_INFO_SEPARATE_STENCIL			0x00000001
+#define A4XX_RB_STENCIL_INFO_STENCIL_BASE__MASK			0xfffff000
+#define A4XX_RB_STENCIL_INFO_STENCIL_BASE__SHIFT		12
+static inline uint32_t A4XX_RB_STENCIL_INFO_STENCIL_BASE(uint32_t val)
+{
+	return ((val >> 12) << A4XX_RB_STENCIL_INFO_STENCIL_BASE__SHIFT) & A4XX_RB_STENCIL_INFO_STENCIL_BASE__MASK;
+}
+
+#define REG_A4XX_RB_STENCIL_PITCH				0x00002109
+#define A4XX_RB_STENCIL_PITCH__MASK				0xffffffff
+#define A4XX_RB_STENCIL_PITCH__SHIFT				0
+static inline uint32_t A4XX_RB_STENCIL_PITCH(uint32_t val)
+{
+	return ((val >> 5) << A4XX_RB_STENCIL_PITCH__SHIFT) & A4XX_RB_STENCIL_PITCH__MASK;
+}
+
 #define REG_A4XX_RB_STENCILREFMASK				0x0000210b
 #define A4XX_RB_STENCILREFMASK_STENCILREF__MASK			0x000000ff
 #define A4XX_RB_STENCILREFMASK_STENCILREF__SHIFT		0
@@ -1479,6 +1497,76 @@ static inline uint32_t A4XX_SP_HS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val)
 
 #define REG_A4XX_SP_HS_LENGTH_REG				0x00002312
 
+#define REG_A4XX_SP_DS_PARAM_REG				0x0000231a
+#define A4XX_SP_DS_PARAM_REG_POSREGID__MASK			0x000000ff
+#define A4XX_SP_DS_PARAM_REG_POSREGID__SHIFT			0
+static inline uint32_t A4XX_SP_DS_PARAM_REG_POSREGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_PARAM_REG_POSREGID__SHIFT) & A4XX_SP_DS_PARAM_REG_POSREGID__MASK;
+}
+#define A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__MASK		0xfff00000
+#define A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__SHIFT		20
+static inline uint32_t A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__SHIFT) & A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__MASK;
+}
+
+static inline uint32_t REG_A4XX_SP_DS_OUT(uint32_t i0) { return 0x0000231b + 0x1*i0; }
+
+static inline uint32_t REG_A4XX_SP_DS_OUT_REG(uint32_t i0) { return 0x0000231b + 0x1*i0; }
+#define A4XX_SP_DS_OUT_REG_A_REGID__MASK			0x000001ff
+#define A4XX_SP_DS_OUT_REG_A_REGID__SHIFT			0
+static inline uint32_t A4XX_SP_DS_OUT_REG_A_REGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_OUT_REG_A_REGID__SHIFT) & A4XX_SP_DS_OUT_REG_A_REGID__MASK;
+}
+#define A4XX_SP_DS_OUT_REG_A_COMPMASK__MASK			0x00001e00
+#define A4XX_SP_DS_OUT_REG_A_COMPMASK__SHIFT			9
+static inline uint32_t A4XX_SP_DS_OUT_REG_A_COMPMASK(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_OUT_REG_A_COMPMASK__SHIFT) & A4XX_SP_DS_OUT_REG_A_COMPMASK__MASK;
+}
+#define A4XX_SP_DS_OUT_REG_B_REGID__MASK			0x01ff0000
+#define A4XX_SP_DS_OUT_REG_B_REGID__SHIFT			16
+static inline uint32_t A4XX_SP_DS_OUT_REG_B_REGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_OUT_REG_B_REGID__SHIFT) & A4XX_SP_DS_OUT_REG_B_REGID__MASK;
+}
+#define A4XX_SP_DS_OUT_REG_B_COMPMASK__MASK			0x1e000000
+#define A4XX_SP_DS_OUT_REG_B_COMPMASK__SHIFT			25
+static inline uint32_t A4XX_SP_DS_OUT_REG_B_COMPMASK(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_OUT_REG_B_COMPMASK__SHIFT) & A4XX_SP_DS_OUT_REG_B_COMPMASK__MASK;
+}
+
+static inline uint32_t REG_A4XX_SP_DS_VPC_DST(uint32_t i0) { return 0x0000232c + 0x1*i0; }
+
+static inline uint32_t REG_A4XX_SP_DS_VPC_DST_REG(uint32_t i0) { return 0x0000232c + 0x1*i0; }
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC0__MASK			0x000000ff
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC0__SHIFT			0
+static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC0(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC0__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC0__MASK;
+}
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC1__MASK			0x0000ff00
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC1__SHIFT			8
+static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC1(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC1__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC1__MASK;
+}
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC2__MASK			0x00ff0000
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC2__SHIFT			16
+static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC2(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC2__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC2__MASK;
+}
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC3__MASK			0xff000000
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC3__SHIFT			24
+static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC3(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC3__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC3__MASK;
+}
+
 #define REG_A4XX_SP_DS_OBJ_OFFSET_REG				0x00002334
 #define A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK	0x01ff0000
 #define A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT	16
@@ -1501,6 +1589,82 @@ static inline uint32_t A4XX_SP_DS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val)
 
 #define REG_A4XX_SP_DS_LENGTH_REG				0x00002339
 
+#define REG_A4XX_SP_GS_PARAM_REG				0x00002341
+#define A4XX_SP_GS_PARAM_REG_POSREGID__MASK			0x000000ff
+#define A4XX_SP_GS_PARAM_REG_POSREGID__SHIFT			0
+static inline uint32_t A4XX_SP_GS_PARAM_REG_POSREGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_PARAM_REG_POSREGID__SHIFT) & A4XX_SP_GS_PARAM_REG_POSREGID__MASK;
+}
+#define A4XX_SP_GS_PARAM_REG_PRIMREGID__MASK			0x0000ff00
+#define A4XX_SP_GS_PARAM_REG_PRIMREGID__SHIFT			8
+static inline uint32_t A4XX_SP_GS_PARAM_REG_PRIMREGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_PARAM_REG_PRIMREGID__SHIFT) & A4XX_SP_GS_PARAM_REG_PRIMREGID__MASK;
+}
+#define A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__MASK		0xfff00000
+#define A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__SHIFT		20
+static inline uint32_t A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__SHIFT) & A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__MASK;
+}
+
+static inline uint32_t REG_A4XX_SP_GS_OUT(uint32_t i0) { return 0x00002342 + 0x1*i0; }
+
+static inline uint32_t REG_A4XX_SP_GS_OUT_REG(uint32_t i0) { return 0x00002342 + 0x1*i0; }
+#define A4XX_SP_GS_OUT_REG_A_REGID__MASK			0x000001ff
+#define A4XX_SP_GS_OUT_REG_A_REGID__SHIFT			0
+static inline uint32_t A4XX_SP_GS_OUT_REG_A_REGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_OUT_REG_A_REGID__SHIFT) & A4XX_SP_GS_OUT_REG_A_REGID__MASK;
+}
+#define A4XX_SP_GS_OUT_REG_A_COMPMASK__MASK			0x00001e00
+#define A4XX_SP_GS_OUT_REG_A_COMPMASK__SHIFT			9
+static inline uint32_t A4XX_SP_GS_OUT_REG_A_COMPMASK(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_OUT_REG_A_COMPMASK__SHIFT) & A4XX_SP_GS_OUT_REG_A_COMPMASK__MASK;
+}
+#define A4XX_SP_GS_OUT_REG_B_REGID__MASK			0x01ff0000
+#define A4XX_SP_GS_OUT_REG_B_REGID__SHIFT			16
+static inline uint32_t A4XX_SP_GS_OUT_REG_B_REGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_OUT_REG_B_REGID__SHIFT) & A4XX_SP_GS_OUT_REG_B_REGID__MASK;
+}
+#define A4XX_SP_GS_OUT_REG_B_COMPMASK__MASK			0x1e000000
+#define A4XX_SP_GS_OUT_REG_B_COMPMASK__SHIFT			25
+static inline uint32_t A4XX_SP_GS_OUT_REG_B_COMPMASK(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_OUT_REG_B_COMPMASK__SHIFT) & A4XX_SP_GS_OUT_REG_B_COMPMASK__MASK;
+}
+
+static inline uint32_t REG_A4XX_SP_GS_VPC_DST(uint32_t i0) { return 0x00002353 + 0x1*i0; }
+
+static inline uint32_t REG_A4XX_SP_GS_VPC_DST_REG(uint32_t i0) { return 0x00002353 + 0x1*i0; }
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC0__MASK			0x000000ff
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC0__SHIFT			0
+static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC0(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC0__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC0__MASK;
+}
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC1__MASK			0x0000ff00
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC1__SHIFT			8
+static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC1(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC1__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC1__MASK;
+}
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC2__MASK			0x00ff0000
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC2__SHIFT			16
+static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC2(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC2__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC2__MASK;
+}
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC3__MASK			0xff000000
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC3__SHIFT			24
+static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC3(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC3__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC3__MASK;
+}
+
 #define REG_A4XX_SP_GS_OBJ_OFFSET_REG				0x0000235b
 #define A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK	0x01ff0000
 #define A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT	16
@@ -1702,6 +1866,18 @@ static inline uint32_t A4XX_VFD_CONTROL_3_REGID_VTXCNT(uint32_t val)
 {
 	return ((val) << A4XX_VFD_CONTROL_3_REGID_VTXCNT__SHIFT) & A4XX_VFD_CONTROL_3_REGID_VTXCNT__MASK;
 }
+#define A4XX_VFD_CONTROL_3_REGID_TESSX__MASK			0x00ff0000
+#define A4XX_VFD_CONTROL_3_REGID_TESSX__SHIFT			16
+static inline uint32_t A4XX_VFD_CONTROL_3_REGID_TESSX(uint32_t val)
+{
+	return ((val) << A4XX_VFD_CONTROL_3_REGID_TESSX__SHIFT) & A4XX_VFD_CONTROL_3_REGID_TESSX__MASK;
+}
+#define A4XX_VFD_CONTROL_3_REGID_TESSY__MASK			0xff000000
+#define A4XX_VFD_CONTROL_3_REGID_TESSY__SHIFT			24
+static inline uint32_t A4XX_VFD_CONTROL_3_REGID_TESSY(uint32_t val)
+{
+	return ((val) << A4XX_VFD_CONTROL_3_REGID_TESSY__SHIFT) & A4XX_VFD_CONTROL_3_REGID_TESSY__MASK;
+}
 
 #define REG_A4XX_VFD_CONTROL_4					0x00002204
 
@@ -2498,6 +2674,8 @@ static inline uint32_t A4XX_UNKNOWN_20F7(float val)
 
 #define REG_A4XX_UNKNOWN_22D7					0x000022d7
 
+#define REG_A4XX_UNKNOWN_2352					0x00002352
+
 #define REG_A4XX_TEX_SAMP_0					0x00000000
 #define A4XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR			0x00000001
 #define A4XX_TEX_SAMP_0_XY_MAG__MASK				0x00000006
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index c45b70399b2..00b6acba065 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -14,7 +14,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  60800 bytes, from 2015-07-10 14:00:13)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 7c2451b2e71..98a90e26679 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -14,7 +14,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  60800 bytes, from 2015-07-10 14:00:13)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)

From 7bfe8cf4a487aec4870df23f6f72c828f1caaa49 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 5 Aug 2015 18:14:49 -0400
Subject: [PATCH 1066/1208] freedreno/a4xx: add s8/z32/z32_s8x24 support

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a4xx/fd4_emit.c |  41 ++++--
 .../drivers/freedreno/a4xx/fd4_format.c       |  19 ++-
 src/gallium/drivers/freedreno/a4xx/fd4_gmem.c | 126 ++++++++++++++----
 .../drivers/freedreno/freedreno_util.h        |   2 +-
 4 files changed, 151 insertions(+), 37 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index e7c210d3437..c3226d5121f 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -200,23 +200,13 @@ void
 fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		struct pipe_surface **bufs)
 {
-	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
+	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS];
 	int i;
 
 	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
 		mrt_comp[i] = (i < nr_bufs) ? 0xf : 0;
 	}
 
-	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
-	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
-			A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
-			A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
-			A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
-			A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
-			A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
-			A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
-			A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
-
 	/* output sampler state: */
 	OUT_PKT3(ring, CP_LOAD_STATE, 2 + (2 * nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
@@ -250,6 +240,25 @@ fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, unsigned nr_bufs,
 			uint32_t offset = fd_resource_offset(rsc, lvl, bufs[i]->u.tex.first_layer);
 			enum pipe_format format = fd4_gmem_restore_format(bufs[i]->format);
 
+			/* The restore blit_zs shader expects stencil in sampler 0,
+			 * and depth in sampler 1
+			 */
+			if (rsc->stencil && (i == 0)) {
+				rsc = rsc->stencil;
+				format = fd4_gmem_restore_format(rsc->base.b.format);
+			}
+
+			/* z32 restore is accomplished using depth write.  If there is
+			 * no stencil component (ie. PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+			 * then no render target:
+			 *
+			 * (The same applies for z32_s8x24, since for stencil sampler
+			 * state the above 'if' will replace 'format' with s8)
+			 */
+			if ((format == PIPE_FORMAT_Z32_FLOAT) ||
+					(format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT))
+				mrt_comp[i] = 0;
+
 			debug_assert(bufs[i]->u.tex.first_layer == bufs[i]->u.tex.last_layer);
 
 			OUT_RING(ring, A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) |
@@ -281,6 +290,16 @@ fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, unsigned nr_bufs,
 			OUT_RING(ring, 0x00000000);
 		}
 	}
+
+	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+			A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+			A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+			A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+			A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+			A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+			A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+			A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_format.c b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
index 29abe0b0cc3..f906a6781b5 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_format.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
@@ -89,6 +89,14 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	_T(L8_UNORM,   8_UNORM, R8_UNORM, WZYX),
 	_T(I8_UNORM,   8_UNORM, NONE,     WZYX),
 
+	/* NOTE: should be TFMT_8_UINT (which then gets remapped to
+	 * TFMT_8_UNORM for mem2gmem in _gmem_restore_format()), but
+	 * we don't know TFMT_8_UINT yet.. so just use TFMT_8_UNORM
+	 * for now.. sampling from stencil as a texture might not
+	 * work right, but at least should be fine for zsbuf..
+	 */
+	_T(S8_UINT,    8_UNORM,  R8_UNORM, WZYX),
+
 	/* 16-bit */
 	V_(R16_UNORM,   16_UNORM, NONE,     WZYX),
 	V_(R16_SNORM,   16_SNORM, NONE,     WZYX),
@@ -191,7 +199,8 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 
 	_T(Z24X8_UNORM,       X8Z24_UNORM, R8G8B8A8_UNORM, WZYX),
 	_T(Z24_UNORM_S8_UINT, X8Z24_UNORM, R8G8B8A8_UNORM, WZYX),
-	/*_T(Z32_FLOAT,         Z32_FLOAT,   R8G8B8A8_UNORM, WZYX),*/
+	_T(Z32_FLOAT,         32_FLOAT,   R8G8B8A8_UNORM, WZYX),
+	_T(Z32_FLOAT_S8X24_UINT, 32_FLOAT,R8G8B8A8_UNORM, WZYX),
 
 	/* 48-bit */
 	V_(R16G16B16_UNORM,   16_16_16_UNORM, NONE, WZYX),
@@ -282,6 +291,9 @@ fd4_pipe2swap(enum pipe_format format)
 enum a4xx_tex_fetchsize
 fd4_pipe2fetchsize(enum pipe_format format)
 {
+	if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+		format = PIPE_FORMAT_Z32_FLOAT;
+
 	switch (util_format_get_blocksizebits(format)) {
 	case 8:   return TFETCH4_1_BYTE;
 	case 16:  return TFETCH4_2_BYTE;
@@ -312,6 +324,8 @@ fd4_gmem_restore_format(enum pipe_format format)
 		return PIPE_FORMAT_R8G8B8A8_UNORM;
 	case PIPE_FORMAT_Z16_UNORM:
 		return PIPE_FORMAT_R8G8_UNORM;
+	case PIPE_FORMAT_S8_UINT:
+		return PIPE_FORMAT_R8_UNORM;
 	default:
 		return format;
 	}
@@ -328,6 +342,9 @@ fd4_pipe2depth(enum pipe_format format)
 	case PIPE_FORMAT_X8Z24_UNORM:
 	case PIPE_FORMAT_S8_UINT_Z24_UNORM:
 		return DEPTH4_24_8;
+	case PIPE_FORMAT_Z32_FLOAT:
+	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+		return DEPTH4_32;
 	default:
 		return ~0;
 	}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index 890e71ead2c..6541da54161 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -68,11 +68,23 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 
 		if ((i < nr_bufs) && bufs[i]) {
 			struct pipe_surface *psurf = bufs[i];
+			enum pipe_format pformat = 0;
 
 			rsc = fd_resource(psurf->texture);
+			pformat = psurf->format;
+
+			/* In case we're drawing to Z32F_S8, the "color" actually goes to
+			 * the stencil
+			 */
+			if (rsc->stencil) {
+				rsc = rsc->stencil;
+				pformat = rsc->base.b.format;
+				bases++;
+			}
+
 			slice = fd_resource_slice(rsc, psurf->u.tex.level);
-			format = fd4_pipe2color(psurf->format);
-			swap = fd4_pipe2swap(psurf->format);
+			format = fd4_pipe2color(pformat);
+			swap = fd4_pipe2swap(pformat);
 
 			debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
 
@@ -114,13 +126,23 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 /* transfer from gmem to system memory (ie. normal RAM) */
 
 static void
-emit_gmem2mem_surf(struct fd_context *ctx,
+emit_gmem2mem_surf(struct fd_context *ctx, bool stencil,
 		uint32_t base, struct pipe_surface *psurf)
 {
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct fd_resource *rsc = fd_resource(psurf->texture);
-	struct fd_resource_slice *slice = &rsc->slices[psurf->u.tex.level];
-	uint32_t offset = fd_resource_offset(rsc, psurf->u.tex.level,
+	enum pipe_format pformat = psurf->format;
+	struct fd_resource_slice *slice;
+	uint32_t offset;
+
+	if (stencil) {
+		debug_assert(rsc->stencil);
+		rsc = rsc->stencil;
+		pformat = rsc->base.b.format;
+	}
+
+	slice = &rsc->slices[psurf->u.tex.level];
+	offset = fd_resource_offset(rsc, psurf->u.tex.level,
 			psurf->u.tex.first_layer);
 
 	debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
@@ -132,10 +154,10 @@ emit_gmem2mem_surf(struct fd_context *ctx,
 	OUT_RELOCW(ring, rsc->bo, offset, 0, 0);   /* RB_COPY_DEST_BASE */
 	OUT_RING(ring, A4XX_RB_COPY_DEST_PITCH_PITCH(slice->pitch * rsc->cpp));
 	OUT_RING(ring, A4XX_RB_COPY_DEST_INFO_TILE(TILE4_LINEAR) |
-			A4XX_RB_COPY_DEST_INFO_FORMAT(fd4_pipe2color(psurf->format)) |
+			A4XX_RB_COPY_DEST_INFO_FORMAT(fd4_pipe2color(pformat)) |
 			A4XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) |
 			A4XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) |
-			A4XX_RB_COPY_DEST_INFO_SWAP(fd4_pipe2swap(psurf->format)));
+			A4XX_RB_COPY_DEST_INFO_SWAP(fd4_pipe2swap(pformat)));
 
 	fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
 			DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX_SIZE_IGN, 0, 0, NULL);
@@ -226,7 +248,11 @@ fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 	fd4_emit_vertex_bufs(ring, &emit);
 
 	if (ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
-		emit_gmem2mem_surf(ctx, gmem->zsbuf_base[0], pfb->zsbuf);
+		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
+		if (!rsc->stencil || (ctx->resolve & FD_BUFFER_DEPTH))
+			emit_gmem2mem_surf(ctx, false, ctx->gmem.zsbuf_base[0], pfb->zsbuf);
+		if (rsc->stencil && (ctx->resolve & FD_BUFFER_STENCIL))
+			emit_gmem2mem_surf(ctx, true, ctx->gmem.zsbuf_base[1], pfb->zsbuf);
 	}
 
 	if (ctx->resolve & FD_BUFFER_COLOR) {
@@ -236,7 +262,7 @@ fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 				continue;
 			if (!(ctx->resolve & (PIPE_CLEAR_COLOR0 << i)))
 				continue;
-			emit_gmem2mem_surf(ctx, gmem->cbuf_base[i], pfb->cbufs[i]);
+			emit_gmem2mem_surf(ctx, false, gmem->cbuf_base[i], pfb->cbufs[i]);
 		}
 	}
 
@@ -254,9 +280,20 @@ emit_mem2gmem_surf(struct fd_context *ctx, uint32_t *bases,
 		struct pipe_surface **bufs, uint32_t nr_bufs, uint32_t bin_w)
 {
 	struct fd_ringbuffer *ring = ctx->ring;
+	struct pipe_surface *zsbufs[2];
 
 	emit_mrt(ring, nr_bufs, bufs, bases, bin_w);
 
+	if (bufs[0] && (bufs[0]->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) {
+		/* The gmem_restore_tex logic will put the first buffer's stencil
+		 * as color. Supply it with the proper information to make that
+		 * happen.
+		 */
+		zsbufs[0] = zsbufs[1] = bufs[0];
+		bufs = zsbufs;
+		nr_bufs = 2;
+	}
+
 	fd4_emit_gmem_restore_tex(ring, nr_bufs, bufs);
 
 	fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
@@ -394,13 +431,6 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	bin_w = gmem->bin_w;
 	bin_h = gmem->bin_h;
 
-	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
-		emit.prog = &ctx->blit_prog[0];
-		emit.fp = NULL;      /* frag shader changed so clear cache */
-		fd4_program_emit(ring, &emit, 1, &pfb->zsbuf);
-		emit_mem2gmem_surf(ctx, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w);
-	}
-
 	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR)) {
 		emit.prog = &ctx->blit_prog[pfb->nr_cbufs - 1];
 		emit.fp = NULL;      /* frag shader changed so clear cache */
@@ -408,6 +438,40 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 		emit_mem2gmem_surf(ctx, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w);
 	}
 
+	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
+		switch (pfb->zsbuf->format) {
+		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+		case PIPE_FORMAT_Z32_FLOAT:
+			emit.prog = (pfb->zsbuf->format == PIPE_FORMAT_Z32_FLOAT) ?
+					&ctx->blit_z : &ctx->blit_zs;
+			emit.key.half_precision = false;
+
+			OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1);
+			OUT_RING(ring, A4XX_RB_DEPTH_CONTROL_Z_ENABLE |
+					A4XX_RB_DEPTH_CONTROL_Z_WRITE_ENABLE |
+					A4XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_ALWAYS) |
+					A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE);
+
+			OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1);
+			OUT_RING(ring, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE);
+
+			OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1);
+			OUT_RING(ring, 0x80000);   /* GRAS_CL_CLIP_CNTL */
+
+			break;
+		default:
+			/* Non-float can use a regular color write. It's split over 8-bit
+			 * components, so half precision is always sufficient.
+			 */
+			emit.prog = &ctx->blit_prog[0];
+			emit.key.half_precision = true;
+			break;
+		}
+		emit.fp = NULL;      /* frag shader changed so clear cache */
+		fd4_program_emit(ring, &emit, 1, &pfb->zsbuf);
+		emit_mem2gmem_surf(ctx, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w);
+	}
+
 	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
 	OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
 			A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
@@ -546,21 +610,35 @@ fd4_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile)
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd_gmem_stateobj *gmem = &ctx->gmem;
-	uint32_t reg;
 
-	OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3);
-	reg = A4XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]);
 	if (pfb->zsbuf) {
-		reg |= A4XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd4_pipe2depth(pfb->zsbuf->format));
-	}
-	OUT_RING(ring, reg);
-	if (pfb->zsbuf) {
-		uint32_t cpp = util_format_get_blocksize(pfb->zsbuf->format);
+		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
+		uint32_t cpp = rsc->cpp;
+
+		OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3);
+		OUT_RING(ring, A4XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]) |
+				A4XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd4_pipe2depth(pfb->zsbuf->format)));
 		OUT_RING(ring, A4XX_RB_DEPTH_PITCH(cpp * gmem->bin_w));
 		OUT_RING(ring, A4XX_RB_DEPTH_PITCH2(cpp * gmem->bin_w));
+
+		OUT_PKT0(ring, REG_A4XX_RB_STENCIL_INFO, 2);
+		if (rsc->stencil) {
+			OUT_RING(ring, A4XX_RB_STENCIL_INFO_SEPARATE_STENCIL |
+					A4XX_RB_STENCIL_INFO_STENCIL_BASE(gmem->zsbuf_base[1]));
+			OUT_RING(ring, A4XX_RB_STENCIL_PITCH(rsc->stencil->cpp * gmem->bin_w));
+		} else {
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+		}
 	} else {
+		OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3);
 		OUT_RING(ring, 0x00000000);
 		OUT_RING(ring, 0x00000000);
+		OUT_RING(ring, 0x00000000);
+
+		OUT_PKT0(ring, REG_A4XX_RB_STENCIL_INFO, 2);
+		OUT_RING(ring, 0);            /* RB_STENCIL_INFO */
+		OUT_RING(ring, 0);            /* RB_STENCIL_PITCH */
 	}
 
 	OUT_PKT0(ring, REG_A4XX_GRAS_DEPTH_CONTROL, 1);
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index b5c5db91788..d6a08b5fa13 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -57,7 +57,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define A2XX_MAX_RENDER_TARGETS 1
 #define A3XX_MAX_RENDER_TARGETS 4
 #define A4XX_MAX_RENDER_TARGETS 8
-/* for now until a4xx MRT support: */
+
 #define MAX_RENDER_TARGETS A4XX_MAX_RENDER_TARGETS
 
 #define FD_DBG_MSGS     0x0001

From 7e5d56394bd53607d0158b49f36ac1428acb7954 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 2 Aug 2015 16:22:43 +0200
Subject: [PATCH 1067/1208] gallium/radeon: add a debug flag not to use write
 combining (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: just clear the flag before the allocation

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/r600_buffer_common.c | 3 +++
 src/gallium/drivers/radeon/r600_pipe_common.c   | 1 +
 src/gallium/drivers/radeon/r600_pipe_common.h   | 1 +
 3 files changed, 5 insertions(+)

diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index 5b5c06362a7..0f788b7e23c 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -161,6 +161,9 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 		flags |= RADEON_FLAG_NO_CPU_ACCESS;
 	}
 
+	if (rscreen->debug_flags & DBG_NO_WC)
+		flags &= ~RADEON_FLAG_GTT_WC;
+
 	/* Allocate a new resource. */
 	new_buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment,
 					     use_reusable_pool,
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 3f1c0f0eae9..c9b89295e83 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -355,6 +355,7 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "switch_on_eop", DBG_SWITCH_ON_EOP, "Program WD/IA to switch on end-of-packet." },
 	{ "forcedma", DBG_FORCE_DMA, "Use asynchronous DMA for all operations when possible." },
 	{ "precompile", DBG_PRECOMPILE, "Compile one shader variant at shader creation." },
+	{ "nowc", DBG_NO_WC, "Disable GTT write combining" },
 
 	DEBUG_NAMED_VALUE_END /* must be last */
 };
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index ce3f396011f..85ac22f3c7e 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -97,6 +97,7 @@
 #define DBG_FORCE_DMA		(1llu << 38)
 #define DBG_PRECOMPILE		(1llu << 39)
 #define DBG_INFO		(1llu << 40)
+#define DBG_NO_WC		(1llu << 41)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
 

From 9901aeb1c74648cbe1aa1d18d590a689c844cbad Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Sun, 9 Aug 2015 22:03:00 -0700
Subject: [PATCH 1068/1208] mesa/format_utils: Add src_bits == dst_bits cases
 to unorm_to_unorm

This better ensures that the src_bits == dst_bits case gets optimized away.

Reviewed-by: Neil Roberts <neil@linux.intel.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/mesa/main/format_utils.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/format_utils.h b/src/mesa/main/format_utils.h
index 00ec7774dd2..618f43d0aaa 100644
--- a/src/mesa/main/format_utils.h
+++ b/src/mesa/main/format_utils.h
@@ -99,7 +99,7 @@ _mesa_unorm_to_unorm(unsigned x, unsigned src_bits, unsigned dst_bits)
 {
    if (src_bits < dst_bits) {
       return EXTEND_NORMALIZED_INT(x, src_bits, dst_bits);
-   } else {
+   } else if (src_bits > dst_bits) {
       unsigned src_half = (1 << (src_bits - 1)) - 1;
 
       if (src_bits + dst_bits > sizeof(x) * 8) {
@@ -109,6 +109,8 @@ _mesa_unorm_to_unorm(unsigned x, unsigned src_bits, unsigned dst_bits)
       } else {
          return (x * MAX_UINT(dst_bits) + src_half) / MAX_UINT(src_bits);
       }
+   } else {
+      return x;
    }
 }
 

From 17c978166185a7d3a9759f828a4370c1f2169776 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 3 Aug 2015 14:12:35 -0700
Subject: [PATCH 1069/1208] i965/nir: Use nir_op_info.output_type for
 determining when to resolve

Previously, we were explicitly listing every instruction that needs a
resolve.  However, those instructions were precicely the ones that returned
booleans so there's no reason why we shouldn't just have that check.  Also,
all of the reduction opcodes such as bany and ball were missing so it
didn't properly flag stuff on vec4.  If an opcode gets added in the future
that returns a bool but doesn't need a resolve, we can special-case that.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 .../i965/brw_nir_analyze_boolean_resolves.c   | 40 +++++++------------
 1 file changed, 15 insertions(+), 25 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
index 9eb0ed9bd79..d1fc06dd526 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
@@ -109,30 +109,6 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
          uint8_t resolve_status;
          nir_alu_instr *alu = nir_instr_as_alu(instr);
          switch (alu->op) {
-         case nir_op_flt:
-         case nir_op_ilt:
-         case nir_op_ult:
-         case nir_op_fge:
-         case nir_op_ige:
-         case nir_op_uge:
-         case nir_op_feq:
-         case nir_op_ieq:
-         case nir_op_fne:
-         case nir_op_ine:
-         case nir_op_f2b:
-         case nir_op_i2b:
-            /* This instruction will turn into a CMP when we actually emit
-             * so the result will have to be resolved before it can be used.
-             */
-            resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED;
-
-            /* Even though the destination is allowed to be left unresolved,
-             * the sources are treated as regular integers or floats so
-             * they need to be resolved.
-             */
-            nir_foreach_src(instr, src_mark_needs_resolve, NULL);
-            break;
-
          case nir_op_imov:
          case nir_op_inot:
             /* This is a single-source instruction.  Just copy the resolve
@@ -169,7 +145,21 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
          }
 
          default:
-            resolve_status = BRW_NIR_NON_BOOLEAN;
+            if (nir_op_infos[alu->op].output_type == nir_type_bool) {
+               /* This instructions will turn into a CMP when we actually emit
+                * them so the result will have to be resolved before it can be
+                * used.
+                */
+               resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED;
+
+               /* Even though the destination is allowed to be left
+                * unresolved, the sources are treated as regular integers or
+                * floats so they need to be resolved.
+                */
+               nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+            } else {
+               resolve_status = BRW_NIR_NON_BOOLEAN;
+            }
          }
 
          /* If the destination is SSA, go ahead allow unresolved booleans.

From 1d4e698466bdea735c5f06c2658322bdc527efce Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 3 Aug 2015 16:25:18 -0700
Subject: [PATCH 1070/1208] i965/nir: Don't mark bany or ball instructions for
 resolve

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 .../i965/brw_nir_analyze_boolean_resolves.c   | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
index d1fc06dd526..c995d2b7e2d 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
@@ -109,6 +109,29 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
          uint8_t resolve_status;
          nir_alu_instr *alu = nir_instr_as_alu(instr);
          switch (alu->op) {
+         case nir_op_bany2:
+         case nir_op_bany3:
+         case nir_op_bany4:
+         case nir_op_ball_fequal2:
+         case nir_op_ball_iequal2:
+         case nir_op_ball_fequal3:
+         case nir_op_ball_iequal3:
+         case nir_op_ball_fequal4:
+         case nir_op_ball_iequal4:
+         case nir_op_bany_fnequal2:
+         case nir_op_bany_inequal2:
+         case nir_op_bany_fnequal3:
+         case nir_op_bany_inequal3:
+         case nir_op_bany_fnequal4:
+         case nir_op_bany_inequal4:
+            /* These are only implemented by the vec4 backend and its
+             * implementation emits resolved booleans.  At some point in the
+             * future, this may change and we'll have to remove some of the
+             * above cases.
+             */
+            resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+            break;
+
          case nir_op_imov:
          case nir_op_inot:
             /* This is a single-source instruction.  Just copy the resolve

From 5e1c1c2fcbdfb96a973ae3fd196e341ab2d41833 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 3 Aug 2015 10:00:38 -0700
Subject: [PATCH 1071/1208] i965/vec4-nir: Handle boolean resolvese on ILK-

The analysis code was already there and running, we just weren't doing
anything with the result of it yet.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index b13465bcb30..3056d098214 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1292,6 +1292,20 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    default:
       unreachable("Unimplemented ALU operation");
    }
+
+   /* If we need to do a boolean resolve, replace the result with -(x & 1)
+    * to sign extend the low bit to 0/~0
+    */
+   if (devinfo->gen <= 5 &&
+       (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) ==
+       BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
+      dst_reg masked = dst_reg(this, glsl_type::int_type);
+      masked.writemask = dst.writemask;
+      emit(AND(masked, src_reg(dst), src_reg(1)));
+      src_reg masked_neg = src_reg(masked);
+      masked_neg.negate = true;
+      emit(MOV(retype(dst, BRW_REGISTER_TYPE_D), masked_neg));
+   }
 }
 
 void

From 1d658cf8795383dbef127e46f3740b516bfe21b9 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 3 Aug 2015 14:37:41 -0700
Subject: [PATCH 1072/1208] i965/vec4_nir: Do boolean source modifier resolves
 on BDW+

On BDW+, the negation source modifier on NOT, AND, OR, and XOR, is actually
a boolean negate and not an integer negate.  However, NIR's soruce
modifiers are the integer version.  We have to resolve it with a MOV prior
to emitting the actual instruction.  This is basically the same thing we do
in the FS backend.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h           |  1 +
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp     | 15 +++++++++++++++
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 13 +++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 985886d7024..0c13d43a7d3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -320,6 +320,7 @@ public:
 		    dst_reg dst, src_reg src0, src_reg src1);
 
    src_reg fix_3src_operand(src_reg src);
+   src_reg resolve_source_modifiers(const src_reg& src);
 
    vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
                                const src_reg &src1 = src_reg());
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 3056d098214..f04c5e1b5b5 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1020,18 +1020,33 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    }
 
    case nir_op_inot:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+      }
       emit(NOT(dst, op[0]));
       break;
 
    case nir_op_ixor:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
       emit(XOR(dst, op[0], op[1]));
       break;
 
    case nir_op_ior:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
       emit(OR(dst, op[0], op[1]));
       break;
 
    case nir_op_iand:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
       emit(AND(dst, op[0], op[1]));
       break;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index ba352be3ad5..57d98c9b6b3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -312,6 +312,19 @@ vec4_visitor::fix_3src_operand(src_reg src)
    return src_reg(expanded);
 }
 
+src_reg
+vec4_visitor::resolve_source_modifiers(const src_reg& src)
+{
+   if (!src.abs && !src.negate)
+      return src;
+
+   dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
+   resolved.type = src.type;
+   emit(MOV(resolved, src));
+
+   return src_reg(resolved);
+}
+
 src_reg
 vec4_visitor::fix_math_operand(src_reg src)
 {

From c1d9b3ae0bb0f1222719d7737dd9986e437bf5b9 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 3 Aug 2015 15:21:59 -0700
Subject: [PATCH 1073/1208] i965/vec4_nir: Properly handle integer multiplies
 on BDW+

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 50 ++++++++++++----------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index f04c5e1b5b5..923e2d30a4c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -775,31 +775,35 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
 
    case nir_op_imul: {
-      nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
-      nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
+      if (devinfo->gen < 8) {
+         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
+         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
 
-      /* For integer multiplication, the MUL uses the low 16 bits of one of
-       * the operands (src0 through SNB, src1 on IVB and later). The MACH
-       * accumulates in the contribution of the upper 16 bits of that
-       * operand. If we can determine that one of the args is in the low
-       * 16 bits, though, we can just emit a single MUL.
-       */
-      if (value0 && value0->u[0] < (1 << 16)) {
-         if (devinfo->gen < 7)
-            emit(MUL(dst, op[0], op[1]));
-         else
-            emit(MUL(dst, op[1], op[0]));
-      } else if (value1 && value1->u[0] < (1 << 16)) {
-         if (devinfo->gen < 7)
-            emit(MUL(dst, op[1], op[0]));
-         else
-            emit(MUL(dst, op[0], op[1]));
+         /* For integer multiplication, the MUL uses the low 16 bits of one of
+          * the operands (src0 through SNB, src1 on IVB and later). The MACH
+          * accumulates in the contribution of the upper 16 bits of that
+          * operand. If we can determine that one of the args is in the low
+          * 16 bits, though, we can just emit a single MUL.
+          */
+         if (value0 && value0->u[0] < (1 << 16)) {
+            if (devinfo->gen < 7)
+               emit(MUL(dst, op[0], op[1]));
+            else
+               emit(MUL(dst, op[1], op[0]));
+         } else if (value1 && value1->u[0] < (1 << 16)) {
+            if (devinfo->gen < 7)
+               emit(MUL(dst, op[1], op[0]));
+            else
+               emit(MUL(dst, op[0], op[1]));
+         } else {
+            struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+            emit(MUL(acc, op[0], op[1]));
+            emit(MACH(dst_null_d(), op[0], op[1]));
+            emit(MOV(dst, src_reg(acc)));
+         }
       } else {
-         struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
-
-         emit(MUL(acc, op[0], op[1]));
-         emit(MACH(dst_null_d(), op[0], op[1]));
-         emit(MOV(dst, src_reg(acc)));
+	 emit(MUL(dst, op[0], op[1]));
       }
       break;
    }

From 7539ac7fe2077f7634250dcb34497e1ac643b0df Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 30 Jul 2015 20:49:22 -0700
Subject: [PATCH 1074/1208] ra: Refactor ra_set_finalize

All this commit does is change an early return to an if with an else
clause.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/util/register_allocate.c | 45 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/src/util/register_allocate.c b/src/util/register_allocate.c
index 95be20fcc1b..129d58d2292 100644
--- a/src/util/register_allocate.c
+++ b/src/util/register_allocate.c
@@ -321,32 +321,31 @@ ra_set_finalize(struct ra_regs *regs, unsigned int **q_values)
             regs->classes[b]->q[c] = q_values[b][c];
 	 }
       }
-      return;
-   }
+   } else {
+      /* Compute, for each class B and C, how many regs of B an
+       * allocation to C could conflict with.
+       */
+      for (b = 0; b < regs->class_count; b++) {
+         for (c = 0; c < regs->class_count; c++) {
+            unsigned int rc;
+            int max_conflicts = 0;
 
-   /* Compute, for each class B and C, how many regs of B an
-    * allocation to C could conflict with.
-    */
-   for (b = 0; b < regs->class_count; b++) {
-      for (c = 0; c < regs->class_count; c++) {
-	 unsigned int rc;
-	 int max_conflicts = 0;
+            for (rc = 0; rc < regs->count; rc++) {
+               int conflicts = 0;
+               unsigned int i;
 
-	 for (rc = 0; rc < regs->count; rc++) {
-	    int conflicts = 0;
-	    unsigned int i;
+               if (!reg_belongs_to_class(rc, regs->classes[c]))
+                  continue;
 
-            if (!reg_belongs_to_class(rc, regs->classes[c]))
-	       continue;
-
-	    for (i = 0; i < regs->regs[rc].num_conflicts; i++) {
-	       unsigned int rb = regs->regs[rc].conflict_list[i];
-	       if (reg_belongs_to_class(rb, regs->classes[b]))
-		  conflicts++;
-	    }
-	    max_conflicts = MAX2(max_conflicts, conflicts);
-	 }
-	 regs->classes[b]->q[c] = max_conflicts;
+               for (i = 0; i < regs->regs[rc].num_conflicts; i++) {
+                  unsigned int rb = regs->regs[rc].conflict_list[i];
+                  if (reg_belongs_to_class(rb, regs->classes[b]))
+                     conflicts++;
+               }
+               max_conflicts = MAX2(max_conflicts, conflicts);
+            }
+            regs->classes[b]->q[c] = max_conflicts;
+         }
       }
    }
 }

From bdcc8f32304b67cd9c87f5f285c1faa00c51d3ad Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 30 Jul 2015 20:53:04 -0700
Subject: [PATCH 1075/1208] ra: Delete the conflict lists in ra_set_finalize

They are never used after the set is finalized so there's no reason to keep
them around.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/util/register_allocate.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/util/register_allocate.c b/src/util/register_allocate.c
index 129d58d2292..436e008b01a 100644
--- a/src/util/register_allocate.c
+++ b/src/util/register_allocate.c
@@ -348,6 +348,11 @@ ra_set_finalize(struct ra_regs *regs, unsigned int **q_values)
          }
       }
    }
+
+   for (b = 0; b < regs->count; b++) {
+      ralloc_free(regs->regs[b].conflict_list);
+      regs->regs[b].conflict_list = NULL;
+   }
 }
 
 static void

From 0ac65abb466578aafbc753189cdc40fd9a6000b8 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 31 Jul 2015 08:35:57 -0700
Subject: [PATCH 1076/1208] i965/fs: Use dispatch_width instead of reg_width in
 alloc_reg_sets

reg_width is kind of an outdated concept.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 .../drivers/dri/i965/brw_fs_reg_allocate.cpp     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 6a7ed64415d..211f70ee942 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -73,11 +73,11 @@ fs_visitor::assign_regs_trivial()
 }
 
 static void
-brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
+brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
 {
    const struct brw_device_info *devinfo = compiler->devinfo;
    int base_reg_count = BRW_MAX_GRF;
-   int index = reg_width - 1;
+   int index = (dispatch_width / 8) - 1;
 
    /* The registers used to make up almost all values handled in the compiler
     * are a scalar value occupying a single register (or 2 registers in the
@@ -121,7 +121,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
    /* Compute the total number of registers across all classes. */
    int ra_reg_count = 0;
    for (int i = 0; i < class_count; i++) {
-      if (devinfo->gen <= 5 && reg_width == 2) {
+      if (devinfo->gen <= 5 && dispatch_width == 16) {
          /* From the G45 PRM:
           *
           * In order to reduce the hardware complexity, the following
@@ -168,7 +168,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
    int pairs_reg_count = 0;
    for (int i = 0; i < class_count; i++) {
       int class_reg_count;
-      if (devinfo->gen <= 5 && reg_width == 2) {
+      if (devinfo->gen <= 5 && dispatch_width == 16) {
          class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
 
          /* See comment below.  The only difference here is that we are
@@ -214,7 +214,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
          pairs_reg_count = class_reg_count;
       }
 
-      if (devinfo->gen <= 5 && reg_width == 2) {
+      if (devinfo->gen <= 5 && dispatch_width == 16) {
          for (int j = 0; j < class_reg_count; j++) {
             ra_class_add_reg(regs, classes[i], reg);
 
@@ -249,7 +249,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
    /* Add a special class for aligned pairs, which we'll put delta_xy
     * in on Gen <= 6 so that we can do PLN.
     */
-   if (devinfo->has_pln && reg_width == 1 && devinfo->gen <= 6) {
+   if (devinfo->has_pln && dispatch_width == 8 && devinfo->gen <= 6) {
       aligned_pairs_class = ra_alloc_reg_class(regs);
 
       for (int i = 0; i < pairs_reg_count; i++) {
@@ -287,8 +287,8 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
 void
 brw_fs_alloc_reg_sets(struct brw_compiler *compiler)
 {
-   brw_alloc_reg_set(compiler, 1);
-   brw_alloc_reg_set(compiler, 2);
+   brw_alloc_reg_set(compiler, 8);
+   brw_alloc_reg_set(compiler, 16);
 }
 
 static int

From 1bb339493cd892c8065266b93a296a84b1dfce9b Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 31 Jul 2015 08:36:35 -0700
Subject: [PATCH 1077/1208] i965/fs: Don't do redundant RA setup on IVB+

Acked-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 211f70ee942..b70895ec2ff 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -79,6 +79,15 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
    int base_reg_count = BRW_MAX_GRF;
    int index = (dispatch_width / 8) - 1;
 
+   if (dispatch_width > 8 && devinfo->gen >= 7) {
+      /* For IVB+, we don't need the PLN hacks or the even-reg alignment in
+       * SIMD16.  Therefore, we can use the exact same register sets for
+       * SIMD16 as we do for SIMD8 and we don't need to recalculate them.
+       */
+      compiler->fs_reg_sets[index] = compiler->fs_reg_sets[0];
+      return;
+   }
+
    /* The registers used to make up almost all values handled in the compiler
     * are a scalar value occupying a single register (or 2 registers in the
     * case of SIMD16, which is handled by dividing base_reg_count by 2 and

From 7068a6409c897e44cd98377df310691592ef6d0d Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 10 Aug 2015 11:48:14 -0700
Subject: [PATCH 1078/1208] i965/vec4_visitor: Make some function arguments
 const references

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h           | 6 +++---
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 0c13d43a7d3..341c516b39a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -319,13 +319,13 @@ public:
    void emit_scalar(ir_instruction *ir, enum prog_opcode op,
 		    dst_reg dst, src_reg src0, src_reg src1);
 
-   src_reg fix_3src_operand(src_reg src);
-   src_reg resolve_source_modifiers(const src_reg& src);
+   src_reg fix_3src_operand(const src_reg &src);
+   src_reg resolve_source_modifiers(const src_reg &src);
 
    vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
                                const src_reg &src1 = src_reg());
 
-   src_reg fix_math_operand(src_reg src);
+   src_reg fix_math_operand(const src_reg &src);
 
    void emit_pack_half_2x16(dst_reg dst, src_reg src0);
    void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 57d98c9b6b3..34dd7159631 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -287,7 +287,7 @@ vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements
 }
 
 src_reg
-vec4_visitor::fix_3src_operand(src_reg src)
+vec4_visitor::fix_3src_operand(const src_reg &src)
 {
    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
     * able to use vertical stride of zero to replicate the vec4 uniform, like
@@ -313,7 +313,7 @@ vec4_visitor::fix_3src_operand(src_reg src)
 }
 
 src_reg
-vec4_visitor::resolve_source_modifiers(const src_reg& src)
+vec4_visitor::resolve_source_modifiers(const src_reg &src)
 {
    if (!src.abs && !src.negate)
       return src;
@@ -326,7 +326,7 @@ vec4_visitor::resolve_source_modifiers(const src_reg& src)
 }
 
 src_reg
-vec4_visitor::fix_math_operand(src_reg src)
+vec4_visitor::fix_math_operand(const src_reg &src)
 {
    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
       return src;

From 8a688bee83ced46eb4bff741f05d2da033c07ade Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 10 Aug 2015 11:52:50 -0700
Subject: [PATCH 1079/1208] i965/fs: Make resolve_source_modifiers consistent
 with the vec4 version

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp     | 15 ++++++++-------
 src/mesa/drivers/dri/i965/brw_fs.h       |  2 +-
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 14 +++++++-------
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index ce1edc3f889..6fe900aa63f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1311,15 +1311,16 @@ fs_visitor::emit_sampleid_setup()
    return reg;
 }
 
-void
-fs_visitor::resolve_source_modifiers(fs_reg *src)
+fs_reg
+fs_visitor::resolve_source_modifiers(const fs_reg &src)
 {
-   if (!src->abs && !src->negate)
-      return;
+   if (!src.abs && !src.negate)
+      return src;
 
-   fs_reg temp = bld.vgrf(src->type);
-   bld.MOV(temp, *src);
-   *src = temp;
+   fs_reg temp = bld.vgrf(src.type);
+   bld.MOV(temp, src);
+
+   return temp;
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 4749c47f224..f7f01947d32 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -225,7 +225,7 @@ public:
    fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
                          const fs_reg &sampler);
    void emit_gen6_gather_wa(uint8_t wa, fs_reg dst);
-   void resolve_source_modifiers(fs_reg *src);
+   fs_reg resolve_source_modifiers(const fs_reg &src);
    void emit_discard_jump();
    bool try_replace_with_sel();
    bool opt_peephole_sel();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index ee964a0f45d..aa6064e5e7a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -811,28 +811,28 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 
    case nir_op_inot:
       if (devinfo->gen >= 8) {
-         resolve_source_modifiers(&op[0]);
+         op[0] = resolve_source_modifiers(op[0]);
       }
       bld.NOT(result, op[0]);
       break;
    case nir_op_ixor:
       if (devinfo->gen >= 8) {
-         resolve_source_modifiers(&op[0]);
-         resolve_source_modifiers(&op[1]);
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
       }
       bld.XOR(result, op[0], op[1]);
       break;
    case nir_op_ior:
       if (devinfo->gen >= 8) {
-         resolve_source_modifiers(&op[0]);
-         resolve_source_modifiers(&op[1]);
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
       }
       bld.OR(result, op[0], op[1]);
       break;
    case nir_op_iand:
       if (devinfo->gen >= 8) {
-         resolve_source_modifiers(&op[0]);
-         resolve_source_modifiers(&op[1]);
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
       }
       bld.AND(result, op[0], op[1]);
       break;

From 3fa1ca34cc0134bd16b3315a0695703c9f684bd4 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 10 Aug 2015 17:41:36 -0400
Subject: [PATCH 1080/1208] nouveau: no need to do tnl wakeup, state updates
 are always hooked up

A TNL state update now requires a DrawBuffer to be set, which it isn't
early on in context creation. Since we init swtnl from context init,
this caused crashes.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91570
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "10.6" <mesa-stable@lists.freedesktop.org>
---
 src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c | 1 -
 src/mesa/drivers/dri/nouveau/nv04_render.c     | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c b/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c
index 0753c3a0019..755de2c4b68 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c
@@ -338,7 +338,6 @@ TAG(swtnl_init)(struct gl_context *ctx)
 			   NUM_VERTEX_ATTRS * 4 * sizeof(GLfloat));
 	_tnl_need_projected_coords(ctx, GL_FALSE);
 	_tnl_allow_vertex_fog(ctx, GL_FALSE);
-	_tnl_wakeup(ctx);
 
 	swtnl_alloc_vertices(ctx);
 }
diff --git a/src/mesa/drivers/dri/nouveau/nv04_render.c b/src/mesa/drivers/dri/nouveau/nv04_render.c
index 30e9f9aad96..3b7f7829044 100644
--- a/src/mesa/drivers/dri/nouveau/nv04_render.c
+++ b/src/mesa/drivers/dri/nouveau/nv04_render.c
@@ -285,7 +285,6 @@ nv04_render_init(struct gl_context *ctx)
 	_tnl_init_vertices(ctx, tnl->vb.Size,
 			   NUM_VERTEX_ATTRS * 4 * sizeof(GLfloat));
 	_tnl_allow_pixel_fog(ctx, GL_FALSE);
-	_tnl_wakeup(ctx);
 }
 
 void

From 87cea61b9e2681e5365e989c7fa7a0298e4005fa Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Mon, 10 Aug 2015 15:35:21 -0400
Subject: [PATCH 1081/1208] radeonsi: add new OLAND pci id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: mesa-stable@lists.freedesktop.org
---
 include/pci_ids/radeonsi_pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/pci_ids/radeonsi_pci_ids.h b/include/pci_ids/radeonsi_pci_ids.h
index cd5da99a6a6..f451b7d761a 100644
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -63,6 +63,7 @@ CHIPSET(0x6608, OLAND_6608, OLAND)
 CHIPSET(0x6610, OLAND_6610, OLAND)
 CHIPSET(0x6611, OLAND_6611, OLAND)
 CHIPSET(0x6613, OLAND_6613, OLAND)
+CHIPSET(0x6617, OLAND_6617, OLAND)
 CHIPSET(0x6620, OLAND_6620, OLAND)
 CHIPSET(0x6621, OLAND_6621, OLAND)
 CHIPSET(0x6623, OLAND_6623, OLAND)

From fe55ab2d12202236ba5bf9beae09803dfe97a7ac Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sun, 9 Aug 2015 14:44:30 +1000
Subject: [PATCH 1082/1208] glsl: Add missing spec quote about atomic counter
 in structs

Reviewed-by: Thomas Helland <thomashelland90@gmail.com>
---
 src/glsl/ast_to_hir.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 68f71c6393e..06cd6a5ec59 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -5631,10 +5631,10 @@ ast_process_structure_or_interface_block(exec_list *instructions,
          }
 
          if (field_type->contains_atomic()) {
-            /* FINISHME: Add a spec quotation here once updated spec
-             * FINISHME: language is available.  See Khronos bug #10903
-             * FINISHME: on whether atomic counters are allowed in
-             * FINISHME: structures.
+            /* From section 4.1.7.3 of the GLSL 4.40 spec:
+             *
+             *    "Members of structures cannot be declared as atomic counter
+             *     types."
              */
             YYLTYPE loc = decl_list->get_location();
             _mesa_glsl_error(&loc, state, "atomic counter in structure, "

From f9094691378722304dd94deb76ad013bd65c7a5b Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 12 May 2015 16:10:07 +0300
Subject: [PATCH 1083/1208] i965: Add SKL support to
 brw_miptree_get_horizontal_slice_pitch().

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index c9ee5265a1d..b8b03932065 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -432,9 +432,7 @@ brw_miptree_get_horizontal_slice_pitch(const struct brw_context *brw,
                                        const struct intel_mipmap_tree *mt,
                                        unsigned level)
 {
-   assert(brw->gen < 9);
-
-   if (mt->target == GL_TEXTURE_3D ||
+   if ((brw->gen < 9 && mt->target == GL_TEXTURE_3D) ||
        (brw->gen == 4 && mt->target == GL_TEXTURE_CUBE_MAP)) {
       return ALIGN(minify(mt->physical_width0, level), mt->align_w);
    } else {

From 2cdb24a7c2238843d23b468275d479553f537e7e Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 12 May 2015 15:56:54 +0300
Subject: [PATCH 1084/1208] i965: Fix brw_memory_barrier() for SKL.

This works as-is on SKL, only the assertion needs to be relaxed.

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/mesa/drivers/dri/i965/brw_program.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 467a8934180..4f380184464 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -197,7 +197,7 @@ brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
    unsigned bits = (PIPE_CONTROL_DATA_CACHE_INVALIDATE |
                     PIPE_CONTROL_NO_WRITE |
                     PIPE_CONTROL_CS_STALL);
-   assert(brw->gen >= 7 && brw->gen <= 8);
+   assert(brw->gen >= 7 && brw->gen <= 9);
 
    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
                    GL_ELEMENT_ARRAY_BARRIER_BIT |

From 3144844f5ca89cd5743bc9b0ac142ccf862af557 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sat, 2 May 2015 16:58:24 +0300
Subject: [PATCH 1085/1208] i965: Implement surface state set-up for shader
 images.

v2: Add SKL support.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_context.h       |   2 +
 .../drivers/dri/i965/brw_surface_formats.c    | 109 ++++++++++++++++++
 .../drivers/dri/i965/brw_wm_surface_state.c   |  77 +++++++++++++
 3 files changed, 188 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index cd43ac5114e..3aff9246db8 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1743,6 +1743,8 @@ void brw_upload_abo_surfaces(struct brw_context *brw,
 bool brw_render_target_supported(struct brw_context *brw,
                                  struct gl_renderbuffer *rb);
 uint32_t brw_depth_format(struct brw_context *brw, mesa_format format);
+mesa_format brw_lower_mesa_image_format(const struct brw_device_info *devinfo,
+                                        mesa_format format);
 
 /* brw_performance_monitor.c */
 void brw_init_performance_monitors(struct brw_context *brw);
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c
index 05016067bba..a33fd88a026 100644
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@@ -813,3 +813,112 @@ brw_depth_format(struct brw_context *brw, mesa_format format)
       unreachable("Unexpected depth format.");
    }
 }
+
+mesa_format
+brw_lower_mesa_image_format(const struct brw_device_info *devinfo,
+                            mesa_format format)
+{
+   switch (format) {
+   /* These are never lowered.  Up to BDW we'll have to fall back to untyped
+    * surface access for 128bpp formats.
+    */
+   case MESA_FORMAT_RGBA_UINT32:
+   case MESA_FORMAT_RGBA_SINT32:
+   case MESA_FORMAT_RGBA_FLOAT32:
+   case MESA_FORMAT_R_UINT32:
+   case MESA_FORMAT_R_SINT32:
+   case MESA_FORMAT_R_FLOAT32:
+      return format;
+
+   /* From HSW to BDW the only 64bpp format supported for typed access is
+    * RGBA_UINT16.  IVB falls back to untyped.
+    */
+   case MESA_FORMAT_RGBA_UINT16:
+   case MESA_FORMAT_RGBA_SINT16:
+   case MESA_FORMAT_RGBA_FLOAT16:
+   case MESA_FORMAT_RG_UINT32:
+   case MESA_FORMAT_RG_SINT32:
+   case MESA_FORMAT_RG_FLOAT32:
+      return (devinfo->gen >= 9 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RGBA_UINT16 : MESA_FORMAT_RG_UINT32);
+
+   /* Up to BDW no SINT or FLOAT formats of less than 32 bits per component
+    * are supported.  IVB doesn't support formats with more than one component
+    * for typed access.  For 8 and 16 bpp formats IVB relies on the
+    * undocumented behavior that typed reads from R_UINT8 and R_UINT16
+    * surfaces actually do a 32-bit misaligned read.  The alternative would be
+    * to use two surface state entries with different formats for each image,
+    * one for reading (using R_UINT32) and another one for writing (using
+    * R_UINT8 or R_UINT16), but that would complicate the shaders we generate
+    * even more.
+    */
+   case MESA_FORMAT_RGBA_UINT8:
+   case MESA_FORMAT_RGBA_SINT8:
+      return (devinfo->gen >= 9 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RGBA_UINT8 : MESA_FORMAT_R_UINT32);
+
+   case MESA_FORMAT_RG_UINT16:
+   case MESA_FORMAT_RG_SINT16:
+   case MESA_FORMAT_RG_FLOAT16:
+      return (devinfo->gen >= 9 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RG_UINT16 : MESA_FORMAT_R_UINT32);
+
+   case MESA_FORMAT_RG_UINT8:
+   case MESA_FORMAT_RG_SINT8:
+      return (devinfo->gen >= 9 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RG_UINT8 : MESA_FORMAT_R_UINT16);
+
+   case MESA_FORMAT_R_UINT16:
+   case MESA_FORMAT_R_FLOAT16:
+   case MESA_FORMAT_R_SINT16:
+      return (devinfo->gen >= 9 ? format : MESA_FORMAT_R_UINT16);
+
+   case MESA_FORMAT_R_UINT8:
+   case MESA_FORMAT_R_SINT8:
+      return (devinfo->gen >= 9 ? format : MESA_FORMAT_R_UINT8);
+
+   /* Neither the 2/10/10/10 nor the 11/11/10 packed formats are supported
+    * by the hardware.
+    */
+   case MESA_FORMAT_R10G10B10A2_UINT:
+   case MESA_FORMAT_R10G10B10A2_UNORM:
+   case MESA_FORMAT_R11G11B10_FLOAT:
+      return MESA_FORMAT_R_UINT32;
+
+   /* No normalized fixed-point formats are supported by the hardware. */
+   case MESA_FORMAT_RGBA_UNORM16:
+   case MESA_FORMAT_RGBA_SNORM16:
+      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RGBA_UINT16 : MESA_FORMAT_RG_UINT32);
+
+   case MESA_FORMAT_R8G8B8A8_UNORM:
+   case MESA_FORMAT_R8G8B8A8_SNORM:
+      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RGBA_UINT8 : MESA_FORMAT_R_UINT32);
+
+   case MESA_FORMAT_R16G16_UNORM:
+   case MESA_FORMAT_R16G16_SNORM:
+      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RG_UINT16 : MESA_FORMAT_R_UINT32);
+
+   case MESA_FORMAT_R8G8_UNORM:
+   case MESA_FORMAT_R8G8_SNORM:
+      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RG_UINT8 : MESA_FORMAT_R_UINT16);
+
+   case MESA_FORMAT_R_UNORM16:
+   case MESA_FORMAT_R_SNORM16:
+      return MESA_FORMAT_R_UINT16;
+
+   case MESA_FORMAT_R_UNORM8:
+   case MESA_FORMAT_R_SNORM8:
+      return MESA_FORMAT_R_UINT8;
+
+   default:
+      unreachable("Unknown image format");
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 72aad96bb6a..33e045f2099 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1024,6 +1024,83 @@ const struct brw_tracked_state brw_cs_abo_surfaces = {
    .emit = brw_upload_cs_abo_surfaces,
 };
 
+static uint32_t
+get_image_format(struct brw_context *brw, mesa_format format, GLenum access)
+{
+   if (access == GL_WRITE_ONLY) {
+      return brw_format_for_mesa_format(format);
+   } else {
+      /* Typed surface reads support a very limited subset of the shader
+       * image formats.  Translate it into the closest format the
+       * hardware supports.
+       */
+      if ((_mesa_get_format_bytes(format) >= 16 && brw->gen <= 8) ||
+          (_mesa_get_format_bytes(format) >= 8 &&
+           (brw->gen == 7 && !brw->is_haswell)))
+         return BRW_SURFACEFORMAT_RAW;
+      else
+         return brw_format_for_mesa_format(
+            brw_lower_mesa_image_format(brw->intelScreen->devinfo, format));
+   }
+}
+
+static void
+update_image_surface(struct brw_context *brw,
+                     struct gl_image_unit *u,
+                     GLenum access,
+                     unsigned surface_idx,
+                     uint32_t *surf_offset,
+                     struct brw_image_param *param)
+{
+   if (u->_Valid) {
+      struct gl_texture_object *obj = u->TexObj;
+      const unsigned format = get_image_format(brw, u->_ActualFormat, access);
+
+      if (obj->Target == GL_TEXTURE_BUFFER) {
+         struct intel_buffer_object *intel_obj =
+            intel_buffer_object(obj->BufferObject);
+         const unsigned texel_size = (format == BRW_SURFACEFORMAT_RAW ? 1 :
+                                      _mesa_get_format_bytes(u->_ActualFormat));
+
+         brw->vtbl.emit_buffer_surface_state(
+            brw, surf_offset, intel_obj->buffer, obj->BufferOffset,
+            format, intel_obj->Base.Size / texel_size, texel_size,
+            access != GL_READ_ONLY);
+
+      } else {
+         struct intel_texture_object *intel_obj = intel_texture_object(obj);
+         struct intel_mipmap_tree *mt = intel_obj->mt;
+
+         if (format == BRW_SURFACEFORMAT_RAW) {
+            brw->vtbl.emit_buffer_surface_state(
+               brw, surf_offset, mt->bo, mt->offset,
+               format, mt->bo->size - mt->offset, 1 /* pitch */,
+               access != GL_READ_ONLY);
+
+         } else {
+            const unsigned min_layer = obj->MinLayer + u->Layer;
+            const unsigned min_level = obj->MinLevel + u->Level;
+            const unsigned num_layers = (!u->Layered ? 1 :
+                                         obj->Target == GL_TEXTURE_CUBE_MAP ? 6 :
+                                         mt->logical_depth0);
+            const GLenum target = (obj->Target == GL_TEXTURE_CUBE_MAP ||
+                                   obj->Target == GL_TEXTURE_CUBE_MAP_ARRAY ?
+                                   GL_TEXTURE_2D_ARRAY : obj->Target);
+
+            brw->vtbl.emit_texture_surface_state(
+               brw, mt, target,
+               min_layer, min_layer + num_layers,
+               min_level, min_level + 1,
+               format, SWIZZLE_XYZW,
+               surf_offset, access != GL_READ_ONLY, false);
+         }
+      }
+
+   } else {
+      brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, surf_offset);
+   }
+}
+
 void
 gen4_init_vtable_surface_functions(struct brw_context *brw)
 {

From 87a3e02d9bec689e110f820bba7b125b3e801fdd Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 21 Jan 2015 17:34:49 +0200
Subject: [PATCH 1086/1208] i965: Define and initialize image parameter
 structure.

This will be used to pass image meta-data to the shader when we cannot
use typed surface reads and writes.  All entries except surface_idx
and size are otherwise unused and will get eliminated by the uniform
packing pass.  size will be used for bounds checking with some image
formats and will be useful for ARB_shader_image_size too.  surface_idx
is always used.

v2: Add CS support.  Move the image_params array back to
    brw_stage_prog_data.
v3: Improve documentation.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/mesa/drivers/dri/i965/brw_context.h       |  54 ++++++++++
 src/mesa/drivers/dri/i965/brw_cs.cpp          |   3 +
 src/mesa/drivers/dri/i965/brw_gs.c            |   3 +
 src/mesa/drivers/dri/i965/brw_vs.c            |   5 +-
 src/mesa/drivers/dri/i965/brw_wm.c            |   4 +
 .../drivers/dri/i965/brw_wm_surface_state.c   | 102 ++++++++++++++++++
 6 files changed, 170 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 3aff9246db8..b851f3830a2 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -361,6 +361,7 @@ struct brw_stage_prog_data {
 
    GLuint nr_params;       /**< number of float params/constants */
    GLuint nr_pull_params;
+   unsigned nr_image_params;
 
    unsigned curb_read_length;
    unsigned total_scratch;
@@ -381,6 +382,59 @@ struct brw_stage_prog_data {
     */
    const gl_constant_value **param;
    const gl_constant_value **pull_param;
+
+   /**
+    * Image metadata passed to the shader as uniforms.  This is deliberately
+    * ignored by brw_stage_prog_data_compare() because its contents don't have
+    * any influence on program compilation.
+    */
+   struct brw_image_param *image_param;
+};
+
+/*
+ * Image metadata structure as laid out in the shader parameter
+ * buffer.  Entries have to be 16B-aligned for the vec4 back-end to be
+ * able to use them.  That's okay because the padding and any unused
+ * entries [most of them except when we're doing untyped surface
+ * access] will be removed by the uniform packing pass.
+ */
+#define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET      0
+#define BRW_IMAGE_PARAM_OFFSET_OFFSET           4
+#define BRW_IMAGE_PARAM_SIZE_OFFSET             8
+#define BRW_IMAGE_PARAM_STRIDE_OFFSET           12
+#define BRW_IMAGE_PARAM_TILING_OFFSET           16
+#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET        20
+#define BRW_IMAGE_PARAM_SIZE                    24
+
+struct brw_image_param {
+   /** Surface binding table index. */
+   uint32_t surface_idx;
+
+   /** Offset applied to the X and Y surface coordinates. */
+   uint32_t offset[2];
+
+   /** Surface X, Y and Z dimensions. */
+   uint32_t size[3];
+
+   /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in
+    * pixels, vertical slice stride in pixels.
+    */
+   uint32_t stride[4];
+
+   /** Log2 of the tiling modulus in the X, Y and Z dimension. */
+   uint32_t tiling[3];
+
+   /**
+    * Right shift to apply for bit 6 address swizzling.  Two different
+    * swizzles can be specified and will be applied one after the other.  The
+    * resulting address will be:
+    *
+    *  addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^
+    *                              (addr >> swizzling[1])))
+    *
+    * Use \c 0xff if any of the swizzles is not required.
+    */
+   uint32_t swizzling[2];
 };
 
 /* Data about a particular attempt to compile a program.  Note that
diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp
index 29ee75b1e1a..59520307f42 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
@@ -190,7 +190,10 @@ brw_codegen_cs_prog(struct brw_context *brw,
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    prog_data.base.pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.image_param =
+      rzalloc_array(NULL, struct brw_image_param, cs->NumImages);
    prog_data.base.nr_params = param_count;
+   prog_data.base.nr_image_params = cs->NumImages;
 
    program = brw_cs_emit(brw, mem_ctx, key, &prog_data,
                          &cp->program, prog, &program_size);
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 9c59c8a0dfc..d1a955a4de3 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -69,7 +69,10 @@ brw_codegen_gs_prog(struct brw_context *brw,
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    c.prog_data.base.base.pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   c.prog_data.base.base.image_param =
+      rzalloc_array(NULL, struct brw_image_param, gs->NumImages);
    c.prog_data.base.base.nr_params = param_count;
+   c.prog_data.base.base.nr_image_params = gs->NumImages;
 
    if (brw->gen >= 7) {
       if (gp->program.OutputType == GL_POINTS) {
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 2b9b005c782..20bc7a97d20 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -122,7 +122,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
        * conservative here.
        */
       param_count = vs->num_uniform_components * 4;
-
+      stage_prog_data->nr_image_params = vs->NumImages;
    } else {
       param_count = vp->program.Base.Parameters->NumParameters * 4;
    }
@@ -135,6 +135,9 @@ brw_codegen_vs_prog(struct brw_context *brw,
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    stage_prog_data->pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   stage_prog_data->image_param =
+      rzalloc_array(NULL, struct brw_image_param,
+                    stage_prog_data->nr_image_params);
    stage_prog_data->nr_params = param_count;
 
    GLbitfield64 outputs_written = vp->program.Base.OutputsWritten;
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 9b90a7c191f..c8583c09fe6 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -190,6 +190,7 @@ brw_codegen_wm_prog(struct brw_context *brw,
    int param_count;
    if (fs) {
       param_count = fs->num_uniform_components;
+      prog_data.base.nr_image_params = fs->NumImages;
    } else {
       param_count = fp->program.Base.Parameters->NumParameters * 4;
    }
@@ -199,6 +200,9 @@ brw_codegen_wm_prog(struct brw_context *brw,
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    prog_data.base.pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.image_param =
+      rzalloc_array(NULL, struct brw_image_param,
+                    prog_data.base.nr_image_params);
    prog_data.base.nr_params = param_count;
 
    prog_data.barycentric_interp_modes =
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 33e045f2099..35df6f13e19 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1044,6 +1044,103 @@ get_image_format(struct brw_context *brw, mesa_format format, GLenum access)
    }
 }
 
+static void
+update_default_image_param(struct brw_context *brw,
+                           struct gl_image_unit *u,
+                           unsigned surface_idx,
+                           struct brw_image_param *param)
+{
+   memset(param, 0, sizeof(*param));
+   param->surface_idx = surface_idx;
+   /* Set the swizzling shifts to all-ones to effectively disable swizzling --
+    * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
+    * detailed explanation of these parameters.
+    */
+   param->swizzling[0] = 0xff;
+   param->swizzling[1] = 0xff;
+}
+
+static void
+update_buffer_image_param(struct brw_context *brw,
+                          struct gl_image_unit *u,
+                          unsigned surface_idx,
+                          struct brw_image_param *param)
+{
+   struct gl_buffer_object *obj = u->TexObj->BufferObject;
+
+   update_default_image_param(brw, u, surface_idx, param);
+
+   param->size[0] = obj->Size / _mesa_get_format_bytes(u->_ActualFormat);
+   param->stride[0] = _mesa_get_format_bytes(u->_ActualFormat);
+}
+
+static void
+update_texture_image_param(struct brw_context *brw,
+                           struct gl_image_unit *u,
+                           unsigned surface_idx,
+                           struct brw_image_param *param)
+{
+   struct intel_mipmap_tree *mt = intel_texture_object(u->TexObj)->mt;
+
+   update_default_image_param(brw, u, surface_idx, param);
+
+   param->size[0] = minify(mt->logical_width0, u->Level);
+   param->size[1] = minify(mt->logical_height0, u->Level);
+   param->size[2] = (!u->Layered ? 1 :
+                     u->TexObj->Target == GL_TEXTURE_CUBE_MAP ? 6 :
+                     u->TexObj->Target == GL_TEXTURE_3D ?
+                     minify(mt->logical_depth0, u->Level) :
+                     mt->logical_depth0);
+
+   intel_miptree_get_image_offset(mt, u->Level, u->Layer,
+                                  &param->offset[0],
+                                  &param->offset[1]);
+
+   param->stride[0] = mt->cpp;
+   param->stride[1] = mt->pitch / mt->cpp;
+   param->stride[2] =
+      brw_miptree_get_horizontal_slice_pitch(brw, mt, u->Level);
+   param->stride[3] =
+      brw_miptree_get_vertical_slice_pitch(brw, mt, u->Level);
+
+   if (mt->tiling == I915_TILING_X) {
+      /* An X tile is a rectangular block of 512x8 bytes. */
+      param->tiling[0] = _mesa_logbase2(512 / mt->cpp);
+      param->tiling[1] = _mesa_logbase2(8);
+
+      if (brw->has_swizzling) {
+         /* Right shifts required to swizzle bits 9 and 10 of the memory
+          * address with bit 6.
+          */
+         param->swizzling[0] = 3;
+         param->swizzling[1] = 4;
+      }
+   } else if (mt->tiling == I915_TILING_Y) {
+      /* The layout of a Y-tiled surface in memory isn't really fundamentally
+       * different to the layout of an X-tiled surface, we simply pretend that
+       * the surface is broken up in a number of smaller 16Bx32 tiles, each
+       * one arranged in X-major order just like is the case for X-tiling.
+       */
+      param->tiling[0] = _mesa_logbase2(16 / mt->cpp);
+      param->tiling[1] = _mesa_logbase2(32);
+
+      if (brw->has_swizzling) {
+         /* Right shift required to swizzle bit 9 of the memory address with
+          * bit 6.
+          */
+         param->swizzling[0] = 3;
+      }
+   }
+
+   /* 3D textures are arranged in 2D in memory with 2^lod slices per row.  The
+    * address calculation algorithm (emit_address_calculation() in
+    * brw_fs_surface_builder.cpp) handles this as a sort of tiling with
+    * modulus equal to the LOD.
+    */
+   param->tiling[2] = (u->TexObj->Target == GL_TEXTURE_3D ? u->Level :
+                       0);
+}
+
 static void
 update_image_surface(struct brw_context *brw,
                      struct gl_image_unit *u,
@@ -1067,6 +1164,8 @@ update_image_surface(struct brw_context *brw,
             format, intel_obj->Base.Size / texel_size, texel_size,
             access != GL_READ_ONLY);
 
+         update_buffer_image_param(brw, u, surface_idx, param);
+
       } else {
          struct intel_texture_object *intel_obj = intel_texture_object(obj);
          struct intel_mipmap_tree *mt = intel_obj->mt;
@@ -1094,10 +1193,13 @@ update_image_surface(struct brw_context *brw,
                format, SWIZZLE_XYZW,
                surf_offset, access != GL_READ_ONLY, false);
          }
+
+         update_texture_image_param(brw, u, surface_idx, param);
       }
 
    } else {
       brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, surf_offset);
+      update_default_image_param(brw, u, surface_idx, param);
    }
 }
 

From 868f1ba0a4e6e3057be5b8c2458db4773cf82034 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 17:19:29 +0300
Subject: [PATCH 1087/1208] i965: Reserve enough parameter entries for all
 image uniforms used in the program.

v2: Add CS support.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/mesa/drivers/dri/i965/brw_cs.cpp | 3 ++-
 src/mesa/drivers/dri/i965/brw_gs.c   | 1 +
 src/mesa/drivers/dri/i965/brw_vs.c   | 3 ++-
 src/mesa/drivers/dri/i965/brw_wm.c   | 3 ++-
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp
index 59520307f42..cd7e0942277 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
@@ -182,7 +182,8 @@ brw_codegen_cs_prog(struct brw_context *brw,
     * prog_data associated with the compiled program, and which will be freed
     * by the state cache.
     */
-   int param_count = cs->num_uniform_components;
+   int param_count = cs->num_uniform_components +
+                     cs->NumImages * BRW_IMAGE_PARAM_SIZE;
 
    /* The backend also sometimes adds params for texture size. */
    param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits;
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index d1a955a4de3..5c0d9230162 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -64,6 +64,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
 
    /* We also upload clip plane data as uniforms */
    param_count += MAX_CLIP_PLANES * 4;
+   param_count += gs->NumImages * BRW_IMAGE_PARAM_SIZE;
 
    c.prog_data.base.base.param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 20bc7a97d20..c53cb49b612 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -121,7 +121,8 @@ brw_codegen_vs_prog(struct brw_context *brw,
        * case being a float value that gets blown up to a vec4, so be
        * conservative here.
        */
-      param_count = vs->num_uniform_components * 4;
+      param_count = vs->num_uniform_components * 4 +
+                    vs->NumImages * BRW_IMAGE_PARAM_SIZE;
       stage_prog_data->nr_image_params = vs->NumImages;
    } else {
       param_count = vp->program.Base.Parameters->NumParameters * 4;
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index c8583c09fe6..184f21c7b06 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -189,7 +189,8 @@ brw_codegen_wm_prog(struct brw_context *brw,
     */
    int param_count;
    if (fs) {
-      param_count = fs->num_uniform_components;
+      param_count = fs->num_uniform_components +
+                    fs->NumImages * BRW_IMAGE_PARAM_SIZE;
       prog_data.base.nr_image_params = fs->NumImages;
    } else {
       param_count = fp->program.Base.Parameters->NumParameters * 4;

From 47f9b07e4cf79a8249c6f9f09148a6a0b4fabacc Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 20 Jul 2015 17:13:17 +0300
Subject: [PATCH 1088/1208] i965: Hook up image state upload.

v2: Add CS support.  Move the image_params array back to
    brw_stage_prog_data.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Acked-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/mesa/drivers/dri/i965/brw_context.h       | 10 ++-
 .../drivers/dri/i965/brw_gs_surface_state.c   | 25 +++++++
 src/mesa/drivers/dri/i965/brw_state.h         |  4 ++
 src/mesa/drivers/dri/i965/brw_state_upload.c  | 12 ++++
 .../drivers/dri/i965/brw_vs_surface_state.c   | 25 +++++++
 .../drivers/dri/i965/brw_wm_surface_state.c   | 72 +++++++++++++++++++
 6 files changed, 146 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index b851f3830a2..707cd8f9ae5 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -201,6 +201,7 @@ enum brw_state_id {
    BRW_STATE_STATS_WM,
    BRW_STATE_UNIFORM_BUFFER,
    BRW_STATE_ATOMIC_BUFFER,
+   BRW_STATE_IMAGE_UNITS,
    BRW_STATE_META_IN_PROGRESS,
    BRW_STATE_INTERPOLATION_MAP,
    BRW_STATE_PUSH_CONSTANT_ALLOCATION,
@@ -282,6 +283,7 @@ enum brw_state_id {
 #define BRW_NEW_STATS_WM                (1ull << BRW_STATE_STATS_WM)
 #define BRW_NEW_UNIFORM_BUFFER          (1ull << BRW_STATE_UNIFORM_BUFFER)
 #define BRW_NEW_ATOMIC_BUFFER           (1ull << BRW_STATE_ATOMIC_BUFFER)
+#define BRW_NEW_IMAGE_UNITS             (1ull << BRW_STATE_IMAGE_UNITS)
 #define BRW_NEW_META_IN_PROGRESS        (1ull << BRW_STATE_META_IN_PROGRESS)
 #define BRW_NEW_INTERPOLATION_MAP       (1ull << BRW_STATE_INTERPOLATION_MAP)
 #define BRW_NEW_PUSH_CONSTANT_ALLOCATION (1ull << BRW_STATE_PUSH_CONSTANT_ALLOCATION)
@@ -1513,8 +1515,8 @@ struct brw_context
    } perfmon;
 
    int num_atoms[BRW_NUM_PIPELINES];
-   const struct brw_tracked_state render_atoms[57];
-   const struct brw_tracked_state compute_atoms[3];
+   const struct brw_tracked_state render_atoms[60];
+   const struct brw_tracked_state compute_atoms[4];
 
    /* If (INTEL_DEBUG & DEBUG_BATCH) */
    struct {
@@ -1792,6 +1794,10 @@ void brw_upload_abo_surfaces(struct brw_context *brw,
                              struct gl_shader_program *prog,
                              struct brw_stage_state *stage_state,
                              struct brw_stage_prog_data *prog_data);
+void brw_upload_image_surfaces(struct brw_context *brw,
+                               struct gl_shader *shader,
+                               struct brw_stage_state *stage_state,
+                               struct brw_stage_prog_data *prog_data);
 
 /* brw_surface_formats.c */
 bool brw_render_target_supported(struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
index 0b8bfc3d9bd..0bb307432d0 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
@@ -119,3 +119,28 @@ const struct brw_tracked_state brw_gs_abo_surfaces = {
    },
    .emit = brw_upload_gs_abo_surfaces,
 };
+
+static void
+brw_upload_gs_image_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_GEOMETRY_PROGRAM */
+   struct gl_shader_program *prog =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
+
+   if (prog) {
+      /* BRW_NEW_GS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_GEOMETRY],
+                                &brw->gs.base, &brw->gs.prog_data->base.base);
+   }
+}
+
+const struct brw_tracked_state brw_gs_image_surfaces = {
+   .dirty = {
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_GEOMETRY_PROGRAM |
+             BRW_NEW_GS_PROG_DATA |
+             BRW_NEW_IMAGE_UNITS,
+   },
+   .emit = brw_upload_gs_image_surfaces,
+};
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index cf3a96c7bee..78a1f874b4e 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -72,8 +72,10 @@ extern const struct brw_tracked_state brw_vs_samplers;
 extern const struct brw_tracked_state brw_gs_samplers;
 extern const struct brw_tracked_state brw_vs_ubo_surfaces;
 extern const struct brw_tracked_state brw_vs_abo_surfaces;
+extern const struct brw_tracked_state brw_vs_image_surfaces;
 extern const struct brw_tracked_state brw_gs_ubo_surfaces;
 extern const struct brw_tracked_state brw_gs_abo_surfaces;
+extern const struct brw_tracked_state brw_gs_image_surfaces;
 extern const struct brw_tracked_state brw_vs_unit;
 extern const struct brw_tracked_state brw_gs_prog;
 extern const struct brw_tracked_state brw_wm_prog;
@@ -84,7 +86,9 @@ extern const struct brw_tracked_state brw_gs_binding_table;
 extern const struct brw_tracked_state brw_vs_binding_table;
 extern const struct brw_tracked_state brw_wm_ubo_surfaces;
 extern const struct brw_tracked_state brw_wm_abo_surfaces;
+extern const struct brw_tracked_state brw_wm_image_surfaces;
 extern const struct brw_tracked_state brw_cs_abo_surfaces;
+extern const struct brw_tracked_state brw_cs_image_surfaces;
 extern const struct brw_tracked_state brw_wm_unit;
 extern const struct brw_tracked_state brw_interpolation_map;
 
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 6096b4946a0..9de42ce8503 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -194,6 +194,10 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
 
    &gen7_hw_binding_tables, /* Enable hw-generated binding tables for Haswell */
 
+   &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
+   &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
+   &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
+
    &gen6_vs_push_constants, /* Before vs_state */
    &gen6_gs_push_constants, /* Before gs_state */
    &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
@@ -253,6 +257,7 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
 static const struct brw_tracked_state *gen7_compute_atoms[] =
 {
    &brw_state_base_address,
+   &brw_cs_image_surfaces,
    &brw_cs_abo_surfaces,
    &brw_cs_state,
 };
@@ -272,6 +277,10 @@ static const struct brw_tracked_state *gen8_render_atoms[] =
 
    &gen7_hw_binding_tables, /* Enable hw-generated binding tables for Broadwell */
 
+   &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
+   &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
+   &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
+
    &gen6_vs_push_constants, /* Before vs_state */
    &gen6_gs_push_constants, /* Before gs_state */
    &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
@@ -338,6 +347,7 @@ static const struct brw_tracked_state *gen8_render_atoms[] =
 static const struct brw_tracked_state *gen8_compute_atoms[] =
 {
    &gen8_state_base_address,
+   &brw_cs_image_surfaces,
    &brw_cs_abo_surfaces,
    &brw_cs_state,
 };
@@ -472,6 +482,7 @@ void brw_init_state( struct brw_context *brw )
    ctx->DriverFlags.NewUniformBuffer = BRW_NEW_UNIFORM_BUFFER;
    ctx->DriverFlags.NewTextureBuffer = BRW_NEW_TEXTURE_BUFFER;
    ctx->DriverFlags.NewAtomicBuffer = BRW_NEW_ATOMIC_BUFFER;
+   ctx->DriverFlags.NewImageUnits = BRW_NEW_IMAGE_UNITS;
 }
 
 
@@ -585,6 +596,7 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_STATS_WM),
    DEFINE_BIT(BRW_NEW_UNIFORM_BUFFER),
    DEFINE_BIT(BRW_NEW_ATOMIC_BUFFER),
+   DEFINE_BIT(BRW_NEW_IMAGE_UNITS),
    DEFINE_BIT(BRW_NEW_META_IN_PROGRESS),
    DEFINE_BIT(BRW_NEW_INTERPOLATION_MAP),
    DEFINE_BIT(BRW_NEW_PUSH_CONSTANT_ALLOCATION),
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index b2f91bd412b..72e37d4b467 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -191,3 +191,28 @@ const struct brw_tracked_state brw_vs_abo_surfaces = {
    },
    .emit = brw_upload_vs_abo_surfaces,
 };
+
+static void
+brw_upload_vs_image_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_VERTEX_PROGRAM */
+   struct gl_shader_program *prog =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
+
+   if (prog) {
+      /* BRW_NEW_VS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_VERTEX],
+                                &brw->vs.base, &brw->vs.prog_data->base.base);
+   }
+}
+
+const struct brw_tracked_state brw_vs_image_surfaces = {
+   .dirty = {
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_IMAGE_UNITS |
+             BRW_NEW_VERTEX_PROGRAM |
+             BRW_NEW_VS_PROG_DATA,
+   },
+   .emit = brw_upload_vs_image_surfaces,
+};
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 35df6f13e19..f13a97ce2b0 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1024,6 +1024,31 @@ const struct brw_tracked_state brw_cs_abo_surfaces = {
    .emit = brw_upload_cs_abo_surfaces,
 };
 
+static void
+brw_upload_cs_image_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* _NEW_PROGRAM */
+   struct gl_shader_program *prog =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
+
+   if (prog) {
+      /* BRW_NEW_CS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_COMPUTE],
+                                &brw->cs.base, &brw->cs.prog_data->base);
+   }
+}
+
+const struct brw_tracked_state brw_cs_image_surfaces = {
+   .dirty = {
+      .mesa = _NEW_PROGRAM,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_CS_PROG_DATA |
+             BRW_NEW_IMAGE_UNITS
+   },
+   .emit = brw_upload_cs_image_surfaces,
+};
+
 static uint32_t
 get_image_format(struct brw_context *brw, mesa_format format, GLenum access)
 {
@@ -1203,6 +1228,53 @@ update_image_surface(struct brw_context *brw,
    }
 }
 
+void
+brw_upload_image_surfaces(struct brw_context *brw,
+                          struct gl_shader *shader,
+                          struct brw_stage_state *stage_state,
+                          struct brw_stage_prog_data *prog_data)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   if (shader && shader->NumImages) {
+      for (unsigned i = 0; i < shader->NumImages; i++) {
+         struct gl_image_unit *u = &ctx->ImageUnits[shader->ImageUnits[i]];
+         const unsigned surf_idx = prog_data->binding_table.image_start + i;
+
+         update_image_surface(brw, u, shader->ImageAccess[i],
+                              surf_idx,
+                              &stage_state->surf_offset[surf_idx],
+                              &prog_data->image_param[i]);
+      }
+
+      brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
+   }
+}
+
+static void
+brw_upload_wm_image_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   struct gl_shader_program *prog = ctx->Shader._CurrentFragmentProgram;
+
+   if (prog) {
+      /* BRW_NEW_FS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_FRAGMENT],
+                                &brw->wm.base, &brw->wm.prog_data->base);
+   }
+}
+
+const struct brw_tracked_state brw_wm_image_surfaces = {
+   .dirty = {
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_FRAGMENT_PROGRAM |
+             BRW_NEW_FS_PROG_DATA |
+             BRW_NEW_IMAGE_UNITS
+   },
+   .emit = brw_upload_wm_image_surfaces,
+};
+
 void
 gen4_init_vtable_surface_functions(struct brw_context *brw)
 {

From acb6d90dc809283d9839685852f19f6b301b23d3 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 3 Feb 2015 17:14:10 +0200
Subject: [PATCH 1089/1208] i965/gen7: Enable fragment shader dispatch if the
 program has image uniforms.

Shaders with image uniforms may have side effects.  Make sure that
fragment shader threads are dispatched if the shader has any image
uniforms.

v2: Use brw_stage_prog_data::nr_image_params to find out if the shader
    has image uniforms instead of checking core mesa data structures
    (Ken).

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/gen7_wm_state.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index d7be58dd0f7..aa47421844e 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -83,6 +83,7 @@ upload_wm_state(struct brw_context *brw)
 
    /* _NEW_BUFFERS | _NEW_COLOR */
    if (brw_color_buffer_write_enabled(brw) || writes_depth ||
+       prog_data->base.nr_image_params ||
        dw1 & GEN7_WM_KILL_ENABLE) {
       dw1 |= GEN7_WM_DISPATCH_ENABLE;
    }

From ac7664e493655e290783c23a0412b9c70936da50 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 13 Jul 2015 14:21:07 +0300
Subject: [PATCH 1090/1208] i965/gen7-8: Poke the 3DSTATE UAV access enable
 bits.

v2: Set the PS UAV-only bit on HSW (Ken).

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_defines.h   |  4 ++++
 src/mesa/drivers/dri/i965/gen7_gs_state.c |  4 +++-
 src/mesa/drivers/dri/i965/gen7_vs_state.c | 13 ++++++++-----
 src/mesa/drivers/dri/i965/gen7_wm_state.c |  9 +++++++++
 src/mesa/drivers/dri/i965/gen8_gs_state.c |  4 +++-
 src/mesa/drivers/dri/i965/gen8_ps_state.c |  3 ++-
 src/mesa/drivers/dri/i965/gen8_vs_state.c |  4 +++-
 7 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 9c232c46ff3..7fa7c5f06f9 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1855,6 +1855,7 @@ enum brw_message_target {
 # define GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT	18
 # define GEN6_VS_FLOATING_POINT_MODE_IEEE_754		(0 << 16)
 # define GEN6_VS_FLOATING_POINT_MODE_ALT		(1 << 16)
+# define HSW_VS_UAV_ACCESS_ENABLE                       (1 << 12)
 /* DW4 */
 # define GEN6_VS_DISPATCH_START_GRF_SHIFT		20
 # define GEN6_VS_URB_READ_LENGTH_SHIFT			11
@@ -1880,6 +1881,7 @@ enum brw_message_target {
 # define GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT	18
 # define GEN6_GS_FLOATING_POINT_MODE_IEEE_754		(0 << 16)
 # define GEN6_GS_FLOATING_POINT_MODE_ALT		(1 << 16)
+# define HSW_GS_UAV_ACCESS_ENABLE       		(1 << 12)
 /* DW4 */
 # define GEN7_GS_OUTPUT_VERTEX_SIZE_SHIFT		23
 # define GEN7_GS_OUTPUT_TOPOLOGY_SHIFT			17
@@ -2406,6 +2408,7 @@ enum brw_wm_barycentric_interp_mode {
 /* DW2 */
 # define GEN7_WM_MSDISPMODE_PERSAMPLE			(0 << 31)
 # define GEN7_WM_MSDISPMODE_PERPIXEL			(1 << 31)
+# define HSW_WM_UAV_ONLY                                (1 << 30)
 
 #define _3DSTATE_PS				0x7820 /* GEN7+ */
 /* DW1: kernel pointer */
@@ -2429,6 +2432,7 @@ enum brw_wm_barycentric_interp_mode {
 # define GEN7_PS_RENDER_TARGET_FAST_CLEAR_ENABLE	(1 << 8)
 # define GEN7_PS_DUAL_SOURCE_BLEND_ENABLE		(1 << 7)
 # define GEN7_PS_RENDER_TARGET_RESOLVE_ENABLE		(1 << 6)
+# define HSW_PS_UAV_ACCESS_ENABLE			(1 << 5)
 # define GEN7_PS_POSOFFSET_NONE				(0 << 3)
 # define GEN7_PS_POSOFFSET_CENTROID			(2 << 3)
 # define GEN7_PS_POSOFFSET_SAMPLE			(3 << 3)
diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
index 8d6d3fe1d34..497ecec8e45 100644
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
@@ -59,7 +59,9 @@ upload_gs_state(struct brw_context *brw)
       OUT_BATCH(((ALIGN(stage_state->sampler_count, 4)/4) <<
                  GEN6_GS_SAMPLER_COUNT_SHIFT) |
                 ((brw->gs.prog_data->base.base.binding_table.size_bytes / 4) <<
-                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+                (brw->is_haswell && prog_data->base.nr_image_params ?
+                 HSW_GS_UAV_ACCESS_ENABLE : 0));
 
       if (brw->gs.prog_data->base.base.total_scratch) {
          OUT_RELOC(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index 00bc6f24dbe..b7e48585482 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -111,6 +111,7 @@ upload_vs_state(struct brw_context *brw)
    uint32_t floating_point_mode = 0;
    const int max_threads_shift = brw->is_haswell ?
       HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;
+   const struct brw_vue_prog_data *prog_data = &brw->vs.prog_data->base;
 
    if (!brw->is_haswell && !brw->is_baytrail)
       gen7_emit_vs_workaround_flush(brw);
@@ -125,19 +126,21 @@ upload_vs_state(struct brw_context *brw)
 	     ((ALIGN(stage_state->sampler_count, 4)/4) <<
               GEN6_VS_SAMPLER_COUNT_SHIFT) |
              ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) <<
-              GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+              GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+             (brw->is_haswell && prog_data->base.nr_image_params ?
+              HSW_VS_UAV_ACCESS_ENABLE : 0));
 
-   if (brw->vs.prog_data->base.base.total_scratch) {
+   if (prog_data->base.total_scratch) {
       OUT_RELOC(stage_state->scratch_bo,
 		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(brw->vs.prog_data->base.base.total_scratch) - 11);
+		ffs(prog_data->base.total_scratch) - 11);
    } else {
       OUT_BATCH(0);
    }
 
-   OUT_BATCH((brw->vs.prog_data->base.base.dispatch_grf_start_reg <<
+   OUT_BATCH((prog_data->base.dispatch_grf_start_reg <<
               GEN6_VS_DISPATCH_START_GRF_SHIFT) |
-	     (brw->vs.prog_data->base.urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
+	     (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
 	     (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));
 
    OUT_BATCH(((brw->max_vs_threads - 1) << max_threads_shift) |
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index aa47421844e..285311ef53c 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -107,6 +107,12 @@ upload_wm_state(struct brw_context *brw)
       dw1 |= GEN7_WM_USES_INPUT_COVERAGE_MASK;
    }
 
+   /* _NEW_BUFFERS | _NEW_COLOR */
+   if (brw->is_haswell &&
+       !(brw_color_buffer_write_enabled(brw) || writes_depth) &&
+       prog_data->base.nr_image_params)
+      dw2 |= HSW_WM_UAV_ONLY;
+
    BEGIN_BATCH(3);
    OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2));
    OUT_BATCH(dw1);
@@ -209,6 +215,9 @@ gen7_upload_ps_state(struct brw_context *brw,
       _mesa_get_min_invocations_per_fragment(ctx, fp, false);
    assert(min_inv_per_frag >= 1);
 
+   if (brw->is_haswell && prog_data->base.nr_image_params)
+      dw4 |= HSW_PS_UAV_ACCESS_ENABLE;
+
    if (prog_data->prog_offset_16 || prog_data->no_8) {
       dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
       if (!prog_data->no_8 && min_inv_per_frag == 1) {
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
index 26a02d3b045..81bd3b21778 100644
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -52,7 +52,9 @@ gen8_upload_gs_state(struct brw_context *brw)
                 ((ALIGN(stage_state->sampler_count, 4)/4) <<
                  GEN6_GS_SAMPLER_COUNT_SHIFT) |
                 ((prog_data->base.binding_table.size_bytes / 4) <<
-                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+                (prog_data->base.nr_image_params ?
+                 HSW_GS_UAV_ACCESS_ENABLE : 0));
 
       if (brw->gs.prog_data->base.base.total_scratch) {
          OUT_RELOC64(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index d5445093e67..f84fbe1864d 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -61,7 +61,8 @@ gen8_upload_ps_extra(struct brw_context *brw,
    if (brw->gen >= 9 && prog_data->pulls_bary)
       dw1 |= GEN9_PSX_SHADER_PULLS_BARY;
 
-   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx))
+   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) ||
+       prog_data->base.nr_image_params)
       dw1 |= GEN8_PSX_SHADER_HAS_UAV;
 
    BEGIN_BATCH(2);
diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c
index 28f5adddf14..8b5048bee7e 100644
--- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c
@@ -53,7 +53,9 @@ upload_vs_state(struct brw_context *brw)
              ((ALIGN(stage_state->sampler_count, 4) / 4) <<
                GEN6_VS_SAMPLER_COUNT_SHIFT) |
              ((prog_data->base.binding_table.size_bytes / 4) <<
-               GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+               GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+             (prog_data->base.nr_image_params ?
+              HSW_VS_UAV_ACCESS_ENABLE : 0));
 
    if (prog_data->base.total_scratch) {
       OUT_RELOC64(stage_state->scratch_bo,

From 786e0853bebc3c4ab073bdbb48eec8ba5ea93842 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 9 Feb 2015 21:04:53 +0200
Subject: [PATCH 1091/1208] i965/gen7-8: Set up early depth/stencil control
 appropriately for image load/store.

v2: Store early fragment test mode in brw_wm_prog_data instead of
    getting it from core mesa data structures (Ken).

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_context.h      | 1 +
 src/mesa/drivers/dri/i965/brw_defines.h      | 3 +++
 src/mesa/drivers/dri/i965/brw_wm.c           | 2 ++
 src/mesa/drivers/dri/i965/gen7_wm_state.c    | 6 ++++++
 src/mesa/drivers/dri/i965/gen8_depth_state.c | 6 +++---
 src/mesa/drivers/dri/i965/gen8_ps_state.c    | 6 ++++++
 6 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 707cd8f9ae5..b52bca77460 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -466,6 +466,7 @@ struct brw_wm_prog_data {
 
    uint8_t computed_depth_mode;
 
+   bool early_fragment_tests;
    bool no_8;
    bool dual_src_blend;
    bool uses_pos_offset;
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 7fa7c5f06f9..82a36357de9 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -2384,6 +2384,9 @@ enum brw_wm_barycentric_interp_mode {
 # define GEN7_WM_KILL_ENABLE				(1 << 25)
 # define GEN7_WM_COMPUTED_DEPTH_MODE_SHIFT              23
 # define GEN7_WM_USES_SOURCE_DEPTH			(1 << 20)
+# define GEN7_WM_EARLY_DS_CONTROL_NORMAL                (0 << 21)
+# define GEN7_WM_EARLY_DS_CONTROL_PSEXEC                (1 << 21)
+# define GEN7_WM_EARLY_DS_CONTROL_PREPS                 (2 << 21)
 # define GEN7_WM_USES_SOURCE_W			        (1 << 19)
 # define GEN7_WM_POSITION_ZW_PIXEL			(0 << 17)
 # define GEN7_WM_POSITION_ZW_CENTROID			(2 << 17)
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 184f21c7b06..6ee92848172 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -179,6 +179,8 @@ brw_codegen_wm_prog(struct brw_context *brw,
       fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
    prog_data.computed_depth_mode = computed_depth_mode(&fp->program);
 
+   prog_data.early_fragment_tests = fs && fs->EarlyFragmentTests;
+
    /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */
    if (!prog)
       prog_data.base.use_alt_mode = true;
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index 285311ef53c..fd6dab5be8b 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -107,6 +107,12 @@ upload_wm_state(struct brw_context *brw)
       dw1 |= GEN7_WM_USES_INPUT_COVERAGE_MASK;
    }
 
+   /* BRW_NEW_FS_PROG_DATA */
+   if (prog_data->early_fragment_tests)
+      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
+   else if (prog_data->base.nr_image_params)
+      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
+
    /* _NEW_BUFFERS | _NEW_COLOR */
    if (brw->is_haswell &&
        !(brw_color_buffer_write_enabled(brw) || writes_depth) &&
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index 8f23702d66b..93100a0708f 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -250,10 +250,10 @@ pma_fix_enable(const struct brw_context *brw)
     */
    const bool hiz_enabled = depth_irb && intel_renderbuffer_has_hiz(depth_irb);
 
-   /* 3DSTATE_WM::Early Depth/Stencil Control != EDSC_PREPS (2).
-    * We always leave this set to EDSC_NORMAL (0).
+   /* BRW_NEW_FS_PROG_DATA:
+    * 3DSTATE_WM::Early Depth/Stencil Control != EDSC_PREPS (2).
     */
-   const bool edsc_not_preps = true;
+   const bool edsc_not_preps = !brw->wm.prog_data->early_fragment_tests;
 
    /* 3DSTATE_PS_EXTRA::PixelShaderValid is always true. */
    const bool pixel_shader_valid = true;
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index f84fbe1864d..ae18f0f162c 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -119,6 +119,12 @@ upload_wm_state(struct brw_context *brw)
    dw1 |= brw->wm.prog_data->barycentric_interp_modes <<
       GEN7_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT;
 
+   /* BRW_NEW_FS_PROG_DATA */
+   if (brw->wm.prog_data->early_fragment_tests)
+      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
+   else if (brw->wm.prog_data->base.nr_image_params)
+      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
+
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_WM << 16 | (2 - 2));
    OUT_BATCH(dw1);

From 3569742ec458c0a881857d9deb782c1e11f195d8 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Fri, 22 Nov 2013 16:00:33 -0800
Subject: [PATCH 1092/1208] i965: Define implementation constants for
 ARB_shader_image_load_store.

Reviewed-by: Paul Berry <stereotype441@gmail.com>

v2: Drop VS support pre-Gen8, drop GS support.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_context.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index efcd91aad84..f428f58c69a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -505,6 +505,18 @@ brw_initialize_context_constants(struct brw_context *brw)
       ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicBuffers = BRW_MAX_ABO;
       ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicBuffers = BRW_MAX_ABO;
       ctx->Const.MaxCombinedAtomicBuffers = 3 * BRW_MAX_ABO;
+
+      ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms =
+         BRW_MAX_IMAGES;
+      ctx->Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms =
+         (brw->intelScreen->compiler->scalar_vs ? BRW_MAX_IMAGES : 0);
+      ctx->Const.Program[MESA_SHADER_COMPUTE].MaxImageUniforms =
+         BRW_MAX_IMAGES;
+      ctx->Const.MaxImageUnits = MAX_IMAGE_UNITS;
+      ctx->Const.MaxCombinedImageUnitsAndFragmentOutputs =
+         MAX_IMAGE_UNITS + BRW_MAX_DRAW_BUFFERS;
+      ctx->Const.MaxImageSamples = 0;
+      ctx->Const.MaxCombinedImageUniforms = 3 * BRW_MAX_IMAGES;
    }
 
    /* Gen6 converts quads to polygon in beginning of 3D pipeline,

From fb19df7a626d02cb54614d4610af2d14720a2ef3 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 22 Apr 2015 16:43:51 +0300
Subject: [PATCH 1093/1208] i965/fs: Import image access validity checks.

These utility functions check whether an image access is valid.
According to the spec an invalid image access should have no effect on
the image and yield well-defined results.  Typically the hardware
implements correct bounds and surface checking by itself, but in some
cases (typed atomics on IVB and untyped messages elsewhere) we need to
implement it in software to work around lacking hardware support.

v2: Drop VEC4 suport.
v3: Rebase.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 .../dri/i965/brw_fs_surface_builder.cpp       | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
index cac958d49c1..5ee04dece88 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -161,3 +161,58 @@ namespace brw {
       }
    }
 }
+
+namespace {
+   namespace image_validity {
+      /**
+       * Check whether there is an image bound at the given index and write
+       * the comparison result to f0.0.  Returns an appropriate predication
+       * mode to use on subsequent image operations.
+       */
+      brw_predicate
+      emit_surface_check(const fs_builder &bld, const fs_reg &image)
+      {
+         const brw_device_info *devinfo = bld.shader->devinfo;
+         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+         if (devinfo->gen == 7 && !devinfo->is_haswell) {
+            /* Check the first component of the size field to find out if the
+             * image is bound.  Necessary on IVB for typed atomics because
+             * they don't seem to respect null surfaces and will happily
+             * corrupt or read random memory when no image is bound.
+             */
+            bld.CMP(bld.null_reg_ud(),
+                    retype(size, BRW_REGISTER_TYPE_UD),
+                    fs_reg(0), BRW_CONDITIONAL_NZ);
+
+            return BRW_PREDICATE_NORMAL;
+         } else {
+            /* More recent platforms implement compliant behavior when a null
+             * surface is bound.
+             */
+            return BRW_PREDICATE_NONE;
+         }
+      }
+
+      /**
+       * Check whether the provided coordinates are within the image bounds
+       * and write the comparison result to f0.0.  Returns an appropriate
+       * predication mode to use on subsequent image operations.
+       */
+      brw_predicate
+      emit_bounds_check(const fs_builder &bld, const fs_reg &image,
+                        const fs_reg &addr, unsigned dims)
+      {
+         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+         for (unsigned c = 0; c < dims; ++c)
+            set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
+                          bld.CMP(bld.null_reg_ud(),
+                                  offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
+                                  offset(size, bld, c),
+                                  BRW_CONDITIONAL_L));
+
+         return BRW_PREDICATE_NORMAL;
+      }
+   }
+}

From 1a37619763a99b78aa574aca0058eda86de7a0dc Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 22 Apr 2015 16:44:18 +0300
Subject: [PATCH 1094/1208] i965/fs: Import image memory offset calculation
 code.

Define a function to calculate the memory address of the image
location given by a vector of coordinates.  This is required in cases
where we need to fall back to untyped surface access, which take a raw
memory offset and know nothing about surface coordinates, type
conversion or memory tiling and swizzling.  They are still useful
because typed surface reads don't support any 64 or 128-bit formats on
IVB, and they don't support any 128-bit formats on HSW and BDW.

The tiling algorithm is implemented based on a number of parameters
which are passed in as uniforms and determine whether the surface
layout is X-tiled, Y-tiled or untiled.  This allows binding surfaces
of different tiling layouts to the pipeline without recompiling the
program.

v2: Drop VEC4 suport.
v3: Rebase.
v4: Add plenty of comments (Jason).

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 .../dri/i965/brw_fs_surface_builder.cpp       | 169 ++++++++++++++++++
 1 file changed, 169 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
index 5ee04dece88..a0bedf26bc9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -215,4 +215,173 @@ namespace {
          return BRW_PREDICATE_NORMAL;
       }
    }
+
+   namespace image_coordinates {
+      /**
+       * Calculate the offset in memory of the texel given by \p coord.
+       *
+       * This is meant to be used with untyped surface messages to access a
+       * tiled surface, what involves taking into account the tiling and
+       * swizzling modes of the surface manually so it will hopefully not
+       * happen very often.
+       *
+       * The tiling algorithm implemented here matches either the X or Y
+       * tiling layouts supported by the hardware depending on the tiling
+       * coefficients passed to the program as uniforms.  See Volume 1 Part 2
+       * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
+       * explanation of the hardware tiling format.
+       */
+      fs_reg
+      emit_address_calculation(const fs_builder &bld, const fs_reg &image,
+                               const fs_reg &coord, unsigned dims)
+      {
+         const brw_device_info *devinfo = bld.shader->devinfo;
+         const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
+         const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
+         const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
+         const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
+         const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+         /* Shift the coordinates by the fixed surface offset.  It may be
+          * non-zero if the image is a single slice of a higher-dimensional
+          * surface, or if a non-zero mipmap level of the surface is bound to
+          * the pipeline.  The offset needs to be applied here rather than at
+          * surface state set-up time because the desired slice-level may
+          * start mid-tile, so simply shifting the surface base address
+          * wouldn't give a well-formed tiled surface in the general case.
+          */
+         for (unsigned c = 0; c < 2; ++c)
+            bld.ADD(offset(addr, bld, c), offset(off, bld, c),
+                    (c < dims ?
+                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
+                     fs_reg(0)));
+
+         /* The layout of 3-D textures in memory is sort-of like a tiling
+          * format.  At each miplevel, the slices are arranged in rows of
+          * 2^level slices per row.  The slice row is stored in tmp.y and
+          * the slice within the row is stored in tmp.x.
+          *
+          * The layout of 2-D array textures and cubemaps is much simpler:
+          * Depending on whether the ARYSPC_LOD0 layout is in use it will be
+          * stored in memory as an array of slices, each one being a 2-D
+          * arrangement of miplevels, or as a 2D arrangement of miplevels,
+          * each one being an array of slices.  In either case the separation
+          * between slices of the same LOD is equal to the qpitch value
+          * provided as stride.w.
+          *
+          * This code can be made to handle either 2D arrays and 3D textures
+          * by passing in the miplevel as tile.z for 3-D textures and 0 in
+          * tile.z for 2-D array textures.
+          *
+          * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
+          * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
+          * of the hardware 3D texture and 2D array layouts.
+          */
+         if (dims > 2) {
+            /* Decompose z into a major (tmp.y) and a minor (tmp.x)
+             * index.
+             */
+            bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), fs_reg(0),
+                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
+            bld.SHR(offset(tmp, bld, 1),
+                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
+                    offset(tile, bld, 2));
+
+            /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
+             * slice offset.
+             */
+            for (unsigned c = 0; c < 2; ++c) {
+               bld.MUL(offset(tmp, bld, c),
+                       offset(stride, bld, 2 + c), offset(tmp, bld, c));
+               bld.ADD(offset(addr, bld, c),
+                       offset(addr, bld, c), offset(tmp, bld, c));
+            }
+         }
+
+         if (dims > 1) {
+            /* Calculate the major/minor x and y indices.  In order to
+             * accommodate both X and Y tiling, the Y-major tiling format is
+             * treated as being a bunch of narrow X-tiles placed next to each
+             * other.  This means that the tile width for Y-tiling is actually
+             * the width of one sub-column of the Y-major tile where each 4K
+             * tile has 8 512B sub-columns.
+             *
+             * The major Y value is the row of tiles in which the pixel lives.
+             * The major X value is the tile sub-column in which the pixel
+             * lives; for X tiling, this is the same as the tile column, for Y
+             * tiling, each tile has 8 sub-columns.  The minor X and Y indices
+             * are the position within the sub-column.
+             */
+            for (unsigned c = 0; c < 2; ++c) {
+               /* Calculate the minor x and y indices. */
+               bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
+                       fs_reg(0), offset(addr, bld, c));
+
+               /* Calculate the major x and y indices. */
+               bld.SHR(offset(major, bld, c),
+                       offset(addr, bld, c), offset(tile, bld, c));
+            }
+
+            /* Calculate the texel index from the start of the tile row and
+             * the vertical coordinate of the row.
+             * Equivalent to:
+             *   tmp.x = (major.x << tile.y << tile.x) +
+             *           (minor.y << tile.x) + minor.x
+             *   tmp.y = major.y << tile.y
+             */
+            bld.SHL(tmp, major, offset(tile, bld, 1));
+            bld.ADD(tmp, tmp, offset(minor, bld, 1));
+            bld.SHL(tmp, tmp, offset(tile, bld, 0));
+            bld.ADD(tmp, tmp, minor);
+            bld.SHL(offset(tmp, bld, 1),
+                    offset(major, bld, 1), offset(tile, bld, 1));
+
+            /* Add it to the start of the tile row. */
+            bld.MUL(offset(tmp, bld, 1),
+                    offset(tmp, bld, 1), offset(stride, bld, 1));
+            bld.ADD(tmp, tmp, offset(tmp, bld, 1));
+
+            /* Multiply by the Bpp value. */
+            bld.MUL(dst, tmp, stride);
+
+            if (devinfo->gen < 8 && !devinfo->is_baytrail) {
+               /* Take into account the two dynamically specified shifts.
+                * Both need are used to implement swizzling of X-tiled
+                * surfaces.  For Y-tiled surfaces only one bit needs to be
+                * XOR-ed with bit 6 of the memory address, so a swz value of
+                * 0xff (actually interpreted as 31 by the hardware) will be
+                * provided to cause the relevant bit of tmp.y to be zero and
+                * turn the first XOR into the identity.  For linear surfaces
+                * or platforms lacking address swizzling both shifts will be
+                * 0xff causing the relevant bits of both tmp.x and .y to be
+                * zero, what effectively disables swizzling.
+                */
+               for (unsigned c = 0; c < 2; ++c)
+                  bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
+
+               /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
+               bld.XOR(tmp, tmp, offset(tmp, bld, 1));
+               bld.AND(tmp, tmp, fs_reg(1 << 6));
+               bld.XOR(dst, dst, tmp);
+            }
+
+         } else {
+            /* Multiply by the Bpp/stride value.  Note that the addr.y may be
+             * non-zero even if the image is one-dimensional because a
+             * vertical offset may have been applied above to select a
+             * non-zero slice or level of a higher-dimensional texture.
+             */
+            bld.MUL(offset(addr, bld, 1),
+                    offset(addr, bld, 1), offset(stride, bld, 1));
+            bld.ADD(addr, addr, offset(addr, bld, 1));
+            bld.MUL(dst, addr, stride);
+         }
+
+         return dst;
+      }
+   }
 }

From 86dbd8af40deaa99aedf011e863b908173e63012 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Thu, 23 Jul 2015 19:32:08 +0300
Subject: [PATCH 1095/1208] i965/fs: Import code to transform image coordinates
 into surface coordinates.

Accounting for the padding required for 1D arrays in certain cases.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 .../dri/i965/brw_fs_surface_builder.cpp       | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
index a0bedf26bc9..4d616c78e78 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -217,6 +217,58 @@ namespace {
    }
 
    namespace image_coordinates {
+      /**
+       * Return the total number of coordinates needed to address a texel of
+       * the surface, which may be more than the sum of \p surf_dims and \p
+       * arr_dims if padding is required.
+       */
+      unsigned
+      num_image_coordinates(const fs_builder &bld,
+                            unsigned surf_dims, unsigned arr_dims,
+                            mesa_format format)
+      {
+         /* HSW in vec4 mode and our software coordinate handling for untyped
+          * reads want the array index to be at the Z component.
+          */
+         const bool array_index_at_z =
+            !image_format_info::has_matching_typed_format(
+               bld.shader->devinfo, format);
+         const unsigned zero_dims =
+            ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
+
+         return surf_dims + zero_dims + arr_dims;
+      }
+
+      /**
+       * Transform image coordinates into the form expected by the
+       * implementation.
+       */
+      fs_reg
+      emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
+                             unsigned surf_dims, unsigned arr_dims,
+                             mesa_format format)
+      {
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+         if (dims > surf_dims + arr_dims) {
+            assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
+            /* The array index is required to be passed in as the Z component,
+             * insert a zero at the Y component to shift it to the right
+             * position.
+             *
+             * FINISHME: Factor out this frequently recurring pattern into a
+             * helper function.
+             */
+            const fs_reg srcs[] = { addr, fs_reg(0), offset(addr, bld, 1) };
+            const fs_reg dst = bld.vgrf(addr.type, dims);
+            bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
+            return dst;
+         } else {
+            return addr;
+         }
+      }
+
       /**
        * Calculate the offset in memory of the texel given by \p coord.
        *

From 26ca81ce3029cbd2531f52635258aecae19bf185 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 22 Apr 2015 16:45:28 +0300
Subject: [PATCH 1096/1208] i965/fs: Import image format metadata queries.

Define some utility functions to query the bitfield layout of a given
image format and whether it satisfies a number of more or less
hardware-specific properties.

v2: Drop VEC4 suport.
v3: Add SKL support.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 .../dri/i965/brw_fs_surface_builder.cpp       | 148 ++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
index 4d616c78e78..9682ca33189 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -163,6 +163,154 @@ namespace brw {
 }
 
 namespace {
+   namespace image_format_info {
+      /**
+       * Simple 4-tuple of scalars used to pass around per-color component
+       * values.
+       */
+      struct color_u {
+         color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
+         {
+         }
+
+         color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
+            r(r), g(g), b(b), a(a)
+         {
+         }
+
+         unsigned
+         operator[](unsigned i) const
+         {
+            const unsigned xs[] = { r, g, b, a };
+            return xs[i];
+         }
+
+         unsigned r, g, b, a;
+      };
+
+      /**
+       * Return the per-channel bitfield widths for a given image format.
+       */
+      inline color_u
+      get_bit_widths(mesa_format format)
+      {
+         return color_u(_mesa_get_format_bits(format, GL_RED_BITS),
+                        _mesa_get_format_bits(format, GL_GREEN_BITS),
+                        _mesa_get_format_bits(format, GL_BLUE_BITS),
+                        _mesa_get_format_bits(format, GL_ALPHA_BITS));
+      }
+
+      /**
+       * Return the per-channel bitfield shifts for a given image format.
+       */
+      inline color_u
+      get_bit_shifts(mesa_format format)
+      {
+         const color_u widths = get_bit_widths(format);
+         return color_u(0, widths.r, widths.r + widths.g,
+                        widths.r + widths.g + widths.b);
+      }
+
+      /**
+       * Return true if all present components have the same bit width.
+       */
+      inline bool
+      is_homogeneous(mesa_format format)
+      {
+         const color_u widths = get_bit_widths(format);
+         return ((widths.g == 0 || widths.g == widths.r) &&
+                 (widths.b == 0 || widths.b == widths.r) &&
+                 (widths.a == 0 || widths.a == widths.r));
+      }
+
+      /**
+       * Return true if the format conversion boils down to a trivial copy.
+       */
+      inline bool
+      is_conversion_trivial(const brw_device_info *devinfo, mesa_format format)
+      {
+         return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
+                 format == brw_lower_mesa_image_format(devinfo, format);
+      }
+
+      /**
+       * Return true if the hardware natively supports some format with
+       * compatible bitfield layout, but possibly different data types.
+       */
+      inline bool
+      has_supported_bit_layout(const brw_device_info *devinfo,
+                               mesa_format format)
+      {
+         const color_u widths = get_bit_widths(format);
+         const color_u lower_widths = get_bit_widths(
+            brw_lower_mesa_image_format(devinfo, format));
+
+         return (widths.r == lower_widths.r &&
+                 widths.g == lower_widths.g &&
+                 widths.b == lower_widths.b &&
+                 widths.a == lower_widths.a);
+      }
+
+      /**
+       * Return true if we are required to spread individual components over
+       * several components of the format used by the hardware (RG32 and
+       * friends implemented as RGBA16UI).
+       */
+      inline bool
+      has_split_bit_layout(const brw_device_info *devinfo, mesa_format format)
+      {
+         const mesa_format lower_format =
+            brw_lower_mesa_image_format(devinfo, format);
+
+         return (_mesa_format_num_components(format) <
+                 _mesa_format_num_components(lower_format));
+      }
+
+      /**
+       * Return true unless we have to fall back to untyped surface access.
+       * Fail!
+       */
+      inline bool
+      has_matching_typed_format(const brw_device_info *devinfo,
+                                mesa_format format)
+      {
+         return (_mesa_get_format_bytes(format) <= 4 ||
+                 (_mesa_get_format_bytes(format) <= 8 &&
+                  (devinfo->gen >= 8 || devinfo->is_haswell)) ||
+                 devinfo->gen >= 9);
+      }
+
+      /**
+       * Return true if the hardware returns garbage in the unused high bits
+       * of each component.  This may happen on IVB because we rely on the
+       * undocumented behavior that typed reads from surfaces of the
+       * unsupported R8 and R16 formats return useful data in their least
+       * significant bits.
+       */
+      inline bool
+      has_undefined_high_bits(const brw_device_info *devinfo,
+                              mesa_format format)
+      {
+         const mesa_format lower_format =
+            brw_lower_mesa_image_format(devinfo, format);
+
+         return (devinfo->gen == 7 && !devinfo->is_haswell &&
+                 (lower_format == MESA_FORMAT_R_UINT16 ||
+                  lower_format == MESA_FORMAT_R_UINT8));
+      }
+
+      /**
+       * Return true if the format represents values as signed integers
+       * requiring sign extension when unpacking.
+       */
+      inline bool
+      needs_sign_extension(mesa_format format)
+      {
+         return (_mesa_get_format_datatype(format) == GL_SIGNED_NORMALIZED ||
+                 _mesa_get_format_datatype(format) == GL_INT);
+      }
+   }
+
    namespace image_validity {
       /**
        * Check whether there is an image bound at the given index and write

From 7e8be000101cc6fe3846745b559f2d785430e253 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Thu, 30 Jul 2015 15:51:58 +0300
Subject: [PATCH 1097/1208] i965/fs: Import image format conversion primitives.

Define bitfield packing, unpacking and type conversion operations in
terms of which the image format conversion code will be implemented.
These don't directly know about image formats: The packing and
unpacking functions take a 4-tuple of bit shifts and a 4-tuple of bit
widths as arguments, determining the bitfield position of each
component.  Most of the remaining functions perform integer, fixed
point normalized, and floating point type conversions, mapping between
a target type with per-component bit widths given by a parameter and a
matching native representation of the same type.

v2: Drop VEC4 suport.
v3: Rebase.
v4: Fix clamping of negative floats in the unsigned case of
    emit_convert_to_scaled().

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 .../dri/i965/brw_fs_surface_builder.cpp       | 265 ++++++++++++++++++
 1 file changed, 265 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
index 9682ca33189..441b2a0fd7f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -584,4 +584,269 @@ namespace {
          return dst;
       }
    }
+
+   namespace image_format_conversion {
+      using image_format_info::color_u;
+
+      namespace {
+         /**
+          * Maximum representable value in an unsigned integer with the given
+          * number of bits.
+          */
+         inline unsigned
+         scale(unsigned n)
+         {
+            return (1 << n) - 1;
+         }
+      }
+
+      /**
+       * Pack the vector \p src in a bitfield given the per-component bit
+       * shifts and widths.  Note that bitfield components are not allowed to
+       * cross 32-bit boundaries.
+       */
+      fs_reg
+      emit_pack(const fs_builder &bld, const fs_reg &src,
+                const color_u &shifts, const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+         bool seen[4] = {};
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+               /* Shift each component left to the correct bitfield position. */
+               bld.SHL(tmp, offset(src, bld, c), fs_reg(shifts[c] % 32));
+
+               /* Add everything up. */
+               if (seen[shifts[c] / 32]) {
+                  bld.OR(offset(dst, bld, shifts[c] / 32),
+                         offset(dst, bld, shifts[c] / 32), tmp);
+               } else {
+                  bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
+                  seen[shifts[c] / 32] = true;
+               }
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Unpack a vector from the bitfield \p src given the per-component bit
+       * shifts and widths.  Note that bitfield components are not allowed to
+       * cross 32-bit boundaries.
+       */
+      fs_reg
+      emit_unpack(const fs_builder &bld, const fs_reg &src,
+                  const color_u &shifts, const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(src.type, 4);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Shift left to discard the most significant bits. */
+               bld.SHL(offset(dst, bld, c),
+                       offset(src, bld, shifts[c] / 32),
+                       fs_reg(32 - shifts[c] % 32 - widths[c]));
+
+               /* Shift back to the least significant bits using an arithmetic
+                * shift to get sign extension on signed types.
+                */
+               bld.ASR(offset(dst, bld, c),
+                       offset(dst, bld, c), fs_reg(32 - widths[c]));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Convert an integer vector into another integer vector of the
+       * specified bit widths, properly handling overflow.
+       */
+      fs_reg
+      emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
+                              const color_u &widths, bool is_signed)
+      {
+         const unsigned s = (is_signed ? 1 : 0);
+         const fs_reg dst = bld.vgrf(
+            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
+         assert(src.type == dst.type);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Clamp to the maximum value. */
+               bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
+                               fs_reg((int)scale(widths[c] - s)),
+                               BRW_CONDITIONAL_L);
+
+               /* Clamp to the minimum value. */
+               if (is_signed)
+                  bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
+                                  fs_reg(-(int)scale(widths[c] - s) - 1),
+                                  BRW_CONDITIONAL_G);
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Convert a normalized fixed-point vector of the specified signedness
+       * and bit widths into a floating point vector.
+       */
+      fs_reg
+      emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
+                               const color_u &widths, bool is_signed)
+      {
+         const unsigned s = (is_signed ? 1 : 0);
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Convert to float. */
+               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
+
+               /* Divide by the normalization constants. */
+               bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
+                       fs_reg(1.0f / scale(widths[c] - s)));
+
+               /* Clamp to the minimum value. */
+               if (is_signed)
+                  bld.emit_minmax(offset(dst, bld, c),
+                                  offset(dst, bld, c), fs_reg(-1.0f),
+                                  BRW_CONDITIONAL_G);
+            }
+         }
+         return dst;
+      }
+
+      /**
+       * Convert a floating-point vector into a normalized fixed-point vector
+       * of the specified signedness and bit widths.
+       */
+      fs_reg
+      emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
+                             const color_u &widths, bool is_signed)
+      {
+         const unsigned s = (is_signed ? 1 : 0);
+         const fs_reg dst = bld.vgrf(
+            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
+         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Clamp the normalized floating-point argument. */
+               if (is_signed) {
+                  bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
+                                  fs_reg(-1.0f), BRW_CONDITIONAL_G);
+
+                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
+                                  fs_reg(1.0f), BRW_CONDITIONAL_L);
+               } else {
+                  set_saturate(true, bld.MOV(offset(fdst, bld, c),
+                                             offset(src, bld, c)));
+               }
+
+               /* Multiply by the normalization constants. */
+               bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
+                       fs_reg((float)scale(widths[c] - s)));
+
+               /* Convert to integer. */
+               bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
+               bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Convert a floating point vector of the specified bit widths into a
+       * 32-bit floating point vector.
+       */
+      fs_reg
+      emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
+                              const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
+
+               /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
+                * This works because they have a 5-bit exponent just like the
+                * 16-bit floating point format, and they have no sign bit.
+                */
+               if (widths[c] < 16)
+                  bld.SHL(offset(dst, bld, c),
+                          offset(dst, bld, c), fs_reg(15 - widths[c]));
+
+               /* Convert to 32-bit floating point. */
+               bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
+            }
+         }
+
+         return fdst;
+      }
+
+      /**
+       * Convert a vector into a floating point vector of the specified bit
+       * widths.
+       */
+      fs_reg
+      emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
+                            const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
+
+               /* Clamp to the minimum value. */
+               if (widths[c] < 16)
+                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
+                                  fs_reg(0.0f), BRW_CONDITIONAL_G);
+
+               /* Convert to 16-bit floating-point. */
+               bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
+
+               /* Discard the least significant bits to get floating point
+                * numbers of the requested width.  This works because the
+                * 10-bit and 11-bit floating point formats have a 5-bit
+                * exponent just like the 16-bit format, and they have no sign
+                * bit.
+                */
+               if (widths[c] < 16)
+                  bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
+                          fs_reg(15 - widths[c]));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Fill missing components of a vector with 0, 0, 0, 1.
+       */
+      fs_reg
+      emit_pad(const fs_builder &bld, const fs_reg &src,
+               const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(src.type, 4);
+         const unsigned pad[] = { 0, 0, 0, 1 };
+
+         for (unsigned c = 0; c < 4; ++c)
+            bld.MOV(offset(dst, bld, c),
+                    widths[c] ? offset(src, bld, c) : fs_reg(pad[c]));
+
+         return dst;
+      }
+   }
 }

From caae52561dabb2d20f2369c547e660d078974285 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Thu, 30 Jul 2015 15:46:40 +0300
Subject: [PATCH 1098/1208] i965/fs: Implement image load, store and atomic.

v2: Drop VEC4 suport.
v3: Rebase.
v4: Move array coordinate workaround into the surface builder.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 .../dri/i965/brw_fs_surface_builder.cpp       | 244 ++++++++++++++++++
 .../drivers/dri/i965/brw_fs_surface_builder.h |  20 ++
 2 files changed, 264 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
index 441b2a0fd7f..50e0acd05f5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -850,3 +850,247 @@ namespace {
       }
    }
 }
+
+namespace brw {
+   namespace image_access {
+      /**
+       * Load a vector from a surface of the given format and dimensionality
+       * at the given coordinates.  \p surf_dims and \p arr_dims give the
+       * number of non-array and array coordinates of the image respectively.
+       */
+      fs_reg
+      emit_image_load(const fs_builder &bld,
+                      const fs_reg &image, const fs_reg &addr,
+                      unsigned surf_dims, unsigned arr_dims,
+                      mesa_format format)
+      {
+         using namespace image_format_info;
+         using namespace image_format_conversion;
+         using namespace image_validity;
+         using namespace image_coordinates;
+         using namespace surface_access;
+         const brw_device_info *devinfo = bld.shader->devinfo;
+         const mesa_format lower_format =
+            brw_lower_mesa_image_format(devinfo, format);
+         fs_reg tmp;
+
+         /* Transform the image coordinates into actual surface coordinates. */
+         const fs_reg saddr =
+            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+         if (has_matching_typed_format(devinfo, format)) {
+            /* Hopefully we get here most of the time... */
+            tmp = emit_typed_read(bld, image, saddr, dims,
+                                  _mesa_format_num_components(lower_format));
+         } else {
+            /* Untyped surface reads return 32 bits of the surface per
+             * component, without any sort of unpacking or type conversion,
+             */
+            const unsigned size = _mesa_get_format_bytes(format) / 4;
+
+            /* they don't properly handle out of bounds access, so we have to
+             * check manually if the coordinates are valid and predicate the
+             * surface read on the result,
+             */
+            const brw_predicate pred =
+               emit_bounds_check(bld, image, saddr, dims);
+
+            /* and they don't know about surface coordinates, we need to
+             * convert them to a raw memory offset.
+             */
+            const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
+
+            tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
+
+            /* An out of bounds surface access should give zero as result. */
+            for (unsigned c = 0; c < 4; ++c)
+               set_predicate(pred, bld.SEL(offset(tmp, bld, c),
+                                           offset(tmp, bld, c), fs_reg(0)));
+         }
+
+         /* Set the register type to D instead of UD if the data type is
+          * represented as a signed integer in memory so that sign extension
+          * is handled correctly by unpack.
+          */
+         if (needs_sign_extension(format))
+            tmp = retype(tmp, BRW_REGISTER_TYPE_D);
+
+         if (!has_supported_bit_layout(devinfo, format)) {
+            /* Unpack individual vector components from the bitfield if the
+             * hardware is unable to do it for us.
+             */
+            if (has_split_bit_layout(devinfo, format))
+               tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
+                               get_bit_widths(lower_format));
+            else
+               tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
+                                 get_bit_widths(format));
+
+         } else if ((needs_sign_extension(format) &&
+                     !is_conversion_trivial(devinfo, format)) ||
+                    has_undefined_high_bits(devinfo, format)) {
+            /* Perform a trivial unpack even though the bit layout matches in
+             * order to get the most significant bits of each component
+             * initialized properly.
+             */
+            tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
+                              get_bit_widths(format));
+         }
+
+         if (!_mesa_is_format_integer(format)) {
+            if (is_conversion_trivial(devinfo, format)) {
+               /* Just need to cast the vector to the target type. */
+               tmp = retype(tmp, BRW_REGISTER_TYPE_F);
+            } else {
+               /* Do the right sort of type conversion to float. */
+               if (_mesa_get_format_datatype(format) == GL_FLOAT)
+                  tmp = emit_convert_from_float(
+                     bld, tmp, get_bit_widths(format));
+               else
+                  tmp = emit_convert_from_scaled(
+                     bld, tmp, get_bit_widths(format),
+                     _mesa_is_format_signed(format));
+            }
+         }
+
+         /* Initialize missing components of the result. */
+         return emit_pad(bld, tmp, get_bit_widths(format));
+      }
+
+      /**
+       * Store a vector in a surface of the given format and dimensionality at
+       * the given coordinates.  \p surf_dims and \p arr_dims give the number
+       * of non-array and array coordinates of the image respectively.
+       */
+      void
+      emit_image_store(const fs_builder &bld, const fs_reg &image,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned surf_dims, unsigned arr_dims,
+                       mesa_format format)
+      {
+         using namespace image_format_info;
+         using namespace image_format_conversion;
+         using namespace image_validity;
+         using namespace image_coordinates;
+         using namespace surface_access;
+         const brw_device_info *devinfo = bld.shader->devinfo;
+
+         /* Transform the image coordinates into actual surface coordinates. */
+         const fs_reg saddr =
+            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+         if (format == MESA_FORMAT_NONE) {
+            /* We don't know what the format is, but that's fine because it
+             * implies write-only access, and typed surface writes are always
+             * able to take care of type conversion and packing for us.
+             */
+            emit_typed_write(bld, image, saddr, src, dims, 4);
+
+         } else {
+            const mesa_format lower_format =
+               brw_lower_mesa_image_format(devinfo, format);
+            fs_reg tmp = src;
+
+            if (!is_conversion_trivial(devinfo, format)) {
+               /* Do the right sort of type conversion. */
+               if (_mesa_get_format_datatype(format) == GL_FLOAT)
+                  tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
+
+               else if (_mesa_is_format_integer(format))
+                  tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
+                                                _mesa_is_format_signed(format));
+
+               else
+                  tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
+                                               _mesa_is_format_signed(format));
+            }
+
+            /* We're down to bit manipulation at this point. */
+            tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
+
+            if (!has_supported_bit_layout(devinfo, format)) {
+               /* Pack the vector components into a bitfield if the hardware
+                * is unable to do it for us.
+                */
+               if (has_split_bit_layout(devinfo, format))
+                  tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
+                                    get_bit_widths(lower_format));
+
+               else
+                  tmp = emit_pack(bld, tmp, get_bit_shifts(format),
+                                  get_bit_widths(format));
+            }
+
+            if (has_matching_typed_format(devinfo, format)) {
+               /* Hopefully we get here most of the time... */
+               emit_typed_write(bld, image, saddr, tmp, dims,
+                                _mesa_format_num_components(lower_format));
+
+            } else {
+               /* Untyped surface writes store 32 bits of the surface per
+                * component, without any sort of packing or type conversion,
+                */
+               const unsigned size = _mesa_get_format_bytes(format) / 4;
+
+               /* they don't properly handle out of bounds access, so we have
+                * to check manually if the coordinates are valid and predicate
+                * the surface write on the result,
+                */
+               const brw_predicate pred =
+                  emit_bounds_check(bld, image, saddr, dims);
+
+               /* and, phew, they don't know about surface coordinates, we
+                * need to convert them to a raw memory offset.
+                */
+               const fs_reg laddr = emit_address_calculation(
+                  bld, image, saddr, dims);
+
+               emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
+            }
+         }
+      }
+
+      /**
+       * Perform an atomic read-modify-write operation in a surface of the
+       * given dimensionality at the given coordinates.  \p surf_dims and \p
+       * arr_dims give the number of non-array and array coordinates of the
+       * image respectively.  Main building block of the imageAtomic GLSL
+       * built-ins.
+       */
+      fs_reg
+      emit_image_atomic(const fs_builder &bld,
+                        const fs_reg &image, const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned surf_dims, unsigned arr_dims,
+                        unsigned rsize, unsigned op)
+      {
+         using namespace image_validity;
+         using namespace image_coordinates;
+         using namespace surface_access;
+         /* Avoid performing an atomic operation on an unbound surface. */
+         const brw_predicate pred = emit_surface_check(bld, image);
+
+         /* Transform the image coordinates into actual surface coordinates. */
+         const fs_reg saddr =
+            emit_image_coordinates(bld, addr, surf_dims, arr_dims,
+                                  MESA_FORMAT_R_UINT32);
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims,
+                                  MESA_FORMAT_R_UINT32);
+
+         /* Thankfully we can do without untyped atomics here. */
+         const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
+                                              dims, rsize, op, pred);
+
+         /* An unbound surface access should give zero as result. */
+         if (rsize)
+            set_predicate(pred, bld.SEL(tmp, tmp, fs_reg(0)));
+
+         return tmp;
+      }
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h
index 264314b7724..a3dd839955b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h
@@ -65,5 +65,25 @@ namespace brw {
                         unsigned dims, unsigned rsize, unsigned op,
                         brw_predicate pred = BRW_PREDICATE_NONE);
    }
+
+   namespace image_access {
+      fs_reg
+      emit_image_load(const fs_builder &bld,
+                      const fs_reg &image, const fs_reg &addr,
+                      unsigned surf_dims, unsigned arr_dims,
+                      mesa_format format);
+
+      void
+      emit_image_store(const fs_builder &bld, const fs_reg &image,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned surf_dims, unsigned arr_dims,
+                       mesa_format format);
+      fs_reg
+      emit_image_atomic(const fs_builder &bld,
+                        const fs_reg &image, const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned surf_dims, unsigned arr_dims,
+                        unsigned rsize, unsigned op);
+   }
 }
 #endif

From 84431c1f1d343c85f3b7fa265293a1d245ba9cf3 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 5 May 2015 21:05:45 +0300
Subject: [PATCH 1099/1208] i965: Teach type_size() about the size of an image
 uniform.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp           | 1 +
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 6fe900aa63f..82cb499a326 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -484,6 +484,7 @@ fs_visitor::type_size(const struct glsl_type *type)
    case GLSL_TYPE_SUBROUTINE:
       return 1;
    case GLSL_TYPE_IMAGE:
+      return BRW_IMAGE_PARAM_SIZE;
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_ERROR:
    case GLSL_TYPE_INTERFACE:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 34dd7159631..9062bcc444f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -638,6 +638,7 @@ vec4_visitor::type_size(const struct glsl_type *type)
    case GLSL_TYPE_ATOMIC_UINT:
       return 0;
    case GLSL_TYPE_IMAGE:
+      return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_DOUBLE:
    case GLSL_TYPE_ERROR:

From 4af27145fe2fec6586ce95e80a76cdcbfe933db1 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 5 May 2015 21:07:15 +0300
Subject: [PATCH 1100/1208] i965: Implement logic to set up and upload an image
 uniform.

v2: Move the image_params array back to brw_stage_prog_data.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_shader.cpp | 31 ++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_shader.h   |  1 +
 2 files changed, 32 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index bccf8d6d8d2..2a1e469fa07 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -1418,3 +1418,34 @@ backend_shader::assign_common_binding_table_offsets(uint32_t next_binding_table_
 
    /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
 }
+
+void
+backend_shader::setup_image_uniform_values(const gl_uniform_storage *storage)
+{
+   const unsigned stage = _mesa_program_enum_to_shader_stage(prog->Target);
+
+   for (unsigned i = 0; i < MAX2(storage->array_elements, 1); i++) {
+      const unsigned image_idx = storage->image[stage].index + i;
+      const brw_image_param *param = &stage_prog_data->image_param[image_idx];
+
+      /* Upload the brw_image_param structure.  The order is expected to match
+       * the BRW_IMAGE_PARAM_*_OFFSET defines.
+       */
+      setup_vector_uniform_values(
+         (const gl_constant_value *)&param->surface_idx, 1);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->offset, 2);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->size, 3);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->stride, 4);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->tiling, 3);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->swizzling, 2);
+
+      brw_mark_surface_used(
+         stage_prog_data,
+         stage_prog_data->binding_table.image_start + image_idx);
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 925072f1349..2cc97f24972 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -272,6 +272,7 @@ public:
 
    virtual void setup_vector_uniform_values(const gl_constant_value *values,
                                             unsigned n) = 0;
+   void setup_image_uniform_values(const gl_uniform_storage *storage);
 };
 
 uint32_t brw_texture_offset(int *offsets, unsigned num_components);

From 912ef52c29fdc373889594b963cc93c89fa9e3f7 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sun, 28 Jun 2015 21:16:31 +0300
Subject: [PATCH 1101/1208] i965/fs: Handle image uniforms in NIR programs.

v2: Move the image_params array back to brw_stage_prog_data.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.h       |  1 +
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 51 ++++++++++++++++++++----
 2 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index f7f01947d32..975183e990d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -264,6 +264,7 @@ public:
                       nir_jump_instr *instr);
    fs_reg get_nir_src(nir_src src);
    fs_reg get_nir_dest(nir_dest dest);
+   fs_reg get_nir_image_deref(const nir_deref_var *deref);
    void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
                      unsigned wr_mask);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index aa6064e5e7a..943c66ee8af 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -237,17 +237,26 @@ fs_visitor::nir_setup_uniform(nir_variable *var)
          continue;
       }
 
-      unsigned slots = storage->type->component_slots();
-      if (storage->array_elements)
-         slots *= storage->array_elements;
+      if (storage->type->is_image()) {
+         /* Images don't get a valid location assigned by nir_lower_io()
+          * because their size is driver-specific, so we need to allocate
+          * space for them here at the end of the parameter array.
+          */
+         var->data.driver_location = uniforms;
+         param_size[uniforms] =
+            BRW_IMAGE_PARAM_SIZE * MAX2(storage->array_elements, 1);
 
-      for (unsigned i = 0; i < slots; i++) {
-         stage_prog_data->param[index++] = &storage->storage[i];
+         setup_image_uniform_values(storage);
+      } else {
+         unsigned slots = storage->type->component_slots();
+         if (storage->array_elements)
+            slots *= storage->array_elements;
+
+         for (unsigned i = 0; i < slots; i++) {
+            stage_prog_data->param[index++] = &storage->storage[i];
+         }
       }
    }
-
-   /* Make sure we actually initialized the right amount of stuff here. */
-   assert(var->data.driver_location + var->type->component_slots() == index);
 }
 
 void
@@ -1149,6 +1158,32 @@ fs_visitor::get_nir_dest(nir_dest dest)
                              dest.reg.indirect);
 }
 
+fs_reg
+fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
+{
+   fs_reg image(UNIFORM, deref->var->data.driver_location,
+                BRW_REGISTER_TYPE_UD);
+
+   if (deref->deref.child) {
+      const nir_deref_array *deref_array =
+         nir_deref_as_array(deref->deref.child);
+      assert(deref->deref.child->deref_type == nir_deref_type_array &&
+             deref_array->deref.child == NULL);
+
+      image = offset(image, bld,
+                     deref_array->base_offset * BRW_IMAGE_PARAM_SIZE);
+
+      if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
+         fs_reg *tmp = new(mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+         bld.MUL(*tmp, get_nir_src(deref_array->indirect),
+                 fs_reg(BRW_IMAGE_PARAM_SIZE));
+         image.reladdr = tmp;
+      }
+   }
+
+   return image;
+}
+
 void
 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
                          unsigned wr_mask)

From a47ae8de2cf30fbe45318a18a2ea032f30ab7d10 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 16:26:52 +0300
Subject: [PATCH 1102/1208] i965/fs: Translate image load, store and atomic NIR
 intrinsics.

v2: Move array coordinate workaround into the surface builder.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 106 +++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 943c66ee8af..9d691f7eb9f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -24,6 +24,7 @@
 #include "glsl/ir.h"
 #include "glsl/ir_optimization.h"
 #include "glsl/nir/glsl_to_nir.h"
+#include "main/shaderimage.h"
 #include "program/prog_to_nir.h"
 #include "brw_fs.h"
 #include "brw_fs_surface_builder.h"
@@ -1202,6 +1203,55 @@ fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
    }
 }
 
+/**
+ * Get the matching channel register datatype for an image intrinsic of the
+ * specified GLSL image type.
+ */
+static brw_reg_type
+get_image_base_type(const glsl_type *type)
+{
+   switch ((glsl_base_type)type->sampler_type) {
+   case GLSL_TYPE_UINT:
+      return BRW_REGISTER_TYPE_UD;
+   case GLSL_TYPE_INT:
+      return BRW_REGISTER_TYPE_D;
+   case GLSL_TYPE_FLOAT:
+      return BRW_REGISTER_TYPE_F;
+   default:
+      unreachable("Not reached.");
+   }
+}
+
+/**
+ * Get the appropriate atomic op for an image atomic intrinsic.
+ */
+static unsigned
+get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
+{
+   switch (op) {
+   case nir_intrinsic_image_atomic_add:
+      return BRW_AOP_ADD;
+   case nir_intrinsic_image_atomic_min:
+      return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
+              BRW_AOP_IMIN : BRW_AOP_UMIN);
+   case nir_intrinsic_image_atomic_max:
+      return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
+              BRW_AOP_IMAX : BRW_AOP_UMAX);
+   case nir_intrinsic_image_atomic_and:
+      return BRW_AOP_AND;
+   case nir_intrinsic_image_atomic_or:
+      return BRW_AOP_OR;
+   case nir_intrinsic_image_atomic_xor:
+      return BRW_AOP_XOR;
+   case nir_intrinsic_image_atomic_exchange:
+      return BRW_AOP_MOV;
+   case nir_intrinsic_image_atomic_comp_swap:
+      return BRW_AOP_CMPWR;
+   default:
+      unreachable("Not reachable.");
+   }
+}
+
 void
 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
 {
@@ -1276,6 +1326,62 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
+   case nir_intrinsic_image_load:
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_image_atomic_add:
+   case nir_intrinsic_image_atomic_min:
+   case nir_intrinsic_image_atomic_max:
+   case nir_intrinsic_image_atomic_and:
+   case nir_intrinsic_image_atomic_or:
+   case nir_intrinsic_image_atomic_xor:
+   case nir_intrinsic_image_atomic_exchange:
+   case nir_intrinsic_image_atomic_comp_swap: {
+      using namespace image_access;
+
+      /* Get the referenced image variable and type. */
+      const nir_variable *var = instr->variables[0]->var;
+      const glsl_type *type = var->type->without_array();
+      const brw_reg_type base_type = get_image_base_type(type);
+
+      /* Get some metadata from the image intrinsic. */
+      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+      const unsigned arr_dims = type->sampler_array ? 1 : 0;
+      const unsigned surf_dims = type->coordinate_components() - arr_dims;
+      const mesa_format format =
+         (var->data.image.write_only ? MESA_FORMAT_NONE :
+          _mesa_get_shader_image_format(var->data.image.format));
+
+      /* Get the arguments of the image intrinsic. */
+      const fs_reg image = get_nir_image_deref(instr->variables[0]);
+      const fs_reg addr = retype(get_nir_src(instr->src[0]),
+                                 BRW_REGISTER_TYPE_UD);
+      const fs_reg src0 = (info->num_srcs >= 3 ?
+                           retype(get_nir_src(instr->src[2]), base_type) :
+                           fs_reg());
+      const fs_reg src1 = (info->num_srcs >= 4 ?
+                           retype(get_nir_src(instr->src[3]), base_type) :
+                           fs_reg());
+      fs_reg tmp;
+
+      /* Emit an image load, store or atomic op. */
+      if (instr->intrinsic == nir_intrinsic_image_load)
+         tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
+
+      else if (instr->intrinsic == nir_intrinsic_image_store)
+         emit_image_store(bld, image, addr, src0, surf_dims, arr_dims, format);
+
+      else
+         tmp = emit_image_atomic(bld, image, addr, src0, src1,
+                                 surf_dims, arr_dims, info->dest_components,
+                                 get_image_atomic_op(instr->intrinsic, type));
+
+      /* Assign the result. */
+      for (unsigned c = 0; c < info->dest_components; ++c)
+         bld.MOV(offset(retype(dest, base_type), bld, c),
+                 offset(tmp, bld, c));
+      break;
+   }
+
    case nir_intrinsic_memory_barrier: {
       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 16 / dispatch_width);
       bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)

From 13a04abc277089275217dce119e18acf4d4ce52d Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Mon, 27 Jul 2015 14:33:06 +0300
Subject: [PATCH 1103/1208] i965/fs: Clamp image array indices to the array
 bounds on IVB.

This fixes the spec@arb_shader_image_load_store@invalid index bounds
piglit tests on IVB, which were causing a GPU hang and then a crash
due to the invalid binding table index result of the array index
calculation.  Other generations seem to behave sensibly when an
invalid surface is provided so it doesn't look like we need to care.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 25 ++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 9d691f7eb9f..ce4153df70e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1170,14 +1170,31 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
          nir_deref_as_array(deref->deref.child);
       assert(deref->deref.child->deref_type == nir_deref_type_array &&
              deref_array->deref.child == NULL);
+      const unsigned size = glsl_get_length(deref->var->type);
+      const unsigned base = MIN2(deref_array->base_offset, size - 1);
 
-      image = offset(image, bld,
-                     deref_array->base_offset * BRW_IMAGE_PARAM_SIZE);
+      image = offset(image, bld, base * BRW_IMAGE_PARAM_SIZE);
 
       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
          fs_reg *tmp = new(mem_ctx) fs_reg(vgrf(glsl_type::int_type));
-         bld.MUL(*tmp, get_nir_src(deref_array->indirect),
-                 fs_reg(BRW_IMAGE_PARAM_SIZE));
+
+         if (devinfo->gen == 7 && !devinfo->is_haswell) {
+            /* IVB hangs when trying to access an invalid surface index with
+             * the dataport.  According to the spec "if the index used to
+             * select an individual element is negative or greater than or
+             * equal to the size of the array, the results of the operation
+             * are undefined but may not lead to termination" -- which is one
+             * of the possible outcomes of the hang.  Clamp the index to
+             * prevent access outside of the array bounds.
+             */
+            bld.emit_minmax(*tmp, retype(get_nir_src(deref_array->indirect),
+                                         BRW_REGISTER_TYPE_UD),
+                            fs_reg(size - base - 1), BRW_CONDITIONAL_L);
+         } else {
+            bld.MOV(*tmp, get_nir_src(deref_array->indirect));
+         }
+
+         bld.MUL(*tmp, *tmp, fs_reg(BRW_IMAGE_PARAM_SIZE));
          image.reladdr = tmp;
       }
    }

From d03c65793a5ee31f1138cbd0fba6fac6cd942428 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Thu, 7 May 2015 18:56:01 +0300
Subject: [PATCH 1104/1208] i965: Expose ARB_shader_image_load_store.

Reviewed-by: Paul Berry <stereotype441@gmail.com>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/intel_extensions.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 4a0ffffe59e..0da528b22a1 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -325,6 +325,7 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_framebuffer_no_attachments = true;
       ctx->Extensions.ARB_gpu_shader5 = true;
       ctx->Extensions.ARB_shader_atomic_counters = true;
+      ctx->Extensions.ARB_shader_image_load_store = true;
       ctx->Extensions.ARB_texture_compression_bptc = true;
       ctx->Extensions.ARB_texture_view = true;
 

From 3c04a90e91a64a4a09d77c76c6ddcaca949e9b0e Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Fri, 1 May 2015 17:00:02 +0300
Subject: [PATCH 1105/1208] docs: Mark ARB_shader_image_load_store as done on
 i965.

---
 docs/GL3.txt              | 2 +-
 docs/relnotes/11.0.0.html | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 8124383b9ea..54c0c5aa6a8 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -139,7 +139,7 @@ GL 4.2, GLSL 4.20:
   GL_ARB_texture_storage                               DONE (all drivers)
   GL_ARB_transform_feedback_instanced                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_base_instance                                 DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_shader_image_load_store                       in progress (curro)
+  GL_ARB_shader_image_load_store                       DONE (i965)
   GL_ARB_conservative_depth                            DONE (all drivers that support GLSL 1.30)
   GL_ARB_shading_language_420pack                      DONE (all drivers that support GLSL 1.30)
   GL_ARB_shading_language_packing                      DONE (all drivers)
diff --git a/docs/relnotes/11.0.0.html b/docs/relnotes/11.0.0.html
index 1b633315c15..2d80198d420 100644
--- a/docs/relnotes/11.0.0.html
+++ b/docs/relnotes/11.0.0.html
@@ -52,6 +52,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_get_texture_sub_image for all drivers</li>
 <li>GL_ARB_gpu_shader5 on radeonsi</li>
 <li>GL_ARB_gpu_shader_fp64 on llvmpipe, radeonsi</li>
+<li>GL_ARB_shader_image_load_store on i965</li>
 <li>GL_ARB_shader_stencil_export on llvmpipe</li>
 <li>GL_ARB_shader_subroutine on core profile all drivers</li>
 <li>GL_ARB_tessellation_shader on nvc0, radeonsi</li>

From 3206d4ed44e761186fee3c679801e57f8ce923cb Mon Sep 17 00:00:00 2001
From: Grazvydas Ignotas <notasas@gmail.com>
Date: Mon, 10 Aug 2015 00:42:32 +0300
Subject: [PATCH 1106/1208] gallium/radeon: use helper functions to mark atoms
 dirty
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is analogous to r300_mark_atom_dirty() used by r300, and will
be used by later patches. For common radeon code, appropriate helper
is called through a function pointer.

No functional changes.

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/r600/evergreen_compute.c  |  2 +-
 src/gallium/drivers/r600/evergreen_state.c    | 28 +++----
 src/gallium/drivers/r600/r600_blit.c          | 16 ++--
 src/gallium/drivers/r600/r600_hw_context.c    | 54 ++++++-------
 src/gallium/drivers/r600/r600_pipe.c          |  1 +
 src/gallium/drivers/r600/r600_pipe.h          | 25 +++++--
 src/gallium/drivers/r600/r600_state.c         | 28 +++----
 src/gallium/drivers/r600/r600_state_common.c  | 75 ++++++++++---------
 src/gallium/drivers/radeon/r600_pipe_common.h |  3 +
 src/gallium/drivers/radeon/r600_streamout.c   | 14 ++--
 src/gallium/drivers/radeon/r600_texture.c     |  2 +-
 src/gallium/drivers/radeonsi/si_blit.c        | 14 ++--
 src/gallium/drivers/radeonsi/si_descriptors.c |  6 +-
 src/gallium/drivers/radeonsi/si_hw_context.c  | 12 +--
 src/gallium/drivers/radeonsi/si_pipe.c        |  1 +
 src/gallium/drivers/radeonsi/si_pipe.h        | 14 ++++
 src/gallium/drivers/radeonsi/si_state.c       | 18 ++---
 src/gallium/drivers/radeonsi/si_state_draw.c  |  2 +-
 .../drivers/radeonsi/si_state_shaders.c       | 12 +--
 19 files changed, 182 insertions(+), 145 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index d89e3de9cc6..51003309654 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -163,7 +163,7 @@ static void evergreen_cs_set_vertex_buffer(
 	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 	state->enabled_mask |= 1 << vb_index;
 	state->dirty_mask |= 1 << vb_index;
-	state->atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &state->atom);
 }
 
 static void evergreen_cs_set_constant_buffer(
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 13ecc46959f..bb5338304ea 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -896,7 +896,7 @@ static void evergreen_set_scissor_states(struct pipe_context *ctx,
 
 	for (i = start_slot; i < start_slot + num_scissors; i++) {
 		rctx->scissor[i].scissor = state[i - start_slot];
-		rctx->scissor[i].atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->scissor[i].atom);
 	}
 }
 
@@ -1346,11 +1346,11 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (rctx->alphatest_state.bypass != alphatest_bypass) {
 			rctx->alphatest_state.bypass = alphatest_bypass;
-			rctx->alphatest_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 		}
 		if (rctx->alphatest_state.cb0_export_16bpc != export_16bpc) {
 			rctx->alphatest_state.cb0_export_16bpc = export_16bpc;
-			rctx->alphatest_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 		}
 	}
 
@@ -1366,28 +1366,28 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (state->zsbuf->format != rctx->poly_offset_state.zs_format) {
 			rctx->poly_offset_state.zs_format = state->zsbuf->format;
-			rctx->poly_offset_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom);
 		}
 
 		if (rctx->db_state.rsurf != surf) {
 			rctx->db_state.rsurf = surf;
-			rctx->db_state.atom.dirty = true;
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 		}
 	} else if (rctx->db_state.rsurf) {
 		rctx->db_state.rsurf = NULL;
-		rctx->db_state.atom.dirty = true;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 
 	if (rctx->cb_misc_state.nr_cbufs != state->nr_cbufs) {
 		rctx->cb_misc_state.nr_cbufs = state->nr_cbufs;
-		rctx->cb_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}
 
 	if (state->nr_cbufs == 0 && rctx->alphatest_state.bypass) {
 		rctx->alphatest_state.bypass = false;
-		rctx->alphatest_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 	}
 
 	log_samples = util_logbase2(rctx->framebuffer.nr_samples);
@@ -1396,7 +1396,7 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 	     rctx->b.family == CHIP_RV770) &&
 	    rctx->db_misc_state.log_samples != log_samples) {
 		rctx->db_misc_state.log_samples = log_samples;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 
 
@@ -1424,7 +1424,7 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 		rctx->framebuffer.atom.num_dw += 4;
 	}
 
-	rctx->framebuffer.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
 
 	r600_set_sample_locations_constant_buffer(rctx);
 }
@@ -1438,7 +1438,7 @@ static void evergreen_set_min_samples(struct pipe_context *ctx, unsigned min_sam
 
 	rctx->ps_iter_samples = min_samples;
 	if (rctx->framebuffer.nr_samples > 1) {
-		rctx->framebuffer.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
 	}
 }
 
@@ -3180,7 +3180,7 @@ void evergreen_update_db_shader_control(struct r600_context * rctx)
 
 	if (db_shader_control != rctx->db_misc_state.db_shader_control) {
 		rctx->db_misc_state.db_shader_control = db_shader_control;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 8664e036338..b0002c3b50f 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -145,7 +145,7 @@ static void r600_blit_decompress_depth(struct pipe_context *ctx,
 	rctx->db_misc_state.copy_depth = util_format_has_depth(desc);
 	rctx->db_misc_state.copy_stencil = util_format_has_stencil(desc);
 	rctx->db_misc_state.copy_sample = first_sample;
-	rctx->db_misc_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 
 	for (level = first_level; level <= last_level; level++) {
 		if (!staging && !(texture->dirty_level_mask & (1 << level)))
@@ -162,7 +162,7 @@ static void r600_blit_decompress_depth(struct pipe_context *ctx,
 
 				if (sample != rctx->db_misc_state.copy_sample) {
 					rctx->db_misc_state.copy_sample = sample;
-					rctx->db_misc_state.atom.dirty = true;
+					r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 				}
 
 				surf_tmpl.format = texture->resource.b.b.format;
@@ -197,7 +197,7 @@ static void r600_blit_decompress_depth(struct pipe_context *ctx,
 
 	/* reenable compression in DB_RENDER_CONTROL */
 	rctx->db_misc_state.flush_depthstencil_through_cb = false;
-	rctx->db_misc_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 }
 
 static void r600_blit_decompress_depth_in_place(struct r600_context *rctx,
@@ -210,7 +210,7 @@ static void r600_blit_decompress_depth_in_place(struct r600_context *rctx,
 
 	/* Enable decompression in DB_RENDER_CONTROL */
 	rctx->db_misc_state.flush_depthstencil_in_place = true;
-	rctx->db_misc_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 
 	surf_tmpl.format = texture->resource.b.b.format;
 
@@ -248,7 +248,7 @@ static void r600_blit_decompress_depth_in_place(struct r600_context *rctx,
 
 	/* Disable decompression in DB_RENDER_CONTROL */
 	rctx->db_misc_state.flush_depthstencil_in_place = false;
-	rctx->db_misc_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 }
 
 void r600_decompress_depth_textures(struct r600_context *rctx,
@@ -437,10 +437,10 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
                    fb->zsbuf->u.tex.last_layer == util_max_layer(&rtex->resource.b.b, level)) {
 			if (rtex->depth_clear_value != depth) {
 				rtex->depth_clear_value = depth;
-				rctx->db_state.atom.dirty = true;
+				r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
 			}
 			rctx->db_misc_state.htile_clear = true;
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 		}
 	}
 
@@ -453,7 +453,7 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
 	/* disable fast clear */
 	if (rctx->db_misc_state.htile_clear) {
 		rctx->db_misc_state.htile_clear = false;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index ad11e760781..d67fdfd4106 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -296,43 +296,43 @@ void r600_begin_new_cs(struct r600_context *ctx)
 	r600_emit_command_buffer(ctx->b.rings.gfx.cs, &ctx->start_cs_cmd);
 
 	/* Re-emit states. */
-	ctx->alphatest_state.atom.dirty = true;
-	ctx->blend_color.atom.dirty = true;
-	ctx->cb_misc_state.atom.dirty = true;
-	ctx->clip_misc_state.atom.dirty = true;
-	ctx->clip_state.atom.dirty = true;
-	ctx->db_misc_state.atom.dirty = true;
-	ctx->db_state.atom.dirty = true;
-	ctx->framebuffer.atom.dirty = true;
-	ctx->pixel_shader.atom.dirty = true;
-	ctx->poly_offset_state.atom.dirty = true;
-	ctx->vgt_state.atom.dirty = true;
-	ctx->sample_mask.atom.dirty = true;
+	r600_mark_atom_dirty(ctx, &ctx->alphatest_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->blend_color.atom);
+	r600_mark_atom_dirty(ctx, &ctx->cb_misc_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->clip_misc_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->clip_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->db_misc_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->db_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
+	r600_mark_atom_dirty(ctx, &ctx->pixel_shader.atom);
+	r600_mark_atom_dirty(ctx, &ctx->poly_offset_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->vgt_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->sample_mask.atom);
 	for (i = 0; i < R600_MAX_VIEWPORTS; i++) {
-		ctx->scissor[i].atom.dirty = true;
-		ctx->viewport[i].atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->scissor[i].atom);
+		r600_mark_atom_dirty(ctx, &ctx->viewport[i].atom);
 	}
-	ctx->config_state.atom.dirty = true;
-	ctx->stencil_ref.atom.dirty = true;
-	ctx->vertex_fetch_shader.atom.dirty = true;
-	ctx->export_shader.atom.dirty = true;
-	ctx->shader_stages.atom.dirty = true;
+	r600_mark_atom_dirty(ctx, &ctx->config_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->stencil_ref.atom);
+	r600_mark_atom_dirty(ctx, &ctx->vertex_fetch_shader.atom);
+	r600_mark_atom_dirty(ctx, &ctx->export_shader.atom);
+	r600_mark_atom_dirty(ctx, &ctx->shader_stages.atom);
 	if (ctx->gs_shader) {
-		ctx->geometry_shader.atom.dirty = true;
-		ctx->gs_rings.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->geometry_shader.atom);
+		r600_mark_atom_dirty(ctx, &ctx->gs_rings.atom);
 	}
-	ctx->vertex_shader.atom.dirty = true;
-	ctx->b.streamout.enable_atom.dirty = true;
+	r600_mark_atom_dirty(ctx, &ctx->vertex_shader.atom);
+	r600_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
 
 	if (ctx->blend_state.cso)
-		ctx->blend_state.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->blend_state.atom);
 	if (ctx->dsa_state.cso)
-		ctx->dsa_state.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->dsa_state.atom);
 	if (ctx->rasterizer_state.cso)
-		ctx->rasterizer_state.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->rasterizer_state.atom);
 
 	if (ctx->b.chip_class <= R700) {
-		ctx->seamless_cube_map.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->seamless_cube_map.atom);
 	}
 
 	ctx->vertex_buffer_state.dirty_mask = ctx->vertex_buffer_state.enabled_mask;
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index f02014e17f0..c4f5c742b9a 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -120,6 +120,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
 	rctx->b.b.screen = screen;
 	rctx->b.b.priv = priv;
 	rctx->b.b.destroy = r600_destroy_context;
+	rctx->b.set_atom_dirty = (void *)r600_set_atom_dirty;
 
 	if (!r600_common_context_init(&rctx->b, &rscreen->b))
 		goto fail;
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index bcb9390f331..3653e8fd0af 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -498,29 +498,44 @@ static inline void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
 	cs->cdw += cb->num_dw;
 }
 
+static inline void r600_set_atom_dirty(struct r600_context *rctx,
+				       struct r600_atom *atom,
+				       bool dirty)
+{
+	atom->dirty = dirty;
+}
+
+static inline void r600_mark_atom_dirty(struct r600_context *rctx,
+					struct r600_atom *atom)
+{
+	r600_set_atom_dirty(rctx, atom, true);
+}
+
 void r600_trace_emit(struct r600_context *rctx);
 
 static inline void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom)
 {
 	atom->emit(&rctx->b, atom);
-	atom->dirty = false;
+	r600_set_atom_dirty(rctx, atom, false);
 	if (rctx->screen->b.trace_bo) {
 		r600_trace_emit(rctx);
 	}
 }
 
-static inline void r600_set_cso_state(struct r600_cso_state *state, void *cso)
+static inline void r600_set_cso_state(struct r600_context *rctx,
+				      struct r600_cso_state *state, void *cso)
 {
 	state->cso = cso;
-	state->atom.dirty = cso != NULL;
+	r600_set_atom_dirty(rctx, &state->atom, cso != NULL);
 }
 
-static inline void r600_set_cso_state_with_cb(struct r600_cso_state *state, void *cso,
+static inline void r600_set_cso_state_with_cb(struct r600_context *rctx,
+					      struct r600_cso_state *state, void *cso,
 					      struct r600_command_buffer *cb)
 {
 	state->cb = cb;
 	state->atom.num_dw = cb ? cb->num_dw : 0;
-	r600_set_cso_state(state, cso);
+	r600_set_cso_state(rctx, state, cso);
 }
 
 /* compute_memory_pool.c */
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index c28a1e14f0f..7fb03b4cff6 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -802,7 +802,7 @@ static void r600_set_scissor_states(struct pipe_context *ctx,
 		return;
 
 	for (i = start_slot ; i < start_slot + num_scissors; i++) {
-		rctx->scissor[i].atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->scissor[i].atom);
 	}
 }
 
@@ -1193,7 +1193,7 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (rctx->alphatest_state.bypass != alphatest_bypass) {
 			rctx->alphatest_state.bypass = alphatest_bypass;
-			rctx->alphatest_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 		}
 	}
 
@@ -1209,28 +1209,28 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (state->zsbuf->format != rctx->poly_offset_state.zs_format) {
 			rctx->poly_offset_state.zs_format = state->zsbuf->format;
-			rctx->poly_offset_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom);
 		}
 
 		if (rctx->db_state.rsurf != surf) {
 			rctx->db_state.rsurf = surf;
-			rctx->db_state.atom.dirty = true;
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 		}
 	} else if (rctx->db_state.rsurf) {
 		rctx->db_state.rsurf = NULL;
-		rctx->db_state.atom.dirty = true;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 
 	if (rctx->cb_misc_state.nr_cbufs != state->nr_cbufs) {
 		rctx->cb_misc_state.nr_cbufs = state->nr_cbufs;
-		rctx->cb_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}
 
 	if (state->nr_cbufs == 0 && rctx->alphatest_state.bypass) {
 		rctx->alphatest_state.bypass = false;
-		rctx->alphatest_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 	}
 
 	/* Calculate the CS size. */
@@ -1250,7 +1250,7 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 		rctx->framebuffer.atom.num_dw += 2;
 	}
 
-	rctx->framebuffer.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
 
 	r600_set_sample_locations_constant_buffer(rctx);
 }
@@ -1541,9 +1541,9 @@ static void r600_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
 
 	rctx->ps_iter_samples = min_samples;
 	if (rctx->framebuffer.nr_samples > 1) {
-		rctx->rasterizer_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->rasterizer_state.atom);
 		if (rctx->b.chip_class == R600)
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
@@ -2089,7 +2089,7 @@ bool r600_adjust_gprs(struct r600_context *rctx)
 	if (rctx->config_state.sq_gpr_resource_mgmt_1 != tmp || rctx->config_state.sq_gpr_resource_mgmt_2 != tmp2) {
 		rctx->config_state.sq_gpr_resource_mgmt_1 = tmp;
 		rctx->config_state.sq_gpr_resource_mgmt_2 = tmp2;
-		rctx->config_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->config_state.atom);
 		rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE;
 	}
 	return true;
@@ -2796,7 +2796,7 @@ void r600_update_db_shader_control(struct r600_context * rctx)
 
 	if (db_shader_control != rctx->db_misc_state.db_shader_control) {
 		rctx->db_misc_state.db_shader_control = db_shader_control;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 455e59aef6c..2d654dc5eb7 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -127,11 +127,11 @@ static void r600_bind_blend_state_internal(struct r600_context *rctx,
 	rctx->dual_src_blend = blend->dual_src_blend;
 
 	if (!blend_disable) {
-		r600_set_cso_state_with_cb(&rctx->blend_state, blend, &blend->buffer);
+		r600_set_cso_state_with_cb(rctx, &rctx->blend_state, blend, &blend->buffer);
 		color_control = blend->cb_color_control;
 	} else {
 		/* Blending is disabled. */
-		r600_set_cso_state_with_cb(&rctx->blend_state, blend, &blend->buffer_no_blend);
+		r600_set_cso_state_with_cb(rctx, &rctx->blend_state, blend, &blend->buffer_no_blend);
 		color_control = blend->cb_color_control_no_blend;
 	}
 
@@ -150,7 +150,7 @@ static void r600_bind_blend_state_internal(struct r600_context *rctx,
 		update_cb = true;
 	}
 	if (update_cb) {
-		rctx->cb_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}
 }
 
@@ -160,7 +160,7 @@ static void r600_bind_blend_state(struct pipe_context *ctx, void *state)
 	struct r600_blend_state *blend = (struct r600_blend_state *)state;
 
 	if (blend == NULL) {
-		r600_set_cso_state_with_cb(&rctx->blend_state, NULL, NULL);
+		r600_set_cso_state_with_cb(rctx, &rctx->blend_state, NULL, NULL);
 		return;
 	}
 
@@ -173,7 +173,7 @@ static void r600_set_blend_color(struct pipe_context *ctx,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
 	rctx->blend_color.state = *state;
-	rctx->blend_color.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->blend_color.atom);
 }
 
 void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom)
@@ -210,7 +210,7 @@ static void r600_set_clip_state(struct pipe_context *ctx,
 	struct pipe_constant_buffer cb;
 
 	rctx->clip_state.state = *state;
-	rctx->clip_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->clip_state.atom);
 
 	cb.buffer = NULL;
 	cb.user_buffer = state->ucp;
@@ -226,7 +226,7 @@ static void r600_set_stencil_ref(struct pipe_context *ctx,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
 	rctx->stencil_ref.state = *state;
-	rctx->stencil_ref.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->stencil_ref.atom);
 }
 
 void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom)
@@ -274,11 +274,11 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 	struct r600_stencil_ref ref;
 
 	if (state == NULL) {
-		r600_set_cso_state_with_cb(&rctx->dsa_state, NULL, NULL);
+		r600_set_cso_state_with_cb(rctx, &rctx->dsa_state, NULL, NULL);
 		return;
 	}
 
-	r600_set_cso_state_with_cb(&rctx->dsa_state, dsa, &dsa->buffer);
+	r600_set_cso_state_with_cb(rctx, &rctx->dsa_state, dsa, &dsa->buffer);
 
 	ref.ref_value[0] = rctx->stencil_ref.pipe_state.ref_value[0];
 	ref.ref_value[1] = rctx->stencil_ref.pipe_state.ref_value[1];
@@ -293,7 +293,7 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 			 * we are having lockup on evergreen so do not enable
 			 * hyperz when not writing zbuffer
 			 */
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 		}
 	}
 
@@ -304,7 +304,7 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 	    rctx->alphatest_state.sx_alpha_ref != dsa->alpha_ref) {
 		rctx->alphatest_state.sx_alpha_test_control = dsa->sx_alpha_test_control;
 		rctx->alphatest_state.sx_alpha_ref = dsa->alpha_ref;
-		rctx->alphatest_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 	}
 }
 
@@ -318,14 +318,14 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state)
 
 	rctx->rasterizer = rs;
 
-	r600_set_cso_state_with_cb(&rctx->rasterizer_state, rs, &rs->buffer);
+	r600_set_cso_state_with_cb(rctx, &rctx->rasterizer_state, rs, &rs->buffer);
 
 	if (rs->offset_enable &&
 	    (rs->offset_units != rctx->poly_offset_state.offset_units ||
 	     rs->offset_scale != rctx->poly_offset_state.offset_scale)) {
 		rctx->poly_offset_state.offset_units = rs->offset_units;
 		rctx->poly_offset_state.offset_scale = rs->offset_scale;
-		rctx->poly_offset_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom);
 	}
 
 	/* Update clip_misc_state. */
@@ -333,14 +333,14 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state)
 	    rctx->clip_misc_state.clip_plane_enable != rs->clip_plane_enable) {
 		rctx->clip_misc_state.pa_cl_clip_cntl = rs->pa_cl_clip_cntl;
 		rctx->clip_misc_state.clip_plane_enable = rs->clip_plane_enable;
-		rctx->clip_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
 	}
 
 	/* Workaround for a missing scissor enable on r600. */
 	if (rctx->b.chip_class == R600 &&
 	    rs->scissor_enable != rctx->scissor[0].enable) {
 		rctx->scissor[0].enable = rs->scissor_enable;
-		rctx->scissor[0].atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->scissor[0].atom);
 	}
 
 	/* Re-emit PA_SC_LINE_STIPPLE. */
@@ -378,7 +378,7 @@ void r600_sampler_states_dirty(struct r600_context *rctx,
 		state->atom.num_dw =
 			util_bitcount(state->dirty_mask & state->has_bordercolor_mask) * 11 +
 			util_bitcount(state->dirty_mask & ~state->has_bordercolor_mask) * 5;
-		state->atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &state->atom);
 	}
 }
 
@@ -443,7 +443,7 @@ static void r600_bind_sampler_states(struct pipe_context *pipe,
 		/* change in TA_CNTL_AUX need a pipeline flush */
 		rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE;
 		rctx->seamless_cube_map.enabled = seamless_cube_map;
-		rctx->seamless_cube_map.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->seamless_cube_map.atom);
 	}
 }
 
@@ -483,7 +483,7 @@ static void r600_bind_vertex_elements(struct pipe_context *ctx, void *state)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
-	r600_set_cso_state(&rctx->vertex_fetch_shader, state);
+	r600_set_cso_state(rctx, &rctx->vertex_fetch_shader, state);
 }
 
 static void r600_delete_vertex_elements(struct pipe_context *ctx, void *state)
@@ -513,7 +513,7 @@ void r600_vertex_buffers_dirty(struct r600_context *rctx)
 		rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 		rctx->vertex_buffer_state.atom.num_dw = (rctx->b.chip_class >= EVERGREEN ? 12 : 11) *
 					       util_bitcount(rctx->vertex_buffer_state.dirty_mask);
-		rctx->vertex_buffer_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->vertex_buffer_state.atom);
 	}
 }
 
@@ -570,7 +570,7 @@ void r600_sampler_views_dirty(struct r600_context *rctx,
 		rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
 		state->atom.num_dw = (rctx->b.chip_class >= EVERGREEN ? 14 : 13) *
 				     util_bitcount(state->dirty_mask);
-		state->atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &state->atom);
 	}
 }
 
@@ -673,7 +673,7 @@ static void r600_set_viewport_states(struct pipe_context *ctx,
 
 	for (i = start_slot; i < start_slot + num_viewports; i++) {
 		rctx->viewport[i].state = state[i - start_slot];
-		rctx->viewport[i].atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->viewport[i].atom);
 	}
 }
 
@@ -913,7 +913,7 @@ void r600_constant_buffers_dirty(struct r600_context *rctx, struct r600_constbuf
 		rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE;
 		state->atom.num_dw = rctx->b.chip_class >= EVERGREEN ? util_bitcount(state->dirty_mask)*20
 								   : util_bitcount(state->dirty_mask)*19;
-		state->atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &state->atom);
 	}
 }
 
@@ -982,7 +982,7 @@ static void r600_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask
 		return;
 
 	rctx->sample_mask.sample_mask = sample_mask;
-	rctx->sample_mask.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->sample_mask.atom);
 }
 
 /*
@@ -1107,27 +1107,28 @@ static void update_shader_atom(struct pipe_context *ctx,
 			       struct r600_shader_state *state,
 			       struct r600_pipe_shader *shader)
 {
+	struct r600_context *rctx = (struct r600_context *)ctx;
+
 	state->shader = shader;
 	if (shader) {
 		state->atom.num_dw = shader->command_buffer.num_dw;
-		state->atom.dirty = true;
 		r600_context_add_resource_size(ctx, (struct pipe_resource *)shader->bo);
 	} else {
 		state->atom.num_dw = 0;
-		state->atom.dirty = false;
 	}
+	r600_mark_atom_dirty(rctx, &state->atom);
 }
 
 static void update_gs_block_state(struct r600_context *rctx, unsigned enable)
 {
 	if (rctx->shader_stages.geom_enable != enable) {
 		rctx->shader_stages.geom_enable = enable;
-		rctx->shader_stages.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom);
 	}
 
 	if (rctx->gs_rings.enable != enable) {
 		rctx->gs_rings.enable = enable;
-		rctx->gs_rings.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->gs_rings.atom);
 
 		if (enable && !rctx->gs_rings.esgs_ring.buffer) {
 			unsigned size = 0x1C000;
@@ -1192,7 +1193,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
 		if (!rctx->shader_stages.geom_enable) {
 			rctx->shader_stages.geom_enable = true;
-			rctx->shader_stages.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom);
 		}
 
 		/* gs_shader provides GS and VS (copy shader) */
@@ -1206,7 +1207,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 				rctx->clip_misc_state.pa_cl_vs_out_cntl = rctx->gs_shader->current->gs_copy_shader->pa_cl_vs_out_cntl;
 				rctx->clip_misc_state.clip_dist_write = rctx->gs_shader->current->gs_copy_shader->shader.clip_dist_write;
 				rctx->clip_misc_state.clip_disable = rctx->gs_shader->current->shader.vs_position_window_space;
-				rctx->clip_misc_state.atom.dirty = true;
+				r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
 			}
 			rctx->b.streamout.enabled_stream_buffers_mask = rctx->gs_shader->current->gs_copy_shader->enabled_stream_buffers_mask;
 		}
@@ -1224,7 +1225,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 			update_shader_atom(ctx, &rctx->geometry_shader, NULL);
 			update_shader_atom(ctx, &rctx->export_shader, NULL);
 			rctx->shader_stages.geom_enable = false;
-			rctx->shader_stages.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom);
 		}
 
 		r600_shader_select(ctx, rctx->vs_shader, &vs_dirty);
@@ -1241,7 +1242,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 				rctx->clip_misc_state.pa_cl_vs_out_cntl = rctx->vs_shader->current->pa_cl_vs_out_cntl;
 				rctx->clip_misc_state.clip_dist_write = rctx->vs_shader->current->shader.clip_dist_write;
 				rctx->clip_misc_state.clip_disable = rctx->vs_shader->current->shader.vs_position_window_space;
-				rctx->clip_misc_state.atom.dirty = true;
+				r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
 			}
 			rctx->b.streamout.enabled_stream_buffers_mask = rctx->vs_shader->current->enabled_stream_buffers_mask;
 		}
@@ -1254,7 +1255,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
 		if (rctx->cb_misc_state.nr_ps_color_outputs != rctx->ps_shader->current->nr_ps_color_outputs) {
 			rctx->cb_misc_state.nr_ps_color_outputs = rctx->ps_shader->current->nr_ps_color_outputs;
-			rctx->cb_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 		}
 
 		if (rctx->b.chip_class <= R700) {
@@ -1262,7 +1263,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
 			if (rctx->cb_misc_state.multiwrite != multiwrite) {
 				rctx->cb_misc_state.multiwrite = multiwrite;
-				rctx->cb_misc_state.atom.dirty = true;
+				r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 			}
 		}
 
@@ -1276,7 +1277,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 				r600_update_ps_state(ctx, rctx->ps_shader->current);
 		}
 
-		rctx->shader_stages.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom);
 		update_shader_atom(ctx, &rctx->pixel_shader, rctx->ps_shader->current);
 	}
 
@@ -1455,13 +1456,13 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		rctx->vgt_state.vgt_multi_prim_ib_reset_en = info.primitive_restart;
 		rctx->vgt_state.vgt_multi_prim_ib_reset_indx = info.restart_index;
 		rctx->vgt_state.vgt_indx_offset = info.index_bias;
-		rctx->vgt_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->vgt_state.atom);
 	}
 
 	/* Workaround for hardware deadlock on certain R600 ASICs: write into a CB register. */
 	if (rctx->b.chip_class == R600) {
 		rctx->b.flags |= R600_CONTEXT_PS_PARTIAL_FLUSH;
-		rctx->cb_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}
 
 	/* Emit states. */
@@ -2491,7 +2492,7 @@ static void r600_set_occlusion_query_state(struct pipe_context *ctx, bool enable
 
 	if (rctx->db_misc_state.occlusion_query_enabled != enable) {
 		rctx->db_misc_state.occlusion_query_enabled = enable;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 85ac22f3c7e..765ee3f70ba 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -467,6 +467,9 @@ struct r600_common_context {
 	/* This ensures there is enough space in the command stream. */
 	void (*need_gfx_cs_space)(struct pipe_context *ctx, unsigned num_dw,
 				  bool include_draw_vbo);
+
+	void (*set_atom_dirty)(struct r600_common_context *ctx,
+			       struct r600_atom *atom, bool dirty);
 };
 
 /* r600_buffer.c */
diff --git a/src/gallium/drivers/radeon/r600_streamout.c b/src/gallium/drivers/radeon/r600_streamout.c
index 026fb30f749..0853f636a27 100644
--- a/src/gallium/drivers/radeon/r600_streamout.c
+++ b/src/gallium/drivers/radeon/r600_streamout.c
@@ -104,7 +104,7 @@ void r600_streamout_buffers_dirty(struct r600_common_context *rctx)
 		(num_bufs - num_bufs_appended) * 6 + /* STRMOUT_BUFFER_UPDATE */
 		(rctx->family > CHIP_R600 && rctx->family < CHIP_RS780 ? 2 : 0); /* SURFACE_BASE_UPDATE */
 
-	begin->dirty = true;
+	rctx->set_atom_dirty(rctx, begin, true);
 
 	r600_set_streamout_enable(rctx, true);
 }
@@ -145,7 +145,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx,
 	if (num_targets) {
 		r600_streamout_buffers_dirty(rctx);
 	} else {
-		rctx->streamout.begin_atom.dirty = false;
+		rctx->set_atom_dirty(rctx, &rctx->streamout.begin_atom, false);
 		r600_set_streamout_enable(rctx, false);
 	}
 }
@@ -353,8 +353,9 @@ static void r600_set_streamout_enable(struct r600_common_context *rctx, bool ena
 					  (rctx->streamout.enabled_mask << 12);
 
 	if ((old_strmout_en != r600_get_strmout_en(rctx)) ||
-            (old_hw_enabled_mask != rctx->streamout.hw_enabled_mask))
-		rctx->streamout.enable_atom.dirty = true;
+            (old_hw_enabled_mask != rctx->streamout.hw_enabled_mask)) {
+		rctx->set_atom_dirty(rctx, &rctx->streamout.enable_atom, true);
+	}
 }
 
 void r600_update_prims_generated_query_state(struct r600_common_context *rctx,
@@ -369,8 +370,9 @@ void r600_update_prims_generated_query_state(struct r600_common_context *rctx,
 		rctx->streamout.prims_gen_query_enabled =
 			rctx->streamout.num_prims_gen_queries != 0;
 
-		if (old_strmout_en != r600_get_strmout_en(rctx))
-			rctx->streamout.enable_atom.dirty = true;
+		if (old_strmout_en != r600_get_strmout_en(rctx)) {
+			rctx->set_atom_dirty(rctx, &rctx->streamout.enable_atom, true);
+		}
 	}
 }
 
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index d05b0a102c0..57c40d96e2c 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -1276,7 +1276,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 				   tex->cmask.offset, tex->cmask.size, 0, true);
 
 		tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
-		fb_state->dirty = true;
+		rctx->set_atom_dirty(rctx, fb_state, true);
 		*buffers &= ~clear_bit;
 	}
 }
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 61ca2a82195..48972bd170c 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -148,7 +148,7 @@ static void si_blit_decompress_depth(struct pipe_context *ctx,
 				struct pipe_surface *zsurf, *cbsurf, surf_tmpl;
 
 				sctx->dbcb_copy_sample = sample;
-				sctx->db_render_state.dirty = true;
+				si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 				surf_tmpl.format = texture->resource.b.b.format;
 				surf_tmpl.u.tex.level = level;
@@ -182,7 +182,7 @@ static void si_blit_decompress_depth(struct pipe_context *ctx,
 
 	sctx->dbcb_depth_copy_enabled = false;
 	sctx->dbcb_stencil_copy_enabled = false;
-	sctx->db_render_state.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 }
 
 static void si_blit_decompress_depth_in_place(struct si_context *sctx,
@@ -194,7 +194,7 @@ static void si_blit_decompress_depth_in_place(struct si_context *sctx,
 	unsigned layer, max_layer, checked_last_layer, level;
 
 	sctx->db_inplace_flush_enabled = true;
-	sctx->db_render_state.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 	surf_tmpl.format = texture->resource.b.b.format;
 
@@ -232,7 +232,7 @@ static void si_blit_decompress_depth_in_place(struct si_context *sctx,
 	}
 
 	sctx->db_inplace_flush_enabled = false;
-	sctx->db_render_state.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 }
 
 void si_flush_depth_textures(struct si_context *sctx,
@@ -378,9 +378,9 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 		}
 
 		zstex->depth_clear_value = depth;
-		sctx->framebuffer.atom.dirty = true; /* updates DB_DEPTH_CLEAR */
+		si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_DEPTH_CLEAR */
 		sctx->db_depth_clear = true;
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 
 	si_blitter_begin(ctx, SI_CLEAR);
@@ -393,7 +393,7 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 		sctx->db_depth_clear = false;
 		sctx->db_depth_disable_expclear = false;
 		zstex->depth_cleared = true;
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index fcf4dbfd989..8d9f8f71dc5 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -122,7 +122,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
 
 	desc->list_dirty = false;
 	desc->pointer_dirty = true;
-	sctx->shader_userdata.atom.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
 	return true;
 }
 
@@ -452,7 +452,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 	 * uploaded to a fresh new buffer, so I don't think flushing the const
 	 * cache is needed. */
 	desc->pointer_dirty = true;
-	sctx->shader_userdata.atom.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
 	sctx->vertex_buffers_dirty = false;
 	return true;
 }
@@ -869,7 +869,7 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx,
 	if (shader == PIPE_SHADER_VERTEX)
 		sctx->vertex_buffers.pointer_dirty = true;
 
-	sctx->shader_userdata.atom.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
 }
 
 static void si_shader_userdata_begin_new_cs(struct si_context *sctx)
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 8658056d15e..307dc391431 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -178,12 +178,12 @@ void si_begin_new_cs(struct si_context *ctx)
 	/* The CS initialization should be emitted before everything else. */
 	si_pm4_emit(ctx, ctx->init_config);
 
-	ctx->clip_regs.dirty = true;
-	ctx->framebuffer.atom.dirty = true;
-	ctx->msaa_sample_locs.dirty = true;
-	ctx->msaa_config.dirty = true;
-	ctx->db_render_state.dirty = true;
-	ctx->b.streamout.enable_atom.dirty = true;
+	si_mark_atom_dirty(ctx, &ctx->clip_regs);
+	si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
+	si_mark_atom_dirty(ctx, &ctx->msaa_sample_locs);
+	si_mark_atom_dirty(ctx, &ctx->msaa_config);
+	si_mark_atom_dirty(ctx, &ctx->db_render_state);
+	si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
 	si_all_descriptors_begin_new_cs(ctx);
 
 	r600_postflush_resume_features(&ctx->b);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index cacef9f0ae7..e29b1586107 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -101,6 +101,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 	sctx->b.b.screen = screen; /* this must be set first */
 	sctx->b.b.priv = priv;
 	sctx->b.b.destroy = si_destroy_context;
+	sctx->b.set_atom_dirty = (void *)si_set_atom_dirty;
 	sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
 
 	if (!r600_common_context_init(&sctx->b, &sscreen->b))
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index a249d317b07..553e1f32683 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -325,4 +325,18 @@ si_invalidate_draw_sh_constants(struct si_context *sctx)
 	sctx->last_sh_base_reg = -1; /* reset to an unknown value */
 }
 
+static inline void
+si_set_atom_dirty(struct si_context *sctx,
+		  struct r600_atom *atom, bool dirty)
+{
+	atom->dirty = dirty;
+}
+
+static inline void
+si_mark_atom_dirty(struct si_context *sctx,
+		   struct r600_atom *atom)
+{
+	si_set_atom_dirty(sctx, atom, true);
+}
+
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index a7aa0624e02..1cac8041d61 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -734,12 +734,12 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
 
 	if (sctx->framebuffer.nr_samples > 1 &&
 	    (!old_rs || old_rs->multisample_enable != rs->multisample_enable))
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 	si_pm4_bind_state(sctx, rasterizer, rs);
 	si_update_fb_rs_state(sctx);
 
-	sctx->clip_regs.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 }
 
 static void si_delete_rs_state(struct pipe_context *ctx, void *state)
@@ -904,7 +904,7 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 
-	sctx->db_render_state.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 }
 
 static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state)
@@ -2038,7 +2038,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 				  util_format_is_pure_integer(state->cbufs[0]->format);
 
 	if (sctx->framebuffer.cb0_is_integer != old_cb0_is_integer)
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 	for (i = 0; i < state->nr_cbufs; i++) {
 		if (!state->cbufs[i])
@@ -2083,11 +2083,11 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 	sctx->framebuffer.atom.num_dw += state->zsbuf ? 26 : 4;
 	sctx->framebuffer.atom.num_dw += 3; /* WINDOW_SCISSOR_BR */
 	sctx->framebuffer.atom.num_dw += 18; /* MSAA sample locations */
-	sctx->framebuffer.atom.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
 
 	if (sctx->framebuffer.nr_samples != old_nr_samples) {
-		sctx->msaa_config.dirty = true;
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 		/* Set sample locations as fragment shader constants. */
 		switch (sctx->framebuffer.nr_samples) {
@@ -2124,7 +2124,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 		     old_nr_samples != SI_NUM_SMOOTH_AA_SAMPLES) &&
 		    (sctx->framebuffer.nr_samples != SI_NUM_SMOOTH_AA_SAMPLES ||
 		     old_nr_samples != 1))
-			sctx->msaa_sample_locs.dirty = true;
+			si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs);
 	}
 }
 
@@ -2266,7 +2266,7 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
 	sctx->ps_iter_samples = min_samples;
 
 	if (sctx->framebuffer.nr_samples > 1)
-		sctx->msaa_config.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 }
 
 /*
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index e8faf405afc..f136a1c94d8 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -796,7 +796,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 	/* Check flush flags. */
 	if (sctx->b.flags)
-		sctx->atoms.s.cache_flush->dirty = true;
+		si_mark_atom_dirty(sctx, sctx->atoms.s.cache_flush);
 
 	si_need_cs_space(sctx, 0, TRUE);
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index fbcb0f4a1d8..475aea1f46d 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -744,7 +744,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 		return;
 
 	sctx->vs_shader = sel;
-	sctx->clip_regs.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 }
 
 static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
@@ -757,7 +757,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 		return;
 
 	sctx->gs_shader = sel;
-	sctx->clip_regs.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 
 	if (enable_changed)
@@ -789,7 +789,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 		return;
 
 	sctx->tes_shader = sel;
-	sctx->clip_regs.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 
 	if (enable_changed) {
@@ -1402,15 +1402,15 @@ void si_update_shaders(struct si_context *sctx)
 
 	if (sctx->ps_db_shader_control != sctx->ps_shader->current->db_shader_control) {
 		sctx->ps_db_shader_control = sctx->ps_shader->current->db_shader_control;
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 
 	if (sctx->smoothing_enabled != sctx->ps_shader->current->key.ps.poly_line_smoothing) {
 		sctx->smoothing_enabled = sctx->ps_shader->current->key.ps.poly_line_smoothing;
-		sctx->msaa_config.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 
 		if (sctx->b.chip_class == SI)
-			sctx->db_render_state.dirty = true;
+			si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 }
 

From 85adde30a4bb3e8e5ca44983308364559ff140ab Mon Sep 17 00:00:00 2001
From: Grazvydas Ignotas <notasas@gmail.com>
Date: Mon, 10 Aug 2015 00:42:33 +0300
Subject: [PATCH 1107/1208] r600g: use a helper to add an initialized atom
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of writing to rctx->atoms directly use a helper to take
advantage of assert checks.

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/r600/evergreen_state.c   |  4 ++--
 src/gallium/drivers/r600/r600_pipe.h         |  1 +
 src/gallium/drivers/r600/r600_state.c        |  4 ++--
 src/gallium/drivers/r600/r600_state_common.c | 15 +++++++++++----
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index bb5338304ea..f7a76f6cac7 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -3470,8 +3470,8 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	}
 	r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
 	r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, evergreen_emit_vertex_fetch_shader, 5);
-	rctx->atoms[id++] = &rctx->b.streamout.begin_atom;
-	rctx->atoms[id++] = &rctx->b.streamout.enable_atom;
+	r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++);
+	r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++);
 	r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23);
 	r600_init_atom(rctx, &rctx->pixel_shader.atom, id++, r600_emit_shader, 0);
 	r600_init_atom(rctx, &rctx->geometry_shader.atom, id++, r600_emit_shader, 0);
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 3653e8fd0af..8b612698b13 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -671,6 +671,7 @@ void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom
 void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom);
 void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom);
 void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a);
+void r600_add_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id);
 void r600_init_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id,
 		    void (*emit)(struct r600_context *ctx, struct r600_atom *state),
 		    unsigned num_dw);
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 7fb03b4cff6..848818842aa 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -3074,8 +3074,8 @@ void r600_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->config_state.atom, id++, r600_emit_config_state, 3);
 	r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
 	r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, r600_emit_vertex_fetch_shader, 5);
-	rctx->atoms[id++] = &rctx->b.streamout.begin_atom;
-	rctx->atoms[id++] = &rctx->b.streamout.enable_atom;
+	r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++);
+	r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++);
 	r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23);
 	r600_init_atom(rctx, &rctx->pixel_shader.atom, id++, r600_emit_shader, 0);
 	r600_init_atom(rctx, &rctx->geometry_shader.atom, id++, r600_emit_shader, 0);
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 2d654dc5eb7..a4238a2fec9 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -47,18 +47,25 @@ void r600_release_command_buffer(struct r600_command_buffer *cb)
 	FREE(cb->buf);
 }
 
+void r600_add_atom(struct r600_context *rctx,
+		   struct r600_atom *atom,
+		   unsigned id)
+{
+	assert(id < R600_NUM_ATOMS);
+	assert(rctx->atoms[id] == NULL);
+	rctx->atoms[id] = atom;
+	atom->dirty = false;
+}
+
 void r600_init_atom(struct r600_context *rctx,
 		    struct r600_atom *atom,
 		    unsigned id,
 		    void (*emit)(struct r600_context *ctx, struct r600_atom *state),
 		    unsigned num_dw)
 {
-	assert(id < R600_NUM_ATOMS);
-	assert(rctx->atoms[id] == NULL);
-	rctx->atoms[id] = atom;
 	atom->emit = (void*)emit;
 	atom->num_dw = num_dw;
-	atom->dirty = false;
+	r600_add_atom(rctx, atom, id);
 }
 
 void r600_emit_cso_state(struct r600_context *rctx, struct r600_atom *atom)

From c58534c1384dc63bb1b13eb37c06bdb4652c13ff Mon Sep 17 00:00:00 2001
From: Grazvydas Ignotas <notasas@gmail.com>
Date: Mon, 10 Aug 2015 00:42:34 +0300
Subject: [PATCH 1108/1208] r600g: don't mark unused atom dirty
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On evergreen config_state is not used, so don't mark it dirty.

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/r600/r600_hw_context.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index d67fdfd4106..c048a713099 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -312,7 +312,9 @@ void r600_begin_new_cs(struct r600_context *ctx)
 		r600_mark_atom_dirty(ctx, &ctx->scissor[i].atom);
 		r600_mark_atom_dirty(ctx, &ctx->viewport[i].atom);
 	}
-	r600_mark_atom_dirty(ctx, &ctx->config_state.atom);
+	if (ctx->b.chip_class < EVERGREEN) {
+		r600_mark_atom_dirty(ctx, &ctx->config_state.atom);
+	}
 	r600_mark_atom_dirty(ctx, &ctx->stencil_ref.atom);
 	r600_mark_atom_dirty(ctx, &ctx->vertex_fetch_shader.atom);
 	r600_mark_atom_dirty(ctx, &ctx->export_shader.atom);

From 50545882113b389decc3f05771764f6c62213af3 Mon Sep 17 00:00:00 2001
From: Grazvydas Ignotas <notasas@gmail.com>
Date: Mon, 10 Aug 2015 00:42:35 +0300
Subject: [PATCH 1109/1208] r600g: use a bitfield to track dirty atoms
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

r600 currently has 73 atoms and looping through their dirty flags has
become costly because checking each flag requires a pointer
dereference before the read. To avoid having to do that add additional
bitfield which can be checked really quickly thanks to tzcnt instruction.

id field was added to struct r600_atom but that doesn't affect memory
usage for both 32 and 64 bit CPUs because it was stuffed into padding.

The performance improvement is ~2% for benchmarks that can have FPS in
the thousands but is hardly measurable in "real" programs.

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/r600/r600_hw_context.c    | 12 ++---
 src/gallium/drivers/r600/r600_pipe.h          | 45 +++++++++++++++++++
 src/gallium/drivers/r600/r600_state_common.c  |  8 ++--
 src/gallium/drivers/radeon/r600_pipe_common.h |  1 +
 4 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index c048a713099..64451516c23 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -51,13 +51,13 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 		unsigned i;
 
 		/* The number of dwords all the dirty states would take. */
-		for (i = 0; i < R600_NUM_ATOMS; i++) {
-			if (ctx->atoms[i] && ctx->atoms[i]->dirty) {
-				num_dw += ctx->atoms[i]->num_dw;
-				if (ctx->screen->b.trace_bo) {
-					num_dw += R600_TRACE_CS_DWORDS;
-				}
+		i = r600_next_dirty_atom(ctx, 0);
+		while (i < R600_NUM_ATOMS) {
+			num_dw += ctx->atoms[i]->num_dw;
+			if (ctx->screen->b.trace_bo) {
+				num_dw += R600_TRACE_CS_DWORDS;
 			}
+			i = r600_next_dirty_atom(ctx, i + 1);
 		}
 
 		/* The upper-bound of how much space a draw command would take. */
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 8b612698b13..5d10bb48157 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -85,6 +85,9 @@
 #define R600_BIG_ENDIAN 0
 #endif
 
+#define R600_DIRTY_ATOM_WORD_BITS (sizeof(unsigned long) * 8)
+#define R600_DIRTY_ATOM_ARRAY_LEN DIV_ROUND_UP(R600_NUM_ATOMS, R600_DIRTY_ATOM_WORD_BITS)
+
 struct r600_context;
 struct r600_bytecode;
 struct r600_shader_key;
@@ -426,6 +429,8 @@ struct r600_context {
 
 	/* State binding slots are here. */
 	struct r600_atom		*atoms[R600_NUM_ATOMS];
+	/* Dirty atom bitmask for fast tests */
+	unsigned long			dirty_atoms[R600_DIRTY_ATOM_ARRAY_LEN];
 	/* States for CS initialization. */
 	struct r600_command_buffer	start_cs_cmd; /* invariant state mostly */
 	/** Compute specific registers initializations.  The start_cs_cmd atom
@@ -502,7 +507,18 @@ static inline void r600_set_atom_dirty(struct r600_context *rctx,
 				       struct r600_atom *atom,
 				       bool dirty)
 {
+	unsigned long mask;
+	unsigned int w;
+
 	atom->dirty = dirty;
+
+	assert(atom->id != 0);
+	w = atom->id / R600_DIRTY_ATOM_WORD_BITS;
+	mask = 1ul << (atom->id % R600_DIRTY_ATOM_WORD_BITS);
+	if (dirty)
+		rctx->dirty_atoms[w] |= mask;
+	else
+		rctx->dirty_atoms[w] &= ~mask;
 }
 
 static inline void r600_mark_atom_dirty(struct r600_context *rctx,
@@ -511,6 +527,35 @@ static inline void r600_mark_atom_dirty(struct r600_context *rctx,
 	r600_set_atom_dirty(rctx, atom, true);
 }
 
+static inline unsigned int r600_next_dirty_atom(struct r600_context *rctx,
+						unsigned int id)
+{
+#if !defined(DEBUG) && defined(HAVE___BUILTIN_CTZ)
+	unsigned int w = id / R600_DIRTY_ATOM_WORD_BITS;
+	unsigned int bit = id % R600_DIRTY_ATOM_WORD_BITS;
+	unsigned long bits, mask = (1ul << bit) - 1;
+
+	for (; w < R600_DIRTY_ATOM_ARRAY_LEN; w++, mask = 0ul) {
+		bits = rctx->dirty_atoms[w] & ~mask;
+		if (bits == 0)
+			continue;
+		return w * R600_DIRTY_ATOM_WORD_BITS + __builtin_ctzl(bits);
+	}
+
+	return R600_NUM_ATOMS;
+#else
+	for (; id < R600_NUM_ATOMS; id++) {
+		bool dirty = !!(rctx->dirty_atoms[id / R600_DIRTY_ATOM_WORD_BITS] &
+			(1ul << (id % R600_DIRTY_ATOM_WORD_BITS)));
+		assert(dirty == (rctx->atoms[id] && rctx->atoms[id]->dirty));
+		if (dirty)
+			break;
+	}
+
+	return id;
+#endif
+}
+
 void r600_trace_emit(struct r600_context *rctx);
 
 static inline void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom)
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index a4238a2fec9..8d0942ff898 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -54,6 +54,7 @@ void r600_add_atom(struct r600_context *rctx,
 	assert(id < R600_NUM_ATOMS);
 	assert(rctx->atoms[id] == NULL);
 	rctx->atoms[id] = atom;
+	atom->id = id;
 	atom->dirty = false;
 }
 
@@ -1476,11 +1477,10 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 	r600_need_cs_space(rctx, ib.user_buffer ? 5 : 0, TRUE);
 	r600_flush_emit(rctx);
 
-	for (i = 0; i < R600_NUM_ATOMS; i++) {
-		if (rctx->atoms[i] == NULL || !rctx->atoms[i]->dirty) {
-			continue;
-		}
+	i = r600_next_dirty_atom(rctx, 0);
+	while (i < R600_NUM_ATOMS) {
 		r600_emit_atom(rctx, rctx->atoms[i]);
+		i = r600_next_dirty_atom(rctx, i + 1);
 	}
 
 	if (rctx->b.chip_class == CAYMAN) {
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 765ee3f70ba..768fe282981 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -315,6 +315,7 @@ struct r600_common_screen {
 struct r600_atom {
 	void (*emit)(struct r600_common_context *ctx, struct r600_atom *state);
 	unsigned		num_dw;
+	unsigned short		id;	/* used by r600 only */
 	bool			dirty;
 };
 

From b88f14702d9c02a34d517f95fe840527961631cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 8 Aug 2015 14:03:54 +0200
Subject: [PATCH 1110/1208] gallium/radeon: fix r600g build if LLVM is disabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MESA_LLVM_VERSION_PATCH is undefined.

Reviewed-by: Edward O'Callaghan <eocallaghan at alterapraxis.com>
Tested-by: Benjamin Bellec <b.bellec@gmail.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index c9b89295e83..9e68e5f80ac 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -882,10 +882,11 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 
 	ws->query_info(ws, &rscreen->info);
 
-	if (HAVE_LLVM)
-		snprintf(llvm_string, sizeof(llvm_string),
-			 ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
-			 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
+#if HAVE_LLVM
+	snprintf(llvm_string, sizeof(llvm_string),
+		 ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
+		 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
+#endif
 
 	snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string),
 		 "%s (DRM %i.%i.%i%s)",

From c4b4bad68a90510406c0bef97039f7d0b4f8f5fe Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 11 Aug 2015 16:39:10 +0100
Subject: [PATCH 1111/1208] docs: add release notes for 10.6.4

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
(cherry picked from commit 6b2fcee64edadbd4db2293f5f4fc1a70e80c7251)
---
 docs/relnotes/10.6.4.html | 136 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 docs/relnotes/10.6.4.html

diff --git a/docs/relnotes/10.6.4.html b/docs/relnotes/10.6.4.html
new file mode 100644
index 00000000000..b330b870bbf
--- /dev/null
+++ b/docs/relnotes/10.6.4.html
@@ -0,0 +1,136 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.6.4 Release Notes / August 11, 2015</h1>
+
+<p>
+Mesa 10.6.4 is a bug fix release which fixes bugs found since the 10.6.3 release.
+</p>
+<p>
+Mesa 10.6.4 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=73512">Bug 73512</a> - [clover] mesa.icd. should contain full path</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91290">Bug 91290</a> - SIGSEGV glcpp/glcpp-parse.y:1077</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Anuj Phogat (6):</p>
+<ul>
+  <li>mesa: Turn get_readpixels_transfer_ops() in to a global function</li>
+  <li>meta: Fix transfer operations check in meta pbo path for readpixels</li>
+  <li>meta: Abort meta pbo path if readpixels need signed-unsigned conversion</li>
+  <li>meta: Don't do fragment color clamping in _mesa_meta_pbo_GetTexSubImage</li>
+  <li>mesa: Add a helper function _mesa_need_luminance_to_rgb_conversion()</li>
+  <li>meta: Fix reading luminance texture as rgba in _mesa_meta_pbo_GetTexSubImage()</li>
+</ul>
+
+<p>Ben Widawsky (1):</p>
+<ul>
+  <li>i965/skl: Add production thread counts and URB size</li>
+</ul>
+
+<p>Eduardo Lima Mitev (3):</p>
+<ul>
+  <li>mesa: Fix errors values returned by glShaderBinary()</li>
+  <li>mesa: Validate target before resolving tex obj in glTex(ture)SubImageXD</li>
+  <li>mesa: Fix error returned by glCopyTexImage2D() upon an invalid internal format</li>
+</ul>
+
+<p>Emil Velikov (6):</p>
+<ul>
+  <li>docs: Add checksums for mesa 10.6.3 tarballs</li>
+  <li>configure.ac: do not set HAVE_DRI(23) when libdrm is missing</li>
+  <li>egl/wayland: libdrm is a hard requirement, treat it as such</li>
+  <li>winsys/radeon: don't leak the fd when it is 0</li>
+  <li>bugzilla_mesa.sh: sort the bugs list by number</li>
+  <li>Update version to 10.6.4</li>
+</ul>
+
+<p>Francisco Jerez (1):</p>
+<ul>
+  <li>i965/fs: Fix fs_inst::regs_read() for sources in the ATTR file.</li>
+</ul>
+
+<p>Frank Binns (2):</p>
+<ul>
+  <li>egl/dri: Add error info needed for EGL_EXT_image_dma_buf_import extension</li>
+  <li>egl: Add eglQuerySurface surface type check for EGL_LARGEST_PBUFFER attrib</li>
+</ul>
+
+<p>Igor Gnatenko (1):</p>
+<ul>
+  <li>opencl: use versioned .so in mesa.icd</li>
+</ul>
+
+<p>Ilia Mirkin (1):</p>
+<ul>
+  <li>nvc0: fix geometry program revalidation of clipping params</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>glsl: Fix a bug where LHS swizzles of swizzles were too small.</li>
+</ul>
+
+<p>Marek Olšák (6):</p>
+<ul>
+  <li>st/mesa: don't call st_validate_state in BlitFramebuffer</li>
+  <li>radeonsi: upload shader rodata after updating scratch relocations</li>
+  <li>st/mesa: don't ignore texture buffer state changes</li>
+  <li>radeonsi: rework how shader pointers to descriptors are set</li>
+  <li>radeonsi: completely rework updating descriptors without CP DMA</li>
+  <li>r600g: fix the CB_SHADER_MASK setup</li>
+</ul>
+
+<p>Samuel Iglesias Gonsalvez (1):</p>
+<ul>
+  <li>glsl/glcpp: fix SIGSEGV when checking error condition for macro redefinition</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>nv50: avoid segfault with enabled but unbound vertex attrib</li>
+</ul>
+
+
+</div>
+</body>
+</html>

From d32c45ca7bd5a81b312504ba99cdab3d748251f7 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 11 Aug 2015 18:54:18 +0100
Subject: [PATCH 1112/1208] docs: add sha256 checksums for 10.6.4

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
(cherry picked from commit 99793e2541510fe208d29e69fedf97a6fff006f8)
---
 docs/relnotes/10.6.4.html | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/relnotes/10.6.4.html b/docs/relnotes/10.6.4.html
index b330b870bbf..168182ec52e 100644
--- a/docs/relnotes/10.6.4.html
+++ b/docs/relnotes/10.6.4.html
@@ -31,7 +31,8 @@ because compatibility contexts are not supported.
 
 <h2>SHA256 checksums</h2>
 <pre>
-TBD
+4960bf17d8b5d6a6503c6954ec6cf480b5cd930797bac901c60bea192675f85e  mesa-10.6.4.tar.gz
+8f5ac103f0f503de2f7a985b0df349bd4ecdfe7f51c714be146fa5a9a3c07b77  mesa-10.6.4.tar.xz
 </pre>
 
 

From 1e53df70642a970fa1bdf5e6b7d64f2c0a4699c7 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Tue, 11 Aug 2015 19:00:03 +0100
Subject: [PATCH 1113/1208] docs: add news item and link release notes for
 10.6.4

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 docs/index.html    | 6 ++++++
 docs/relnotes.html | 1 +
 2 files changed, 7 insertions(+)

diff --git a/docs/index.html b/docs/index.html
index 5b097c73e0f..b9e6148914e 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,12 @@
 
 <h1>News</h1>
 
+<h2>August 11 2015</h2>
+<p>
+<a href="relnotes/10.6.4.html">Mesa 10.6.4</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>July 26 2015</h2>
 <p>
 <a href="relnotes/10.6.3.html">Mesa 10.6.3</a> is released.
diff --git a/docs/relnotes.html b/docs/relnotes.html
index 8355f1132b3..39e7f61e792 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>
 
 <ul>
+<li><a href="relnotes/10.6.4.html">10.6.4 release notes</a>
 <li><a href="relnotes/10.6.3.html">10.6.3 release notes</a>
 <li><a href="relnotes/10.6.2.html">10.6.2 release notes</a>
 <li><a href="relnotes/10.5.9.html">10.5.9 release notes</a>

From 9fa70fef22bd458fbeb95a3b0ebb5f7919cba7f0 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 10 Aug 2015 16:57:58 -0700
Subject: [PATCH 1114/1208] i965: Optimize brw_inst_bits() and
 brw_compact_inst_bits().

Cuts about 1k of .text.

   text     data      bss      dec      hex  filename
5018165   197160    27672  5242997   500075  i965_dri.so before
5017141   197160    27672  5241973   4ffc75  i965_dri.so after

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_inst.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index 7a8c210118c..51082dc77a5 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -683,9 +683,9 @@ brw_inst_bits(const brw_inst *inst, unsigned high, unsigned low)
    high %= 64;
    low %= 64;
 
-   const uint64_t mask = (((1ull << (high - low + 1)) - 1) << low);
+   const uint64_t mask = (1ull << (high - low + 1)) - 1;
 
-   return (inst->data[word] & mask) >> low;
+   return (inst->data[word] >> low) & mask;
 }
 
 /**
@@ -731,9 +731,9 @@ typedef struct {
 static inline unsigned
 brw_compact_inst_bits(brw_compact_inst *inst, unsigned high, unsigned low)
 {
-   const uint64_t mask = (((1ull << (high - low + 1)) - 1) << low);
+   const uint64_t mask = (1ull << (high - low + 1)) - 1;
 
-   return (inst->data & mask) >> low;
+   return (inst->data >> low) & mask;
 }
 
 /**

From 2265321834608c26b2989a5a1f65bb375826a779 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 10 Aug 2015 18:50:48 -0700
Subject: [PATCH 1115/1208] i965: Optimize brw_inst_set_bits() and
 brw_compact_inst_set_bits().

Cuts about 2k of .text.

   text     data      bss      dec      hex  filename
5017141   197160    27672  5241973   4ffc75  i965_dri.so before
5014981   197160    27672  5239813   4ff405  i965_dri.so after

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_inst.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index 51082dc77a5..46eff1dd381 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -702,12 +702,12 @@ brw_inst_set_bits(brw_inst *inst, unsigned high, unsigned low, uint64_t value)
    high %= 64;
    low %= 64;
 
-   const uint64_t mask = (((1ull << (high - low + 1)) - 1) << low);
+   const uint64_t mask = ((1ull << (high - low + 1)) - 1) << low;
 
    /* Make sure the supplied value actually fits in the given bitfield. */
    assert((value & (mask >> low)) == value);
 
-   inst->data[word] = (inst->data[word] & ~mask) | ((value << low) & mask);
+   inst->data[word] = (inst->data[word] & ~mask) | (value << low);
 }
 
 #undef BRW_IA16_ADDR_IMM
@@ -745,12 +745,12 @@ static inline void
 brw_compact_inst_set_bits(brw_compact_inst *inst, unsigned high, unsigned low,
                           uint64_t value)
 {
-   const uint64_t mask = (((1ull << (high - low + 1)) - 1) << low);
+   const uint64_t mask = ((1ull << (high - low + 1)) - 1) << low;
 
    /* Make sure the supplied value actually fits in the given bitfield. */
    assert((value & (mask >> low)) == value);
 
-   inst->data = (inst->data & ~mask) | ((value << low) & mask);
+   inst->data = (inst->data & ~mask) | (value << low);
 }
 
 #define F(name, high, low)                                      \

From 02a4fe22b137d4bc8378bedd8319109fd23a50e3 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Tue, 11 Aug 2015 15:21:03 -0700
Subject: [PATCH 1116/1208] configure.ac: Always define __STDC_LIMIT_MACROS.

... which ensures that we get defines like LONG_MAX in C++.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91591
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 36197d3396a..4e751e36829 100644
--- a/configure.ac
+++ b/configure.ac
@@ -235,7 +235,7 @@ _SAVE_LDFLAGS="$LDFLAGS"
 _SAVE_CPPFLAGS="$CPPFLAGS"
 
 dnl Compiler macros
-DEFINES=""
+DEFINES="-D__STDC_LIMIT_MACROS"
 AC_SUBST([DEFINES])
 case "$host_os" in
 linux*|*-gnu*|gnu*)

From 3941539179b72fe25b6dffd1aacc0722d198a5ca Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Sat, 8 Aug 2015 09:00:21 -0700
Subject: [PATCH 1117/1208] mesa/formats: Only do byteswapping for packed
 formats

Reviewed-by: Iago Toral <itoral@igalia.com>
Cc: "10.6 10.5" <mesa-stable@lists.freedesktop.org>
---
 src/mesa/main/formats.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index baeb1bfe5de..d927073b0dc 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -372,10 +372,10 @@ uint32_t
 _mesa_format_to_array_format(mesa_format format)
 {
    const struct gl_format_info *info = _mesa_get_format_info(format);
-   if (_mesa_little_endian())
-      return info->ArrayFormat;
-   else
+   if (!_mesa_little_endian() && info->Layout == MESA_FORMAT_LAYOUT_PACKED)
       return _mesa_array_format_flip_channels(info->ArrayFormat);
+   else
+      return info->ArrayFormat;
 }
 
 static struct hash_table *format_array_format_table;

From 28d1a506c8d09fa66170978c85566c34cbf1cc0a Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Sun, 9 Aug 2015 23:45:44 -0700
Subject: [PATCH 1118/1208] mesa/formats: Fix swizzle flipping for big-endian
 targets

The swizzle defines where in the format you should look for any given
channel.  When we flip the format around for BE targets, we need to change
the destinations of the swizzles, not the sources.  For example, say the
format is an RGBX format with a swizzle of xyz1 on LE.  Then it should be
wzy1 on BE;  however, the code as it was before, would have made it 1zyx on
BE which is clearly wrong.

Reviewed-by: Iago Toral <itoral@igalia.com>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Cc: "10.6 10.5" <mesa-stable@lists.freedesktop.org>
---
 src/mesa/main/formats.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index d927073b0dc..27590ed4252 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -354,14 +354,22 @@ _mesa_array_format_flip_channels(mesa_array_format format)
       return format;
 
    if (num_channels == 2) {
-      _mesa_array_format_set_swizzle(&format, swizzle[1], swizzle[0],
-                                     swizzle[2], swizzle[3]);
+      /* Assert that the swizzle makes sense for 2 channels */
+      for (unsigned i = 0; i < 4; i++)
+         assert(swizzle[i] != 2 && swizzle[i] != 3);
+
+      static const uint8_t flip_xy[6] = { 1, 0, 2, 3, 4, 5 };
+      _mesa_array_format_set_swizzle(&format,
+                                     flip_xy[swizzle[0]], flip_xy[swizzle[1]],
+                                     flip_xy[swizzle[2]], flip_xy[swizzle[3]]);
       return format;
    }
 
    if (num_channels == 4) {
-      _mesa_array_format_set_swizzle(&format, swizzle[3], swizzle[2],
-                                     swizzle[1], swizzle[0]);
+      static const uint8_t flip[6] = { 3, 2, 1, 0, 4, 5 };
+      _mesa_array_format_set_swizzle(&format,
+                                     flip[swizzle[0]], flip[swizzle[1]],
+                                     flip[swizzle[2]], flip[swizzle[3]]);
       return format;
    }
 

From e3eb91af804f449005a2ff535c805eaa1d579d99 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 10 Aug 2015 01:32:23 -0700
Subject: [PATCH 1119/1208] mesa/formats: Don't flip channels of null array
 formats

Before, if we encountered an array format of 0 on a BE system, we would
flip all the channels even though it's an invalid format.  This would
result in a mostly invalid format with a swizzle of yyyy or wwww.  Instead,
we should just return 0 if the array format stashed in the format info is
invalid.

Cc: "10.6 10.5" <mesa-stable@lists.freedesktop.org>
---
 src/mesa/main/formats.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index 27590ed4252..d7b2bae59e7 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -380,7 +380,8 @@ uint32_t
 _mesa_format_to_array_format(mesa_format format)
 {
    const struct gl_format_info *info = _mesa_get_format_info(format);
-   if (!_mesa_little_endian() && info->Layout == MESA_FORMAT_LAYOUT_PACKED)
+   if (info->ArrayFormat && !_mesa_little_endian() &&
+       info->Layout == MESA_FORMAT_LAYOUT_PACKED)
       return _mesa_array_format_flip_channels(info->ArrayFormat);
    else
       return info->ArrayFormat;

From 5f1d5b1c7857f8680b47a7a450ee9e4530e22c6f Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Wed, 12 Aug 2015 18:22:53 +0300
Subject: [PATCH 1120/1208] mesa/formats: don't byteswap when building array
 formats

Because we build here an array format, we don't need to swap the
bytes for big endian.
If it isn't an array format, the bytes will be swapped in
_mesa_format_convert.

v2: remove temp variable

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Cc: "10.5 10.6" <mesa-stable@lists.freedesktop.org>
---
 src/mesa/main/glformats.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index c3fd7341ebc..3eb66dab7f8 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -2649,8 +2649,6 @@ get_swizzle_from_gl_format(GLenum format, uint8_t *swizzle)
 uint32_t
 _mesa_format_from_format_and_type(GLenum format, GLenum type)
 {
-   mesa_array_format array_format;
-
    bool is_array_format = true;
    uint8_t swizzle[4];
    bool normalized = false, is_float = false, is_signed = false;
@@ -2706,15 +2704,9 @@ _mesa_format_from_format_and_type(GLenum format, GLenum type)
       normalized = !_mesa_is_enum_format_integer(format);
       num_channels = _mesa_components_in_format(format);
 
-      array_format =
-         MESA_ARRAY_FORMAT(type_size, is_signed, is_float,
-                           normalized, num_channels,
-                           swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
-
-      if (!_mesa_little_endian())
-         array_format = _mesa_array_format_flip_channels(array_format);
-
-      return array_format;
+      return MESA_ARRAY_FORMAT(type_size, is_signed, is_float,
+                               normalized, num_channels,
+                               swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
    }
 
    /* Otherwise this is not an array format, so return the mesa_format

From 91698d1206b86ef1710291213145275a2dd06dd7 Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Fri, 7 Aug 2015 16:37:47 -0700
Subject: [PATCH 1121/1208] mesa/teximage: report the correct function which
 triggered the error

This function would always report that a dimension or size error occurred
in glTexImage even when it was called from glCompressedTexImage. Replace
the static string with the dynamically determined caller name.

Reviewed-by: Tapani Palli <tapani.palli@intel.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/main/teximage.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index fc69387204f..d35dc12e2cb 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -3336,15 +3336,15 @@ teximage(struct gl_context *ctx, GLboolean compressed, GLuint dims,
 
       if (!dimensionsOK) {
          _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glTexImage%uD(invalid width or height or depth)",
-                     dims);
+                     "%s%uD(invalid width or height or depth)",
+                     func, dims);
          return;
       }
 
       if (!sizeOK) {
          _mesa_error(ctx, GL_OUT_OF_MEMORY,
-                     "glTexImage%uD(image too large: %d x %d x %d, %s format)",
-                     dims, width, height, depth,
+                     "%s%uD(image too large: %d x %d x %d, %s format)",
+                     func, dims, width, height, depth,
                      _mesa_enum_to_string(internalFormat));
          return;
       }

From 078aef0e97bf7e0cc8fae4d541d5035ff6c29ad7 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 6 Aug 2015 14:26:47 -0700
Subject: [PATCH 1122/1208] i965/shader: Don't use OptimizeForAOS for NIR vec4
 vertex shaders

Shader-db results for vec4 programs using NIR on HSW:

   total instructions in shared programs: 1838157 -> 1828469 (-0.53%)
   instructions in affected programs:     275978 -> 266290 (-3.51%)
   helped:                                2827
   HURT:                                  244
   GAINED:                                0
   LOST:                                  0

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Eduardo Lima Mitev <elima@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_shader.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 2a1e469fa07..6b928060ef0 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -129,8 +129,8 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
           */
          compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
          compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
+         compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = false;
       }
-      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = false;
 
       compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions = nir_options;
    }

From 81d2fd91a90e5b2fd9fd74792a7a7c329f0e4d29 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 10 Aug 2015 06:58:37 -0400
Subject: [PATCH 1123/1208] mesa: add NV_read_{depth,stencil,depth_stencil}
 extensions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These extensions allow reading depth/stencil for GLES contexts, which is
useful for tools like apitrace.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/main/extensions.c |  3 +++
 src/mesa/main/readpix.c    | 48 +++++++++++++++++++++++++++++++-------
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 2dbfabdc7b5..d934d19c3e7 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -385,6 +385,9 @@ static const struct extension extension_table[] = {
    { "GL_NV_point_sprite",                         o(NV_point_sprite),                         GL,             2001 },
    { "GL_NV_primitive_restart",                    o(NV_primitive_restart),                    GLL,            2002 },
    { "GL_NV_read_buffer",                          o(dummy_true),                              ES2,            2011 },
+   { "GL_NV_read_depth",                           o(dummy_true),                              ES2,            2011 },
+   { "GL_NV_read_depth_stencil",                   o(dummy_true),                              ES2,            2011 },
+   { "GL_NV_read_stencil",                         o(dummy_true),                              ES2,            2011 },
    { "GL_NV_texgen_reflection",                    o(dummy_true),                              GLL,            1999 },
    { "GL_NV_texture_barrier",                      o(NV_texture_barrier),                      GL,             2009 },
    { "GL_NV_texture_env_combine4",                 o(NV_texture_env_combine4),                 GLL,            1999 },
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index 2744232f906..d826ecfc3d5 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -916,10 +916,8 @@ read_pixels_es3_error_check(GLenum format, GLenum type,
    const GLenum data_type = _mesa_get_format_datatype(rb->Format);
    GLboolean is_unsigned_int = GL_FALSE;
    GLboolean is_signed_int = GL_FALSE;
-
-   if (!_mesa_is_color_format(internalFormat)) {
-      return GL_INVALID_OPERATION;
-   }
+   GLboolean is_float_depth = (internalFormat == GL_DEPTH_COMPONENT32F) ||
+         (internalFormat == GL_DEPTH32F_STENCIL8);
 
    is_unsigned_int = _mesa_is_enum_format_unsigned_int(internalFormat);
    if (!is_unsigned_int) {
@@ -950,6 +948,43 @@ read_pixels_es3_error_check(GLenum format, GLenum type,
           (is_unsigned_int && type == GL_UNSIGNED_INT))
          return GL_NO_ERROR;
       break;
+   case GL_DEPTH_STENCIL:
+      switch (type) {
+      case GL_FLOAT_32_UNSIGNED_INT_24_8_REV:
+         if (is_float_depth)
+            return GL_NO_ERROR;
+         break;
+      case GL_UNSIGNED_INT_24_8:
+         if (!is_float_depth)
+            return GL_NO_ERROR;
+         break;
+      default:
+         return GL_INVALID_ENUM;
+      }
+      break;
+   case GL_DEPTH_COMPONENT:
+      switch (type) {
+      case GL_FLOAT:
+         if (is_float_depth)
+            return GL_NO_ERROR;
+         break;
+      case GL_UNSIGNED_SHORT:
+      case GL_UNSIGNED_INT_24_8:
+         if (!is_float_depth)
+            return GL_NO_ERROR;
+         break;
+      default:
+         return GL_INVALID_ENUM;
+      }
+      break;
+   case GL_STENCIL_INDEX:
+      switch (type) {
+      case GL_UNSIGNED_BYTE:
+         return GL_NO_ERROR;
+      default:
+         return GL_INVALID_ENUM;
+      }
+      break;
    }
 
    return GL_INVALID_OPERATION;
@@ -1023,11 +1058,6 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
          err = read_pixels_es3_error_check(format, type, rb);
       }
 
-      if (err == GL_NO_ERROR && (format == GL_DEPTH_COMPONENT
-          || format == GL_DEPTH_STENCIL)) {
-         err = GL_INVALID_ENUM;
-      }
-
       if (err != GL_NO_ERROR) {
          _mesa_error(ctx, err, "glReadPixels(invalid format %s and/or type %s)",
                      _mesa_enum_to_string(format),

From f72fead4a28d5d8a16bbc20781218ea7df0b9c9a Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 10 Aug 2015 20:41:45 -0400
Subject: [PATCH 1124/1208] freedreno: cap cleanups

Move a few things around to group stuff that is common to a3xx/a4xx
together.  Also, introduce is_ir3() for things that are more specific to
the compiler / shader-ISA than to the gpu generation.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/freedreno_screen.c      | 24 +++++++------------
 .../drivers/freedreno/freedreno_screen.h      |  8 +++++++
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 417d7c65ad4..26802b0188a 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -176,22 +176,20 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_PRIMITIVE_RESTART:
 	case PIPE_CAP_TGSI_INSTANCEID:
 	case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+	case PIPE_CAP_INDEP_BLEND_ENABLE:
+	case PIPE_CAP_INDEP_BLEND_FUNC:
 		return is_a3xx(screen) || is_a4xx(screen);
 
 	case PIPE_CAP_DEPTH_CLIP_DISABLE:
 		return is_a3xx(screen);
 
-	case PIPE_CAP_INDEP_BLEND_ENABLE:
-	case PIPE_CAP_INDEP_BLEND_FUNC:
-		return is_a3xx(screen) || is_a4xx(screen);
-
 	case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
 		return 256;
 
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
 		if (glsl120)
 			return 120;
-		return (is_a3xx(screen) || is_a4xx(screen)) ? 130 : 120;
+		return is_ir3(screen) ? 130 : 120;
 
 	/* Unsupported features. */
 	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
@@ -229,19 +227,16 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 
 	/* Stream output. */
 	case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-		if (is_a3xx(screen) || is_a4xx(screen))
+		if (is_ir3(screen))
 			return PIPE_MAX_SO_BUFFERS;
 		return 0;
 	case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
-		if (is_a3xx(screen) || is_a4xx(screen))
+		if (is_ir3(screen))
 			return 1;
 		return 0;
 	case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
-		if (is_a3xx(screen) || is_a4xx(screen))
-			return 16 * 4;   /* should only be shader out limit? */
-		return 0;
 	case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-		if (is_a3xx(screen) || is_a4xx(screen))
+		if (is_ir3(screen))
 			return 16 * 4;   /* should only be shader out limit? */
 		return 0;
 
@@ -273,9 +268,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_QUERY_TIMESTAMP:
 		return 0;
 	case PIPE_CAP_OCCLUSION_QUERY:
-		/* TODO still missing on a4xx, but we lie to get gl2..
-		 * it's not a feature, it's a bug!
-		 */
 		return is_a3xx(screen) || is_a4xx(screen);
 
 	case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
@@ -372,7 +364,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		 */
 		return ((is_a3xx(screen) || is_a4xx(screen)) ? 4096 : 64) * sizeof(float[4]);
 	case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
-		return (is_a3xx(screen) || is_a4xx(screen)) ? 16 : 1;
+		return is_ir3(screen) ? 16 : 1;
 	case PIPE_SHADER_CAP_MAX_PREDS:
 		return 0; /* nothing uses this */
 	case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
@@ -394,7 +386,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 	case PIPE_SHADER_CAP_INTEGERS:
 		if (glsl120)
 			return 0;
-		return (is_a3xx(screen) || is_a4xx(screen)) ? 1 : 0;
+		return is_ir3(screen) ? 1 : 0;
 	case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
 	case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
 		return 16;
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index 4b32d2d7d97..4e5c3a61958 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -73,6 +73,7 @@ struct fd_bo * fd_screen_bo_from_handle(struct pipe_screen *pscreen,
 struct pipe_screen * fd_screen_create(struct fd_device *dev);
 
 /* is a3xx patch revision 0? */
+/* TODO a306.0 probably doesn't need this.. be more clever?? */
 static inline boolean
 is_a3xx_p0(struct fd_screen *screen)
 {
@@ -91,4 +92,11 @@ is_a4xx(struct fd_screen *screen)
 	return (screen->gpu_id >= 400) && (screen->gpu_id < 500);
 }
 
+/* is it using the ir3 compiler (shader isa introduced with a3xx)? */
+static inline boolean
+is_ir3(struct fd_screen *screen)
+{
+	return is_a3xx(screen) || is_a4xx(screen);
+}
+
 #endif /* FREEDRENO_SCREEN_H_ */

From 8885f2befaea68ce7f9d550c9b9ff5ae77524406 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Tue, 11 Aug 2015 08:48:34 -0400
Subject: [PATCH 1125/1208] freedreno/a4xx: point-size and spritelist fixes

a4xx needs similar treatment as 995f55a6

Also fixup a few point-size and vpsrepl issues and drop fix_blit_fp()
hack previously needed for mem2gmem.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a4xx/fd4_draw.c |  3 +
 src/gallium/drivers/freedreno/a4xx/fd4_draw.h |  8 ++-
 src/gallium/drivers/freedreno/a4xx/fd4_emit.h |  4 ++
 src/gallium/drivers/freedreno/a4xx/fd4_gmem.c |  1 +
 .../drivers/freedreno/a4xx/fd4_program.c      | 67 ++++++++-----------
 .../drivers/freedreno/a4xx/fd4_rasterizer.c   |  8 +--
 .../drivers/freedreno/ir3/ir3_shader.h        |  5 --
 7 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index 0927b0d7682..2bd2ca23d54 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -131,6 +131,9 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 			.fsaturate_t = fd4_ctx->fsaturate_t,
 			.fsaturate_r = fd4_ctx->fsaturate_r,
 		},
+		.rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade,
+		.sprite_coord_enable = ctx->rasterizer ? ctx->rasterizer->sprite_coord_enable : false,
+		.sprite_coord_mode = ctx->rasterizer ? ctx->rasterizer->sprite_coord_mode : false,
 	};
 	unsigned dirty;
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
index 1bd376ca6ec..b89a30a7c4b 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
@@ -106,6 +106,7 @@ fd4_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
 {
 	struct pipe_index_buffer *idx = &ctx->indexbuf;
 	struct fd_bo *idx_bo = NULL;
+	enum pc_di_primtype primtype = ctx->primtypes[info->mode];
 	enum a4xx_index_size idx_type;
 	enum pc_di_src_sel src_sel;
 	uint32_t idx_size, idx_offset;
@@ -126,7 +127,12 @@ fd4_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		src_sel = DI_SRC_SEL_AUTO_INDEX;
 	}
 
-	fd4_draw(ctx, ring, ctx->primtypes[info->mode], vismode, src_sel,
+	/* points + psize -> spritelist: */
+	if (ctx->rasterizer && ctx->rasterizer->point_size_per_vertex &&
+			(info->mode == PIPE_PRIM_POINTS))
+		primtype = DI_PT_POINTLIST_PSIZE;
+
+	fd4_draw(ctx, ring, primtype, vismode, src_sel,
 			info->count, info->instance_count,
 			idx_type, idx_size, idx_offset, idx_bo);
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
index 99c7596fb82..ab7850e50b0 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
@@ -53,6 +53,10 @@ struct fd4_emit {
 	struct ir3_shader_key key;
 	uint32_t dirty;
 
+	uint32_t sprite_coord_enable;  /* bitmask */
+	bool sprite_coord_mode;
+	bool rasterflat;
+
 	/* cached to avoid repeated lookups of same variants: */
 	struct ir3_shader_variant *vp, *fp;
 	/* TODO: other shader stages.. */
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index 6541da54161..81c37f72565 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -309,6 +309,7 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd4_emit emit = {
 			.vtx = &fd4_ctx->blit_vbuf_state,
+			.sprite_coord_enable = 1,
 			/* NOTE: They all use the same VP, this is for vtx bufs. */
 			.prog = &ctx->blit_prog[0],
 			.key = {
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index fdfdee55123..1a6d0142132 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -465,10 +465,10 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 				COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE));
 		OUT_RING(ring, 0x00000000);
 	} else {
-		uint32_t vinterp[8], flatshade[2];
+		uint32_t vinterp[8], vpsrepl[8];
 
 		memset(vinterp, 0, sizeof(vinterp));
-		memset(flatshade, 0, sizeof(flatshade));
+		memset(vpsrepl, 0, sizeof(vpsrepl));
 
 		/* looks like we need to do int varyings in the frag
 		 * shader on a4xx (no flatshad reg?  or a420.0 bug?):
@@ -485,29 +485,40 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 		 * something like the code below instead of workaround
 		 * in the shader:
 		 */
-#if 0
-		/* figure out VARYING_INTERP / FLAT_SHAD register values: */
+		/* figure out VARYING_INTERP / VARYING_PS_REPL register values: */
 		for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) {
 			uint32_t interp = s[FS].v->inputs[j].interpolate;
+
+			/* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
+			 * instead.. rather than -8 everywhere else..
+			 */
+			uint32_t inloc = s[FS].v->inputs[j].inloc - 8;
+
+			/* currently assuming varyings aligned to 4 (not
+			 * packed):
+			 */
+			debug_assert((inloc % 4) == 0);
+
 			if ((interp == TGSI_INTERPOLATE_CONSTANT) ||
 					((interp == TGSI_INTERPOLATE_COLOR) && emit->rasterflat)) {
-				/* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
-				 * instead.. rather than -8 everywhere else..
-				 */
-				uint32_t loc = s[FS].v->inputs[j].inloc - 8;
-
-				/* currently assuming varyings aligned to 4 (not
-				 * packed):
-				 */
-				debug_assert((loc % 4) == 0);
+				uint32_t loc = inloc;
 
 				for (i = 0; i < 4; i++, loc++) {
 					vinterp[loc / 16] |= 1 << ((loc % 16) * 2);
-					flatshade[loc / 32] |= 1 << (loc % 32);
+					//flatshade[loc / 32] |= 1 << (loc % 32);
 				}
 			}
+
+			/* Replace the .xy coordinates with S/T from the point sprite. Set
+			 * interpolation bits for .zw such that they become .01
+			 */
+			if (emit->sprite_coord_enable & (1 << sem2idx(s[FS].v->inputs[j].semantic))) {
+				vpsrepl[inloc / 16] |= (emit->sprite_coord_mode ? 0x0d : 0x09)
+					<< ((inloc % 16) * 2);
+				vinterp[(inloc + 2) / 16] |= 2 << (((inloc + 2) % 16) * 2);
+				vinterp[(inloc + 3) / 16] |= 3 << (((inloc + 3) % 16) * 2);
+			}
 		}
-#endif
 
 		OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
 		OUT_RING(ring, A4XX_VPC_ATTR_TOTALATTR(s[FS].v->total_in) |
@@ -524,7 +535,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 
 		OUT_PKT0(ring, REG_A4XX_VPC_VARYING_PS_REPL_MODE(0), 8);
 		for (i = 0; i < 8; i++)
-			OUT_RING(ring, s[FS].v->shader->vpsrepl[i]);   /* VPC_VARYING_PS_REPL[i] */
+			OUT_RING(ring, vpsrepl[i]);   /* VPC_VARYING_PS_REPL[i] */
 	}
 
 	if (s[VS].instrlen)
@@ -535,28 +546,6 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 			emit_shader(ring, s[FS].v);
 }
 
-/* hack.. until we figure out how to deal w/ vpsrepl properly.. */
-static void
-fix_blit_fp(struct fd4_shader_stateobj *so)
-{
-	so->shader->vpsrepl[0] = 0x99999999;
-	so->shader->vpsrepl[1] = 0x99999999;
-	so->shader->vpsrepl[2] = 0x99999999;
-	so->shader->vpsrepl[3] = 0x99999999;
-}
-static void
-fix_blit_fps(struct pipe_context *pctx)
-{
-	struct fd_context *ctx = fd_context(pctx);
-	int i;
-
-	for (i = 0; i < ctx->screen->max_rts; i++)
-		fix_blit_fp(ctx->blit_prog[i].fp);
-
-	fix_blit_fp(ctx->blit_z.fp);
-	fix_blit_fp(ctx->blit_zs.fp);
-}
-
 void
 fd4_prog_init(struct pipe_context *pctx)
 {
@@ -567,6 +556,4 @@ fd4_prog_init(struct pipe_context *pctx)
 	pctx->delete_vs_state = fd4_vp_state_delete;
 
 	fd_prog_init(pctx);
-
-	fix_blit_fps(pctx);
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
index e54b606a285..dc7e98b149d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
@@ -50,7 +50,7 @@ fd4_rasterizer_state_create(struct pipe_context *pctx,
 
 	if (cso->point_size_per_vertex) {
 		psize_min = util_get_min_point_size(cso);
-		psize_max = 8192;
+		psize_max = 4092;
 	} else {
 		/* Force the point size to be as if the vertex output was disabled. */
 		psize_min = cso->point_size;
@@ -67,9 +67,9 @@ fd4_rasterizer_state_create(struct pipe_context *pctx,
 */
 	so->gras_cl_clip_cntl = 0x80000; /* ??? */
 	so->gras_su_point_minmax =
-			A4XX_GRAS_SU_POINT_MINMAX_MIN(psize_min/2) |
-			A4XX_GRAS_SU_POINT_MINMAX_MAX(psize_max/2);
-	so->gras_su_point_size   = A4XX_GRAS_SU_POINT_SIZE(cso->point_size/2);
+			A4XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) |
+			A4XX_GRAS_SU_POINT_MINMAX_MAX(psize_max);
+	so->gras_su_point_size   = A4XX_GRAS_SU_POINT_SIZE(cso->point_size);
 	so->gras_su_poly_offset_scale =
 			A4XX_GRAS_SU_POLY_OFFSET_SCALE(cso->offset_scale);
 	so->gras_su_poly_offset_offset =
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index c0fd44d4ed1..1bbbdbd224d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -219,11 +219,6 @@ struct ir3_shader {
 	struct pipe_stream_output_info stream_output;
 
 	struct ir3_shader_variant *variants;
-
-	/* so far, only used for blit_prog shader.. values for
-	 * VPC_VARYING_PS_REPL[i].MODE
-	 */
-	uint32_t vpsrepl[8];
 };
 
 void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);

From 0667962103034d7426c763a7793ce22baab46c8e Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Tue, 11 Aug 2015 11:47:46 -0400
Subject: [PATCH 1126/1208] freedreno/ir3: use nir pass to lower const to
 scalar

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 53faf16ae30..0ab33455ed1 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -149,6 +149,7 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens)
 	nir_opt_global_to_local(s);
 	nir_convert_to_ssa(s);
 	nir_lower_idiv(s);
+	nir_lower_load_const_to_scalar(s);
 
 	do {
 		progress = false;

From 6e04020dd7784bb44d5e04b41efce342f80840cf Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Tue, 11 Aug 2015 16:09:48 -0400
Subject: [PATCH 1127/1208] freedreno/ir3/print: print left/right neighbors too

When debugging compiler, this is useful to see.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_print.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
index f377982dd5e..07e03d26908 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -175,6 +175,20 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		printf("]");
 	}
 
+	if (instr->cp.left) {
+		printf(", left=_");
+		printf("[");
+		print_instr_name(instr->cp.left);
+		printf("]");
+	}
+
+	if (instr->cp.right) {
+		printf(", right=_");
+		printf("[");
+		print_instr_name(instr->cp.right);
+		printf("]");
+	}
+
 	if (is_meta(instr)) {
 		if (instr->opc == OPC_META_FO) {
 			printf(", off=%d", instr->fo.off);

From aab3912f21508b0681962c68fdaca1435c06b2ea Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Tue, 11 Aug 2015 16:11:04 -0400
Subject: [PATCH 1128/1208] freedreno/ir3: 'keeps' need neighbors found too

This shows up with a glamor shader, which does a TXF and uses the result
for conditional kill.  Before we wouldn't group the fanin (collect)
neighbors which need to be allocated adjacently at RA, resulting in
badness.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_group.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
index 70d9b08e019..ca28aefd502 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -236,6 +236,11 @@ find_neighbors(struct ir3 *ir)
 			instr_find_neighbors(instr);
 		}
 	}
+
+	for (i = 0; i < ir->keeps_count; i++) {
+		struct ir3_instruction *instr = ir->keeps[i];
+		instr_find_neighbors(instr);
+	}
 }
 
 void

From fb07c49f4883b12cef37748271d99e2fcf217a72 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Tue, 11 Aug 2015 16:33:14 -0400
Subject: [PATCH 1129/1208] ttn: add buffer texture type

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/auxiliary/nir/tgsi_to_nir.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 4130697e2a7..93dfb803389 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -999,6 +999,9 @@ static void
 setup_texture_info(nir_tex_instr *instr, unsigned texture)
 {
    switch (texture) {
+   case TGSI_TEXTURE_BUFFER:
+      instr->sampler_dim = GLSL_SAMPLER_DIM_BUF;
+      break;
    case TGSI_TEXTURE_1D:
       instr->sampler_dim = GLSL_SAMPLER_DIM_1D;
       break;

From 500025a23784877c8a61d8b3c7a8eab6fddf242a Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Tue, 11 Aug 2015 16:47:16 -0400
Subject: [PATCH 1130/1208] freedreno/a3xx+a4xx: add texture buffer object
 support

Basic texture buffer support.  Should be straightforward to add first/
last_element support.  And with a bit of work in ir3 emulate larger
texture buffer sizes.  But this seems to be enough for stk gl31 render
paths.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a3xx/fd3_emit.c    |  6 ++++--
 src/gallium/drivers/freedreno/a3xx/fd3_texture.c |  4 ++--
 src/gallium/drivers/freedreno/a4xx/fd4_emit.c    |  3 ++-
 src/gallium/drivers/freedreno/a4xx/fd4_texture.c |  4 ++--
 src/gallium/drivers/freedreno/freedreno_screen.c | 16 +++++++++++++---
 .../drivers/freedreno/freedreno_surface.c        |  3 ++-
 src/gallium/drivers/freedreno/freedreno_util.h   | 16 ++++++++++++++++
 7 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 49b3d97cce8..752e7f88cb9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -251,14 +251,15 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 				CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
 		for (i = 0; i < tex->num_textures; i++) {
 			static const struct fd3_pipe_sampler_view dummy_view = {
+					.base.target = PIPE_TEXTURE_1D, /* anything !PIPE_BUFFER */
 					.base.u.tex.first_level = 1,
 			};
 			const struct fd3_pipe_sampler_view *view = tex->textures[i] ?
 					fd3_pipe_sampler_view(tex->textures[i]) :
 					&dummy_view;
 			struct fd_resource *rsc = fd_resource(view->base.texture);
-			unsigned start = view->base.u.tex.first_level;
-			unsigned end   = view->base.u.tex.last_level;
+			unsigned start = fd_sampler_first_level(&view->base);
+			unsigned end   = fd_sampler_last_level(&view->base);;
 
 			for (j = 0; j < (end - start + 1); j++) {
 				struct fd_resource_slice *slice =
@@ -341,6 +342,7 @@ fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring,
 			format = fd3_gmem_restore_format(rsc->base.b.format);
 		}
 
+		/* note: PIPE_BUFFER disallowed for surfaces */
 		unsigned lvl = psurf[i]->u.tex.level;
 		struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl);
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
index a278bf5c603..c30658d0e7b 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
@@ -210,8 +210,8 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 {
 	struct fd3_pipe_sampler_view *so = CALLOC_STRUCT(fd3_pipe_sampler_view);
 	struct fd_resource *rsc = fd_resource(prsc);
-	unsigned lvl = cso->u.tex.first_level;
-	unsigned miplevels = cso->u.tex.last_level - lvl;
+	unsigned lvl = fd_sampler_first_level(cso);
+	unsigned miplevels = fd_sampler_last_level(cso) - lvl;
 	uint32_t sz2 = 0;
 
 	if (!so)
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index c3226d5121f..b75be29e523 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -172,7 +172,7 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			const struct fd4_pipe_sampler_view *view = tex->textures[i] ?
 					fd4_pipe_sampler_view(tex->textures[i]) :
 					&dummy_view;
-			unsigned start = view->base.u.tex.first_level;
+			unsigned start = fd_sampler_first_level(&view->base);
 
 			OUT_RING(ring, view->texconst0);
 			OUT_RING(ring, view->texconst1);
@@ -235,6 +235,7 @@ fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, unsigned nr_bufs,
 	for (i = 0; i < nr_bufs; i++) {
 		if (bufs[i]) {
 			struct fd_resource *rsc = fd_resource(bufs[i]->texture);
+			/* note: PIPE_BUFFER disallowed for surfaces */
 			unsigned lvl = bufs[i]->u.tex.level;
 			struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl);
 			uint32_t offset = fd_resource_offset(rsc, lvl, bufs[i]->u.tex.first_layer);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
index 6ba25d0816d..d2bc5fee6c0 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -150,8 +150,8 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 {
 	struct fd4_pipe_sampler_view *so = CALLOC_STRUCT(fd4_pipe_sampler_view);
 	struct fd_resource *rsc = fd_resource(prsc);
-	unsigned lvl = cso->u.tex.first_level;
-	unsigned miplevels = cso->u.tex.last_level - lvl;
+	unsigned lvl = fd_sampler_first_level(cso);
+	unsigned miplevels = fd_sampler_last_level(cso) - lvl;
 
 	if (!so)
 		return NULL;
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 26802b0188a..3aeed57b043 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -164,9 +164,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_BARRIER:
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
 	case PIPE_CAP_CUBE_MAP_ARRAY:
-	case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
-	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
-	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
 	case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
 	case PIPE_CAP_START_INSTANCE:
 	case PIPE_CAP_COMPUTE:
@@ -178,8 +175,21 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
 	case PIPE_CAP_INDEP_BLEND_ENABLE:
 	case PIPE_CAP_INDEP_BLEND_FUNC:
+	case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
 		return is_a3xx(screen) || is_a4xx(screen);
 
+	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+		/* ignoring first/last_element.. but I guess that should be
+		 * easy to add..
+		 */
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+		/* I think 32k on a4xx.. and we could possibly emulate more
+		 * by pretending 2d/rect textures and splitting high bits
+		 * of index into 2nd dimension..
+		 */
+		return 16383;
+
 	case PIPE_CAP_DEPTH_CLIP_DISABLE:
 		return is_a3xx(screen);
 
diff --git a/src/gallium/drivers/freedreno/freedreno_surface.c b/src/gallium/drivers/freedreno/freedreno_surface.c
index 250fe4bc0f5..70c44eb79c3 100644
--- a/src/gallium/drivers/freedreno/freedreno_surface.c
+++ b/src/gallium/drivers/freedreno/freedreno_surface.c
@@ -41,7 +41,8 @@ fd_create_surface(struct pipe_context *pctx,
 //	struct fd_resource* tex = fd_resource(ptex);
 	struct fd_surface* surface = CALLOC_STRUCT(fd_surface);
 
-	assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer);
+	debug_assert(ptex->target != PIPE_BUFFER);
+	debug_assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer);
 
 	if (surface) {
 		struct pipe_surface *psurf = &surface->base;
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index d6a08b5fa13..7129a1bddd1 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -140,6 +140,22 @@ fd_surface_half_precision(const struct pipe_surface *psurf)
 	return true;
 }
 
+static inline unsigned
+fd_sampler_first_level(const struct pipe_sampler_view *view)
+{
+	if (view->target == PIPE_BUFFER)
+		return 0;
+	return view->u.tex.first_level;
+}
+
+static inline unsigned
+fd_sampler_last_level(const struct pipe_sampler_view *view)
+{
+	if (view->target == PIPE_BUFFER)
+		return 0;
+	return view->u.tex.last_level;
+}
+
 static inline bool
 fd_half_precision(struct pipe_framebuffer_state *pfb)
 {

From bdc564b942ba292a897ea0d7d37f4bcafc236129 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 12 Aug 2015 11:39:24 -0400
Subject: [PATCH 1131/1208] freedreno/a4xx: format updates

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a4xx/fd4_format.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_format.c b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
index f906a6781b5..3e0045449eb 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_format.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
@@ -104,7 +104,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R16_SINT,    16_SINT,  R16_SINT, WZYX),
 	V_(R16_USCALED, 16_UINT,  NONE,     WZYX),
 	V_(R16_SSCALED, 16_UINT,  NONE,     WZYX),
-	VT(R16_FLOAT,   16_FLOAT, NONE,     WZYX),
+	VT(R16_FLOAT,   16_FLOAT, R16_FLOAT,WZYX),
 
 	_T(A16_UINT,    16_UINT,  NONE,     WZYX),
 	_T(A16_SINT,    16_SINT,  NONE,     WZYX),
@@ -140,7 +140,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R32_SINT,    32_SINT,  R32_SINT, WZYX),
 	V_(R32_USCALED, 32_UINT,  NONE,     WZYX),
 	V_(R32_SSCALED, 32_UINT,  NONE,     WZYX),
-	VT(R32_FLOAT,   32_FLOAT, NONE,     WZYX),
+	VT(R32_FLOAT,   32_FLOAT, R32_FLOAT,WZYX),
 	V_(R32_FIXED,   32_FIXED, NONE,     WZYX),
 
 	_T(A32_UINT,    32_UINT,  NONE,     WZYX),
@@ -156,7 +156,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R16G16_SINT,    16_16_SINT,  R16G16_SINT, WZYX),
 	V_(R16G16_USCALED, 16_16_UINT,  NONE,        WZYX),
 	V_(R16G16_SSCALED, 16_16_SINT,  NONE,        WZYX),
-	VT(R16G16_FLOAT,   16_16_FLOAT, NONE,        WZYX),
+	VT(R16G16_FLOAT,   16_16_FLOAT, R16G16_FLOAT,WZYX),
 
 	_T(L16A16_UINT,    16_16_UINT,  NONE,        WZYX),
 	_T(L16A16_SINT,    16_16_SINT,  NONE,        WZYX),
@@ -227,7 +227,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R32G32_SINT,    32_32_SINT,  R32G32_SINT, WZYX),
 	V_(R32G32_USCALED, 32_32_UINT,  NONE,        WZYX),
 	V_(R32G32_SSCALED, 32_32_SINT,  NONE,        WZYX),
-	VT(R32G32_FLOAT,   32_32_FLOAT, NONE,        WZYX),
+	VT(R32G32_FLOAT,   32_32_FLOAT, R32G32_FLOAT,WZYX),
 	V_(R32G32_FIXED,   32_32_FIXED, NONE,        WZYX),
 
 	_T(L32A32_UINT,    32_32_UINT,  NONE,        WZYX),

From 8e11be0ddb0920633c5fab8d6a6460b7591a2627 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 10 Aug 2015 00:52:21 +0200
Subject: [PATCH 1132/1208] radeonsi: move VGT_GS_MODE to the VS state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The VS will want to select GS scenario A here (VS with PrimitiveID).

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_state_shaders.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 475aea1f46d..a6480b330cd 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -314,6 +314,12 @@ static void si_shader_vs(struct si_shader *shader)
 	if (pm4 == NULL)
 		return;
 
+	/* If this is the GS copy shader, the GS state writes this register.
+	 * Otherwise, the VS state writes it.
+	 */
+	if (!shader->is_gs_copy_shader)
+		si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, 0);
+
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 
@@ -1287,8 +1293,6 @@ static void si_update_vgt_shader_config(struct si_context *sctx)
 		}
 
 		si_pm4_set_reg(*pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
-		if (!sctx->gs_shader)
-			si_pm4_set_reg(*pm4, R_028A40_VGT_GS_MODE, 0);
 	}
 	si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
 }

From e7a52a5cb810de49a8282cb9f9caea5d554c3348 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 10 Aug 2015 01:50:11 +0200
Subject: [PATCH 1133/1208] radeonsi: add support for gl_PrimitiveID in the
 fragment shader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It must be obtained from the VS.

The GS scenario A must be enabled for PrimID to be generated for the VS.

+ 4 piglits

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c      | 22 +++++++++++++++---
 src/gallium/drivers/radeonsi/si_shader.h      | 12 ++++++++++
 src/gallium/drivers/radeonsi/si_state.c       |  1 -
 .../drivers/radeonsi/si_state_shaders.c       | 23 +++++++++++++++----
 4 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 61d36430bcf..ac1c1be7a79 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -73,6 +73,7 @@ struct si_shader_context
 	int param_streamout_offset[4];
 	int param_vertex_id;
 	int param_rel_auto_id;
+	int param_vs_prim_id;
 	int param_instance_id;
 	int param_tes_u;
 	int param_tes_v;
@@ -486,6 +487,9 @@ static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 		return bld_base->uint_bld.zero;
 
 	switch (si_shader_ctx->type) {
+	case TGSI_PROCESSOR_VERTEX:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    si_shader_ctx->param_vs_prim_id);
 	case TGSI_PROCESSOR_TESS_CTRL:
 		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 				    SI_PARAM_PATCH_ID);
@@ -2027,7 +2031,7 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 	struct si_shader_output_values *outputs = NULL;
 	int i,j;
 
-	outputs = MALLOC(info->num_outputs * sizeof(outputs[0]));
+	outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
 
 	for (i = 0; i < info->num_outputs; i++) {
 		outputs[i].name = info->output_semantic_name[i];
@@ -2040,7 +2044,19 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 					      "");
 	}
 
-	si_llvm_export_vs(bld_base, outputs, info->num_outputs);
+	/* Export PrimitiveID when PS needs it. */
+	if (si_vs_exports_prim_id(si_shader_ctx->shader)) {
+		outputs[i].name = TGSI_SEMANTIC_PRIMID;
+		outputs[i].sid = 0;
+		outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+					       get_primitive_id(bld_base, 0));
+		outputs[i].values[1] = bld_base->base.undef;
+		outputs[i].values[2] = bld_base->base.undef;
+		outputs[i].values[3] = bld_base->base.undef;
+		i++;
+	}
+
+	si_llvm_export_vs(bld_base, outputs, i);
 	FREE(outputs);
 }
 
@@ -3415,7 +3431,7 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 		/* VGPRs */
 		params[si_shader_ctx->param_vertex_id = num_params++] = i32;
 		params[si_shader_ctx->param_rel_auto_id = num_params++] = i32;
-		params[num_params++] = i32; /* unused */
+		params[si_shader_ctx->param_vs_prim_id = num_params++] = i32;
 		params[si_shader_ctx->param_instance_id = num_params++] = i32;
 		break;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 82e9c915965..cd845c12e64 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -221,6 +221,7 @@ union si_shader_key {
 		uint64_t	es_enabled_outputs;
 		unsigned	as_es:1; /* export shader */
 		unsigned	as_ls:1; /* local shader */
+		unsigned	export_prim_id; /* when PS needs it and GS is disabled */
 	} vs;
 	struct {
 		unsigned	prim_mode:3;
@@ -231,6 +232,7 @@ union si_shader_key {
 		 * This describes how outputs are laid out in memory. */
 		uint64_t	es_enabled_outputs;
 		unsigned	as_es:1; /* export shader */
+		unsigned	export_prim_id; /* when PS needs it and GS is disabled */
 	} tes; /* tessellation evaluation shader */
 };
 
@@ -289,6 +291,16 @@ static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
 		return sctx->vs_shader->current;
 }
 
+static inline bool si_vs_exports_prim_id(struct si_shader *shader)
+{
+	if (shader->selector->type == PIPE_SHADER_VERTEX)
+		return shader->key.vs.export_prim_id;
+	else if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+		return shader->key.tes.export_prim_id;
+	else
+		return false;
+}
+
 /* radeonsi_shader.c */
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		     struct si_shader *shader);
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 1cac8041d61..701671b8d99 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3136,7 +3136,6 @@ static void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
 	si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
 
-	si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0x0);
 	si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
 	si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0);
 	si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index a6480b330cd..044acd8fcea 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -308,6 +308,7 @@ static void si_shader_vs(struct si_shader *shader)
 	uint64_t va;
 	unsigned window_space =
 	   shader->selector->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+	bool enable_prim_id = si_vs_exports_prim_id(shader);
 
 	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
 
@@ -317,8 +318,12 @@ static void si_shader_vs(struct si_shader *shader)
 	/* If this is the GS copy shader, the GS state writes this register.
 	 * Otherwise, the VS state writes it.
 	 */
-	if (!shader->is_gs_copy_shader)
-		si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, 0);
+	if (!shader->is_gs_copy_shader) {
+		si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE,
+			       S_028A40_MODE(enable_prim_id ? V_028A40_GS_SCENARIO_A : 0));
+		si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, enable_prim_id);
+	} else
+		si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0);
 
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
@@ -327,7 +332,7 @@ static void si_shader_vs(struct si_shader *shader)
 		vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
 		num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_VERTEX) {
-		vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
+		vgpr_comp_cnt = shader->uses_instanceid ? 3 : (enable_prim_id ? 2 : 0);
 		num_user_sgprs = SI_VS_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
 		vgpr_comp_cnt = 3; /* all components are needed for TES */
@@ -534,6 +539,10 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 			key->vs.as_es = 1;
 			key->vs.es_enabled_outputs = sctx->gs_shader->inputs_read;
 		}
+
+		if (!sctx->gs_shader && sctx->ps_shader &&
+		    sctx->ps_shader->info.uses_primid)
+			key->vs.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_TESS_CTRL:
 		key->tcs.prim_mode =
@@ -543,7 +552,8 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 		if (sctx->gs_shader) {
 			key->tes.as_es = 1;
 			key->tes.es_enabled_outputs = sctx->gs_shader->inputs_read;
-		}
+		} else if (sctx->ps_shader && sctx->ps_shader->info.uses_primid)
+			key->tes.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_GEOMETRY:
 		break;
@@ -977,7 +987,10 @@ bcolor:
 			}
 		}
 
-		if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(tmp)) {
+		if (name == TGSI_SEMANTIC_PRIMID)
+			/* PrimID is written after the last output. */
+			tmp |= S_028644_OFFSET(vs->vs_output_param_offset[vsinfo->num_outputs]);
+		else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(tmp)) {
 			/* No corresponding output found, load defaults into input.
 			 * Don't set any other bits.
 			 * (FLAT_SHADE=1 completely changes behavior) */

From 8ae88105b60be613126ea07492ffd9712e5e71eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 10 Aug 2015 02:28:01 +0200
Subject: [PATCH 1134/1208] radeonsi: enable VS_OUT_MISC_SIDE_BUS_ENA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is recommended for better performance.
Diag tests always enable this.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 701671b8d99..7b452737c86 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -496,6 +496,7 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
 					    info->writes_edgeflag ||
 					    info->writes_layer ||
 					     info->writes_viewport_index) |
+		S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1) |
 		(sctx->queued.named.rasterizer->clip_plane_enable &
 		 clipdist_mask));
 	r600_write_context_reg(cs, R_028810_PA_CL_CLIP_CNTL,

From bfac8ba9d32be351277c7ea814ac9848bdcb1f16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 11 Aug 2015 22:36:51 +0200
Subject: [PATCH 1135/1208] radeonsi: fix polygon offset scale
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The value was copied from r300g, which uses 1/12 subpixels, but this hw
uses 1/16 subpixels.

Fixes piglit: gl-1.4-polygon-offset (formerly a glean test)

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Cc: mesa-stable@lists.freedesktop.org
---
 src/gallium/drivers/radeonsi/si_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 7b452737c86..32609439659 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -666,7 +666,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 
 	/* offset */
 	rs->offset_units = state->offset_units;
-	rs->offset_scale = state->offset_scale * 12.0f;
+	rs->offset_scale = state->offset_scale * 16.0f;
 
 	si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0,
 		S_0286D4_FLAT_SHADE_ENA(1) |

From d335aad11b208bcdcc75a99d4b6c5fc8b69ce368 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 11 Aug 2015 22:36:51 +0200
Subject: [PATCH 1136/1208] r600g: fix polygon offset scale

The value was copied from r300g, which uses 1/12 subpixels, but this hw
uses 1/16 subpixels.

Should fix piglit: gl-1.4-polygon-offset (formerly a glean test)
(untested, ported from radeonsi)

Reviewed-by: Edward O'Callaghan <eocallaghan at alterapraxis.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Cc: mesa-stable@lists.freedesktop.org
---
 src/gallium/drivers/r600/evergreen_state.c | 2 +-
 src/gallium/drivers/r600/r600_state.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index f7a76f6cac7..95987ee2f70 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -485,7 +485,7 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 
 	/* offset */
 	rs->offset_units = state->offset_units;
-	rs->offset_scale = state->offset_scale * 12.0f;
+	rs->offset_scale = state->offset_scale * 16.0f;
 	rs->offset_enable = state->offset_point || state->offset_line || state->offset_tri;
 
 	if (state->point_size_per_vertex) {
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 848818842aa..5cc2283792d 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -473,7 +473,7 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 
 	/* offset */
 	rs->offset_units = state->offset_units;
-	rs->offset_scale = state->offset_scale * 12.0f;
+	rs->offset_scale = state->offset_scale * 16.0f;
 	rs->offset_enable = state->offset_point || state->offset_line || state->offset_tri;
 
 	if (state->point_size_per_vertex) {

From 8c0b943e87b48e7359230825cc06fbdd059a9e58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 11 Aug 2015 21:37:59 +0200
Subject: [PATCH 1137/1208] r600g: allow setting geometry shader sampler states

We were ignoring them. This is both hilarious and sad.

Cc: mesa-stable@lists.freedesktop.org
Reviewed-by: Edward O'Callaghan <eocallaghan at alterapraxis.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/r600/r600_state_common.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 8d0942ff898..ee477919b02 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -407,11 +407,6 @@ static void r600_bind_sampler_states(struct pipe_context *pipe,
 
 	assert(start == 0); /* XXX fix below */
 
-	if (shader != PIPE_SHADER_VERTEX &&
-	    shader != PIPE_SHADER_FRAGMENT) {
-		return;
-	}
-
 	for (i = 0; i < count; i++) {
 		struct r600_pipe_sampler_state *rstate = rstates[i];
 

From 86a72ee48eb371566765566fc778d790bc9ce201 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Wed, 29 Jul 2015 21:49:45 +1000
Subject: [PATCH 1138/1208] mesa: Fix printf format specifier warn of the
 ptrdiff_t
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See §7.19.6.1, paragraph 7 of the ISO C specification.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/main/bufferobj.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 6ddcc5cad4e..1cdea937f91 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -2373,7 +2373,7 @@ _mesa_map_buffer_range(struct gl_context *ctx,
 
    if (offset + length > bufObj->Size) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "%s(offset %ld + length %ld > buffer_size %ld)", func,
+                  "%s(offset %td + length %td > buffer_size %td)", func,
                   offset, length, bufObj->Size);
       return NULL;
    }

From 24695f4b2738d930a2bc71b4ebc9e5d993980cae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Mon, 3 Aug 2015 10:46:33 +0300
Subject: [PATCH 1139/1208] mesa: fix name returned for XFB varyings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_mesa_get_program_resource_name has logic to append '[0]' in name
if variable is an array, this should be skipped for XFB varyings
that have array index already appended.

v2: fix comment, change also GL_NAME_LENGTH query to match
    the behaviour

Fixes:
   ES31-CTS.program_interface_query.transform-feedback-types

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Martin Peres <martin.peres@linux.intel.com>
---
 src/mesa/main/shader_query.cpp | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index a85e4c42ae3..ee7320221e2 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -719,6 +719,12 @@ _mesa_get_program_resource_name(struct gl_shader_program *shProg,
    bool add_index = !(((programInterface == GL_PROGRAM_INPUT) &&
                        res->StageReferences & (1 << MESA_SHADER_GEOMETRY)));
 
+   /* Transform feedback varyings have array index already appended
+    * in their names.
+    */
+   if (programInterface == GL_TRANSFORM_FEEDBACK_VARYING)
+      add_index = false;
+
    if (add_index && _mesa_program_resource_array_size(res)) {
       int i;
 
@@ -963,11 +969,17 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
 
    switch(prop) {
    case GL_NAME_LENGTH:
-      if (res->Type == GL_ATOMIC_COUNTER_BUFFER)
+      switch (res->Type) {
+      case GL_ATOMIC_COUNTER_BUFFER:
          goto invalid_operation;
-      /* Base name +3 if array '[0]' + terminator. */
-      *val = strlen(_mesa_program_resource_name(res)) +
-         (_mesa_program_resource_array_size(res) > 0 ? 3 : 0) + 1;
+      case GL_TRANSFORM_FEEDBACK_VARYING:
+         *val = strlen(_mesa_program_resource_name(res)) + 1;
+         break;
+      default:
+         /* Base name +3 if array '[0]' + terminator. */
+         *val = strlen(_mesa_program_resource_name(res)) +
+            (_mesa_program_resource_array_size(res) > 0 ? 3 : 0) + 1;
+      }
       return 1;
    case GL_TYPE:
       switch (res->Type) {

From 853853b2ac527698215b4290629ec242333e264a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Wed, 12 Aug 2015 11:13:40 +0300
Subject: [PATCH 1140/1208] mesa: update MaxShaderStorageBlockSize to 2^27
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extension spec originally required 2^24 but 2^27 is the minimum value
required by OpenGL 4.5 and OpenGL ES 3.1 specifications.

Fixes:
   ES31-CTS.shader_storage_buffer_object.basic-max

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Timothy Arceri <t_arceri@yahoo.com.au>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/main/context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 7451e5a2bdd..888c461d1c2 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -604,7 +604,7 @@ _mesa_init_constants(struct gl_constants *consts, gl_api api)
    /** GL_ARB_shader_storage_buffer_object */
    consts->MaxCombinedShaderStorageBlocks = 8;
    consts->MaxShaderStorageBufferBindings = 8;
-   consts->MaxShaderStorageBlockSize = 16 * 1024 * 1024;
+   consts->MaxShaderStorageBlockSize = 128 * 1024 * 1024; /* 2^27 */
    consts->ShaderStorageBufferOffsetAlignment = 256;
 
    /* GL_ARB_explicit_uniform_location, GL_MAX_UNIFORM_LOCATIONS */

From b4897eb70a994c4630b0fde4a66dd6ace833c33a Mon Sep 17 00:00:00 2001
From: Topi Pohjolainen <topi.pohjolainen@intel.com>
Date: Thu, 25 Jun 2015 14:31:03 +0300
Subject: [PATCH 1141/1208] i965: Rename brw_upload_item_data to
 brw_alloc_item_data

and simplify the interface to take directly the size and to return
the offset. The routine does nothing more than allocate, it doesn't
upload anything.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_state_cache.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 693441c6f49..50c02432e81 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -248,18 +248,17 @@ brw_try_upload_using_copy(struct brw_cache *cache,
    return false;
 }
 
-static void
-brw_upload_item_data(struct brw_cache *cache,
-		     struct brw_cache_item *item,
-		     const void *data)
+static uint32_t
+brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
 {
+   uint32_t offset;
    struct brw_context *brw = cache->brw;
 
    /* Allocate space in the cache BO for our new program. */
-   if (cache->next_offset + item->size > cache->bo->size) {
+   if (cache->next_offset + size > cache->bo->size) {
       uint32_t new_size = cache->bo->size * 2;
 
-      while (cache->next_offset + item->size > new_size)
+      while (cache->next_offset + size > new_size)
 	 new_size *= 2;
 
       brw_cache_new_bo(cache, new_size);
@@ -273,10 +272,12 @@ brw_upload_item_data(struct brw_cache *cache,
       brw_cache_new_bo(cache, cache->bo->size);
    }
 
-   item->offset = cache->next_offset;
+   offset = cache->next_offset;
 
    /* Programs are always 64-byte aligned, so set up the next one now */
-   cache->next_offset = ALIGN(item->offset + item->size, 64);
+   cache->next_offset = ALIGN(offset + size, 64);
+
+   return offset;
 }
 
 void
@@ -312,7 +313,7 @@ brw_upload_cache(struct brw_cache *cache,
     * compile to the thing in our backend.
     */
    if (!brw_try_upload_using_copy(cache, item, data, aux)) {
-      brw_upload_item_data(cache, item, data);
+      item->offset = brw_alloc_item_data(cache, data_size);
    }
 
    /* Set up the memory containing the key and aux_data */

From 12a66d91f6b0beff123fb6fd8a4f3c3796379532 Mon Sep 17 00:00:00 2001
From: Topi Pohjolainen <topi.pohjolainen@intel.com>
Date: Thu, 25 Jun 2015 14:35:26 +0300
Subject: [PATCH 1142/1208] i965: Only write program to cache when it doesn't
 exist yet

Current logic re-writes the same data when existing data is found.
Not that this actually matters at the moment in practice, the
contraint for finding matching data is too severe to ever allow
data to be shared between two items in the cache.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_state_cache.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 50c02432e81..61439a89f8c 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -314,6 +314,13 @@ brw_upload_cache(struct brw_cache *cache,
     */
    if (!brw_try_upload_using_copy(cache, item, data, aux)) {
       item->offset = brw_alloc_item_data(cache, data_size);
+
+      /* Copy data to the buffer */
+      if (brw->has_llc) {
+         memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
+      } else {
+         drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
+      }
    }
 
    /* Set up the memory containing the key and aux_data */
@@ -332,13 +339,6 @@ brw_upload_cache(struct brw_cache *cache,
    cache->items[hash] = item;
    cache->n_items++;
 
-   /* Copy data to the buffer */
-   if (brw->has_llc) {
-      memcpy((char *) cache->bo->virtual + item->offset, data, data_size);
-   } else {
-      drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
-   }
-
    *out_offset = item->offset;
    *(void **)out_aux = (void *)((char *)item->key + item->key_size);
    cache->brw->ctx.NewDriverState |= 1 << cache_id;

From 1bba29ed403e735ba0bf04ed8aa2e571884fcaaf Mon Sep 17 00:00:00 2001
From: Topi Pohjolainen <topi.pohjolainen@intel.com>
Date: Thu, 25 Jun 2015 14:00:41 +0300
Subject: [PATCH 1143/1208] i965: Stop aux data compare preventing program
 binary re-use

Items in the program cache consist of three things: key, the data
representing the instructions and auxiliary data representing
uniform storage. The data consisting of instructions is stored into
a drm buffer object while the key and the auxiliary data reside in
malloced section. Now the cache uploading is equipped with a check
that iterates over existing items and seeks to find a another item
using identical instruction data than the one being just uploaded.
If such is found there is no need to add another section into the
drm buffer object holding identical copy of the existing one. The
item just being uploaded should instead simply point to the same
offset in the underlying drm buffer object.

Unfortunately the check for the matching instruction data is
coupled with a check for matching auxiliary data also. This
effectively prevents the cache from ever containing two items
that could share a section in the drm buffer object.

The constraint for the instruction data and auxiliary data to
match is, fortunately, unnecessary strong. When items are stored
into the cache they will anyway contain their own copy of the
auxiliary data (even if they matched - which they in real world
never will). The only thing the items would be sharing is the
instruction data and hence we should only check for that to match
and nothing else.

No piglit regression in jenkins.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_state_cache.c | 52 ++++++++-------------
 1 file changed, 20 insertions(+), 32 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 61439a89f8c..9e00c837407 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -200,36 +200,23 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
 }
 
 /**
- * Attempts to find an item in the cache with identical data and aux
- * data to use
+ * Attempts to find an item in the cache with identical data.
  */
-static bool
-brw_try_upload_using_copy(struct brw_cache *cache,
-			  struct brw_cache_item *result_item,
-			  const void *data,
-			  const void *aux)
+static const struct brw_cache_item *
+brw_lookup_prog(const struct brw_cache *cache,
+                enum brw_cache_id cache_id,
+                const void *data, unsigned data_size)
 {
-   struct brw_context *brw = cache->brw;
+   const struct brw_context *brw = cache->brw;
    int i;
-   struct brw_cache_item *item;
+   const struct brw_cache_item *item;
 
    for (i = 0; i < cache->size; i++) {
       for (item = cache->items[i]; item; item = item->next) {
-	 const void *item_aux = item->key + item->key_size;
 	 int ret;
 
-	 if (item->cache_id != result_item->cache_id ||
-	     item->size != result_item->size ||
-	     item->aux_size != result_item->aux_size) {
+	 if (item->cache_id != cache_id || item->size != data_size)
 	    continue;
-	 }
-
-         if (cache->aux_compare[result_item->cache_id]) {
-            if (!cache->aux_compare[result_item->cache_id](item_aux, aux))
-               continue;
-         } else if (memcmp(item_aux, aux, item->aux_size) != 0) {
-	    continue;
-	 }
 
          if (!brw->has_llc)
             drm_intel_bo_map(cache->bo, false);
@@ -239,13 +226,11 @@ brw_try_upload_using_copy(struct brw_cache *cache,
 	 if (ret)
 	    continue;
 
-	 result_item->offset = item->offset;
-
-	 return true;
+	 return item;
       }
    }
 
-   return false;
+   return NULL;
 }
 
 static uint32_t
@@ -294,6 +279,8 @@ brw_upload_cache(struct brw_cache *cache,
 {
    struct brw_context *brw = cache->brw;
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
+   const struct brw_cache_item *matching_data =
+      brw_lookup_prog(cache, cache_id, data, data_size);
    GLuint hash;
    void *tmp;
 
@@ -305,14 +292,15 @@ brw_upload_cache(struct brw_cache *cache,
    hash = hash_key(item);
    item->hash = hash;
 
-   /* If we can find a matching prog/prog_data combo in the cache
-    * already, then reuse the existing stuff.  This will mean not
-    * flagging CACHE_NEW_* when transitioning between the two
-    * equivalent hash keys.  This is notably useful for programs
-    * generating shaders at runtime, where multiple shaders may
-    * compile to the thing in our backend.
+   /* If we can find a matching prog in the cache already, then reuse the
+    * existing stuff without creating new copy into the underlying buffer
+    * object. This is notably useful for programs generating shaders at
+    * runtime, where multiple shaders may compile to the same thing in our
+    * backend.
     */
-   if (!brw_try_upload_using_copy(cache, item, data, aux)) {
+   if (matching_data) {
+      item->offset = matching_data->offset;
+   } else {
       item->offset = brw_alloc_item_data(cache, data_size);
 
       /* Copy data to the buffer */

From b8f63b3c1002eea9cc6d54191bd41ea43c467e96 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Wed, 12 Aug 2015 17:01:52 +1000
Subject: [PATCH 1144/1208] glsl: make linker error message more informative

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/glsl/linker.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index d7efea5e05d..a7cd82049bd 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -2880,8 +2880,9 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog)
 
       if (sh) {
          if (sh->NumImages > ctx->Const.Program[i].MaxImageUniforms)
-            linker_error(prog, "Too many %s shader image uniforms\n",
-                         _mesa_shader_stage_to_string(i));
+            linker_error(prog, "Too many %s shader image uniforms (%u > %u)\n",
+                         _mesa_shader_stage_to_string(i), sh->NumImages,
+                         ctx->Const.Program[i].MaxImageUniforms);
 
          total_image_units += sh->NumImages;
 

From 8dffa89e013b611cdafbb2cc5216450fa248cb7c Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Thu, 13 Aug 2015 18:42:54 +1000
Subject: [PATCH 1145/1208] mesa: remove extern from texture function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/teximage.c | 2 +-
 src/mesa/main/teximage.h | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index d35dc12e2cb..a144f950d44 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -5595,7 +5595,7 @@ check_multisample_target(GLuint dims, GLenum target, bool dsa)
 }
 
 
-void
+static void
 _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
                                 struct gl_texture_object *texObj,
                                 GLenum target, GLsizei samples,
diff --git a/src/mesa/main/teximage.h b/src/mesa/main/teximage.h
index 1eebaa8b631..bf729daf534 100644
--- a/src/mesa/main/teximage.h
+++ b/src/mesa/main/teximage.h
@@ -199,15 +199,6 @@ _mesa_copy_texture_sub_image(struct gl_context *ctx, GLuint dims,
                              GLsizei width, GLsizei height,
                              const char *caller);
 
-extern void
-_mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
-                                struct gl_texture_object *texObj,
-                                GLenum target, GLsizei samples,
-                                GLint internalformat, GLsizei width,
-                                GLsizei height, GLsizei depth,
-                                GLboolean fixedsamplelocations,
-                                GLboolean immutable, const char *func);
-
 extern void
 _mesa_texture_buffer_range(struct gl_context *ctx,
                            struct gl_texture_object *texObj,

From 2900e8ca9077d20c5b29bb5a4171ac59ea9d1767 Mon Sep 17 00:00:00 2001
From: Frank Binns <frank.binns@imgtec.com>
Date: Tue, 4 Aug 2015 14:32:43 +0100
Subject: [PATCH 1146/1208] egl/x11: fix use of EGL_BAD_NATIVE_WINDOW

Commit 4ed23fd590 introduced some calls to _eglError inappropriately
passing it EGL_BAD_NATIVE_WINDOW. This was actually harmless in two of the
cases as _eglError gets called later on with a more appropriate error code
but (just to be safe) switch these to _eglLog calls instead.

The final case is a little trickier as it actually needs to set an error
of which the following are available (according to the EGL spec):
EGL_BAD_MATCH, EGL_BAD_CONFIG, EGL_BAD_NATIVE_(PIXMAP|WINDOW) and
EGL_BAD_ALLOC.

Of these, EGL_BAD_ALLOC seems to be the most appropriate given that
failure can occur either as a result of xcb_get_setup failing due to an
earlier error on the connection (where the most commonly occurring error
code is XCB_CONN_CLOSED_MEM_INSUFFICIENT) or as a result of the
xcb_screen_iterator_t 'rem' field being 0.

In addition to this, commit af2aea40d2 unconditionally set the error to
EGL_BAD_NATIVE_WINDOW when creating a window or pixmap surface with a NULL
native handle. Change this to correctly set the error based on surface
type.

v2: Updated patch description (Emil Velikov)
    Return EGL_BAD_NATIVE_PIXMAP when eglCreatePixmapSurface is called
    with a NULL native pixmap handle

Signed-off-by: Frank Binns <frank.binns@imgtec.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/egl/drivers/dri2/platform_x11.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 9b040f7d491..3b5277738ae 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -226,7 +226,7 @@ dri2_x11_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
       s = xcb_setup_roots_iterator(xcb_get_setup(dri2_dpy->conn));
       screen = get_xcb_screen(s, dri2_dpy->screen);
       if (!screen) {
-         _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_create_surface");
+         _eglError(EGL_BAD_ALLOC, "failed to get xcb screen");
          goto cleanup_surf;
       }
 
@@ -236,7 +236,10 @@ dri2_x11_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
 			dri2_surf->base.Width, dri2_surf->base.Height);
    } else {
       if (!drawable) {
-         _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_create_surface");
+         if (type == EGL_WINDOW_BIT)
+            _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_create_surface");
+         else
+            _eglError(EGL_BAD_NATIVE_PIXMAP, "dri2_create_surface");
          goto cleanup_surf;
       }
       dri2_surf->drawable = drawable;
@@ -544,7 +547,7 @@ dri2_x11_connect(struct dri2_egl_display *dri2_dpy)
    s = xcb_setup_roots_iterator(xcb_get_setup(dri2_dpy->conn));
    screen = get_xcb_screen(s, dri2_dpy->screen);
    if (!screen) {
-      _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_x11_connect");
+      _eglLog(_EGL_WARNING, "DRI2: failed to get xcb screen");
       return EGL_FALSE;
    }
    connect_cookie = xcb_dri2_connect_unchecked(dri2_dpy->conn, screen->root,
@@ -635,7 +638,7 @@ dri2_x11_authenticate(_EGLDisplay *disp, uint32_t id)
 
    screen = get_xcb_screen(s, dri2_dpy->screen);
    if (!screen) {
-      _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_x11_authenticate");
+      _eglLog(_EGL_WARNING, "DRI2: failed to get xcb screen");
       return -1;
    }
 

From 3b491cbc42f6cfad2e750957f720b15b95278acf Mon Sep 17 00:00:00 2001
From: Frank Binns <frank.binns@imgtec.com>
Date: Tue, 4 Aug 2015 14:32:44 +0100
Subject: [PATCH 1147/1208] egl/x11: set EGL_BAD_NATIVE_(PIXMAP|WINDOW) for
 invalid pixmaps/windows

Both eglCreatePixmapSurface and eglCreateWindowSurface were incorrectly
setting the EGL error to be EGL_BAD_ALLOC when an invalid native drawable
handle was being passed in. The EGL spec states the following for
eglCreatePixmapSurface:

	"If pixmap is not a valid native pixmap handle, then an EGL_BAD_-
	 NATIVE_PIXMAP error should be generated."

(eglCreateWindowSurface has similar text)

Correctly set the EGL error value based on xcb_get_geometry_reply returning
an error structure containing something other than BadAlloc.

v2: Check for BadAlloc error and update commit message to reflect this

Signed-off-by: Frank Binns <frank.binns@imgtec.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/egl/drivers/dri2/platform_x11.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 3b5277738ae..459c3914ad6 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -268,10 +268,18 @@ dri2_x11_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
    if (type != EGL_PBUFFER_BIT) {
       cookie = xcb_get_geometry (dri2_dpy->conn, dri2_surf->drawable);
       reply = xcb_get_geometry_reply (dri2_dpy->conn, cookie, &error);
-      if (reply == NULL || error != NULL) {
-	 _eglError(EGL_BAD_ALLOC, "xcb_get_geometry");
-	 free(error);
-	 goto cleanup_dri_drawable;
+      if (error != NULL) {
+         if (error->error_code == BadAlloc)
+            _eglError(EGL_BAD_ALLOC, "xcb_get_geometry");
+         else if (type == EGL_WINDOW_BIT)
+            _eglError(EGL_BAD_NATIVE_WINDOW, "xcb_get_geometry");
+         else
+            _eglError(EGL_BAD_NATIVE_PIXMAP, "xcb_get_geometry");
+         free(error);
+         goto cleanup_dri_drawable;
+      } else if (reply == NULL) {
+         _eglError(EGL_BAD_ALLOC, "xcb_get_geometry");
+         goto cleanup_dri_drawable;
       }
 
       dri2_surf->base.Width = reply->width;

From 9a4eae61c24858d69d731d63b141d2acaed40d69 Mon Sep 17 00:00:00 2001
From: Frank Binns <frank.binns@imgtec.com>
Date: Tue, 4 Aug 2015 14:32:45 +0100
Subject: [PATCH 1148/1208] egl/x11: don't abort when creating a DRI2 drawable
 fails

When calling either eglCreateWindowSurface or eglCreatePixmapSurface it
was possible for an application to be aborted as a result of it failing
to create a DRI2 drawable on the server. This could happen due to an
application passing in an invalid native drawable handle, for example.

v2: Handle the case where an error has been set on the connection

Cc: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Frank Binns <frank.binns@imgtec.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/egl/drivers/dri2/platform_x11.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 459c3914ad6..bf7d2bea4c1 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -289,7 +289,25 @@ dri2_x11_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
    }
 
    if (dri2_dpy->dri2) {
-      xcb_dri2_create_drawable (dri2_dpy->conn, dri2_surf->drawable);
+      xcb_void_cookie_t cookie;
+      int conn_error;
+
+      cookie = xcb_dri2_create_drawable_checked(dri2_dpy->conn,
+                                                dri2_surf->drawable);
+      error = xcb_request_check(dri2_dpy->conn, cookie);
+      conn_error = xcb_connection_has_error(dri2_dpy->conn);
+      if (conn_error || error != NULL) {
+         if (type == EGL_PBUFFER_BIT || conn_error || error->error_code == BadAlloc)
+            _eglError(EGL_BAD_ALLOC, "xcb_dri2_create_drawable_checked");
+         else if (type == EGL_WINDOW_BIT)
+            _eglError(EGL_BAD_NATIVE_WINDOW,
+                      "xcb_dri2_create_drawable_checked");
+         else
+            _eglError(EGL_BAD_NATIVE_PIXMAP,
+                      "xcb_dri2_create_drawable_checked");
+         free(error);
+         goto cleanup_dri_drawable;
+      }
    } else {
       if (type == EGL_PBUFFER_BIT) {
          dri2_surf->depth = _eglGetConfigKey(conf, EGL_BUFFER_SIZE);

From 21b2c6fd5ea5ec2a810945c3c61b14d93a53991d Mon Sep 17 00:00:00 2001
From: Frank Binns <frank.binns@imgtec.com>
Date: Wed, 12 Aug 2015 16:35:59 +0100
Subject: [PATCH 1149/1208] egl: don't allow eglGetConfigs to set num_configs
 param to a negative value

When a buffer is provided to eglGetConfigs it's supposed to set the value
of the num_config parameter to the total number of configs that have been
copied into this buffer. For some reason the EGL spec doesn't consider it
to be an error to pass this function a buffer while specifying its size to
be less than 0. Given this, one would expect this combination to result in
the num_config parameter being set to 0 but this wasn't the case. This was
due to the buffer size being copied straight into num_configs without being
clamped to 0.

This was causing the following dEQP EGL test to fail:
dEQP-EGL.functional.query_config.get_configs.get_configs_bounds

Signed-off-by: Frank Binns <frank.binns@imgtec.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/egl/main/eglarray.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/egl/main/eglarray.c b/src/egl/main/eglarray.c
index 3ccc8a649f0..d2f39af49a6 100644
--- a/src/egl/main/eglarray.c
+++ b/src/egl/main/eglarray.c
@@ -197,6 +197,9 @@ _eglFlattenArray(_EGLArray *array, void *buffer, EGLint elem_size, EGLint size,
 
    count = array->Size;
    if (buffer) {
+      /* clamp size to 0 */
+      if (size < 0)
+         size = 0;
       /* do not exceed buffer size */
       if (count > size)
          count = size;

From d9603be038b6d30f17ca7c05e60cc78100a625ac Mon Sep 17 00:00:00 2001
From: Frank Binns <frank.binns@imgtec.com>
Date: Wed, 12 Aug 2015 16:36:00 +0100
Subject: [PATCH 1150/1208] egl: improve attribute checking for
 eglCreateContext

The EGL 1.4 spec states for eglCreateContext:

	"attribute EGL_CONTEXT_CLIENT_VERSION is only valid when the current
	 rendering API is EGL_OPENGL_ES_API"

Additionally, if the EGL_KHR_create_context EGL extension is supported
(this is mandatory in EGL 1.5) then the EGL_CONTEXT_MAJOR_VERSION_KHR,
which is an alias for EGL_CONTEXT_CLIENT_VERSION, and
EGL_CONTEXT_MINOR_VERSION_KHR attributes are also accepted by
eglCreateContext with the extension spec stating:

	"The values for attributes EGL_CONTEXT_MAJOR_VERSION_KHR and
	 EGL_CONTEXT_MINOR_VERSION_KHR specify the requested client API
	 version. They are only meaningful for OpenGL and OpenGL ES
	 contexts, and specifying them for other types of contexts will
	 generate an error."

Add the necessary checks against the extension and rendering APIs when
validating these attributes as part of eglCreateContext.

Signed-off-by: Frank Binns <frank.binns@imgtec.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
[Emil Velikov: Add newline before the spec quote (Matt)]
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 src/egl/main/eglcontext.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/egl/main/eglcontext.c b/src/egl/main/eglcontext.c
index e767f4b1abe..588f48921f2 100644
--- a/src/egl/main/eglcontext.c
+++ b/src/egl/main/eglcontext.c
@@ -101,11 +101,42 @@ _eglParseContextAttribList(_EGLContext *ctx, _EGLDisplay *dpy,
 
       switch (attr) {
       case EGL_CONTEXT_CLIENT_VERSION:
+         /* The EGL 1.4 spec says:
+          *
+          *     "attribute EGL_CONTEXT_CLIENT_VERSION is only valid when the
+          *      current rendering API is EGL_OPENGL_ES_API"
+          *
+          * The EGL_KHR_create_context spec says:
+          *
+          *     "EGL_CONTEXT_MAJOR_VERSION_KHR           0x3098
+          *      (this token is an alias for EGL_CONTEXT_CLIENT_VERSION)"
+          *
+          *     "The values for attributes EGL_CONTEXT_MAJOR_VERSION_KHR and
+          *      EGL_CONTEXT_MINOR_VERSION_KHR specify the requested client API
+          *      version. They are only meaningful for OpenGL and OpenGL ES
+          *      contexts, and specifying them for other types of contexts will
+          *      generate an error."
+          */
+         if ((api != EGL_OPENGL_ES_API &&
+             (!dpy->Extensions.KHR_create_context || api != EGL_OPENGL_API))) {
+               err = EGL_BAD_ATTRIBUTE;
+               break;
+         }
+
          ctx->ClientMajorVersion = val;
          break;
 
       case EGL_CONTEXT_MINOR_VERSION_KHR:
-         if (!dpy->Extensions.KHR_create_context) {
+         /* The EGL_KHR_create_context spec says:
+          *
+          *     "The values for attributes EGL_CONTEXT_MAJOR_VERSION_KHR and
+          *      EGL_CONTEXT_MINOR_VERSION_KHR specify the requested client API
+          *      version. They are only meaningful for OpenGL and OpenGL ES
+          *      contexts, and specifying them for other types of contexts will
+          *      generate an error."
+          */
+         if (!dpy->Extensions.KHR_create_context ||
+             (api != EGL_OPENGL_ES_API && api != EGL_OPENGL_API)) {
             err = EGL_BAD_ATTRIBUTE;
             break;
          }

From 28ed1e08e8ba98ebd4ff0b56326372f0df9c73ad Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Fri, 7 Aug 2015 13:58:37 -0700
Subject: [PATCH 1151/1208] i965/skl: Remove early platform support

We do not want bug reports from this early stepping of SKL. Few if any were ever
shipped outside of Intel to early enabling partners, and none will be sold.

There is a functional change here. If you're using new mesa on an old
kernel/libdrm, the revid will be -1, and we'll use new SKL values instead of
early ones (a hopefully irrelevant improvement IMO).

v2: Remove hunk which warned before dying. Instead, default to normal SKL
support (Ken)

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Neil Roberts <neil@linux.intel.com>
---
 src/mesa/drivers/dri/i965/brw_device_info.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index be517e826c4..7ad3a2fb7b4 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -322,11 +322,6 @@ static const struct brw_device_info brw_device_info_chv = {
       .max_gs_entries = 640,                        \
    }
 
-static const struct brw_device_info brw_device_info_skl_early = {
-   GEN9_FEATURES, .gt = 1,
-   .supports_simd16_3src = false,
-};
-
 static const struct brw_device_info brw_device_info_skl_gt1 = {
    GEN9_FEATURES, .gt = 1,
 };
@@ -376,10 +371,5 @@ brw_get_device_info(int devid, int revision)
       return NULL;
    }
 
-   if (devinfo->gen == 9 &&
-       !devinfo->is_broxton &&
-       (revision == 2 || revision == 3 || revision == -1))
-      return &brw_device_info_skl_early;
-
    return devinfo;
 }

From 7a144aaf64e5cfff5aa53d7fd340c91762e51aa5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Thu, 13 Aug 2015 09:30:35 +0300
Subject: [PATCH 1152/1208] mesa: set correct error for non-renderable
 multisample textures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: same common error on gles31 and desktop OpenGL
    (spotted by Erik Faye-Lund)

Signed-off-by: Marta Lofstedt <marta.lofstedt@linux.intel.com>
Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Timothy Arceri <t_arceri@yahoo.com.au>
---
 src/mesa/main/teximage.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index a144f950d44..3a556a6ad6e 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -5639,9 +5639,16 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
    }
 
    if (!is_renderable_texture_format(ctx, internalformat)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-            "%s(internalformat=%s)",
-            func, _mesa_enum_to_string(internalformat));
+      /* Page 172 of OpenGL ES 3.1 spec says:
+       *   "An INVALID_ENUM error is generated if sizedinternalformat is not
+       *   color-renderable, depth-renderable, or stencil-renderable (as
+       *   defined in section 9.4).
+       *
+       *  (Same error is also defined for desktop OpenGL for multisample
+       *  teximage/texstorage functions.)
+       */
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(internalformat=%s)", func,
+                  _mesa_enum_to_string(internalformat));
       return;
    }
 

From 7f4ad692a10bf0f247dedd4968b7ffe9b07d2af2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
Date: Fri, 14 Aug 2015 15:16:12 +0900
Subject: [PATCH 1153/1208] st/clover: Fix build against LLVM 3.8 SVN r244928

raw_svector_ostream::flush() is now unnecessary and forbidden:

  CXX      llvm/libclllvm_la-invocation.lo
../../../../../src/gallium/state_trackers/clover/llvm/invocation.cpp: In function 'clover::module {anonymous}::build_module_llvm(llvm::Module*, unsigned int (&)[7])':
../../../../../src/gallium/state_trackers/clover/llvm/invocation.cpp:574:29: error: use of deleted function 'void llvm::raw_svector_ostream::flush()'
       bitcode_ostream.flush();
                             ^
In file included from /home/daenzer/src/llvm-git/llvm/include/clang/Basic/VirtualFileSystem.h:22:0,
                 from /home/daenzer/src/llvm-git/llvm/include/clang/Basic/FileManager.h:20,
                 from /home/daenzer/src/llvm-git/llvm/include/clang/Basic/SourceManager.h:38,
                 from /home/daenzer/src/llvm-git/llvm/include/clang/Frontend/CompilerInstance.h:16,
                 from ../../../../../src/gallium/state_trackers/clover/llvm/invocation.cpp:25:
/home/daenzer/src/llvm-git/llvm/include/llvm/Support/raw_ostream.h:512:8: note: declared here
   void flush() = delete;
        ^
Makefile:862: recipe for target 'llvm/libclllvm_la-invocation.lo' failed

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/gallium/state_trackers/clover/llvm/invocation.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index 86859af3530..63c3f8ee49b 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -571,7 +571,9 @@ namespace {
       llvm::raw_svector_ostream bitcode_ostream(llvm_bitcode);
       llvm::BitstreamWriter writer(llvm_bitcode);
       llvm::WriteBitcodeToFile(mod, bitcode_ostream);
+#if HAVE_LLVM < 0x0308
       bitcode_ostream.flush();
+#endif
 
       const std::vector<llvm::Function *> kernels = find_kernels(mod);
       for (unsigned i = 0; i < kernels.size(); ++i) {

From 78493c33183bf2a4b8be0e58963162ef2e3aa54a Mon Sep 17 00:00:00 2001
From: Zoltan Gilian <zoltan.gilian@gmail.com>
Date: Tue, 7 Jul 2015 23:38:27 +0200
Subject: [PATCH 1154/1208] r600,compute: setup compute sampler states and
 views
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: Add compute mode flag to sampler state setup (Marek).
    Drop branches which avoid reference counting (Marek).
    Simplify unset branch condition (Marek).

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/r600/evergreen_compute.c | 25 +++--------
 src/gallium/drivers/r600/evergreen_state.c   | 46 +++++++++++++++-----
 src/gallium/drivers/r600/evergreend.h        |  5 +++
 src/gallium/drivers/r600/r600_pipe.h         |  7 +--
 src/gallium/drivers/r600/r600_state_common.c | 11 +++--
 5 files changed, 54 insertions(+), 40 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 51003309654..c52e43e9c2a 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -487,6 +487,12 @@ static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 	/* Emit constant buffer state */
 	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 
+	/* Emit sampler state */
+	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
+
+	/* Emit sampler view (texture resource) state */
+	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
+
 	/* Emit compute shader state */
 	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 
@@ -655,25 +661,6 @@ static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 	}
 }
 
-void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
-		unsigned start_slot, unsigned count,
-		struct pipe_sampler_view **views)
-{
-	struct r600_pipe_sampler_view **resource =
-		(struct r600_pipe_sampler_view **)views;
-
-	for (unsigned i = 0; i < count; i++)	{
-		if (resource[i]) {
-			assert(i+1 < 12);
-			/* XXX: Implement */
-			assert(!"Compute samplers not implemented.");
-			///FETCH0 = VTX0 (param buffer),
-			//FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
-		}
-	}
-}
-
-
 static void evergreen_set_global_binding(
 	struct pipe_context *ctx_, unsigned first, unsigned n,
 	struct pipe_resource **resources,
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 95987ee2f70..6a91d4709f4 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1984,7 +1984,7 @@ static void evergreen_emit_cs_constant_buffers(struct r600_context *rctx, struct
 
 static void evergreen_emit_sampler_views(struct r600_context *rctx,
 					 struct r600_samplerview_state *state,
-					 unsigned resource_id_base)
+					 unsigned resource_id_base, unsigned pkt_flags)
 {
 	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
@@ -1997,7 +1997,7 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 		rview = state->views[resource_index];
 		assert(rview);
 
-		radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0));
+		radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0) | pkt_flags);
 		radeon_emit(cs, (resource_id_base + resource_index) * 8);
 		radeon_emit_array(cs, rview->tex_resource_words, 8);
 
@@ -2006,11 +2006,11 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 					      rview->tex_resource->b.b.nr_samples > 1 ?
 						      RADEON_PRIO_SHADER_TEXTURE_MSAA :
 						      RADEON_PRIO_SHADER_TEXTURE_RO);
-		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
 		radeon_emit(cs, reloc);
 
 		if (!rview->skip_mip_address_reloc) {
-			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
 			radeon_emit(cs, reloc);
 		}
 	}
@@ -2019,23 +2019,33 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 
 static void evergreen_emit_vs_sampler_views(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].views, 176 + R600_MAX_CONST_BUFFERS);
+	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].views,
+	                             176 + R600_MAX_CONST_BUFFERS, 0);
 }
 
 static void evergreen_emit_gs_sampler_views(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].views, 336 + R600_MAX_CONST_BUFFERS);
+	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].views,
+	                             336 + R600_MAX_CONST_BUFFERS, 0);
 }
 
 static void evergreen_emit_ps_sampler_views(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].views, R600_MAX_CONST_BUFFERS);
+	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].views,
+	                             R600_MAX_CONST_BUFFERS, 0);
+}
+
+static void evergreen_emit_cs_sampler_views(struct r600_context *rctx, struct r600_atom *atom)
+{
+	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views,
+	                             816 + 2, RADEON_CP_PACKET3_COMPUTE_MODE);
 }
 
 static void evergreen_emit_sampler_states(struct r600_context *rctx,
 				struct r600_textures_info *texinfo,
 				unsigned resource_id_base,
-				unsigned border_index_reg)
+				unsigned border_index_reg,
+				unsigned pkt_flags)
 {
 	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 	uint32_t dirty_mask = texinfo->states.dirty_mask;
@@ -2047,7 +2057,7 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx,
 		rstate = texinfo->states.states[i];
 		assert(rstate);
 
-		radeon_emit(cs, PKT3(PKT3_SET_SAMPLER, 3, 0));
+		radeon_emit(cs, PKT3(PKT3_SET_SAMPLER, 3, 0) | pkt_flags);
 		radeon_emit(cs, (resource_id_base + i) * 3);
 		radeon_emit_array(cs, rstate->tex_sampler_words, 3);
 
@@ -2062,17 +2072,27 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx,
 
 static void evergreen_emit_vs_sampler_states(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_VERTEX], 18, R_00A414_TD_VS_SAMPLER0_BORDER_INDEX);
+	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_VERTEX], 18,
+	                              R_00A414_TD_VS_SAMPLER0_BORDER_INDEX, 0);
 }
 
 static void evergreen_emit_gs_sampler_states(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY], 36, R_00A428_TD_GS_SAMPLER0_BORDER_INDEX);
+	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY], 36,
+	                              R_00A428_TD_GS_SAMPLER0_BORDER_INDEX, 0);
 }
 
 static void evergreen_emit_ps_sampler_states(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT], 0, R_00A400_TD_PS_SAMPLER0_BORDER_INDEX);
+	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT], 0,
+	                              R_00A400_TD_PS_SAMPLER0_BORDER_INDEX, 0);
+}
+
+static void evergreen_emit_cs_sampler_states(struct r600_context *rctx, struct r600_atom *atom)
+{
+	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE], 90,
+	                              R_00A464_TD_CS_SAMPLER0_BORDER_INDEX,
+	                              RADEON_CP_PACKET3_COMPUTE_MODE);
 }
 
 static void evergreen_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a)
@@ -3435,12 +3455,14 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].states.atom, id++, evergreen_emit_vs_sampler_states, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].states.atom, id++, evergreen_emit_gs_sampler_states, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].states.atom, id++, evergreen_emit_ps_sampler_states, 0);
+	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom, id++, evergreen_emit_cs_sampler_states, 0);
 	/* resources */
 	r600_init_atom(rctx, &rctx->vertex_buffer_state.atom, id++, evergreen_fs_emit_vertex_buffers, 0);
 	r600_init_atom(rctx, &rctx->cs_vertex_buffer_state.atom, id++, evergreen_cs_emit_vertex_buffers, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].views.atom, id++, evergreen_emit_vs_sampler_views, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].views.atom, id++, evergreen_emit_gs_sampler_views, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].views.atom, id++, evergreen_emit_ps_sampler_views, 0);
+	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom, id++, evergreen_emit_cs_sampler_views, 0);
 
 	r600_init_atom(rctx, &rctx->vgt_state.atom, id++, r600_emit_vgt_state, 10);
 
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index cd4ff46b103..ad6ad434b78 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -1253,6 +1253,11 @@
 #define R_00A430_TD_GS_SAMPLER0_BORDER_GREEN         0x00A430
 #define R_00A434_TD_GS_SAMPLER0_BORDER_BLUE          0x00A434
 #define R_00A438_TD_GS_SAMPLER0_BORDER_ALPHA         0x00A438
+#define R_00A464_TD_CS_SAMPLER0_BORDER_INDEX         0x00A464
+#define R_00A468_TD_CS_SAMPLER0_BORDER_RED           0x00A468
+#define R_00A46C_TD_CS_SAMPLER0_BORDER_GREEN         0x00A46C
+#define R_00A470_TD_CS_SAMPLER0_BORDER_BLUE          0x00A470
+#define R_00A474_TD_CS_SAMPLER0_BORDER_ALPHA         0x00A474
 
 #define R_03C000_SQ_TEX_SAMPLER_WORD0_0              0x03C000
 #define   S_03C000_CLAMP_X(x)                          (((x) & 0x7) << 0)
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 5d10bb48157..9b66105641a 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -36,7 +36,7 @@
 #include "util/list.h"
 #include "util/u_transfer.h"
 
-#define R600_NUM_ATOMS 73
+#define R600_NUM_ATOMS 75
 
 #define R600_MAX_VIEWPORTS 16
 
@@ -589,11 +589,6 @@ void compute_memory_pool_delete(struct compute_memory_pool* pool);
 struct compute_memory_pool* compute_memory_pool_new(
 	struct r600_screen *rscreen);
 
-/* evergreen_compute.c */
-void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
-                                   unsigned start_slot, unsigned count,
-                                   struct pipe_sampler_view **views);
-
 /* evergreen_state.c */
 struct pipe_sampler_view *
 evergreen_create_sampler_view_custom(struct pipe_context *ctx,
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index ee477919b02..aa4a8d0240f 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -407,6 +407,11 @@ static void r600_bind_sampler_states(struct pipe_context *pipe,
 
 	assert(start == 0); /* XXX fix below */
 
+	if (!states) {
+		disable_mask = ~0u;
+		count = 0;
+	}
+
 	for (i = 0; i < count; i++) {
 		struct r600_pipe_sampler_state *rstate = rstates[i];
 
@@ -596,9 +601,9 @@ static void r600_set_sampler_views(struct pipe_context *pipe, unsigned shader,
 
 	assert(start == 0); /* XXX fix below */
 
-	if (shader == PIPE_SHADER_COMPUTE) {
-		evergreen_set_cs_sampler_view(pipe, start, count, views);
-		return;
+	if (!views) {
+		disable_mask = ~0u;
+		count = 0;
 	}
 
 	remaining_mask = dst->views.enabled_mask & disable_mask;

From 44dc1d307d7eacef0d6f1618ba0fb7f62e08f896 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 10 Aug 2015 19:37:01 +0200
Subject: [PATCH 1155/1208] gallium: add support for GLES texture float
 extensions (v3)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=74329

v2: add a CAP for half floats
    drivers should not expose the CAPs if they don't support the formats

v3: update relnotes

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 docs/relnotes/11.0.0.html                        | 4 ++++
 src/gallium/docs/source/screen.rst               | 6 ++++++
 src/gallium/drivers/freedreno/freedreno_screen.c | 2 ++
 src/gallium/drivers/i915/i915_screen.c           | 2 ++
 src/gallium/drivers/ilo/ilo_screen.c             | 2 ++
 src/gallium/drivers/llvmpipe/lp_screen.c         | 2 ++
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 2 ++
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 2 ++
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 2 ++
 src/gallium/drivers/r300/r300_screen.c           | 2 ++
 src/gallium/drivers/r600/r600_pipe.c             | 2 ++
 src/gallium/drivers/radeonsi/si_pipe.c           | 2 ++
 src/gallium/drivers/softpipe/sp_screen.c         | 2 ++
 src/gallium/drivers/svga/svga_screen.c           | 2 ++
 src/gallium/drivers/vc4/vc4_screen.c             | 2 ++
 src/gallium/include/pipe/p_defines.h             | 2 ++
 src/mesa/state_tracker/st_extensions.c           | 8 ++++++++
 17 files changed, 46 insertions(+)

diff --git a/docs/relnotes/11.0.0.html b/docs/relnotes/11.0.0.html
index 2d80198d420..b5077e7c6a0 100644
--- a/docs/relnotes/11.0.0.html
+++ b/docs/relnotes/11.0.0.html
@@ -58,6 +58,10 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_tessellation_shader on nvc0, radeonsi</li>
 <li>GL_ARB_vertex_attrib_64bit on llvmpipe, radeonsi</li>
 <li>GL_ARB_viewport_array on radeonsi</li>
+<li>GL_OES_texture_float on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
+<li>GL_OES_texture_half_float on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
+<li>GL_OES_texture_float_linear on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
+<li>GL_OES_texture_half_float_linear on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
 <li>GLX_ARB_create_context_robustness on r600, radeonsi</li>
 <li>EGL_EXT_create_context_robustness on r600, radeonsi</li>
 <li>EGL_KHR_gl_colorspace on r600, radeonsi, nv50, nvc0</li>
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index dbdccc7fcef..c0b5eb31d05 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -258,6 +258,12 @@ The integer capabilities:
   How many per-patch outputs and inputs are supported between tessellation
   control and tessellation evaluation shaders, not counting in TESSINNER and
   TESSOUTER. The minimum allowed value for OpenGL is 30.
+* ``PIPE_CAP_TEXTURE_FLOAT_LINEAR``: Whether the linear minification and
+  magnification filters are supported with single-precision floating-point
+  textures.
+* ``PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR``: Whether the linear minification and
+  magnification filters are supported with half-precision floating-point
+  textures.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 3aeed57b043..295ce7e8187 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -230,6 +230,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 6083687962f..7608ca5e1a9 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -244,6 +244,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 338643e65bb..9f150bbd7b2 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -451,6 +451,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_GATHER_SM5:
       return 0;
    case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return true;
    case PIPE_CAP_FAKE_SW_MSAA:
    case PIPE_CAP_TEXTURE_QUERY_LOD:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 1c6c82ea115..539bb448bc6 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -288,6 +288,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
       return 0;
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 1;
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 97cf058ce29..73bcd5b0d3f 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -164,6 +164,8 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index d869544380a..1e198774593 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -176,6 +176,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_CLIP_HALFZ:
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 5c382b8bdae..0c25ec43120 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -175,6 +175,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_CLIP_HALFZ:
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index f1df79ccb21..74aae379c09 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -192,6 +192,8 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
         case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+        case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+        case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index c4f5c742b9a..a0fe1d355c9 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -270,6 +270,8 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CLIP_HALFZ:
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
 		return 1;
 
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index e29b1586107..d60c258709d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -257,6 +257,8 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_TEXCOORD:
 	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
 		return 1;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index ed8e5457f27..49a21d9c06b 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -234,6 +234,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
       return 1;
    case PIPE_CAP_CLIP_HALFZ:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 1;
    case PIPE_CAP_VERTEXID_NOBASE:
       return 0;
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index ac5abe8f89d..7b70272a828 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -310,6 +310,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 0;
    }
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index a20818da4d2..72f08262b66 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -177,6 +177,8 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 85328264cc3..83a3e2bb377 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -607,6 +607,8 @@ enum pipe_cap
    PIPE_CAP_RESOURCE_FROM_USER_MEMORY,
    PIPE_CAP_DEVICE_RESET_STATUS_QUERY,
    PIPE_CAP_MAX_SHADER_PATCH_VARYINGS,
+   PIPE_CAP_TEXTURE_FLOAT_LINEAR,
+   PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 13d636fede0..ee14019c526 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -474,6 +474,8 @@ void st_init_extensions(struct pipe_screen *screen,
        * support the GL_POINT_SPRITE_R_MODE_NV option. */
 
       { o(OES_standard_derivatives),         PIPE_CAP_SM3                              },
+      { o(OES_texture_float_linear),         PIPE_CAP_TEXTURE_FLOAT_LINEAR             },
+      { o(OES_texture_half_float_linear),    PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR        },
       { o(ARB_texture_cube_map_array),       PIPE_CAP_CUBE_MAP_ARRAY                   },
       { o(ARB_texture_multisample),          PIPE_CAP_TEXTURE_MULTISAMPLE              },
       { o(ARB_texture_query_lod),            PIPE_CAP_TEXTURE_QUERY_LOD                },
@@ -492,6 +494,12 @@ void st_init_extensions(struct pipe_screen *screen,
         { PIPE_FORMAT_R32G32B32A32_FLOAT,
           PIPE_FORMAT_R16G16B16A16_FLOAT } },
 
+      { { o(OES_texture_float) },
+        { PIPE_FORMAT_R32G32B32A32_FLOAT } },
+
+      { { o(OES_texture_half_float) },
+        { PIPE_FORMAT_R16G16B16A16_FLOAT } },
+
       { { o(ARB_texture_rgb10_a2ui) },
         { PIPE_FORMAT_R10G10B10A2_UINT,
           PIPE_FORMAT_B10G10R10A2_UINT },

From 2ebb8efa08b4ea290b8a2bb9aa2e3784b8272d87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 10 Aug 2015 19:53:22 +0200
Subject: [PATCH 1156/1208] st/mesa: small cleanup in st_extensions.c

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Edward O'Callaghan <eocallaghan at alterapraxis.com>
---
 src/mesa/state_tracker/st_extensions.c | 40 ++++++--------------------
 1 file changed, 9 insertions(+), 31 deletions(-)

diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index ee14019c526..7864cf8cb42 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -434,12 +434,14 @@ void st_init_extensions(struct pipe_screen *screen,
 
    static const struct st_extension_cap_mapping cap_mapping[] = {
       { o(ARB_base_instance),                PIPE_CAP_START_INSTANCE                   },
-      { o(ARB_buffer_storage),               PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT },
+      { o(ARB_buffer_storage),               PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT   },
+      { o(ARB_color_buffer_float),           PIPE_CAP_VERTEX_COLOR_UNCLAMPED           },
       { o(ARB_depth_clamp),                  PIPE_CAP_DEPTH_CLIP_DISABLE               },
       { o(ARB_depth_texture),                PIPE_CAP_TEXTURE_SHADOW_MAP               },
       { o(ARB_draw_buffers_blend),           PIPE_CAP_INDEP_BLEND_FUNC                 },
       { o(ARB_draw_instanced),               PIPE_CAP_TGSI_INSTANCEID                  },
       { o(ARB_fragment_program_shadow),      PIPE_CAP_TEXTURE_SHADOW_MAP               },
+      { o(ARB_framebuffer_object),           PIPE_CAP_MIXED_FRAMEBUFFER_SIZES          },
       { o(ARB_instanced_arrays),             PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR  },
       { o(ARB_occlusion_query),              PIPE_CAP_OCCLUSION_QUERY                  },
       { o(ARB_occlusion_query2),             PIPE_CAP_OCCLUSION_QUERY                  },
@@ -449,6 +451,8 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_shader_stencil_export),        PIPE_CAP_SHADER_STENCIL_EXPORT            },
       { o(ARB_shader_texture_lod),           PIPE_CAP_SM3                              },
       { o(ARB_shadow),                       PIPE_CAP_TEXTURE_SHADOW_MAP               },
+      { o(ARB_texture_buffer_object),        PIPE_CAP_TEXTURE_BUFFER_OBJECTS           },
+      { o(ARB_texture_gather),               PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS    },
       { o(ARB_texture_mirror_clamp_to_edge), PIPE_CAP_TEXTURE_MIRROR_CLAMP             },
       { o(ARB_texture_non_power_of_two),     PIPE_CAP_NPOT_TEXTURES                    },
       { o(ARB_timer_query),                  PIPE_CAP_QUERY_TIMESTAMP                  },
@@ -469,6 +473,7 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ATI_separate_stencil),             PIPE_CAP_TWO_SIDED_STENCIL                },
       { o(ATI_texture_mirror_once),          PIPE_CAP_TEXTURE_MIRROR_CLAMP             },
       { o(NV_conditional_render),            PIPE_CAP_CONDITIONAL_RENDER               },
+      { o(NV_primitive_restart),             PIPE_CAP_PRIMITIVE_RESTART                },
       { o(NV_texture_barrier),               PIPE_CAP_TEXTURE_BARRIER                  },
       /* GL_NV_point_sprite is not supported by gallium because we don't
        * support the GL_POINT_SPRITE_R_MODE_NV option. */
@@ -581,7 +586,8 @@ void st_init_extensions(struct pipe_screen *screen,
           PIPE_FORMAT_R8G8B8A8_UNORM },
         GL_TRUE }, /* at least one format must be supported */
 
-      { { o(ARB_stencil_texturing) },
+      { { o(ARB_stencil_texturing),
+          o(ARB_texture_stencil8) },
         { PIPE_FORMAT_X24S8_UINT,
           PIPE_FORMAT_S8X24_UINT },
         GL_TRUE }, /* at least one format must be supported */
@@ -675,9 +681,6 @@ void st_init_extensions(struct pipe_screen *screen,
                           ARRAY_SIZE(vertex_mapping), PIPE_BUFFER,
                           PIPE_BIND_VERTEX_BUFFER);
 
-   if (extensions->ARB_stencil_texturing)
-      extensions->ARB_texture_stencil8 = GL_TRUE;
-
    /* Figure out GLSL support. */
    glsl_feature_level = screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL);
 
@@ -749,27 +752,11 @@ void st_init_extensions(struct pipe_screen *screen,
       extensions->ANGLE_texture_compression_dxt = GL_FALSE;
    }
 
-   if (screen->get_shader_param(screen, PIPE_SHADER_GEOMETRY,
-                                PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0) {
-#if 0 /* XXX re-enable when GLSL compiler again supports geometry shaders */
-      extensions->ARB_geometry_shader4 = GL_TRUE;
-#endif
-   }
-
    if (screen->get_shader_param(screen, PIPE_SHADER_TESS_CTRL,
                                 PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0) {
       extensions->ARB_tessellation_shader = GL_TRUE;
    }
 
-   if (screen->get_param(screen, PIPE_CAP_PRIMITIVE_RESTART)) {
-      extensions->NV_primitive_restart = GL_TRUE;
-   }
-
-   /* ARB_color_buffer_float. */
-   if (screen->get_param(screen, PIPE_CAP_VERTEX_COLOR_UNCLAMPED)) {
-      extensions->ARB_color_buffer_float = GL_TRUE;
-   }
-
    if (screen->fence_finish) {
       extensions->ARB_sync = GL_TRUE;
    }
@@ -854,9 +841,7 @@ void st_init_extensions(struct pipe_screen *screen,
    consts->MinMapBufferAlignment =
       screen->get_param(screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT);
 
-   if (screen->get_param(screen, PIPE_CAP_TEXTURE_BUFFER_OBJECTS)) {
-      extensions->ARB_texture_buffer_object = GL_TRUE;
-
+   if (extensions->ARB_texture_buffer_object) {
       consts->MaxTextureBufferSize =
          _min(screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE),
               (1u << 31) - 1);
@@ -871,10 +856,6 @@ void st_init_extensions(struct pipe_screen *screen,
                              PIPE_BIND_SAMPLER_VIEW);
    }
 
-   if (screen->get_param(screen, PIPE_CAP_MIXED_FRAMEBUFFER_SIZES)) {
-      extensions->ARB_framebuffer_object = GL_TRUE;
-   }
-
    /* Unpacking a varying in the fragment shader costs 1 texture indirection.
     * If the number of available texture indirections is very limited, then we
     * prefer to disable varying packing rather than run the risk of varying
@@ -899,9 +880,6 @@ void st_init_extensions(struct pipe_screen *screen,
          extensions->AMD_vertex_shader_viewport_index = GL_TRUE;
    }
 
-   if (consts->MaxProgramTextureGatherComponents > 0)
-      extensions->ARB_texture_gather = GL_TRUE;
-
    /* GL_ARB_ES3_compatibility.
     *
     * Assume that ES3 is supported if GLSL 3.30 is supported.

From 3b7800e75089d4dc8ed9b2a0ce994760c167b93a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 10 Aug 2015 02:11:48 +0200
Subject: [PATCH 1157/1208] gallium: add an interface for EXT_depth_bounds_test

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/docs/source/screen.rst               | 3 +++
 src/gallium/drivers/freedreno/freedreno_screen.c | 1 +
 src/gallium/drivers/i915/i915_screen.c           | 1 +
 src/gallium/drivers/ilo/ilo_screen.c             | 1 +
 src/gallium/drivers/llvmpipe/lp_screen.c         | 1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 1 +
 src/gallium/drivers/r300/r300_screen.c           | 1 +
 src/gallium/drivers/r600/r600_pipe.c             | 1 +
 src/gallium/drivers/radeonsi/si_pipe.c           | 1 +
 src/gallium/drivers/softpipe/sp_screen.c         | 1 +
 src/gallium/drivers/svga/svga_screen.c           | 1 +
 src/gallium/drivers/vc4/vc4_screen.c             | 1 +
 src/gallium/include/pipe/p_defines.h             | 1 +
 src/gallium/include/pipe/p_state.h               | 3 +++
 16 files changed, 20 insertions(+)

diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index c0b5eb31d05..2c0da016d08 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -264,6 +264,9 @@ The integer capabilities:
 * ``PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR``: Whether the linear minification and
   magnification filters are supported with half-precision floating-point
   textures.
+* ``PIPE_CAP_DEPTH_BOUNDS_TEST``: Whether bounds_test, bounds_min, and
+  bounds_max states of pipe_depth_stencil_alpha_state behave according
+  to the GL_EXT_depth_bounds_test specification.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 295ce7e8187..b55f5b36ca9 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -232,6 +232,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
 	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
 	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 7608ca5e1a9..19a94a8e019 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -246,6 +246,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
    case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 9f150bbd7b2..ab4d1377c9f 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -468,6 +468,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 539bb448bc6..14eeab03387 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -295,6 +295,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 73bcd5b0d3f..efa766d4a69 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -166,6 +166,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
    case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 1e198774593..2479cbd664e 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -213,6 +213,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 0c25ec43120..54b469bc03e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -199,6 +199,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 74aae379c09..4ca0b268bde 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -194,6 +194,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
         case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
         case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+        case PIPE_CAP_DEPTH_BOUNDS_TEST:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index a0fe1d355c9..6ffe5615fbf 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -339,6 +339,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 0;
 
 	/* Stream output. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index d60c258709d..f0a377b78b0 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -301,6 +301,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 49a21d9c06b..0bfd9c3578c 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -245,6 +245,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 7b70272a828..66c3deaa9e7 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -312,6 +312,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
    case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
    }
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 72f08262b66..2dee1d40e5f 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -179,6 +179,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
 	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
 	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 83a3e2bb377..2ba56eac793 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -609,6 +609,7 @@ enum pipe_cap
    PIPE_CAP_MAX_SHADER_PATCH_VARYINGS,
    PIPE_CAP_TEXTURE_FLOAT_LINEAR,
    PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR,
+   PIPE_CAP_DEPTH_BOUNDS_TEST,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index a233610ead6..1e493f47ccf 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -223,6 +223,9 @@ struct pipe_depth_state
    unsigned enabled:1;         /**< depth test enabled? */
    unsigned writemask:1;       /**< allow depth buffer writes? */
    unsigned func:3;            /**< depth test func (PIPE_FUNC_x) */
+   unsigned bounds_test:1;     /**< depth bounds test enabled? */
+   float bounds_min;           /**< minimum depth bound */
+   float bounds_max;           /**< maximum depth bound */
 };
 
 

From 36a6f848bb03828aa9c4dc28774acf09055f2831 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 10 Aug 2015 02:18:43 +0200
Subject: [PATCH 1158/1208] st/mesa: add EXT_depth_bounds_test

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/state_tracker/st_atom_depth.c | 15 +++++++++++----
 src/mesa/state_tracker/st_extensions.c |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_depth.c b/src/mesa/state_tracker/st_atom_depth.c
index c4bca8d09b5..d9cc97029fb 100644
--- a/src/mesa/state_tracker/st_atom_depth.c
+++ b/src/mesa/state_tracker/st_atom_depth.c
@@ -105,10 +105,17 @@ update_depth_stencil_alpha(struct st_context *st)
    memset(dsa, 0, sizeof(*dsa));
    memset(&sr, 0, sizeof(sr));
 
-   if (ctx->Depth.Test && ctx->DrawBuffer->Visual.depthBits > 0) {
-      dsa->depth.enabled = 1;
-      dsa->depth.writemask = ctx->Depth.Mask;
-      dsa->depth.func = st_compare_func_to_pipe(ctx->Depth.Func);
+   if (ctx->DrawBuffer->Visual.depthBits > 0) {
+      if (ctx->Depth.Test) {
+         dsa->depth.enabled = 1;
+         dsa->depth.writemask = ctx->Depth.Mask;
+         dsa->depth.func = st_compare_func_to_pipe(ctx->Depth.Func);
+      }
+      if (ctx->Depth.BoundsTest) {
+         dsa->depth.bounds_test = 1;
+         dsa->depth.bounds_min = ctx->Depth.BoundsMin;
+         dsa->depth.bounds_max = ctx->Depth.BoundsMax;
+      }
    }
 
    if (ctx->Stencil.Enabled && ctx->DrawBuffer->Visual.stencilBits > 0) {
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 7864cf8cb42..17f572f80fb 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -491,6 +491,7 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_texture_view),                 PIPE_CAP_SAMPLER_VIEW_TARGET              },
       { o(ARB_clip_control),                 PIPE_CAP_CLIP_HALFZ                       },
       { o(EXT_polygon_offset_clamp),         PIPE_CAP_POLYGON_OFFSET_CLAMP             },
+      { o(EXT_depth_bounds_test),            PIPE_CAP_DEPTH_BOUNDS_TEST                },
    };
 
    /* Required: render target and sampler support */

From 97f58fb59a45f04c9d03709063a081f572509f51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 10 Aug 2015 02:23:21 +0200
Subject: [PATCH 1159/1208] radeonsi: add support for EXT_depth_bounds_test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 docs/relnotes/11.0.0.html               |  1 +
 src/gallium/drivers/radeonsi/si_pipe.c  |  2 +-
 src/gallium/drivers/radeonsi/si_state.c | 10 ++++++----
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/relnotes/11.0.0.html b/docs/relnotes/11.0.0.html
index b5077e7c6a0..ae3a2be2d36 100644
--- a/docs/relnotes/11.0.0.html
+++ b/docs/relnotes/11.0.0.html
@@ -58,6 +58,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_tessellation_shader on nvc0, radeonsi</li>
 <li>GL_ARB_vertex_attrib_64bit on llvmpipe, radeonsi</li>
 <li>GL_ARB_viewport_array on radeonsi</li>
+<li>GL_EXT_depth_bounds_test on radeonsi, nv30, nv50, nvc0</li>
 <li>GL_OES_texture_float on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
 <li>GL_OES_texture_half_float on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
 <li>GL_OES_texture_float_linear on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index f0a377b78b0..22efc6837eb 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -259,6 +259,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
 	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 1;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
@@ -301,7 +302,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
-	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 32609439659..b9f512d17b6 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -838,7 +838,8 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
 
 	db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) |
 		S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
-		S_028800_ZFUNC(state->depth.func);
+		S_028800_ZFUNC(state->depth.func) |
+		S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test);
 
 	/* stencil */
 	if (state->stencil[0].enabled) {
@@ -867,9 +868,12 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
 		dsa->alpha_func = PIPE_FUNC_ALWAYS;
 	}
 
-	/* misc */
 	si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
 	si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
+	if (state->depth.bounds_test) {
+		si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min));
+		si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
+	}
 
 	return dsa;
 }
@@ -3230,8 +3234,6 @@ static void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028BEC_PA_CL_GB_VERT_DISC_ADJ, fui(1.0));
 	si_pm4_set_reg(pm4, R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, fui(1.0));
 	si_pm4_set_reg(pm4, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, fui(1.0));
-	si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, 0);
-	si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, 0);
 	si_pm4_set_reg(pm4, R_028028_DB_STENCIL_CLEAR, 0);
 	si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
 	si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);

From 973988ab8dd4d04b925a5859d1da0801e858a6fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 13 Aug 2015 01:51:37 +0200
Subject: [PATCH 1160/1208] swrast: fix EXT_depth_bounds_test

zMin and zMax can't use _DepthMaxF, because the test is done in Z32_UNORM.

Probably a useless patch given how popular swrast is nowadays, but it helped
create and validate the piglit test.

v2: add an explicit cast to GLuint

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/swrast/s_depth.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/mesa/swrast/s_depth.c b/src/mesa/swrast/s_depth.c
index 134f897c039..ffadc05a732 100644
--- a/src/mesa/swrast/s_depth.c
+++ b/src/mesa/swrast/s_depth.c
@@ -419,8 +419,8 @@ _swrast_depth_bounds_test( struct gl_context *ctx, SWspan *span )
    struct gl_framebuffer *fb = ctx->DrawBuffer;
    struct gl_renderbuffer *rb = fb->Attachment[BUFFER_DEPTH].Renderbuffer;
    GLubyte *zStart;
-   GLuint zMin = (GLuint) (ctx->Depth.BoundsMin * fb->_DepthMaxF + 0.5F);
-   GLuint zMax = (GLuint) (ctx->Depth.BoundsMax * fb->_DepthMaxF + 0.5F);
+   GLuint zMin = (GLuint)((double)ctx->Depth.BoundsMin * 0xffffffff);
+   GLuint zMax = (GLuint)((double)ctx->Depth.BoundsMax * 0xffffffff);
    GLubyte *mask = span->array->mask;
    const GLuint count = span->end;
    GLuint i;
@@ -444,6 +444,16 @@ _swrast_depth_bounds_test( struct gl_context *ctx, SWspan *span )
       zBufferVals = (const GLuint *) zStart;
    }
    else {
+      /* Round the bounds to the precision of the zbuffer. */
+      if (rb->Format == MESA_FORMAT_Z_UNORM16) {
+         zMin = (zMin & 0xffff0000) | (zMin >> 16);
+         zMax = (zMax & 0xffff0000) | (zMax >> 16);
+      } else {
+         /* 24 bits */
+         zMin = (zMin & 0xffffff00) | (zMin >> 24);
+         zMax = (zMax & 0xffffff00) | (zMax >> 24);
+      }
+
       /* unpack Z values into a temporary array */
       if (span->arrayMask & SPAN_XY) {
          get_z32_values(ctx, rb, count, span->array->x, span->array->y,

From 716a67da12be0656a6dae2a448175946aaf57377 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Thu, 25 Jun 2015 12:09:11 -0400
Subject: [PATCH 1161/1208] vl: add cap for stacking frames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Leo Liu <leo.liu@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/include/pipe/p_video_enums.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/include/pipe/p_video_enums.h b/src/gallium/include/pipe/p_video_enums.h
index e28d57dd3b0..feaf1ee9c28 100644
--- a/src/gallium/include/pipe/p_video_enums.h
+++ b/src/gallium/include/pipe/p_video_enums.h
@@ -68,7 +68,8 @@ enum pipe_video_cap
    PIPE_VIDEO_CAP_PREFERS_INTERLACED = 5,
    PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE = 6,
    PIPE_VIDEO_CAP_SUPPORTS_INTERLACED = 7,
-   PIPE_VIDEO_CAP_MAX_LEVEL = 8
+   PIPE_VIDEO_CAP_MAX_LEVEL = 8,
+   PIPE_VIDEO_CAP_STACKED_FRAMES = 9
 };
 
 enum pipe_video_entrypoint

From facba49d839b01da139261e587a05c744cc9a1fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Tue, 28 Apr 2015 15:31:37 +0200
Subject: [PATCH 1162/1208] vl: add HEVC profiles and defines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Christian KÃ¶nig <christian.koenig@amd.com>
Reviewed-by: Leo Liu <leo.liu@amd.com>
---
 src/gallium/auxiliary/util/u_video.h     |   7 ++
 src/gallium/include/pipe/p_video_enums.h |  10 ++-
 src/gallium/include/pipe/p_video_state.h | 105 +++++++++++++++++++++++
 3 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_video.h b/src/gallium/auxiliary/util/u_video.h
index 99d0f6e775a..ddc00216105 100644
--- a/src/gallium/auxiliary/util/u_video.h
+++ b/src/gallium/auxiliary/util/u_video.h
@@ -68,6 +68,13 @@ u_reduce_video_profile(enum pipe_video_profile profile)
       case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH444:
          return PIPE_VIDEO_FORMAT_MPEG4_AVC;
 
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_STILL:
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_12:
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_444:
+         return PIPE_VIDEO_FORMAT_HEVC;
+
       default:
          return PIPE_VIDEO_FORMAT_UNKNOWN;
    }
diff --git a/src/gallium/include/pipe/p_video_enums.h b/src/gallium/include/pipe/p_video_enums.h
index feaf1ee9c28..9a20146f43e 100644
--- a/src/gallium/include/pipe/p_video_enums.h
+++ b/src/gallium/include/pipe/p_video_enums.h
@@ -34,7 +34,8 @@ enum pipe_video_format
    PIPE_VIDEO_FORMAT_MPEG12,   /**< MPEG1, MPEG2 */
    PIPE_VIDEO_FORMAT_MPEG4,    /**< DIVX, XVID */
    PIPE_VIDEO_FORMAT_VC1,      /**< WMV */
-   PIPE_VIDEO_FORMAT_MPEG4_AVC /**< H.264 */
+   PIPE_VIDEO_FORMAT_MPEG4_AVC,/**< H.264 */
+   PIPE_VIDEO_FORMAT_HEVC      /**< H.265 */
 };
 
 enum pipe_video_profile
@@ -54,7 +55,12 @@ enum pipe_video_profile
    PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH,
    PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH10,
    PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH422,
-   PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH444
+   PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH444,
+   PIPE_VIDEO_PROFILE_HEVC_MAIN,
+   PIPE_VIDEO_PROFILE_HEVC_MAIN_10,
+   PIPE_VIDEO_PROFILE_HEVC_MAIN_STILL,
+   PIPE_VIDEO_PROFILE_HEVC_MAIN_12,
+   PIPE_VIDEO_PROFILE_HEVC_MAIN_444
 };
 
 /* Video caps, can be different for each codec/profile */
diff --git a/src/gallium/include/pipe/p_video_state.h b/src/gallium/include/pipe/p_video_state.h
index 3713cd91b09..7d13151e643 100644
--- a/src/gallium/include/pipe/p_video_state.h
+++ b/src/gallium/include/pipe/p_video_state.h
@@ -376,6 +376,111 @@ struct pipe_h264_enc_picture_desc
    bool not_referenced;
 };
 
+struct pipe_h265_sps
+{
+   uint8_t chroma_format_idc;
+   uint8_t separate_colour_plane_flag;
+   uint32_t pic_width_in_luma_samples;
+   uint32_t pic_height_in_luma_samples;
+   uint8_t bit_depth_luma_minus8;
+   uint8_t bit_depth_chroma_minus8;
+   uint8_t log2_max_pic_order_cnt_lsb_minus4;
+   uint8_t sps_max_dec_pic_buffering_minus1;
+   uint8_t log2_min_luma_coding_block_size_minus3;
+   uint8_t log2_diff_max_min_luma_coding_block_size;
+   uint8_t log2_min_transform_block_size_minus2;
+   uint8_t log2_diff_max_min_transform_block_size;
+   uint8_t max_transform_hierarchy_depth_inter;
+   uint8_t max_transform_hierarchy_depth_intra;
+   uint8_t scaling_list_enabled_flag;
+   uint8_t ScalingList4x4[6][16];
+   uint8_t ScalingList8x8[6][64];
+   uint8_t ScalingList16x16[6][64];
+   uint8_t ScalingList32x32[2][64];
+   uint8_t ScalingListDCCoeff16x16[6];
+   uint8_t ScalingListDCCoeff32x32[2];
+   uint8_t amp_enabled_flag;
+   uint8_t sample_adaptive_offset_enabled_flag;
+   uint8_t pcm_enabled_flag;
+   uint8_t pcm_sample_bit_depth_luma_minus1;
+   uint8_t pcm_sample_bit_depth_chroma_minus1;
+   uint8_t log2_min_pcm_luma_coding_block_size_minus3;
+   uint8_t log2_diff_max_min_pcm_luma_coding_block_size;
+   uint8_t pcm_loop_filter_disabled_flag;
+   uint8_t num_short_term_ref_pic_sets;
+   uint8_t long_term_ref_pics_present_flag;
+   uint8_t num_long_term_ref_pics_sps;
+   uint8_t sps_temporal_mvp_enabled_flag;
+   uint8_t strong_intra_smoothing_enabled_flag;
+};
+
+struct pipe_h265_pps
+{
+   struct pipe_h265_sps *sps;
+
+   uint8_t dependent_slice_segments_enabled_flag;
+   uint8_t output_flag_present_flag;
+   uint8_t num_extra_slice_header_bits;
+   uint8_t sign_data_hiding_enabled_flag;
+   uint8_t cabac_init_present_flag;
+   uint8_t num_ref_idx_l0_default_active_minus1;
+   uint8_t num_ref_idx_l1_default_active_minus1;
+   int8_t init_qp_minus26;
+   uint8_t constrained_intra_pred_flag;
+   uint8_t transform_skip_enabled_flag;
+   uint8_t cu_qp_delta_enabled_flag;
+   uint8_t diff_cu_qp_delta_depth;
+   int8_t pps_cb_qp_offset;
+   int8_t pps_cr_qp_offset;
+   uint8_t pps_slice_chroma_qp_offsets_present_flag;
+   uint8_t weighted_pred_flag;
+   uint8_t weighted_bipred_flag;
+   uint8_t transquant_bypass_enabled_flag;
+   uint8_t tiles_enabled_flag;
+   uint8_t entropy_coding_sync_enabled_flag;
+   uint8_t num_tile_columns_minus1;
+   uint8_t num_tile_rows_minus1;
+   uint8_t uniform_spacing_flag;
+   uint16_t column_width_minus1[20];
+   uint16_t row_height_minus1[22];
+   uint8_t loop_filter_across_tiles_enabled_flag;
+   uint8_t pps_loop_filter_across_slices_enabled_flag;
+   uint8_t deblocking_filter_control_present_flag;
+   uint8_t deblocking_filter_override_enabled_flag;
+   uint8_t pps_deblocking_filter_disabled_flag;
+   int8_t pps_beta_offset_div2;
+   int8_t pps_tc_offset_div2;
+   uint8_t lists_modification_present_flag;
+   uint8_t log2_parallel_merge_level_minus2;
+   uint8_t slice_segment_header_extension_present_flag;
+};
+
+struct pipe_h265_picture_desc
+{
+   struct pipe_picture_desc base;
+
+   struct pipe_h265_pps *pps;
+
+   uint8_t IDRPicFlag;
+   uint8_t RAPPicFlag;
+   uint8_t CurrRpsIdx;
+   uint32_t NumPocTotalCurr;
+   uint32_t NumDeltaPocsOfRefRpsIdx;
+   uint32_t NumShortTermPictureSliceHeaderBits;
+   uint32_t NumLongTermPictureSliceHeaderBits;
+
+   int32_t CurrPicOrderCntVal;
+   struct pipe_video_buffer *ref[16];
+   int32_t PicOrderCntVal[16];
+   uint8_t IsLongTerm[16];
+   uint8_t NumPocStCurrBefore;
+   uint8_t NumPocStCurrAfter;
+   uint8_t NumPocLtCurr;
+   uint8_t RefPicSetStCurrBefore[8];
+   uint8_t RefPicSetStCurrAfter[8];
+   uint8_t RefPicSetLtCurr[8];
+};
+
 #ifdef __cplusplus
 }
 #endif

From 0729c251bbff8375ab5d24b80cfc2f8becd6afff Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Fri, 29 May 2015 14:50:44 -0400
Subject: [PATCH 1163/1208] st/omx/enc: flush after eos handling v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2 (chk): reorder the flush

Signed-off-by: Leo Liu <leo.liu@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/state_trackers/omx/vid_enc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/state_trackers/omx/vid_enc.c b/src/gallium/state_trackers/omx/vid_enc.c
index ae1a98f5be3..bb2c80e7b7f 100644
--- a/src/gallium/state_trackers/omx/vid_enc.c
+++ b/src/gallium/state_trackers/omx/vid_enc.c
@@ -1127,6 +1127,7 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
       if (buf->nFlags & OMX_BUFFERFLAG_EOS) {
          buf->nFilledLen = buf->nAllocLen;
          enc_ClearBframes(port, inp);
+         priv->codec->flush(priv->codec);
       }
       return base_port_SendBufferFunction(port, buf);
    }

From 5581f9f28aeaef63bc1495febb402435ddfde556 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Thu, 25 Jun 2015 13:19:56 -0400
Subject: [PATCH 1164/1208] st/omx/enc: stack frame tasks for the gathering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Put tasks to the FIFO queue for results

Signed-off-by: Leo Liu <leo.liu@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/state_trackers/omx/vid_enc.c | 20 +++++++++++++++++++-
 src/gallium/state_trackers/omx/vid_enc.h |  4 +++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/gallium/state_trackers/omx/vid_enc.c b/src/gallium/state_trackers/omx/vid_enc.c
index bb2c80e7b7f..2bd0194189f 100644
--- a/src/gallium/state_trackers/omx/vid_enc.c
+++ b/src/gallium/state_trackers/omx/vid_enc.c
@@ -180,6 +180,11 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
                                 PIPE_VIDEO_ENTRYPOINT_ENCODE, PIPE_VIDEO_CAP_SUPPORTED))
       return OMX_ErrorBadParameter;
  
+   priv->stacked_frames_num = screen->get_video_param(screen,
+                                PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH,
+                                PIPE_VIDEO_ENTRYPOINT_ENCODE,
+                                PIPE_VIDEO_CAP_STACKED_FRAMES);
+
    priv->s_pipe = screen->context_create(screen, priv->screen);
    if (!priv->s_pipe)
       return OMX_ErrorInsufficientResources;
@@ -259,6 +264,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
    LIST_INITHEAD(&priv->free_tasks);
    LIST_INITHEAD(&priv->used_tasks);
    LIST_INITHEAD(&priv->b_frames);
+   LIST_INITHEAD(&priv->stacked_tasks);
 
    return OMX_ErrorNone;
 }
@@ -271,6 +277,7 @@ static OMX_ERRORTYPE vid_enc_Destructor(OMX_COMPONENTTYPE *comp)
    enc_ReleaseTasks(&priv->free_tasks);
    enc_ReleaseTasks(&priv->used_tasks);
    enc_ReleaseTasks(&priv->b_frames);
+   enc_ReleaseTasks(&priv->stacked_tasks);
 
    if (priv->ports) {
       for (i = 0; i < priv->sPortTypesParam[OMX_PortDomainVideo].nPorts; ++i) {
@@ -1116,6 +1123,7 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
    struct input_buf_private *inp = buf->pInputPortPrivate;
    enum pipe_h264_enc_picture_type picture_type;
    struct encode_task *task;
+   unsigned stacked_num = 0;
    OMX_ERRORTYPE err;
 
    enc_MoveTasks(&inp->tasks, &priv->free_tasks);
@@ -1127,6 +1135,7 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
       if (buf->nFlags & OMX_BUFFERFLAG_EOS) {
          buf->nFilledLen = buf->nAllocLen;
          enc_ClearBframes(port, inp);
+         enc_MoveTasks(&priv->stacked_tasks, &inp->tasks);
          priv->codec->flush(priv->codec);
       }
       return base_port_SendBufferFunction(port, buf);
@@ -1167,7 +1176,16 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
       /* handle I or P frame */
       priv->ref_idx_l0 = priv->ref_idx_l1;
       enc_HandleTask(port, task, picture_type);
-      LIST_ADDTAIL(&task->list, &inp->tasks);
+      LIST_ADDTAIL(&task->list, &priv->stacked_tasks);
+      LIST_FOR_EACH_ENTRY(task, &priv->stacked_tasks, list) {
+         ++stacked_num;
+      }
+      if (stacked_num == priv->stacked_frames_num) {
+         struct encode_task *t;
+         t = LIST_ENTRY(struct encode_task, priv->stacked_tasks.next, list);
+         LIST_DEL(&t->list);
+         LIST_ADDTAIL(&t->list, &inp->tasks);
+      }
       priv->ref_idx_l1 = priv->frame_num++;
 
       /* handle B frames */
diff --git a/src/gallium/state_trackers/omx/vid_enc.h b/src/gallium/state_trackers/omx/vid_enc.h
index c8d192b9c60..a83374450b5 100644
--- a/src/gallium/state_trackers/omx/vid_enc.h
+++ b/src/gallium/state_trackers/omx/vid_enc.h
@@ -73,6 +73,7 @@ DERIVEDCLASS(vid_enc_PrivateType, omx_base_filter_PrivateType)
 	struct list_head free_tasks; \
 	struct list_head used_tasks; \
 	struct list_head b_frames; \
+	struct list_head stacked_tasks; \
 	OMX_U32 frame_rate; \
 	OMX_U32 frame_num; \
 	OMX_U32 pic_order_cnt; \
@@ -86,7 +87,8 @@ DERIVEDCLASS(vid_enc_PrivateType, omx_base_filter_PrivateType)
 	struct vl_compositor_state cstate; \
 	struct pipe_video_buffer *scale_buffer[OMX_VID_ENC_NUM_SCALING_BUFFERS]; \
 	OMX_CONFIG_SCALEFACTORTYPE scale; \
-	OMX_U32 current_scale_buffer;
+	OMX_U32 current_scale_buffer; \
+	OMX_U32 stacked_frames_num;
 ENDCLASS(vid_enc_PrivateType)
 
 OMX_ERRORTYPE vid_enc_LoaderComponent(stLoaderComponentType *comp);

From 5609a6986f3eb3c452d66d373b6081df5c6fb34c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Wed, 29 Apr 2015 15:35:02 +0200
Subject: [PATCH 1165/1208] st/vdpau: add HEVC support v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: fix return code

Signed-off-by: Christian KÃ¶nig <christian.koenig@amd.com>
Reviewed-by: Leo Liu <leo.liu@amd.com>
---
 configure.ac                                  |   2 +-
 src/gallium/state_trackers/vdpau/decode.c     | 122 +++++++++++++++++-
 .../state_trackers/vdpau/vdpau_private.h      |  20 +++
 3 files changed, 140 insertions(+), 4 deletions(-)

diff --git a/configure.ac b/configure.ac
index 4e751e36829..7ddb47a2bdc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,7 +81,7 @@ LIBUDEV_REQUIRED=151
 GLPROTO_REQUIRED=1.4.14
 LIBOMXIL_BELLAGIO_REQUIRED=0.0
 LIBVA_REQUIRED=0.35.0
-VDPAU_REQUIRED=0.4.1
+VDPAU_REQUIRED=1.1
 WAYLAND_REQUIRED=1.2.0
 XCB_REQUIRED=1.9.3
 XCBDRI2_REQUIRED=1.8
diff --git a/src/gallium/state_trackers/vdpau/decode.c b/src/gallium/state_trackers/vdpau/decode.c
index 0634ba72fda..3233799d650 100644
--- a/src/gallium/state_trackers/vdpau/decode.c
+++ b/src/gallium/state_trackers/vdpau/decode.c
@@ -413,6 +413,115 @@ vlVdpDecoderRenderH264(struct pipe_h264_picture_desc *picture,
    return VDP_STATUS_OK;
 }
 
+static VdpStatus
+vlVdpDecoderRenderH265(struct pipe_h265_picture_desc *picture,
+                       VdpPictureInfoHEVC *picture_info)
+{
+   unsigned i;
+
+   picture->pps->sps->chroma_format_idc = picture_info->chroma_format_idc;
+   picture->pps->sps->separate_colour_plane_flag = picture_info->separate_colour_plane_flag;
+   picture->pps->sps->pic_width_in_luma_samples = picture_info->pic_width_in_luma_samples;
+   picture->pps->sps->pic_height_in_luma_samples = picture_info->pic_height_in_luma_samples;
+   picture->pps->sps->bit_depth_luma_minus8 = picture_info->bit_depth_luma_minus8;
+   picture->pps->sps->bit_depth_chroma_minus8 = picture_info->bit_depth_chroma_minus8;
+   picture->pps->sps->log2_max_pic_order_cnt_lsb_minus4 = picture_info->log2_max_pic_order_cnt_lsb_minus4;
+   picture->pps->sps->sps_max_dec_pic_buffering_minus1 = picture_info->sps_max_dec_pic_buffering_minus1;
+   picture->pps->sps->log2_min_luma_coding_block_size_minus3 = picture_info->log2_min_luma_coding_block_size_minus3;
+   picture->pps->sps->log2_diff_max_min_luma_coding_block_size = picture_info->log2_diff_max_min_luma_coding_block_size;
+   picture->pps->sps->log2_min_transform_block_size_minus2 = picture_info->log2_min_transform_block_size_minus2;
+   picture->pps->sps->log2_diff_max_min_transform_block_size = picture_info->log2_diff_max_min_transform_block_size;
+   picture->pps->sps->max_transform_hierarchy_depth_inter = picture_info->max_transform_hierarchy_depth_inter;
+   picture->pps->sps->max_transform_hierarchy_depth_intra = picture_info->max_transform_hierarchy_depth_intra;
+   picture->pps->sps->scaling_list_enabled_flag = picture_info->scaling_list_enabled_flag;
+   memcpy(picture->pps->sps->ScalingList4x4, picture_info->ScalingList4x4, 6*16);
+   memcpy(picture->pps->sps->ScalingList8x8, picture_info->ScalingList8x8, 6*64);
+   memcpy(picture->pps->sps->ScalingList16x16, picture_info->ScalingList16x16, 6*64);
+   memcpy(picture->pps->sps->ScalingList32x32, picture_info->ScalingList32x32, 2*64);
+   memcpy(picture->pps->sps->ScalingListDCCoeff16x16, picture_info->ScalingListDCCoeff16x16, 6);
+   memcpy(picture->pps->sps->ScalingListDCCoeff32x32, picture_info->ScalingListDCCoeff32x32, 2);
+   picture->pps->sps->amp_enabled_flag = picture_info->amp_enabled_flag;
+   picture->pps->sps->sample_adaptive_offset_enabled_flag = picture_info->sample_adaptive_offset_enabled_flag;
+   picture->pps->sps->pcm_enabled_flag = picture_info->pcm_enabled_flag;
+   picture->pps->sps->pcm_sample_bit_depth_luma_minus1 = picture_info->pcm_sample_bit_depth_luma_minus1;
+   picture->pps->sps->pcm_sample_bit_depth_chroma_minus1 = picture_info->pcm_sample_bit_depth_chroma_minus1;
+   picture->pps->sps->log2_min_pcm_luma_coding_block_size_minus3 = picture_info->log2_min_pcm_luma_coding_block_size_minus3;
+   picture->pps->sps->log2_diff_max_min_pcm_luma_coding_block_size = picture_info->log2_diff_max_min_pcm_luma_coding_block_size;
+   picture->pps->sps->pcm_loop_filter_disabled_flag = picture_info->pcm_loop_filter_disabled_flag;
+   picture->pps->sps->num_short_term_ref_pic_sets = picture_info->num_short_term_ref_pic_sets;
+   picture->pps->sps->long_term_ref_pics_present_flag = picture_info->long_term_ref_pics_present_flag;
+   picture->pps->sps->num_long_term_ref_pics_sps = picture_info->num_long_term_ref_pics_sps;
+   picture->pps->sps->sps_temporal_mvp_enabled_flag = picture_info->sps_temporal_mvp_enabled_flag;
+   picture->pps->sps->strong_intra_smoothing_enabled_flag = picture_info->strong_intra_smoothing_enabled_flag;
+
+   picture->pps->dependent_slice_segments_enabled_flag = picture_info->dependent_slice_segments_enabled_flag;
+   picture->pps->output_flag_present_flag = picture_info->output_flag_present_flag;
+   picture->pps->num_extra_slice_header_bits = picture_info->num_extra_slice_header_bits;
+   picture->pps->sign_data_hiding_enabled_flag = picture_info->sign_data_hiding_enabled_flag;
+   picture->pps->cabac_init_present_flag = picture_info->cabac_init_present_flag;
+   picture->pps->num_ref_idx_l0_default_active_minus1 = picture_info->num_ref_idx_l0_default_active_minus1;
+   picture->pps->num_ref_idx_l1_default_active_minus1 = picture_info->num_ref_idx_l1_default_active_minus1;
+   picture->pps->init_qp_minus26 = picture_info->init_qp_minus26;
+   picture->pps->constrained_intra_pred_flag = picture_info->constrained_intra_pred_flag;
+   picture->pps->transform_skip_enabled_flag = picture_info->transform_skip_enabled_flag;
+   picture->pps->cu_qp_delta_enabled_flag = picture_info->cu_qp_delta_enabled_flag;
+   picture->pps->diff_cu_qp_delta_depth = picture_info->diff_cu_qp_delta_depth;
+   picture->pps->pps_cb_qp_offset = picture_info->pps_cb_qp_offset;
+   picture->pps->pps_cr_qp_offset = picture_info->pps_cr_qp_offset;
+   picture->pps->pps_slice_chroma_qp_offsets_present_flag = picture_info->pps_slice_chroma_qp_offsets_present_flag;
+   picture->pps->weighted_pred_flag = picture_info->weighted_pred_flag;
+   picture->pps->weighted_bipred_flag = picture_info->weighted_bipred_flag;
+   picture->pps->transquant_bypass_enabled_flag = picture_info->transquant_bypass_enabled_flag;
+   picture->pps->tiles_enabled_flag = picture_info->tiles_enabled_flag;
+   picture->pps->entropy_coding_sync_enabled_flag = picture_info->entropy_coding_sync_enabled_flag;
+   picture->pps->num_tile_columns_minus1 = picture_info->num_tile_columns_minus1;
+   picture->pps->num_tile_rows_minus1 = picture_info->num_tile_rows_minus1;
+   picture->pps->uniform_spacing_flag = picture_info->uniform_spacing_flag;
+   memcpy(picture->pps->column_width_minus1, picture_info->column_width_minus1, 20 * 2);
+   memcpy(picture->pps->row_height_minus1, picture_info->row_height_minus1, 22 * 2);
+   picture->pps->loop_filter_across_tiles_enabled_flag = picture_info->loop_filter_across_tiles_enabled_flag;
+   picture->pps->pps_loop_filter_across_slices_enabled_flag = picture_info->pps_loop_filter_across_slices_enabled_flag;
+   picture->pps->deblocking_filter_control_present_flag = picture_info->deblocking_filter_control_present_flag;
+   picture->pps->deblocking_filter_override_enabled_flag = picture_info->deblocking_filter_override_enabled_flag;
+   picture->pps->pps_deblocking_filter_disabled_flag = picture_info->pps_deblocking_filter_disabled_flag;
+   picture->pps->pps_beta_offset_div2 = picture_info->pps_beta_offset_div2;
+   picture->pps->pps_tc_offset_div2 = picture_info->pps_tc_offset_div2;
+   picture->pps->lists_modification_present_flag = picture_info->lists_modification_present_flag;
+   picture->pps->log2_parallel_merge_level_minus2 = picture_info->log2_parallel_merge_level_minus2;
+   picture->pps->slice_segment_header_extension_present_flag = picture_info->slice_segment_header_extension_present_flag;
+
+   picture->IDRPicFlag = picture_info->IDRPicFlag;
+   picture->RAPPicFlag = picture_info->RAPPicFlag;
+   picture->CurrRpsIdx = picture_info->CurrRpsIdx;
+   picture->NumPocTotalCurr = picture_info->NumPocTotalCurr;
+   picture->NumDeltaPocsOfRefRpsIdx = picture_info->NumDeltaPocsOfRefRpsIdx;
+   picture->NumShortTermPictureSliceHeaderBits = picture_info->NumShortTermPictureSliceHeaderBits;
+   picture->NumLongTermPictureSliceHeaderBits = picture_info->NumLongTermPictureSliceHeaderBits;
+   picture->CurrPicOrderCntVal = picture_info->CurrPicOrderCntVal;
+
+   for (i = 0; i < 16; ++i) {
+      VdpStatus ret = vlVdpGetReferenceFrame
+      (
+         picture_info->RefPics[i],
+         &picture->ref[i]
+      );
+      if (ret != VDP_STATUS_OK)
+         return ret;
+
+      picture->PicOrderCntVal[i] = picture_info->PicOrderCntVal[i];
+      picture->IsLongTerm[i] = picture_info->IsLongTerm[i];
+   }
+
+   picture->NumPocStCurrBefore = picture_info->NumPocStCurrBefore;
+   picture->NumPocStCurrAfter = picture_info->NumPocStCurrAfter;
+   picture->NumPocLtCurr = picture_info->NumPocLtCurr;
+   memcpy(picture->RefPicSetStCurrBefore, picture_info->RefPicSetStCurrBefore, 8);
+   memcpy(picture->RefPicSetStCurrAfter, picture_info->RefPicSetStCurrAfter, 8);
+   memcpy(picture->RefPicSetLtCurr, picture_info->RefPicSetLtCurr, 8);
+
+   return VDP_STATUS_OK;
+}
+
 static void
 vlVdpDecoderFixVC1Startcode(uint32_t *num_buffers, const void *buffers[], unsigned sizes[])
 {
@@ -461,14 +570,17 @@ vlVdpDecoderRender(VdpDecoder decoder,
    struct pipe_video_codec *dec;
    bool buffer_support[2];
    unsigned i;
-   struct pipe_h264_sps sps = {};
-   struct pipe_h264_pps pps = { &sps };
+   struct pipe_h264_sps sps_h264 = {};
+   struct pipe_h264_pps pps_h264 = { &sps_h264 };
+   struct pipe_h265_sps sps_h265 = {};
+   struct pipe_h265_pps pps_h265 = { &sps_h265 };
    union {
       struct pipe_picture_desc base;
       struct pipe_mpeg12_picture_desc mpeg12;
       struct pipe_mpeg4_picture_desc mpeg4;
       struct pipe_vc1_picture_desc vc1;
       struct pipe_h264_picture_desc h264;
+      struct pipe_h265_picture_desc h265;
    } desc;
 
    if (!(picture_info && bitstream_buffers))
@@ -547,9 +659,13 @@ vlVdpDecoderRender(VdpDecoder decoder,
       ret = vlVdpDecoderRenderVC1(&desc.vc1, (VdpPictureInfoVC1 *)picture_info);
       break;
    case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-      desc.h264.pps = &pps;
+      desc.h264.pps = &pps_h264;
       ret = vlVdpDecoderRenderH264(&desc.h264, (VdpPictureInfoH264 *)picture_info);
       break;
+   case PIPE_VIDEO_FORMAT_HEVC:
+      desc.h265.pps = &pps_h265;
+      ret = vlVdpDecoderRenderH265(&desc.h265, (VdpPictureInfoHEVC *)picture_info);
+      break;
    default:
       return VDP_STATUS_INVALID_DECODER_PROFILE;
    }
diff --git a/src/gallium/state_trackers/vdpau/vdpau_private.h b/src/gallium/state_trackers/vdpau/vdpau_private.h
index e14ce041947..27ac44cd9c1 100644
--- a/src/gallium/state_trackers/vdpau/vdpau_private.h
+++ b/src/gallium/state_trackers/vdpau/vdpau_private.h
@@ -261,6 +261,16 @@ ProfileToPipe(VdpDecoderProfile vdpau_profile)
          return PIPE_VIDEO_PROFILE_VC1_MAIN;
       case VDP_DECODER_PROFILE_VC1_ADVANCED:
          return PIPE_VIDEO_PROFILE_VC1_ADVANCED;
+      case VDP_DECODER_PROFILE_HEVC_MAIN:
+         return PIPE_VIDEO_PROFILE_HEVC_MAIN;
+      case VDP_DECODER_PROFILE_HEVC_MAIN_10:
+         return PIPE_VIDEO_PROFILE_HEVC_MAIN_10;
+      case VDP_DECODER_PROFILE_HEVC_MAIN_STILL:
+         return PIPE_VIDEO_PROFILE_HEVC_MAIN_STILL;
+      case VDP_DECODER_PROFILE_HEVC_MAIN_12:
+         return PIPE_VIDEO_PROFILE_HEVC_MAIN_12;
+      case VDP_DECODER_PROFILE_HEVC_MAIN_444:
+         return PIPE_VIDEO_PROFILE_HEVC_MAIN_444;
       default:
          return PIPE_VIDEO_PROFILE_UNKNOWN;
    }
@@ -292,6 +302,16 @@ PipeToProfile(enum pipe_video_profile p_profile)
          return VDP_DECODER_PROFILE_VC1_MAIN;
       case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
          return VDP_DECODER_PROFILE_VC1_ADVANCED;
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+         return VDP_DECODER_PROFILE_HEVC_MAIN;
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
+         return VDP_DECODER_PROFILE_HEVC_MAIN_10;
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_STILL:
+         return VDP_DECODER_PROFILE_HEVC_MAIN_STILL;
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_12:
+         return VDP_DECODER_PROFILE_HEVC_MAIN_12;
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_444:
+         return VDP_DECODER_PROFILE_HEVC_MAIN_444;
       default:
          assert(0);
          return -1;

From 2eb067db0febcd71b4182153155e3e43f215624c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Apr 2015 22:43:23 +0200
Subject: [PATCH 1166/1208] winsys/amdgpu: add a new winsys for the new kernel
 driver

v2: - lots of changes according to Emil Velikov's comments
    - implemented radeon_winsys::read_registers

v3: - a lot of new work, many of them adapt to libdrm interface changes
Squashed patches:
winsys/amdgpu: implement radeon_winsys context support
winsys/amdgpu: add reference counting for contexts
winsys/amdgpu: add userptr support
winsys/amdgpu: allocate IBs like normal buffers
winsys/amdgpu: add IBs to the buffer list, adapt to interface changes
winsys/amdgpu: don't use KMS handles as reloc hash keys
winsys/amdgpu: sync buffer accesses to different rings
winsys/amdgpu: use dependencies instead of waiting for last fence v2
gallium/radeon: unify buffer_wait and buffer_is_busy in the winsys interface (amdgpu part)
winsys/amdgpu: track fences per ring and be thread-safe
winsys/amdgpu: simplify waiting on a variable in amdgpu_fence_wait
gallium/radeon: allow the winsys to choose the IB size (amdgpu part)
winsys/amdgpu: switch to new amdgpu_cs_query_fence_status interface
winsys/amdgpu: handle fence and dependencies merge
winsys/amdgpu follow libdrm change to move user fence into UMD
winsys/amdgpu: use amdgpu_bo_va_op for va map/unmap v2
winsys/amdgpu: use the new tiling flags
winsys/amdgpu: switch to new GTT_USWC definition
winsys/amdgpu: expose amdgpu_cs_query_reset_state to drivers
winsys/amdgpu: fix valgrind warnings
winsys/amdgpu: don't use VRAM with APUs that don't have much of it
winsys/amdgpu: require LLVM 3.6.1 for VI because of bug fixes there
winsys/amdgpu: remove amdgpu_winsys::num_cpus
winsys/amdgpu: align BO size to page size
winsys/amdgpu: reduce BO cache timeout
winsys/amdgpu: remove useless flushing and waiting in amdgpu_bo_set_tiling
winsys/amdgpu: use amdgpu_device_handle as a unique device ID instead of fd
winsys/amdgpu: use safer access to amdgpu_fence_wait::signalled
winsys/amdgpu: allow maximum IB size of 4 MB
winsys/amdgpu: add ip_instance into amdgpu_fence
gallium/radeon: add RING_COMPUTE instead of RADEON_FLUSH_COMPUTE
winsys/amdgpu: set the ring type at CS initilization
winsys/amdgpu: query the GART page size from the kernel
winsys/amdgpu: correctly wait for shared buffers to become idle
winsys/amdgpu: set the amdgpu_cs_fence structure only once at fence creation
winsys/amdgpu: add a specific error message for cs_submit -> -ENOMEM
winsys/amdgpu: check num_active_ioctls before calling amdgpu_bo_wait_for_idle
winsys/amdgpu: clear user fence BO after allocating it
winsys/amdgpu: fix user fences
winsys/amdgpu: make amdgpu_winsys_create public
winsys/amdgpu: remove thread offloading
winsys/amdgpu: flatten the amdgpu_cs_context structure and simplify more

v4: require libdrm 2.4.63
---
 configure.ac                                  |   3 +
 src/gallium/Android.mk                        |   1 +
 src/gallium/Makefile.am                       |   1 +
 .../target-helpers/inline_drm_helper.h        |   8 +-
 src/gallium/drivers/radeon/radeon_winsys.h    |  13 +-
 src/gallium/drivers/radeonsi/Automake.inc     |   6 +-
 src/gallium/targets/dri-vdpau.dyn             |   1 +
 src/gallium/targets/dri/dri.sym               |   1 +
 src/gallium/targets/pipe-loader/Makefile.am   |   4 +-
 .../targets/pipe-loader/pipe_radeonsi.c       |   8 +-
 src/gallium/targets/vdpau/vdpau.sym           |   1 +
 src/gallium/winsys/amdgpu/drm/Android.mk      |  37 +
 src/gallium/winsys/amdgpu/drm/Makefile.am     |  12 +
 .../winsys/amdgpu/drm/Makefile.sources        |   8 +
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c     | 778 ++++++++++++++++++
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.h     |  80 ++
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c     | 704 ++++++++++++++++
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h     | 162 ++++
 src/gallium/winsys/amdgpu/drm/amdgpu_public.h |  40 +
 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 448 ++++++++++
 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h |  71 ++
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c |   3 +-
 22 files changed, 2382 insertions(+), 8 deletions(-)
 create mode 100644 src/gallium/winsys/amdgpu/drm/Android.mk
 create mode 100644 src/gallium/winsys/amdgpu/drm/Makefile.am
 create mode 100644 src/gallium/winsys/amdgpu/drm/Makefile.sources
 create mode 100644 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
 create mode 100644 src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
 create mode 100644 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/amdgpu_public.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
 create mode 100644 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h

diff --git a/configure.ac b/configure.ac
index 7ddb47a2bdc..4b762467756 100644
--- a/configure.ac
+++ b/configure.ac
@@ -70,6 +70,7 @@ AC_SUBST([OPENCL_VERSION])
 dnl Versions for external dependencies
 LIBDRM_REQUIRED=2.4.60
 LIBDRM_RADEON_REQUIRED=2.4.56
+LIBDRM_AMDGPU_REQUIRED=2.4.63
 LIBDRM_INTEL_REQUIRED=2.4.61
 LIBDRM_NVVIEUX_REQUIRED=2.4.33
 LIBDRM_NOUVEAU_REQUIRED=2.4.62
@@ -2105,6 +2106,7 @@ if test -n "$with_gallium_drivers"; then
         xradeonsi)
             HAVE_GALLIUM_RADEONSI=yes
             PKG_CHECK_MODULES([RADEON], [libdrm_radeon >= $LIBDRM_RADEON_REQUIRED])
+            PKG_CHECK_MODULES([AMDGPU], [libdrm_amdgpu >= $LIBDRM_AMDGPU_REQUIRED])
             gallium_require_drm "radeonsi"
             gallium_require_drm_loader
             radeon_llvm_check "radeonsi"
@@ -2357,6 +2359,7 @@ AC_CONFIG_FILES([Makefile
 		src/gallium/winsys/intel/drm/Makefile
 		src/gallium/winsys/nouveau/drm/Makefile
 		src/gallium/winsys/radeon/drm/Makefile
+		src/gallium/winsys/amdgpu/drm/Makefile
 		src/gallium/winsys/svga/drm/Makefile
 		src/gallium/winsys/sw/dri/Makefile
 		src/gallium/winsys/sw/kms-dri/Makefile
diff --git a/src/gallium/Android.mk b/src/gallium/Android.mk
index 0703148af08..39e064e9538 100644
--- a/src/gallium/Android.mk
+++ b/src/gallium/Android.mk
@@ -72,6 +72,7 @@ SUBDIRS += drivers/r600
 endif
 ifneq ($(filter radeonsi, $(MESA_GPU_DRIVERS)),)
 SUBDIRS += drivers/radeonsi
+SUBDIRS += winsys/amdgpu/drm
 endif
 endif
 endif
diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am
index ede6e21233a..e2c1090aa26 100644
--- a/src/gallium/Makefile.am
+++ b/src/gallium/Makefile.am
@@ -58,6 +58,7 @@ endif
 ## radeonsi
 if HAVE_GALLIUM_RADEONSI
 SUBDIRS += drivers/radeonsi
+SUBDIRS += winsys/amdgpu/drm
 endif
 
 ## the radeon winsys - linked in by r300, r600 and radeonsi
diff --git a/src/gallium/auxiliary/target-helpers/inline_drm_helper.h b/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
index d3c331d224d..08271a760f5 100644
--- a/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
@@ -42,6 +42,7 @@
 #if GALLIUM_RADEONSI
 #include "radeon/radeon_winsys.h"
 #include "radeon/drm/radeon_drm_public.h"
+#include "amdgpu/drm/amdgpu_public.h"
 #include "radeonsi/si_public.h"
 #endif
 
@@ -228,7 +229,12 @@ pipe_radeonsi_create_screen(int fd)
 {
    struct radeon_winsys *rw;
 
-   rw = radeon_drm_winsys_create(fd, radeonsi_screen_create);
+   /* First, try amdgpu. */
+   rw = amdgpu_winsys_create(fd, radeonsi_screen_create);
+
+   if (!rw)
+      rw = radeon_drm_winsys_create(fd, radeonsi_screen_create);
+
    return rw ? debug_screen_wrap(rw->screen) : NULL;
 }
 #endif
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 93618ccbc1a..6fa9ea4fdf2 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -44,8 +44,7 @@
 
 #define RADEON_FLUSH_ASYNC		(1 << 0)
 #define RADEON_FLUSH_KEEP_TILING_FLAGS	(1 << 1) /* needs DRM 2.12.0 */
-#define RADEON_FLUSH_COMPUTE		(1 << 2)
-#define RADEON_FLUSH_END_OF_FRAME       (1 << 3)
+#define RADEON_FLUSH_END_OF_FRAME       (1 << 2)
 
 /* Tiling flags. */
 enum radeon_bo_layout {
@@ -134,6 +133,9 @@ enum radeon_family {
     CHIP_KABINI,
     CHIP_HAWAII,
     CHIP_MULLINS,
+    CHIP_TONGA,
+    CHIP_ICELAND,
+    CHIP_CARRIZO,
     CHIP_LAST,
 };
 
@@ -148,10 +150,12 @@ enum chip_class {
     CAYMAN,
     SI,
     CIK,
+    VI,
 };
 
 enum ring_type {
     RING_GFX = 0,
+    RING_COMPUTE,
     RING_DMA,
     RING_UVD,
     RING_VCE,
@@ -517,6 +521,11 @@ struct radeon_winsys {
      */
     void (*ctx_destroy)(struct radeon_winsys_ctx *ctx);
 
+    /**
+     * Query a GPU reset status.
+     */
+    enum pipe_reset_status (*ctx_query_reset_status)(struct radeon_winsys_ctx *ctx);
+
     /**
      * Create a command stream.
      *
diff --git a/src/gallium/drivers/radeonsi/Automake.inc b/src/gallium/drivers/radeonsi/Automake.inc
index 8686fffd71c..5a9dcfd9fd6 100644
--- a/src/gallium/drivers/radeonsi/Automake.inc
+++ b/src/gallium/drivers/radeonsi/Automake.inc
@@ -5,10 +5,12 @@ TARGET_CPPFLAGS += -DGALLIUM_RADEONSI
 TARGET_LIB_DEPS += \
 	$(top_builddir)/src/gallium/drivers/radeonsi/libradeonsi.la \
 	$(RADEON_LIBS) \
-	$(LIBDRM_LIBS)
+	$(LIBDRM_LIBS) \
+	$(AMDGPU_LIBS)
 
 TARGET_RADEON_WINSYS = \
-	$(top_builddir)/src/gallium/winsys/radeon/drm/libradeonwinsys.la
+	$(top_builddir)/src/gallium/winsys/radeon/drm/libradeonwinsys.la \
+	$(top_builddir)/src/gallium/winsys/amdgpu/drm/libamdgpuwinsys.la
 
 TARGET_RADEON_COMMON = \
 	$(top_builddir)/src/gallium/drivers/radeon/libradeon.la
diff --git a/src/gallium/targets/dri-vdpau.dyn b/src/gallium/targets/dri-vdpau.dyn
index e5923a23b39..a7919f7d3ba 100644
--- a/src/gallium/targets/dri-vdpau.dyn
+++ b/src/gallium/targets/dri-vdpau.dyn
@@ -1,4 +1,5 @@
 {
 	nouveau_drm_screen_create;
 	radeon_drm_winsys_create;
+	amdgpu_winsys_create;
 };
diff --git a/src/gallium/targets/dri/dri.sym b/src/gallium/targets/dri/dri.sym
index 49a2cc9fcf2..8e26fb960b7 100644
--- a/src/gallium/targets/dri/dri.sym
+++ b/src/gallium/targets/dri/dri.sym
@@ -4,6 +4,7 @@
 		__driDriverGetExtensions*;
 		nouveau_drm_screen_create;
 		radeon_drm_winsys_create;
+		amdgpu_winsys_create;
 	local:
 		*;
 };
diff --git a/src/gallium/targets/pipe-loader/Makefile.am b/src/gallium/targets/pipe-loader/Makefile.am
index e4048b58605..4d9f7be2ec9 100644
--- a/src/gallium/targets/pipe-loader/Makefile.am
+++ b/src/gallium/targets/pipe-loader/Makefile.am
@@ -155,10 +155,12 @@ nodist_EXTRA_pipe_radeonsi_la_SOURCES = dummy.cpp
 pipe_radeonsi_la_LIBADD = \
 	$(PIPE_LIBS) \
 	$(top_builddir)/src/gallium/winsys/radeon/drm/libradeonwinsys.la \
+	$(top_builddir)/src/gallium/winsys/amdgpu/drm/libamdgpuwinsys.la \
 	$(top_builddir)/src/gallium/drivers/radeon/libradeon.la \
 	$(top_builddir)/src/gallium/drivers/radeonsi/libradeonsi.la \
 	$(LIBDRM_LIBS) \
-	$(RADEON_LIBS)
+	$(RADEON_LIBS) \
+	$(AMDGPU_LIBS)
 
 endif
 
diff --git a/src/gallium/targets/pipe-loader/pipe_radeonsi.c b/src/gallium/targets/pipe-loader/pipe_radeonsi.c
index 5457b5b5e32..31077af6a04 100644
--- a/src/gallium/targets/pipe-loader/pipe_radeonsi.c
+++ b/src/gallium/targets/pipe-loader/pipe_radeonsi.c
@@ -2,6 +2,7 @@
 #include "target-helpers/inline_debug_helper.h"
 #include "radeon/drm/radeon_drm_public.h"
 #include "radeon/radeon_winsys.h"
+#include "amdgpu/drm/amdgpu_public.h"
 #include "radeonsi/si_public.h"
 
 static struct pipe_screen *
@@ -9,7 +10,12 @@ create_screen(int fd)
 {
    struct radeon_winsys *rw;
 
-   rw = radeon_drm_winsys_create(fd, radeonsi_screen_create);
+   /* First, try amdgpu. */
+   rw = amdgpu_winsys_create(fd, radeonsi_screen_create);
+
+   if (!rw)
+      rw = radeon_drm_winsys_create(fd, radeonsi_screen_create);
+
    return rw ? debug_screen_wrap(rw->screen) : NULL;
 }
 
diff --git a/src/gallium/targets/vdpau/vdpau.sym b/src/gallium/targets/vdpau/vdpau.sym
index f184193c055..5e71c6285a6 100644
--- a/src/gallium/targets/vdpau/vdpau.sym
+++ b/src/gallium/targets/vdpau/vdpau.sym
@@ -3,6 +3,7 @@
                vdp_imp_device_create_x11;
                nouveau_drm_screen_create;
                radeon_drm_winsys_create;
+               amdgpu_winsys_create;
        local:
                *;
 };
diff --git a/src/gallium/winsys/amdgpu/drm/Android.mk b/src/gallium/winsys/amdgpu/drm/Android.mk
new file mode 100644
index 00000000000..7d507aa79c6
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/Android.mk
@@ -0,0 +1,37 @@
+# Mesa 3-D graphics library
+#
+# Copyright (C) 2011 Chia-I Wu <olvaffe@gmail.com>
+# Copyright (C) 2011 LunarG Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+# get C_SOURCES
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(C_SOURCES)
+
+LOCAL_SHARED_LIBRARIES := libdrm libdrm_amdgpu
+LOCAL_MODULE := libmesa_winsys_amdgpu
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/winsys/amdgpu/drm/Makefile.am b/src/gallium/winsys/amdgpu/drm/Makefile.am
new file mode 100644
index 00000000000..80ecb7545b3
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/Makefile.am
@@ -0,0 +1,12 @@
+include Makefile.sources
+include $(top_srcdir)/src/gallium/Automake.inc
+
+AM_CFLAGS = \
+	$(GALLIUM_WINSYS_CFLAGS) \
+	$(AMDGPU_CFLAGS)
+
+AM_CXXFLAGS = $(AM_CFLAGS)
+
+noinst_LTLIBRARIES = libamdgpuwinsys.la
+
+libamdgpuwinsys_la_SOURCES = $(C_SOURCES)
diff --git a/src/gallium/winsys/amdgpu/drm/Makefile.sources b/src/gallium/winsys/amdgpu/drm/Makefile.sources
new file mode 100644
index 00000000000..0f550103f57
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/Makefile.sources
@@ -0,0 +1,8 @@
+C_SOURCES := \
+	amdgpu_bo.c \
+	amdgpu_bo.h \
+	amdgpu_cs.c \
+	amdgpu_cs.h \
+	amdgpu_public.h \
+	amdgpu_winsys.c \
+	amdgpu_winsys.h
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
new file mode 100644
index 00000000000..1b9158302ce
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -0,0 +1,778 @@
+/*
+ * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#include "amdgpu_cs.h"
+
+#include "os/os_time.h"
+#include "state_tracker/drm_driver.h"
+#include <amdgpu_drm.h>
+#include <xf86drm.h>
+#include <stdio.h>
+
+static const struct pb_vtbl amdgpu_winsys_bo_vtbl;
+
+static inline struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
+{
+   assert(bo->vtbl == &amdgpu_winsys_bo_vtbl);
+   return (struct amdgpu_winsys_bo *)bo;
+}
+
+struct amdgpu_bomgr {
+   struct pb_manager base;
+   struct amdgpu_winsys *rws;
+};
+
+static struct amdgpu_winsys *get_winsys(struct pb_manager *mgr)
+{
+   return ((struct amdgpu_bomgr*)mgr)->rws;
+}
+
+static struct amdgpu_winsys_bo *get_amdgpu_winsys_bo(struct pb_buffer *_buf)
+{
+   struct amdgpu_winsys_bo *bo = NULL;
+
+   if (_buf->vtbl == &amdgpu_winsys_bo_vtbl) {
+      bo = amdgpu_winsys_bo(_buf);
+   } else {
+      struct pb_buffer *base_buf;
+      pb_size offset;
+      pb_get_base_buffer(_buf, &base_buf, &offset);
+
+      if (base_buf->vtbl == &amdgpu_winsys_bo_vtbl)
+         bo = amdgpu_winsys_bo(base_buf);
+   }
+
+   return bo;
+}
+
+static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
+                           enum radeon_bo_usage usage)
+{
+   struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf);
+   struct amdgpu_winsys *ws = bo->rws;
+   int i;
+
+   if (bo->is_shared) {
+      /* We can't use user fences for shared buffers, because user fences
+       * are local to this process only. If we want to wait for all buffer
+       * uses in all processes, we have to use amdgpu_bo_wait_for_idle.
+       */
+      bool buffer_busy = true;
+      int r;
+
+      r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy);
+      if (r)
+         fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__,
+                 r);
+      return !buffer_busy;
+   }
+
+   if (timeout == 0) {
+      /* Timeout == 0 is quite simple. */
+      pipe_mutex_lock(ws->bo_fence_lock);
+      for (i = 0; i < RING_LAST; i++)
+         if (bo->fence[i]) {
+            if (amdgpu_fence_wait(bo->fence[i], 0, false)) {
+               /* Release the idle fence to avoid checking it again later. */
+               amdgpu_fence_reference(&bo->fence[i], NULL);
+            } else {
+               pipe_mutex_unlock(ws->bo_fence_lock);
+               return false;
+            }
+         }
+      pipe_mutex_unlock(ws->bo_fence_lock);
+      return true;
+
+   } else {
+      struct pipe_fence_handle *fence[RING_LAST] = {};
+      bool fence_idle[RING_LAST] = {};
+      bool buffer_idle = true;
+      int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
+
+      /* Take references to all fences, so that we can wait for them
+       * without the lock. */
+      pipe_mutex_lock(ws->bo_fence_lock);
+      for (i = 0; i < RING_LAST; i++)
+         amdgpu_fence_reference(&fence[i], bo->fence[i]);
+      pipe_mutex_unlock(ws->bo_fence_lock);
+
+      /* Now wait for the fences. */
+      for (i = 0; i < RING_LAST; i++) {
+         if (fence[i]) {
+            if (amdgpu_fence_wait(fence[i], abs_timeout, true))
+               fence_idle[i] = true;
+            else
+               buffer_idle = false;
+         }
+      }
+
+      /* Release idle fences to avoid checking them again later. */
+      pipe_mutex_lock(ws->bo_fence_lock);
+      for (i = 0; i < RING_LAST; i++) {
+         if (fence[i] == bo->fence[i] && fence_idle[i])
+            amdgpu_fence_reference(&bo->fence[i], NULL);
+
+         amdgpu_fence_reference(&fence[i], NULL);
+      }
+      pipe_mutex_unlock(ws->bo_fence_lock);
+
+      return buffer_idle;
+   }
+}
+
+static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
+      struct radeon_winsys_cs_handle *buf)
+{
+   return ((struct amdgpu_winsys_bo*)buf)->initial_domain;
+}
+
+static void amdgpu_bo_destroy(struct pb_buffer *_buf)
+{
+   struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
+   int i;
+
+   amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
+   amdgpu_va_range_free(bo->va_handle);
+   amdgpu_bo_free(bo->bo);
+
+   for (i = 0; i < RING_LAST; i++)
+      amdgpu_fence_reference(&bo->fence[i], NULL);
+
+   if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+      bo->rws->allocated_vram -= align(bo->base.size, bo->rws->gart_page_size);
+   else if (bo->initial_domain & RADEON_DOMAIN_GTT)
+      bo->rws->allocated_gtt -= align(bo->base.size, bo->rws->gart_page_size);
+   FREE(bo);
+}
+
+static void *amdgpu_bo_map(struct radeon_winsys_cs_handle *buf,
+                           struct radeon_winsys_cs *rcs,
+                           enum pipe_transfer_usage usage)
+{
+   struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
+   struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
+   int r;
+   void *cpu = NULL;
+
+   /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */
+   if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+      /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */
+      if (usage & PIPE_TRANSFER_DONTBLOCK) {
+         if (!(usage & PIPE_TRANSFER_WRITE)) {
+            /* Mapping for read.
+             *
+             * Since we are mapping for read, we don't need to wait
+             * if the GPU is using the buffer for read too
+             * (neither one is changing it).
+             *
+             * Only check whether the buffer is being used for write. */
+            if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
+                                                               RADEON_USAGE_WRITE)) {
+               cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
+               return NULL;
+            }
+
+            if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
+                                RADEON_USAGE_WRITE)) {
+               return NULL;
+            }
+         } else {
+            if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
+               cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
+               return NULL;
+            }
+
+            if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
+                                RADEON_USAGE_READWRITE)) {
+               return NULL;
+            }
+         }
+      } else {
+         uint64_t time = os_time_get_nano();
+
+         if (!(usage & PIPE_TRANSFER_WRITE)) {
+            /* Mapping for read.
+             *
+             * Since we are mapping for read, we don't need to wait
+             * if the GPU is using the buffer for read too
+             * (neither one is changing it).
+             *
+             * Only check whether the buffer is being used for write. */
+            if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
+                                                               RADEON_USAGE_WRITE)) {
+               cs->flush_cs(cs->flush_data, 0, NULL);
+            }
+            amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
+                           RADEON_USAGE_WRITE);
+         } else {
+            /* Mapping for write. */
+            if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo))
+               cs->flush_cs(cs->flush_data, 0, NULL);
+
+            amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
+                           RADEON_USAGE_READWRITE);
+         }
+
+         bo->rws->buffer_wait_time += os_time_get_nano() - time;
+      }
+   }
+
+   /* If the buffer is created from user memory, return the user pointer. */
+   if (bo->user_ptr)
+       return bo->user_ptr;
+
+   r = amdgpu_bo_cpu_map(bo->bo, &cpu);
+   return r ? NULL : cpu;
+}
+
+static void amdgpu_bo_unmap(struct radeon_winsys_cs_handle *buf)
+{
+   struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
+
+   amdgpu_bo_cpu_unmap(bo->bo);
+}
+
+static void amdgpu_bo_get_base_buffer(struct pb_buffer *buf,
+                                      struct pb_buffer **base_buf,
+                                      unsigned *offset)
+{
+   *base_buf = buf;
+   *offset = 0;
+}
+
+static enum pipe_error amdgpu_bo_validate(struct pb_buffer *_buf,
+                                          struct pb_validate *vl,
+                                          unsigned flags)
+{
+   /* Always pinned */
+   return PIPE_OK;
+}
+
+static void amdgpu_bo_fence(struct pb_buffer *buf,
+                            struct pipe_fence_handle *fence)
+{
+}
+
+static const struct pb_vtbl amdgpu_winsys_bo_vtbl = {
+   amdgpu_bo_destroy,
+   NULL, /* never called */
+   NULL, /* never called */
+   amdgpu_bo_validate,
+   amdgpu_bo_fence,
+   amdgpu_bo_get_base_buffer,
+};
+
+static struct pb_buffer *amdgpu_bomgr_create_bo(struct pb_manager *_mgr,
+                                                pb_size size,
+                                                const struct pb_desc *desc)
+{
+   struct amdgpu_winsys *rws = get_winsys(_mgr);
+   struct amdgpu_bo_desc *rdesc = (struct amdgpu_bo_desc*)desc;
+   struct amdgpu_bo_alloc_request request = {0};
+   amdgpu_bo_handle buf_handle;
+   uint64_t va = 0;
+   struct amdgpu_winsys_bo *bo;
+   amdgpu_va_handle va_handle;
+   int r;
+
+   assert(rdesc->initial_domain & RADEON_DOMAIN_VRAM_GTT);
+   bo = CALLOC_STRUCT(amdgpu_winsys_bo);
+   if (!bo) {
+      return NULL;
+   }
+
+   request.alloc_size = size;
+   request.phys_alignment = desc->alignment;
+
+   if (rdesc->initial_domain & RADEON_DOMAIN_VRAM) {
+      request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
+      if (rdesc->flags & RADEON_FLAG_CPU_ACCESS)
+         request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
+   }
+   if (rdesc->initial_domain & RADEON_DOMAIN_GTT) {
+      request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
+      if (rdesc->flags & RADEON_FLAG_GTT_WC)
+         request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
+   }
+
+   r = amdgpu_bo_alloc(rws->dev, &request, &buf_handle);
+   if (r) {
+      fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
+      fprintf(stderr, "amdgpu:    size      : %d bytes\n", size);
+      fprintf(stderr, "amdgpu:    alignment : %d bytes\n", desc->alignment);
+      fprintf(stderr, "amdgpu:    domains   : %d\n", rdesc->initial_domain);
+      goto error_bo_alloc;
+   }
+
+   r = amdgpu_va_range_alloc(rws->dev, amdgpu_gpu_va_range_general,
+                             size, desc->alignment, 0, &va, &va_handle, 0);
+   if (r)
+      goto error_va_alloc;
+
+   r = amdgpu_bo_va_op(buf_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP);
+   if (r)
+      goto error_va_map;
+
+   pipe_reference_init(&bo->base.reference, 1);
+   bo->base.alignment = desc->alignment;
+   bo->base.usage = desc->usage;
+   bo->base.size = size;
+   bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
+   bo->rws = rws;
+   bo->bo = buf_handle;
+   bo->va = va;
+   bo->va_handle = va_handle;
+   bo->initial_domain = rdesc->initial_domain;
+   bo->unique_id = __sync_fetch_and_add(&rws->next_bo_unique_id, 1);
+
+   if (rdesc->initial_domain & RADEON_DOMAIN_VRAM)
+      rws->allocated_vram += align(size, rws->gart_page_size);
+   else if (rdesc->initial_domain & RADEON_DOMAIN_GTT)
+      rws->allocated_gtt += align(size, rws->gart_page_size);
+
+   return &bo->base;
+
+error_va_map:
+   amdgpu_va_range_free(va_handle);
+
+error_va_alloc:
+   amdgpu_bo_free(buf_handle);
+
+error_bo_alloc:
+   FREE(bo);
+   return NULL;
+}
+
+static void amdgpu_bomgr_flush(struct pb_manager *mgr)
+{
+   /* NOP */
+}
+
+/* This is for the cache bufmgr. */
+static boolean amdgpu_bomgr_is_buffer_busy(struct pb_manager *_mgr,
+                                           struct pb_buffer *_buf)
+{
+   struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
+
+   if (amdgpu_bo_is_referenced_by_any_cs(bo)) {
+      return TRUE;
+   }
+
+   if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_READWRITE)) {
+      return TRUE;
+   }
+
+   return FALSE;
+}
+
+static void amdgpu_bomgr_destroy(struct pb_manager *mgr)
+{
+   FREE(mgr);
+}
+
+struct pb_manager *amdgpu_bomgr_create(struct amdgpu_winsys *rws)
+{
+   struct amdgpu_bomgr *mgr;
+
+   mgr = CALLOC_STRUCT(amdgpu_bomgr);
+   if (!mgr)
+      return NULL;
+
+   mgr->base.destroy = amdgpu_bomgr_destroy;
+   mgr->base.create_buffer = amdgpu_bomgr_create_bo;
+   mgr->base.flush = amdgpu_bomgr_flush;
+   mgr->base.is_buffer_busy = amdgpu_bomgr_is_buffer_busy;
+
+   mgr->rws = rws;
+   return &mgr->base;
+}
+
+static unsigned eg_tile_split(unsigned tile_split)
+{
+   switch (tile_split) {
+   case 0:     tile_split = 64;    break;
+   case 1:     tile_split = 128;   break;
+   case 2:     tile_split = 256;   break;
+   case 3:     tile_split = 512;   break;
+   default:
+   case 4:     tile_split = 1024;  break;
+   case 5:     tile_split = 2048;  break;
+   case 6:     tile_split = 4096;  break;
+   }
+   return tile_split;
+}
+
+static unsigned eg_tile_split_rev(unsigned eg_tile_split)
+{
+   switch (eg_tile_split) {
+   case 64:    return 0;
+   case 128:   return 1;
+   case 256:   return 2;
+   case 512:   return 3;
+   default:
+   case 1024:  return 4;
+   case 2048:  return 5;
+   case 4096:  return 6;
+   }
+}
+
+static void amdgpu_bo_get_tiling(struct pb_buffer *_buf,
+                                 enum radeon_bo_layout *microtiled,
+                                 enum radeon_bo_layout *macrotiled,
+                                 unsigned *bankw, unsigned *bankh,
+                                 unsigned *tile_split,
+                                 unsigned *stencil_tile_split,
+                                 unsigned *mtilea,
+                                 bool *scanout)
+{
+   struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf);
+   struct amdgpu_bo_info info = {0};
+   uint32_t tiling_flags;
+   int r;
+
+   r = amdgpu_bo_query_info(bo->bo, &info);
+   if (r)
+      return;
+
+   tiling_flags = info.metadata.tiling_info;
+
+   *microtiled = RADEON_LAYOUT_LINEAR;
+   *macrotiled = RADEON_LAYOUT_LINEAR;
+
+   if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4)  /* 2D_TILED_THIN1 */
+      *macrotiled = RADEON_LAYOUT_TILED;
+   else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */
+      *microtiled = RADEON_LAYOUT_TILED;
+
+   if (bankw && tile_split && mtilea && tile_split) {
+      *bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
+      *bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
+      *tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
+      *mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
+   }
+   if (scanout)
+      *scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
+}
+
+static void amdgpu_bo_set_tiling(struct pb_buffer *_buf,
+                                 struct radeon_winsys_cs *rcs,
+                                 enum radeon_bo_layout microtiled,
+                                 enum radeon_bo_layout macrotiled,
+                                 unsigned bankw, unsigned bankh,
+                                 unsigned tile_split,
+                                 unsigned stencil_tile_split,
+                                 unsigned mtilea,
+                                 uint32_t pitch,
+                                 bool scanout)
+{
+   struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf);
+   struct amdgpu_bo_metadata metadata = {0};
+   uint32_t tiling_flags = 0;
+
+   if (macrotiled == RADEON_LAYOUT_TILED)
+      tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */
+   else if (microtiled == RADEON_LAYOUT_TILED)
+      tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */
+   else
+      tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
+
+   tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(bankw));
+   tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(bankh));
+   if (tile_split)
+      tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(tile_split));
+   tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(mtilea));
+
+   if (scanout)
+      tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
+   else
+      tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */
+
+   metadata.tiling_info = tiling_flags;
+
+   amdgpu_bo_set_metadata(bo->bo, &metadata);
+}
+
+static struct radeon_winsys_cs_handle *amdgpu_get_cs_handle(struct pb_buffer *_buf)
+{
+   /* return a direct pointer to amdgpu_winsys_bo. */
+   return (struct radeon_winsys_cs_handle*)get_amdgpu_winsys_bo(_buf);
+}
+
+static struct pb_buffer *
+amdgpu_bo_create(struct radeon_winsys *rws,
+                 unsigned size,
+                 unsigned alignment,
+                 boolean use_reusable_pool,
+                 enum radeon_bo_domain domain,
+                 enum radeon_bo_flag flags)
+{
+   struct amdgpu_winsys *ws = amdgpu_winsys(rws);
+   struct amdgpu_bo_desc desc;
+   struct pb_manager *provider;
+   struct pb_buffer *buffer;
+
+   /* Don't use VRAM if the GPU doesn't have much. This is only the initial
+    * domain. The kernel is free to move the buffer if it wants to.
+    *
+    * 64MB means no VRAM by todays standards.
+    */
+   if (domain & RADEON_DOMAIN_VRAM && ws->info.vram_size <= 64*1024*1024) {
+      domain = RADEON_DOMAIN_GTT;
+      flags = RADEON_FLAG_GTT_WC;
+   }
+
+   memset(&desc, 0, sizeof(desc));
+   desc.base.alignment = alignment;
+
+   /* Align size to page size. This is the minimum alignment for normal
+    * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
+    * like constant/uniform buffers, can benefit from better and more reuse.
+    */
+   size = align(size, ws->gart_page_size);
+
+   /* Only set one usage bit each for domains and flags, or the cache manager
+    * might consider different sets of domains / flags compatible
+    */
+   if (domain == RADEON_DOMAIN_VRAM_GTT)
+      desc.base.usage = 1 << 2;
+   else
+      desc.base.usage = domain >> 1;
+   assert(flags < sizeof(desc.base.usage) * 8 - 3);
+   desc.base.usage |= 1 << (flags + 3);
+
+   desc.initial_domain = domain;
+   desc.flags = flags;
+
+   /* Assign a buffer manager. */
+   if (use_reusable_pool)
+      provider = ws->cman;
+   else
+      provider = ws->kman;
+
+   buffer = provider->create_buffer(provider, size, &desc.base);
+   if (!buffer)
+      return NULL;
+
+   return (struct pb_buffer*)buffer;
+}
+
+static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
+                                               struct winsys_handle *whandle,
+                                               unsigned *stride)
+{
+   struct amdgpu_winsys *ws = amdgpu_winsys(rws);
+   struct amdgpu_winsys_bo *bo;
+   enum amdgpu_bo_handle_type type;
+   struct amdgpu_bo_import_result result = {0};
+   uint64_t va;
+   amdgpu_va_handle va_handle;
+   struct amdgpu_bo_info info = {0};
+   enum radeon_bo_domain initial = 0;
+   int r;
+
+   /* Initialize the structure. */
+   bo = CALLOC_STRUCT(amdgpu_winsys_bo);
+   if (!bo) {
+      return NULL;
+   }
+
+   switch (whandle->type) {
+   case DRM_API_HANDLE_TYPE_SHARED:
+      type = amdgpu_bo_handle_type_gem_flink_name;
+      break;
+   case DRM_API_HANDLE_TYPE_FD:
+      type = amdgpu_bo_handle_type_dma_buf_fd;
+      break;
+   default:
+      return NULL;
+   }
+
+   r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result);
+   if (r)
+      goto error;
+
+   /* Get initial domains. */
+   r = amdgpu_bo_query_info(result.buf_handle, &info);
+   if (r)
+      goto error_query;
+
+   r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
+                             result.alloc_size, 1 << 20, 0, &va, &va_handle, 0);
+   if (r)
+      goto error_query;
+
+   r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP);
+   if (r)
+      goto error_va_map;
+
+   if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
+      initial |= RADEON_DOMAIN_VRAM;
+   if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
+      initial |= RADEON_DOMAIN_GTT;
+
+
+   pipe_reference_init(&bo->base.reference, 1);
+   bo->base.alignment = info.phys_alignment;
+   bo->base.usage = PB_USAGE_GPU_WRITE | PB_USAGE_GPU_READ;
+   bo->bo = result.buf_handle;
+   bo->base.size = result.alloc_size;
+   bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
+   bo->rws = ws;
+   bo->va = va;
+   bo->va_handle = va_handle;
+   bo->initial_domain = initial;
+   bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
+   bo->is_shared = true;
+
+   if (stride)
+      *stride = whandle->stride;
+
+   if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+      ws->allocated_vram += align(bo->base.size, ws->gart_page_size);
+   else if (bo->initial_domain & RADEON_DOMAIN_GTT)
+      ws->allocated_gtt += align(bo->base.size, ws->gart_page_size);
+
+   return &bo->base;
+
+error_va_map:
+   amdgpu_va_range_free(va_handle);
+
+error_query:
+   amdgpu_bo_free(result.buf_handle);
+
+error:
+   FREE(bo);
+   return NULL;
+}
+
+static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
+                                    unsigned stride,
+                                    struct winsys_handle *whandle)
+{
+   struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(buffer);
+   enum amdgpu_bo_handle_type type;
+   int r;
+
+   switch (whandle->type) {
+   case DRM_API_HANDLE_TYPE_SHARED:
+      type = amdgpu_bo_handle_type_gem_flink_name;
+      break;
+   case DRM_API_HANDLE_TYPE_FD:
+      type = amdgpu_bo_handle_type_dma_buf_fd;
+      break;
+   case DRM_API_HANDLE_TYPE_KMS:
+      type = amdgpu_bo_handle_type_kms;
+      break;
+   default:
+      return FALSE;
+   }
+
+   r = amdgpu_bo_export(bo->bo, type, &whandle->handle);
+   if (r)
+      return FALSE;
+
+   whandle->stride = stride;
+   bo->is_shared = true;
+   return TRUE;
+}
+
+static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
+					    void *pointer, unsigned size)
+{
+    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
+    amdgpu_bo_handle buf_handle;
+    struct amdgpu_winsys_bo *bo;
+    uint64_t va;
+    amdgpu_va_handle va_handle;
+
+    bo = CALLOC_STRUCT(amdgpu_winsys_bo);
+    if (!bo)
+        return NULL;
+
+    if (amdgpu_create_bo_from_user_mem(ws->dev, pointer, size, &buf_handle))
+        goto error;
+
+    if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
+                              size, 1 << 12, 0, &va, &va_handle, 0))
+        goto error_va_alloc;
+
+    if (amdgpu_bo_va_op(buf_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP))
+        goto error_va_map;
+
+    /* Initialize it. */
+    pipe_reference_init(&bo->base.reference, 1);
+    bo->bo = buf_handle;
+    bo->base.alignment = 0;
+    bo->base.usage = PB_USAGE_GPU_WRITE | PB_USAGE_GPU_READ;
+    bo->base.size = size;
+    bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
+    bo->rws = ws;
+    bo->user_ptr = pointer;
+    bo->va = va;
+    bo->va_handle = va_handle;
+    bo->initial_domain = RADEON_DOMAIN_GTT;
+    bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
+
+    ws->allocated_gtt += align(bo->base.size, ws->gart_page_size);
+
+    return (struct pb_buffer*)bo;
+
+error_va_map:
+    amdgpu_va_range_free(va_handle);
+
+error_va_alloc:
+    amdgpu_bo_free(buf_handle);
+
+error:
+    FREE(bo);
+    return NULL;
+}
+
+static uint64_t amdgpu_bo_get_va(struct radeon_winsys_cs_handle *buf)
+{
+   return ((struct amdgpu_winsys_bo*)buf)->va;
+}
+
+void amdgpu_bomgr_init_functions(struct amdgpu_winsys *ws)
+{
+   ws->base.buffer_get_cs_handle = amdgpu_get_cs_handle;
+   ws->base.buffer_set_tiling = amdgpu_bo_set_tiling;
+   ws->base.buffer_get_tiling = amdgpu_bo_get_tiling;
+   ws->base.buffer_map = amdgpu_bo_map;
+   ws->base.buffer_unmap = amdgpu_bo_unmap;
+   ws->base.buffer_wait = amdgpu_bo_wait;
+   ws->base.buffer_create = amdgpu_bo_create;
+   ws->base.buffer_from_handle = amdgpu_bo_from_handle;
+   ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
+   ws->base.buffer_get_handle = amdgpu_bo_get_handle;
+   ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
+   ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
+}
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
new file mode 100644
index 00000000000..3739fd1366e
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2008 Jérôme Glisse
+ * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#ifndef AMDGPU_BO_H
+#define AMDGPU_BO_H
+
+#include "amdgpu_winsys.h"
+#include "pipebuffer/pb_bufmgr.h"
+
+struct amdgpu_bo_desc {
+   struct pb_desc base;
+
+   enum radeon_bo_domain initial_domain;
+   unsigned flags;
+};
+
+struct amdgpu_winsys_bo {
+   struct pb_buffer base;
+
+   struct amdgpu_winsys *rws;
+   void *user_ptr; /* from buffer_from_ptr */
+
+   amdgpu_bo_handle bo;
+   uint32_t unique_id;
+   amdgpu_va_handle va_handle;
+   uint64_t va;
+   enum radeon_bo_domain initial_domain;
+
+   /* how many command streams is this bo referenced in? */
+   int num_cs_references;
+
+   /* whether buffer_get_handle or buffer_from_handle was called,
+    * it can only transition from false to true
+    */
+   volatile int is_shared; /* bool (int for atomicity) */
+
+   /* Fences for buffer synchronization. */
+   struct pipe_fence_handle *fence[RING_LAST];
+};
+
+struct pb_manager *amdgpu_bomgr_create(struct amdgpu_winsys *rws);
+void amdgpu_bomgr_init_functions(struct amdgpu_winsys *ws);
+
+static inline
+void amdgpu_winsys_bo_reference(struct amdgpu_winsys_bo **dst,
+                                struct amdgpu_winsys_bo *src)
+{
+   pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
+}
+
+#endif
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
new file mode 100644
index 00000000000..0f42298c2ad
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright © 2008 Jérôme Glisse
+ * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#include "amdgpu_cs.h"
+#include "os/os_time.h"
+#include <stdio.h>
+#include <amdgpu_drm.h>
+
+
+/* FENCES */
+
+static struct pipe_fence_handle *
+amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type,
+                    unsigned ip_instance, unsigned ring)
+{
+   struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
+
+   fence->reference.count = 1;
+   fence->ctx = ctx;
+   fence->fence.context = ctx->ctx;
+   fence->fence.ip_type = ip_type;
+   fence->fence.ip_instance = ip_instance;
+   fence->fence.ring = ring;
+   p_atomic_inc(&ctx->refcount);
+   return (struct pipe_fence_handle *)fence;
+}
+
+static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
+				struct amdgpu_cs_request* request,
+				uint64_t *user_fence_cpu_address)
+{
+   struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
+
+   rfence->fence.fence = request->seq_no;
+   rfence->user_fence_cpu_address = user_fence_cpu_address;
+}
+
+static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
+{
+   struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
+
+   rfence->signalled = true;
+}
+
+bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
+                       bool absolute)
+{
+   struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
+   uint32_t expired;
+   int64_t abs_timeout;
+   uint64_t *user_fence_cpu;
+   int r;
+
+   if (rfence->signalled)
+      return true;
+
+   if (absolute)
+      abs_timeout = timeout;
+   else
+      abs_timeout = os_time_get_absolute_timeout(timeout);
+
+   user_fence_cpu = rfence->user_fence_cpu_address;
+   if (user_fence_cpu && *user_fence_cpu >= rfence->fence.fence) {
+	rfence->signalled = true;
+	return true;
+   }
+   /* Now use the libdrm query. */
+   r = amdgpu_cs_query_fence_status(&rfence->fence,
+				    abs_timeout,
+				    AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE,
+				    &expired);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_cs_query_fence_status failed.\n");
+      return FALSE;
+   }
+
+   if (expired) {
+      /* This variable can only transition from false to true, so it doesn't
+       * matter if threads race for it. */
+      rfence->signalled = true;
+      return true;
+   }
+   return false;
+}
+
+static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
+                                          struct pipe_fence_handle *fence,
+                                          uint64_t timeout)
+{
+   return amdgpu_fence_wait(fence, timeout, false);
+}
+
+/* CONTEXTS */
+
+static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *ws)
+{
+   struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
+   int r;
+   struct amdgpu_bo_alloc_request alloc_buffer = {};
+   amdgpu_bo_handle buf_handle;
+
+   ctx->ws = amdgpu_winsys(ws);
+   ctx->refcount = 1;
+
+   r = amdgpu_cs_ctx_create(ctx->ws->dev, &ctx->ctx);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create failed. (%i)\n", r);
+      FREE(ctx);
+      return NULL;
+   }
+
+   alloc_buffer.alloc_size = 4 * 1024;
+   alloc_buffer.phys_alignment = 4 *1024;
+   alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
+
+   r = amdgpu_bo_alloc(ctx->ws->dev, &alloc_buffer, &buf_handle);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
+      amdgpu_cs_ctx_free(ctx->ctx);
+      FREE(ctx);
+      return NULL;
+   }
+
+   r = amdgpu_bo_cpu_map(buf_handle, (void**)&ctx->user_fence_cpu_address_base);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
+      amdgpu_bo_free(buf_handle);
+      amdgpu_cs_ctx_free(ctx->ctx);
+      FREE(ctx);
+      return NULL;
+   }
+
+   memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
+   ctx->user_fence_bo = buf_handle;
+
+   return (struct radeon_winsys_ctx*)ctx;
+}
+
+static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
+{
+   amdgpu_ctx_unref((struct amdgpu_ctx*)rwctx);
+}
+
+static enum pipe_reset_status
+amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx)
+{
+   struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
+   uint32_t result, hangs;
+   int r;
+
+   r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r);
+      return PIPE_NO_RESET;
+   }
+
+   switch (result) {
+   case AMDGPU_CTX_GUILTY_RESET:
+      return PIPE_GUILTY_CONTEXT_RESET;
+   case AMDGPU_CTX_INNOCENT_RESET:
+      return PIPE_INNOCENT_CONTEXT_RESET;
+   case AMDGPU_CTX_UNKNOWN_RESET:
+      return PIPE_UNKNOWN_CONTEXT_RESET;
+   case AMDGPU_CTX_NO_RESET:
+   default:
+      return PIPE_NO_RESET;
+   }
+}
+
+/* COMMAND SUBMISSION */
+
+static bool amdgpu_get_new_ib(struct amdgpu_cs *cs)
+{
+   /* The maximum size is 4MB - 1B, which is unaligned.
+    * Use aligned size 4MB - 16B. */
+   const unsigned max_ib_size = (1024 * 1024 - 16) * 4;
+   const unsigned min_ib_size = 24 * 1024 * 4;
+
+   cs->base.cdw = 0;
+   cs->base.buf = NULL;
+
+   /* Allocate a new buffer for IBs if the current buffer is all used. */
+   if (!cs->big_ib_buffer ||
+       cs->used_ib_space + min_ib_size > cs->big_ib_buffer->size) {
+      struct radeon_winsys *ws = &cs->ctx->ws->base;
+      struct radeon_winsys_cs_handle *winsys_bo;
+
+      pb_reference(&cs->big_ib_buffer, NULL);
+      cs->big_ib_winsys_buffer = NULL;
+      cs->ib_mapped = NULL;
+      cs->used_ib_space = 0;
+
+      cs->big_ib_buffer = ws->buffer_create(ws, max_ib_size,
+                                            4096, true,
+                                            RADEON_DOMAIN_GTT,
+                                            RADEON_FLAG_CPU_ACCESS);
+      if (!cs->big_ib_buffer)
+         return false;
+
+      winsys_bo = ws->buffer_get_cs_handle(cs->big_ib_buffer);
+
+      cs->ib_mapped = ws->buffer_map(winsys_bo, NULL, PIPE_TRANSFER_WRITE);
+      if (!cs->ib_mapped) {
+         pb_reference(&cs->big_ib_buffer, NULL);
+         return false;
+      }
+
+      cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)winsys_bo;
+   }
+
+   cs->ib.ib_mc_address = cs->big_ib_winsys_buffer->va + cs->used_ib_space;
+   cs->base.buf = (uint32_t*)(cs->ib_mapped + cs->used_ib_space);
+   cs->base.max_dw = (cs->big_ib_buffer->size - cs->used_ib_space) / 4;
+   return true;
+}
+
+static boolean amdgpu_init_cs_context(struct amdgpu_cs *cs,
+                                      enum ring_type ring_type)
+{
+   int i;
+
+   switch (ring_type) {
+   case RING_DMA:
+      cs->request.ip_type = AMDGPU_HW_IP_DMA;
+      break;
+
+   case RING_UVD:
+      cs->request.ip_type = AMDGPU_HW_IP_UVD;
+      break;
+
+   case RING_VCE:
+      cs->request.ip_type = AMDGPU_HW_IP_VCE;
+      break;
+
+   case RING_COMPUTE:
+      cs->request.ip_type = AMDGPU_HW_IP_COMPUTE;
+      break;
+
+   default:
+   case RING_GFX:
+      cs->request.ip_type = AMDGPU_HW_IP_GFX;
+      break;
+   }
+
+   cs->request.number_of_ibs = 1;
+   cs->request.ibs = &cs->ib;
+
+   cs->max_num_buffers = 512;
+   cs->buffers = (struct amdgpu_cs_buffer*)
+                  CALLOC(1, cs->max_num_buffers * sizeof(struct amdgpu_cs_buffer));
+   if (!cs->buffers) {
+      return FALSE;
+   }
+
+   cs->handles = CALLOC(1, cs->max_num_buffers * sizeof(amdgpu_bo_handle));
+   if (!cs->handles) {
+      FREE(cs->buffers);
+      return FALSE;
+   }
+
+   cs->flags = CALLOC(1, cs->max_num_buffers);
+   if (!cs->flags) {
+      FREE(cs->handles);
+      FREE(cs->buffers);
+      return FALSE;
+   }
+
+   for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) {
+      cs->buffer_indices_hashlist[i] = -1;
+   }
+   return TRUE;
+}
+
+static void amdgpu_cs_context_cleanup(struct amdgpu_cs *cs)
+{
+   unsigned i;
+
+   for (i = 0; i < cs->num_buffers; i++) {
+      p_atomic_dec(&cs->buffers[i].bo->num_cs_references);
+      amdgpu_winsys_bo_reference(&cs->buffers[i].bo, NULL);
+      cs->handles[i] = NULL;
+      cs->flags[i] = 0;
+   }
+
+   cs->num_buffers = 0;
+   cs->used_gart = 0;
+   cs->used_vram = 0;
+
+   for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) {
+      cs->buffer_indices_hashlist[i] = -1;
+   }
+}
+
+static void amdgpu_destroy_cs_context(struct amdgpu_cs *cs)
+{
+   amdgpu_cs_context_cleanup(cs);
+   FREE(cs->flags);
+   FREE(cs->buffers);
+   FREE(cs->handles);
+   FREE(cs->request.dependencies);
+}
+
+
+static struct radeon_winsys_cs *
+amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
+                 enum ring_type ring_type,
+                 void (*flush)(void *ctx, unsigned flags,
+                               struct pipe_fence_handle **fence),
+                 void *flush_ctx,
+                 struct radeon_winsys_cs_handle *trace_buf)
+{
+   struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
+   struct amdgpu_cs *cs;
+
+   cs = CALLOC_STRUCT(amdgpu_cs);
+   if (!cs) {
+      return NULL;
+   }
+
+   cs->ctx = ctx;
+   cs->flush_cs = flush;
+   cs->flush_data = flush_ctx;
+   cs->base.ring_type = ring_type;
+
+   if (!amdgpu_init_cs_context(cs, ring_type)) {
+      FREE(cs);
+      return NULL;
+   }
+
+   if (!amdgpu_get_new_ib(cs)) {
+      amdgpu_destroy_cs_context(cs);
+      FREE(cs);
+      return NULL;
+   }
+
+   p_atomic_inc(&ctx->ws->num_cs);
+   return &cs->base;
+}
+
+#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
+
+int amdgpu_get_reloc(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
+{
+   unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1);
+   int i = cs->buffer_indices_hashlist[hash];
+
+   /* not found or found */
+   if (i == -1 || cs->buffers[i].bo == bo)
+      return i;
+
+   /* Hash collision, look for the BO in the list of relocs linearly. */
+   for (i = cs->num_buffers - 1; i >= 0; i--) {
+      if (cs->buffers[i].bo == bo) {
+         /* Put this reloc in the hash list.
+          * This will prevent additional hash collisions if there are
+          * several consecutive get_reloc calls for the same buffer.
+          *
+          * Example: Assuming buffers A,B,C collide in the hash list,
+          * the following sequence of relocs:
+          *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
+          * will collide here: ^ and here:   ^,
+          * meaning that we should get very few collisions in the end. */
+         cs->buffer_indices_hashlist[hash] = i;
+         return i;
+      }
+   }
+   return -1;
+}
+
+static unsigned amdgpu_add_reloc(struct amdgpu_cs *cs,
+                                 struct amdgpu_winsys_bo *bo,
+                                 enum radeon_bo_usage usage,
+                                 enum radeon_bo_domain domains,
+                                 unsigned priority,
+                                 enum radeon_bo_domain *added_domains)
+{
+   struct amdgpu_cs_buffer *reloc;
+   unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1);
+   int i = -1;
+
+   priority = MIN2(priority, 15);
+   *added_domains = 0;
+
+   i = amdgpu_get_reloc(cs, bo);
+
+   if (i >= 0) {
+      reloc = &cs->buffers[i];
+      reloc->usage |= usage;
+      *added_domains = domains & ~reloc->domains;
+      reloc->domains |= domains;
+      cs->flags[i] = MAX2(cs->flags[i], priority);
+      return i;
+   }
+
+   /* New relocation, check if the backing array is large enough. */
+   if (cs->num_buffers >= cs->max_num_buffers) {
+      uint32_t size;
+      cs->max_num_buffers += 10;
+
+      size = cs->max_num_buffers * sizeof(struct amdgpu_cs_buffer);
+      cs->buffers = realloc(cs->buffers, size);
+
+      size = cs->max_num_buffers * sizeof(amdgpu_bo_handle);
+      cs->handles = realloc(cs->handles, size);
+
+      cs->flags = realloc(cs->flags, cs->max_num_buffers);
+   }
+
+   /* Initialize the new relocation. */
+   cs->buffers[cs->num_buffers].bo = NULL;
+   amdgpu_winsys_bo_reference(&cs->buffers[cs->num_buffers].bo, bo);
+   cs->handles[cs->num_buffers] = bo->bo;
+   cs->flags[cs->num_buffers] = priority;
+   p_atomic_inc(&bo->num_cs_references);
+   reloc = &cs->buffers[cs->num_buffers];
+   reloc->bo = bo;
+   reloc->usage = usage;
+   reloc->domains = domains;
+
+   cs->buffer_indices_hashlist[hash] = cs->num_buffers;
+
+   *added_domains = domains;
+   return cs->num_buffers++;
+}
+
+static unsigned amdgpu_cs_add_reloc(struct radeon_winsys_cs *rcs,
+                                    struct radeon_winsys_cs_handle *buf,
+                                    enum radeon_bo_usage usage,
+                                    enum radeon_bo_domain domains,
+                                    enum radeon_bo_priority priority)
+{
+   /* Don't use the "domains" parameter. Amdgpu doesn't support changing
+    * the buffer placement during command submission.
+    */
+   struct amdgpu_cs *cs = amdgpu_cs(rcs);
+   struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
+   enum radeon_bo_domain added_domains;
+   unsigned index = amdgpu_add_reloc(cs, bo, usage, bo->initial_domain,
+                                     priority, &added_domains);
+
+   if (added_domains & RADEON_DOMAIN_GTT)
+      cs->used_gart += bo->base.size;
+   if (added_domains & RADEON_DOMAIN_VRAM)
+      cs->used_vram += bo->base.size;
+
+   return index;
+}
+
+static int amdgpu_cs_get_reloc(struct radeon_winsys_cs *rcs,
+                               struct radeon_winsys_cs_handle *buf)
+{
+   struct amdgpu_cs *cs = amdgpu_cs(rcs);
+
+   return amdgpu_get_reloc(cs, (struct amdgpu_winsys_bo*)buf);
+}
+
+static boolean amdgpu_cs_validate(struct radeon_winsys_cs *rcs)
+{
+   return TRUE;
+}
+
+static boolean amdgpu_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
+{
+   struct amdgpu_cs *cs = amdgpu_cs(rcs);
+   boolean status =
+         (cs->used_gart + gtt) < cs->ctx->ws->info.gart_size * 0.7 &&
+         (cs->used_vram + vram) < cs->ctx->ws->info.vram_size * 0.7;
+
+   return status;
+}
+
+static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
+                                    struct pipe_fence_handle **out_fence)
+{
+   struct amdgpu_winsys *ws = cs->ctx->ws;
+   struct pipe_fence_handle *fence;
+   int i, j, r;
+
+   /* Create a fence. */
+   fence = amdgpu_fence_create(cs->ctx,
+                               cs->request.ip_type,
+                               cs->request.ip_instance,
+                               cs->request.ring);
+   if (out_fence)
+      amdgpu_fence_reference(out_fence, fence);
+
+   cs->request.number_of_dependencies = 0;
+
+   /* Since the kernel driver doesn't synchronize execution between different
+    * rings automatically, we have to add fence dependencies manually. */
+   pipe_mutex_lock(ws->bo_fence_lock);
+   for (i = 0; i < cs->num_buffers; i++) {
+      for (j = 0; j < RING_LAST; j++) {
+         struct amdgpu_cs_fence *dep;
+         unsigned idx;
+
+         struct amdgpu_fence *bo_fence = (void *)cs->buffers[i].bo->fence[j];
+         if (!bo_fence)
+            continue;
+
+         if (bo_fence->ctx == cs->ctx &&
+             bo_fence->fence.ip_type == cs->request.ip_type &&
+             bo_fence->fence.ip_instance == cs->request.ip_instance &&
+             bo_fence->fence.ring == cs->request.ring)
+            continue;
+
+         if (amdgpu_fence_wait((void *)bo_fence, 0, false))
+            continue;
+
+         idx = cs->request.number_of_dependencies++;
+         if (idx >= cs->max_dependencies) {
+            unsigned size;
+
+            cs->max_dependencies = idx + 8;
+            size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence);
+            cs->request.dependencies = realloc(cs->request.dependencies, size);
+         }
+
+         dep = &cs->request.dependencies[idx];
+         memcpy(dep, &bo_fence->fence, sizeof(*dep));
+      }
+   }
+
+   cs->request.fence_info.handle = NULL;
+   if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != AMDGPU_HW_IP_VCE) {
+	cs->request.fence_info.handle = cs->ctx->user_fence_bo;
+	cs->request.fence_info.offset = cs->base.ring_type;
+   }
+
+   r = amdgpu_cs_submit(cs->ctx->ctx, 0, &cs->request, 1);
+   if (r) {
+      if (r == -ENOMEM)
+         fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
+      else
+         fprintf(stderr, "amdgpu: The CS has been rejected, "
+                 "see dmesg for more information.\n");
+
+      amdgpu_fence_signalled(fence);
+   } else {
+      /* Success. */
+      uint64_t *user_fence = NULL;
+      if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != AMDGPU_HW_IP_VCE)
+         user_fence = cs->ctx->user_fence_cpu_address_base +
+                      cs->request.fence_info.offset;
+      amdgpu_fence_submitted(fence, &cs->request, user_fence);
+
+      for (i = 0; i < cs->num_buffers; i++)
+         amdgpu_fence_reference(&cs->buffers[i].bo->fence[cs->base.ring_type],
+                                fence);
+   }
+   pipe_mutex_unlock(ws->bo_fence_lock);
+   amdgpu_fence_reference(&fence, NULL);
+}
+
+static void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
+{
+   /* no-op */
+}
+
+DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
+
+static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
+                            unsigned flags,
+                            struct pipe_fence_handle **fence,
+                            uint32_t cs_trace_id)
+{
+   struct amdgpu_cs *cs = amdgpu_cs(rcs);
+   struct amdgpu_winsys *ws = cs->ctx->ws;
+
+   switch (cs->base.ring_type) {
+   case RING_DMA:
+      /* pad DMA ring to 8 DWs */
+      if (ws->info.chip_class <= SI) {
+         while (rcs->cdw & 7)
+            OUT_CS(&cs->base, 0xf0000000); /* NOP packet */
+      } else {
+         while (rcs->cdw & 7)
+            OUT_CS(&cs->base, 0x00000000); /* NOP packet */
+      }
+      break;
+   case RING_GFX:
+      /* pad DMA ring to 8 DWs to meet CP fetch alignment requirements
+             * r6xx, requires at least 4 dw alignment to avoid a hw bug.
+             */
+      if (ws->info.chip_class <= SI) {
+         while (rcs->cdw & 7)
+            OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
+      } else {
+         while (rcs->cdw & 7)
+            OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */
+      }
+      break;
+   case RING_UVD:
+      while (rcs->cdw & 15)
+         OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
+      break;
+   default:
+      break;
+   }
+
+   if (rcs->cdw > rcs->max_dw) {
+      fprintf(stderr, "amdgpu: command stream overflowed\n");
+   }
+
+   amdgpu_cs_add_reloc(rcs, (void*)cs->big_ib_winsys_buffer,
+		       RADEON_USAGE_READ, 0, RADEON_PRIO_MIN);
+
+   /* If the CS is not empty or overflowed.... */
+   if (cs->base.cdw && cs->base.cdw <= cs->base.max_dw && !debug_get_option_noop()) {
+      int r;
+
+      r = amdgpu_bo_list_create(ws->dev, cs->num_buffers,
+                                cs->handles, cs->flags,
+                                &cs->request.resources);
+
+      if (r) {
+         fprintf(stderr, "amdgpu: resource list creation failed (%d)\n", r);
+         cs->request.resources = NULL;
+	 goto cleanup;
+      }
+
+      cs->ib.size = cs->base.cdw;
+      cs->used_ib_space += cs->base.cdw * 4;
+
+      amdgpu_cs_do_submission(cs, fence);
+
+      /* Cleanup. */
+      if (cs->request.resources)
+         amdgpu_bo_list_destroy(cs->request.resources);
+   }
+
+cleanup:
+   amdgpu_cs_context_cleanup(cs);
+   amdgpu_get_new_ib(cs);
+
+   ws->num_cs_flushes++;
+}
+
+static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
+{
+   struct amdgpu_cs *cs = amdgpu_cs(rcs);
+
+   amdgpu_destroy_cs_context(cs);
+   p_atomic_dec(&cs->ctx->ws->num_cs);
+   pb_reference(&cs->big_ib_buffer, NULL);
+   FREE(cs);
+}
+
+static boolean amdgpu_bo_is_referenced(struct radeon_winsys_cs *rcs,
+                                       struct radeon_winsys_cs_handle *_buf,
+                                       enum radeon_bo_usage usage)
+{
+   struct amdgpu_cs *cs = amdgpu_cs(rcs);
+   struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
+
+   return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
+}
+
+void amdgpu_cs_init_functions(struct amdgpu_winsys *ws)
+{
+   ws->base.ctx_create = amdgpu_ctx_create;
+   ws->base.ctx_destroy = amdgpu_ctx_destroy;
+   ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
+   ws->base.cs_create = amdgpu_cs_create;
+   ws->base.cs_destroy = amdgpu_cs_destroy;
+   ws->base.cs_add_reloc = amdgpu_cs_add_reloc;
+   ws->base.cs_get_reloc = amdgpu_cs_get_reloc;
+   ws->base.cs_validate = amdgpu_cs_validate;
+   ws->base.cs_memory_below_limit = amdgpu_cs_memory_below_limit;
+   ws->base.cs_flush = amdgpu_cs_flush;
+   ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
+   ws->base.cs_sync_flush = amdgpu_cs_sync_flush;
+   ws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
+   ws->base.fence_reference = amdgpu_fence_reference;
+}
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
new file mode 100644
index 00000000000..0842259044b
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#ifndef AMDGPU_CS_H
+#define AMDGPU_CS_H
+
+#include "amdgpu_bo.h"
+#include "util/u_memory.h"
+
+struct amdgpu_ctx {
+   struct amdgpu_winsys *ws;
+   amdgpu_context_handle ctx;
+   amdgpu_bo_handle user_fence_bo;
+   uint64_t *user_fence_cpu_address_base;
+   int refcount;
+};
+
+struct amdgpu_cs_buffer {
+   struct amdgpu_winsys_bo *bo;
+   enum radeon_bo_usage usage;
+   enum radeon_bo_domain domains;
+};
+
+
+struct amdgpu_cs {
+   struct radeon_winsys_cs base;
+   struct amdgpu_ctx *ctx;
+
+   /* Flush CS. */
+   void (*flush_cs)(void *ctx, unsigned flags, struct pipe_fence_handle **fence);
+   void *flush_data;
+
+   /* A buffer out of which new IBs are allocated. */
+   struct pb_buffer *big_ib_buffer; /* for holding the reference */
+   struct amdgpu_winsys_bo *big_ib_winsys_buffer;
+   uint8_t *ib_mapped;
+   unsigned used_ib_space;
+
+   /* amdgpu_cs_submit parameters */
+   struct amdgpu_cs_request    request;
+   struct amdgpu_cs_ib_info    ib;
+
+   /* Relocs. */
+   unsigned                    max_num_buffers;
+   unsigned                    num_buffers;
+   amdgpu_bo_handle            *handles;
+   uint8_t                     *flags;
+   struct amdgpu_cs_buffer     *buffers;
+
+   int                         buffer_indices_hashlist[512];
+
+   unsigned                    used_vram;
+   unsigned                    used_gart;
+
+   unsigned                    max_dependencies;
+};
+
+struct amdgpu_fence {
+   struct pipe_reference reference;
+
+   struct amdgpu_ctx *ctx;  /* submission context */
+   struct amdgpu_cs_fence fence;
+   uint64_t *user_fence_cpu_address;
+
+   volatile int signalled;              /* bool (int for atomicity) */
+};
+
+static inline void amdgpu_ctx_unref(struct amdgpu_ctx *ctx)
+{
+   if (p_atomic_dec_zero(&ctx->refcount)) {
+      amdgpu_cs_ctx_free(ctx->ctx);
+      amdgpu_bo_free(ctx->user_fence_bo);
+      FREE(ctx);
+   }
+}
+
+static inline void amdgpu_fence_reference(struct pipe_fence_handle **dst,
+                                          struct pipe_fence_handle *src)
+{
+   struct amdgpu_fence **rdst = (struct amdgpu_fence **)dst;
+   struct amdgpu_fence *rsrc = (struct amdgpu_fence *)src;
+
+   if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
+      amdgpu_ctx_unref((*rdst)->ctx);
+      FREE(*rdst);
+   }
+   *rdst = rsrc;
+}
+
+int amdgpu_get_reloc(struct amdgpu_cs *csc, struct amdgpu_winsys_bo *bo);
+
+static inline struct amdgpu_cs *
+amdgpu_cs(struct radeon_winsys_cs *base)
+{
+   return (struct amdgpu_cs*)base;
+}
+
+static inline boolean
+amdgpu_bo_is_referenced_by_cs(struct amdgpu_cs *cs,
+                              struct amdgpu_winsys_bo *bo)
+{
+   int num_refs = bo->num_cs_references;
+   return num_refs == bo->rws->num_cs ||
+         (num_refs && amdgpu_get_reloc(cs, bo) != -1);
+}
+
+static inline boolean
+amdgpu_bo_is_referenced_by_cs_with_usage(struct amdgpu_cs *cs,
+                                         struct amdgpu_winsys_bo *bo,
+                                         enum radeon_bo_usage usage)
+{
+   int index;
+
+   if (!bo->num_cs_references)
+      return FALSE;
+
+   index = amdgpu_get_reloc(cs, bo);
+   if (index == -1)
+      return FALSE;
+
+   return (cs->buffers[index].usage & usage) != 0;
+}
+
+static inline boolean
+amdgpu_bo_is_referenced_by_any_cs(struct amdgpu_winsys_bo *bo)
+{
+   return bo->num_cs_references != 0;
+}
+
+bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
+                       bool absolute);
+void amdgpu_cs_init_functions(struct amdgpu_winsys *ws);
+
+#endif
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_public.h b/src/gallium/winsys/amdgpu/drm/amdgpu_public.h
new file mode 100644
index 00000000000..ad133b20bf6
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_public.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+#ifndef AMDGPU_PUBLIC_H
+#define AMDGPU_PUBLIC_H
+
+#include "pipe/p_defines.h"
+
+struct radeon_winsys;
+struct pipe_screen;
+
+typedef struct pipe_screen *(*radeon_screen_create_t)(struct radeon_winsys *);
+
+struct radeon_winsys *
+amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create);
+
+#endif
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
new file mode 100644
index 00000000000..d517762f7a2
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright © 2009 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright © 2009 Joakim Sindholt <opensource@zhasha.com>
+ * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#include "amdgpu_cs.h"
+#include "amdgpu_public.h"
+
+#include "util/u_hash_table.h"
+#include <amdgpu_drm.h>
+#include <xf86drm.h>
+#include <stdio.h>
+#include <sys/stat.h>
+
+#define CIK_TILE_MODE_COLOR_2D			14
+
+#define CIK__GB_TILE_MODE__PIPE_CONFIG(x)        (((x) >> 6) & 0x1f)
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P2               0
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16          4
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16         5
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32         6
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32         7
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16    8
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16    9
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16    10
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16   11
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16   12
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32   13
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32   14
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16   16
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16  17
+
+static struct util_hash_table *dev_tab = NULL;
+pipe_static_mutex(dev_tab_mutex);
+
+static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
+{
+   unsigned mode2d = info->gb_tile_mode[CIK_TILE_MODE_COLOR_2D];
+
+   switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) {
+   case CIK__PIPE_CONFIG__ADDR_SURF_P2:
+   default:
+       return 2;
+   case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32:
+       return 4;
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32:
+       return 8;
+   case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16:
+       return 16;
+   }
+}
+
+/* Convert Sea Islands register values GB_ADDR_CFG and MC_ADDR_CFG
+ * into GB_TILING_CONFIG register which is only present on R600-R700. */
+static unsigned r600_get_gb_tiling_config(struct amdgpu_gpu_info *info)
+{
+   unsigned num_pipes = info->gb_addr_cfg & 0x7;
+   unsigned num_banks = info->mc_arb_ramcfg & 0x3;
+   unsigned pipe_interleave_bytes = (info->gb_addr_cfg >> 4) & 0x7;
+   unsigned row_size = (info->gb_addr_cfg >> 28) & 0x3;
+
+   return num_pipes | (num_banks << 4) |
+         (pipe_interleave_bytes << 8) |
+         (row_size << 12);
+}
+
+/* Helper function to do the ioctls needed for setup and init. */
+static boolean do_winsys_init(struct amdgpu_winsys *ws)
+{
+   struct amdgpu_buffer_size_alignments alignment_info = {};
+   struct amdgpu_heap_info vram, gtt;
+   struct drm_amdgpu_info_hw_ip dma = {}, uvd = {}, vce = {};
+   uint32_t vce_version = 0, vce_feature = 0;
+   int r;
+
+   /* Query hardware and driver information. */
+   r = amdgpu_query_gpu_info(ws->dev, &ws->amdinfo);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_buffer_size_alignment(ws->dev, &alignment_info);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_GTT, 0, &gtt);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_DMA, 0, &dma);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_UVD, 0, &uvd);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd) failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_VCE, 0, &vce);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_VCE, 0, 0,
+				     &vce_version, &vce_feature);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n");
+      goto fail;
+   }
+
+   /* Set chip identification. */
+   ws->info.pci_id = ws->amdinfo.asic_id; /* TODO: is this correct? */
+
+   switch (ws->info.pci_id) {
+#define CHIPSET(pci_id, name, cfamily) case pci_id: ws->info.family = CHIP_##cfamily; break;
+#include "pci_ids/radeonsi_pci_ids.h"
+#undef CHIPSET
+
+   default:
+      fprintf(stderr, "amdgpu: Invalid PCI ID.\n");
+      goto fail;
+   }
+
+   if (ws->info.family >= CHIP_TONGA)
+      ws->info.chip_class = VI;
+   else if (ws->info.family >= CHIP_BONAIRE)
+      ws->info.chip_class = CIK;
+   else {
+      fprintf(stderr, "amdgpu: Unknown family.\n");
+      goto fail;
+   }
+
+   /* LLVM 3.6 is required for VI. */
+   if (ws->info.chip_class >= VI &&
+       (HAVE_LLVM < 0x0306 ||
+        (HAVE_LLVM == 0x0306 && MESA_LLVM_VERSION_PATCH < 1))) {
+      fprintf(stderr, "amdgpu: LLVM 3.6.1 is required, got LLVM %i.%i.%i\n",
+              HAVE_LLVM >> 8, HAVE_LLVM & 255, MESA_LLVM_VERSION_PATCH);
+      goto fail;
+   }
+
+   /* Set hardware information. */
+   ws->info.gart_size = gtt.heap_size;
+   ws->info.vram_size = vram.heap_size;
+   /* convert the shader clock from KHz to MHz */
+   ws->info.max_sclk = ws->amdinfo.max_engine_clk / 1000;
+   ws->info.max_compute_units = 1; /* TODO */
+   ws->info.max_se = ws->amdinfo.num_shader_engines;
+   ws->info.max_sh_per_se = ws->amdinfo.num_shader_arrays_per_engine;
+   ws->info.has_uvd = uvd.available_rings != 0;
+   ws->info.vce_fw_version =
+         vce.available_rings ? vce_version : 0;
+   ws->info.has_userptr = TRUE;
+   ws->info.r600_num_backends = ws->amdinfo.rb_pipes;
+   ws->info.r600_clock_crystal_freq = ws->amdinfo.gpu_counter_freq;
+   ws->info.r600_tiling_config = r600_get_gb_tiling_config(&ws->amdinfo);
+   ws->info.r600_num_tile_pipes = cik_get_num_tile_pipes(&ws->amdinfo);
+   ws->info.r600_max_pipes = ws->amdinfo.max_quad_shader_pipes; /* TODO: is this correct? */
+   ws->info.r600_virtual_address = TRUE;
+   ws->info.r600_has_dma = dma.available_rings != 0;
+
+   memcpy(ws->info.si_tile_mode_array, ws->amdinfo.gb_tile_mode,
+          sizeof(ws->amdinfo.gb_tile_mode));
+   ws->info.si_tile_mode_array_valid = TRUE;
+   ws->info.si_backend_enabled_mask = ws->amdinfo.enabled_rb_pipes_mask;
+
+   memcpy(ws->info.cik_macrotile_mode_array, ws->amdinfo.gb_macro_tile_mode,
+          sizeof(ws->amdinfo.gb_macro_tile_mode));
+   ws->info.cik_macrotile_mode_array_valid = TRUE;
+
+   ws->gart_page_size = alignment_info.size_remote;
+
+   return TRUE;
+
+fail:
+   amdgpu_device_deinitialize(ws->dev);
+   ws->dev = NULL;
+   return FALSE;
+}
+
+static void amdgpu_winsys_destroy(struct radeon_winsys *rws)
+{
+   struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
+
+   pipe_mutex_destroy(ws->bo_fence_lock);
+
+   ws->cman->destroy(ws->cman);
+   ws->kman->destroy(ws->kman);
+
+   amdgpu_device_deinitialize(ws->dev);
+   FREE(rws);
+}
+
+static void amdgpu_winsys_query_info(struct radeon_winsys *rws,
+                                     struct radeon_info *info)
+{
+   *info = ((struct amdgpu_winsys *)rws)->info;
+}
+
+static boolean amdgpu_cs_request_feature(struct radeon_winsys_cs *rcs,
+                                         enum radeon_feature_id fid,
+                                         boolean enable)
+{
+   return FALSE;
+}
+
+static uint64_t amdgpu_query_value(struct radeon_winsys *rws,
+                                   enum radeon_value_id value)
+{
+   struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
+   struct amdgpu_heap_info heap;
+   uint64_t retval = 0;
+
+   switch (value) {
+   case RADEON_REQUESTED_VRAM_MEMORY:
+      return ws->allocated_vram;
+   case RADEON_REQUESTED_GTT_MEMORY:
+      return ws->allocated_gtt;
+   case RADEON_BUFFER_WAIT_TIME_NS:
+      return ws->buffer_wait_time;
+   case RADEON_TIMESTAMP:
+      amdgpu_query_info(ws->dev, AMDGPU_INFO_TIMESTAMP, 8, &retval);
+      return retval;
+   case RADEON_NUM_CS_FLUSHES:
+      return ws->num_cs_flushes;
+   case RADEON_NUM_BYTES_MOVED:
+      amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_BYTES_MOVED, 8, &retval);
+      return retval;
+   case RADEON_VRAM_USAGE:
+      amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &heap);
+      return heap.heap_usage;
+   case RADEON_GTT_USAGE:
+      amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_GTT, 0, &heap);
+      return heap.heap_usage;
+   case RADEON_GPU_TEMPERATURE:
+   case RADEON_CURRENT_SCLK:
+   case RADEON_CURRENT_MCLK:
+      return 0;
+   case RADEON_GPU_RESET_COUNTER:
+      assert(0);
+      return 0;
+   }
+   return 0;
+}
+
+static void amdgpu_read_registers(struct radeon_winsys *rws,
+                                  unsigned reg_offset,
+                                  unsigned num_registers, uint32_t *out)
+{
+   struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
+
+   amdgpu_read_mm_registers(ws->dev, reg_offset / 4, num_registers,
+                            0xffffffff, 0, out);
+}
+
+static unsigned hash_dev(void *key)
+{
+#if defined(PIPE_ARCH_X86_64)
+   return pointer_to_intptr(key) ^ (pointer_to_intptr(key) >> 32);
+#else
+   return pointer_to_intptr(key);
+#endif
+}
+
+static int compare_dev(void *key1, void *key2)
+{
+   return key1 != key2;
+}
+
+static bool amdgpu_winsys_unref(struct radeon_winsys *ws)
+{
+   struct amdgpu_winsys *rws = (struct amdgpu_winsys*)ws;
+   bool destroy;
+
+   /* When the reference counter drops to zero, remove the device pointer
+    * from the table.
+    * This must happen while the mutex is locked, so that
+    * amdgpu_winsys_create in another thread doesn't get the winsys
+    * from the table when the counter drops to 0. */
+   pipe_mutex_lock(dev_tab_mutex);
+
+   destroy = pipe_reference(&rws->reference, NULL);
+   if (destroy && dev_tab)
+      util_hash_table_remove(dev_tab, rws->dev);
+
+   pipe_mutex_unlock(dev_tab_mutex);
+   return destroy;
+}
+
+PUBLIC struct radeon_winsys *
+amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create)
+{
+   struct amdgpu_winsys *ws;
+   drmVersionPtr version = drmGetVersion(fd);
+   amdgpu_device_handle dev;
+   uint32_t drm_major, drm_minor, r;
+
+   /* The DRM driver version of amdgpu is 3.x.x. */
+   if (version->version_major != 3) {
+      drmFreeVersion(version);
+      return NULL;
+   }
+   drmFreeVersion(version);
+
+   /* Look up the winsys from the dev table. */
+   pipe_mutex_lock(dev_tab_mutex);
+   if (!dev_tab)
+      dev_tab = util_hash_table_create(hash_dev, compare_dev);
+
+   /* Initialize the amdgpu device. This should always return the same pointer
+    * for the same fd. */
+   r = amdgpu_device_initialize(fd, &drm_major, &drm_minor, &dev);
+   if (r) {
+      pipe_mutex_unlock(dev_tab_mutex);
+      fprintf(stderr, "amdgpu: amdgpu_device_initialize failed.\n");
+      return NULL;
+   }
+
+   /* Lookup a winsys if we have already created one for this device. */
+   ws = util_hash_table_get(dev_tab, dev);
+   if (ws) {
+      pipe_reference(NULL, &ws->reference);
+      pipe_mutex_unlock(dev_tab_mutex);
+      return &ws->base;
+   }
+
+   /* Create a new winsys. */
+   ws = CALLOC_STRUCT(amdgpu_winsys);
+   if (!ws) {
+      pipe_mutex_unlock(dev_tab_mutex);
+      return NULL;
+   }
+
+   ws->dev = dev;
+   ws->info.drm_major = drm_major;
+   ws->info.drm_minor = drm_minor;
+
+   if (!do_winsys_init(ws))
+      goto fail;
+
+   /* Create managers. */
+   ws->kman = amdgpu_bomgr_create(ws);
+   if (!ws->kman)
+      goto fail;
+   ws->cman = pb_cache_manager_create(ws->kman, 500000, 2.0f, 0,
+			(ws->info.vram_size + ws->info.gart_size) / 8);
+   if (!ws->cman)
+      goto fail;
+
+   /* init reference */
+   pipe_reference_init(&ws->reference, 1);
+
+   /* Set functions. */
+   ws->base.unref = amdgpu_winsys_unref;
+   ws->base.destroy = amdgpu_winsys_destroy;
+   ws->base.query_info = amdgpu_winsys_query_info;
+   ws->base.cs_request_feature = amdgpu_cs_request_feature;
+   ws->base.query_value = amdgpu_query_value;
+   ws->base.read_registers = amdgpu_read_registers;
+
+   amdgpu_bomgr_init_functions(ws);
+   amdgpu_cs_init_functions(ws);
+
+   pipe_mutex_init(ws->bo_fence_lock);
+
+   /* Create the screen at the end. The winsys must be initialized
+    * completely.
+    *
+    * Alternatively, we could create the screen based on "ws->gen"
+    * and link all drivers into one binary blob. */
+   ws->base.screen = screen_create(&ws->base);
+   if (!ws->base.screen) {
+      amdgpu_winsys_destroy(&ws->base);
+      pipe_mutex_unlock(dev_tab_mutex);
+      return NULL;
+   }
+
+   util_hash_table_set(dev_tab, dev, ws);
+
+   /* We must unlock the mutex once the winsys is fully initialized, so that
+    * other threads attempting to create the winsys from the same fd will
+    * get a fully initialized winsys and not just half-way initialized. */
+   pipe_mutex_unlock(dev_tab_mutex);
+
+   return &ws->base;
+
+fail:
+   pipe_mutex_unlock(dev_tab_mutex);
+   if (ws->cman)
+      ws->cman->destroy(ws->cman);
+   if (ws->kman)
+      ws->kman->destroy(ws->kman);
+   FREE(ws);
+   return NULL;
+}
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
new file mode 100644
index 00000000000..68c896814f2
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2009 Corbin Simpson
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#ifndef AMDGPU_WINSYS_H
+#define AMDGPU_WINSYS_H
+
+#include "gallium/drivers/radeon/radeon_winsys.h"
+#include "os/os_thread.h"
+#include <amdgpu.h>
+
+struct amdgpu_cs;
+
+struct amdgpu_winsys {
+   struct radeon_winsys base;
+   struct pipe_reference reference;
+
+   amdgpu_device_handle dev;
+
+   pipe_mutex bo_fence_lock;
+
+   int num_cs; /* The number of command streams created. */
+   uint32_t next_bo_unique_id;
+   uint64_t allocated_vram;
+   uint64_t allocated_gtt;
+   uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */
+   uint64_t num_cs_flushes;
+   unsigned gart_page_size;
+
+   struct radeon_info info;
+
+   struct pb_manager *kman;
+   struct pb_manager *cman;
+
+   struct amdgpu_gpu_info amdinfo;
+};
+
+static inline struct amdgpu_winsys *
+amdgpu_winsys(struct radeon_winsys *base)
+{
+   return (struct amdgpu_winsys*)base;
+}
+
+#endif
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 856a4ede8de..7a267f9acbf 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -550,6 +550,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
 
         default:
         case RING_GFX:
+        case RING_COMPUTE:
             cs->cst->flags[0] = 0;
             cs->cst->flags[1] = RADEON_CS_RING_GFX;
             cs->cst->cs.num_chunks = 2;
@@ -565,7 +566,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
                 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
                 cs->cst->cs.num_chunks = 3;
             }
-            if (flags & RADEON_FLUSH_COMPUTE) {
+            if (cs->base.ring_type == RING_COMPUTE) {
                 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
                 cs->cst->cs.num_chunks = 3;
             }

From e7fc664b91a5d886c2709d05a498f6a1dfbaf136 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Apr 2015 19:41:33 +0200
Subject: [PATCH 1167/1208] winsys/amdgpu: add addrlib - texture addressing and
 alignment calculator

This is an internal project that Catalyst uses and now open source will do
too.

v2: squashed these commits in:
    - winsys/amdgpu: fix warnings in addrlib
    - winsys/amdgpu: set PIPE_CONFIG and NUM_BANKS in tiling_flags
---
 src/gallium/drivers/r300/r300_state.c         |    2 +-
 src/gallium/drivers/r300/r300_texture.c       |    2 +-
 src/gallium/drivers/radeon/r600_texture.c     |    3 +-
 src/gallium/drivers/radeon/radeon_winsys.h    |    5 +-
 src/gallium/winsys/amdgpu/drm/Makefile.am     |    7 +-
 .../winsys/amdgpu/drm/Makefile.sources        |   23 +
 .../amdgpu/drm/addrlib/addrinterface.cpp      | 1008 ++++
 .../winsys/amdgpu/drm/addrlib/addrinterface.h | 2166 ++++++++
 .../winsys/amdgpu/drm/addrlib/addrtypes.h     |  590 +++
 .../amdgpu/drm/addrlib/core/addrcommon.h      |  558 ++
 .../amdgpu/drm/addrlib/core/addrelemlib.cpp   | 1674 ++++++
 .../amdgpu/drm/addrlib/core/addrelemlib.h     |  270 +
 .../amdgpu/drm/addrlib/core/addrlib.cpp       | 4023 +++++++++++++++
 .../winsys/amdgpu/drm/addrlib/core/addrlib.h  |  695 +++
 .../amdgpu/drm/addrlib/core/addrobject.cpp    |  246 +
 .../amdgpu/drm/addrlib/core/addrobject.h      |   89 +
 .../drm/addrlib/inc/chip/r800/si_gb_reg.h     |  155 +
 .../amdgpu/drm/addrlib/inc/lnx_common_defs.h  |  129 +
 .../addrlib/r800/chip/si_ci_vi_merged_enum.h  |   40 +
 .../amdgpu/drm/addrlib/r800/ciaddrlib.cpp     | 1777 +++++++
 .../amdgpu/drm/addrlib/r800/ciaddrlib.h       |  197 +
 .../amdgpu/drm/addrlib/r800/egbaddrlib.cpp    | 4575 +++++++++++++++++
 .../amdgpu/drm/addrlib/r800/egbaddrlib.h      |  411 ++
 .../amdgpu/drm/addrlib/r800/siaddrlib.cpp     | 2818 ++++++++++
 .../amdgpu/drm/addrlib/r800/siaddrlib.h       |  262 +
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c     |    5 +-
 src/gallium/winsys/amdgpu/drm/amdgpu_id.h     |  157 +
 .../winsys/amdgpu/drm/amdgpu_surface.c        |  438 ++
 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c |   50 +
 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h |    7 +
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c |    3 +-
 31 files changed, 22378 insertions(+), 7 deletions(-)
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.cpp
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/addrtypes.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/core/addrcommon.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.cpp
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.cpp
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.cpp
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/inc/chip/r800/si_gb_reg.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/inc/lnx_common_defs.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/r800/chip/si_ci_vi_merged_enum.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.cpp
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.cpp
 create mode 100644 src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/amdgpu_id.h
 create mode 100644 src/gallium/winsys/amdgpu/drm/amdgpu_surface.c

diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index e886df87a60..d99d5ae0152 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -844,7 +844,7 @@ static void r300_tex_set_tiling_flags(struct r300_context *r300,
         tex->tex.macrotile[level]) {
         r300->rws->buffer_set_tiling(tex->buf, r300->cs,
                 tex->tex.microtile, tex->tex.macrotile[level],
-                0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0,
                 tex->tex.stride_in_bytes[0], false);
 
         tex->surface_level = level;
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 6c01c0d21e4..5e4d50df27d 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -1063,7 +1063,7 @@ r300_texture_create_object(struct r300_screen *rscreen,
 
     rws->buffer_set_tiling(tex->buf, NULL,
             tex->tex.microtile, tex->tex.macrotile[0],
-            0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0,
             tex->tex.stride_in_bytes[0], false);
 
     return tex;
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 57c40d96e2c..a4c7034cb37 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -243,10 +243,11 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 				       RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR,
 				       surface->level[0].mode >= RADEON_SURF_MODE_2D ?
 				       RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR,
+				       surface->pipe_config,
 				       surface->bankw, surface->bankh,
 				       surface->tile_split,
 				       surface->stencil_tile_split,
-				       surface->mtilea,
+				       surface->mtilea, surface->num_banks,
 				       surface->level[0].pitch_bytes,
 				       (surface->flags & RADEON_SURF_SCANOUT) != 0);
 
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 6fa9ea4fdf2..616816ebcea 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -322,6 +322,8 @@ struct radeon_surf {
     struct radeon_surf_level    stencil_level[RADEON_SURF_MAX_LEVEL];
     uint32_t                    tiling_index[RADEON_SURF_MAX_LEVEL];
     uint32_t                    stencil_tiling_index[RADEON_SURF_MAX_LEVEL];
+    uint32_t                    pipe_config;
+    uint32_t                    num_banks;
 };
 
 struct radeon_winsys {
@@ -446,10 +448,11 @@ struct radeon_winsys {
                               struct radeon_winsys_cs *rcs,
                               enum radeon_bo_layout microtile,
                               enum radeon_bo_layout macrotile,
+                              unsigned pipe_config,
                               unsigned bankw, unsigned bankh,
                               unsigned tile_split,
                               unsigned stencil_tile_split,
-                              unsigned mtilea,
+                              unsigned mtilea, unsigned num_banks,
                               unsigned stride,
                               bool scanout);
 
diff --git a/src/gallium/winsys/amdgpu/drm/Makefile.am b/src/gallium/winsys/amdgpu/drm/Makefile.am
index 80ecb7545b3..a719913b157 100644
--- a/src/gallium/winsys/amdgpu/drm/Makefile.am
+++ b/src/gallium/winsys/amdgpu/drm/Makefile.am
@@ -3,7 +3,12 @@ include $(top_srcdir)/src/gallium/Automake.inc
 
 AM_CFLAGS = \
 	$(GALLIUM_WINSYS_CFLAGS) \
-	$(AMDGPU_CFLAGS)
+	$(AMDGPU_CFLAGS) \
+	-I$(srcdir)/addrlib \
+	-I$(srcdir)/addrlib/core \
+	-I$(srcdir)/addrlib/inc/chip/r800 \
+	-I$(srcdir)/addrlib/r800/chip \
+	-DBRAHMA_BUILD=1
 
 AM_CXXFLAGS = $(AM_CFLAGS)
 
diff --git a/src/gallium/winsys/amdgpu/drm/Makefile.sources b/src/gallium/winsys/amdgpu/drm/Makefile.sources
index 0f550103f57..6b33841b204 100644
--- a/src/gallium/winsys/amdgpu/drm/Makefile.sources
+++ b/src/gallium/winsys/amdgpu/drm/Makefile.sources
@@ -1,8 +1,31 @@
 C_SOURCES := \
+	addrlib/addrinterface.cpp \
+	addrlib/addrinterface.h \
+	addrlib/addrtypes.h \
+	addrlib/core/addrcommon.h \
+	addrlib/core/addrelemlib.cpp \
+	addrlib/core/addrelemlib.h \
+	addrlib/core/addrlib.cpp \
+	addrlib/core/addrlib.h \
+	addrlib/core/addrobject.cpp \
+	addrlib/core/addrobject.h \
+	addrlib/inc/chip/r800/si_gb_reg.h \
+	addrlib/inc/lnx_common_defs.h \
+	addrlib/r800/chip/si_ci_merged_enum.h \
+	addrlib/r800/chip/si_ci_vi_merged_enum.h \
+	addrlib/r800/chip/si_enum.h \
+	addrlib/r800/ciaddrlib.cpp \
+	addrlib/r800/ciaddrlib.h \
+	addrlib/r800/egbaddrlib.cpp \
+	addrlib/r800/egbaddrlib.h \
+	addrlib/r800/siaddrlib.cpp \
+	addrlib/r800/siaddrlib.h \
 	amdgpu_bo.c \
 	amdgpu_bo.h \
 	amdgpu_cs.c \
 	amdgpu_cs.h \
+	amdgpu_id.h \
 	amdgpu_public.h \
+	amdgpu_surface.c \
 	amdgpu_winsys.c \
 	amdgpu_winsys.h
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.cpp
new file mode 100644
index 00000000000..65569278b1e
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.cpp
@@ -0,0 +1,1008 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrinterface.cpp
+* @brief Contains the addrlib interface functions
+***************************************************************************************************
+*/
+#include "addrinterface.h"
+#include "addrlib.h"
+
+#include "addrcommon.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Create/Destroy/Config functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrCreate
+*
+*   @brief
+*       Create address lib object
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrCreate(
+    const ADDR_CREATE_INPUT*    pAddrCreateIn,  ///< [in] infomation for creating address lib object
+    ADDR_CREATE_OUTPUT*         pAddrCreateOut) ///< [out] address lib handle
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    returnCode = AddrLib::Create(pAddrCreateIn, pAddrCreateOut);
+
+    return returnCode;
+}
+
+
+
+/**
+***************************************************************************************************
+*   AddrDestroy
+*
+*   @brief
+*       Destroy address lib object
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrDestroy(
+    ADDR_HANDLE hLib) ///< [in] address lib handle
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (hLib)
+    {
+        AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+        pLib->Destroy();
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                    Surface functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrComputeSurfaceInfo
+*
+*   @brief
+*       Calculate surface width/height/depth/alignments and suitable tiling mode
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSurfaceInfo(
+    ADDR_HANDLE                             hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,  ///< [in] surface information
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut) ///< [out] surface parameters and alignments
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeSurfaceInfo(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+/**
+***************************************************************************************************
+*   AddrComputeSurfaceAddrFromCoord
+*
+*   @brief
+*       Compute surface address according to coordinates
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSurfaceAddrFromCoord(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,  ///< [in] surface info and coordinates
+    ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut) ///< [out] surface address
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeSurfaceAddrFromCoord(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeSurfaceCoordFromAddr
+*
+*   @brief
+*       Compute coordinates according to surface address
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSurfaceCoordFromAddr(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,  ///< [in] surface info and address
+    ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT*      pOut) ///< [out] coordinates
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeSurfaceCoordFromAddr(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                   HTile functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrComputeHtileInfo
+*
+*   @brief
+*       Compute Htile pitch, height, base alignment and size in bytes
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeHtileInfo(
+    ADDR_HANDLE                             hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_HTILE_INFO_INPUT*    pIn,  ///< [in] Htile information
+    ADDR_COMPUTE_HTILE_INFO_OUTPUT*         pOut) ///< [out] Htile pitch, height and size in bytes
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeHtileInfo(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeHtileAddrFromCoord
+*
+*   @brief
+*       Compute Htile address according to coordinates (of depth buffer)
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeHtileAddrFromCoord(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT*   pIn,  ///< [in] Htile info and coordinates
+    ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT*        pOut) ///< [out] Htile address
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeHtileAddrFromCoord(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeHtileCoordFromAddr
+*
+*   @brief
+*       Compute coordinates within depth buffer (1st pixel of a micro tile) according to
+*       Htile address
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeHtileCoordFromAddr(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT*   pIn,  ///< [in] Htile info and address
+    ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT*        pOut) ///< [out] Htile coordinates
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeHtileCoordFromAddr(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                     C-mask functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrComputeCmaskInfo
+*
+*   @brief
+*       Compute Cmask pitch, height, base alignment and size in bytes from color buffer
+*       info
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeCmaskInfo(
+    ADDR_HANDLE                             hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_CMASK_INFO_INPUT*    pIn,  ///< [in] Cmask pitch and height
+    ADDR_COMPUTE_CMASK_INFO_OUTPUT*         pOut) ///< [out] Cmask pitch, height and size in bytes
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeCmaskInfo(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeCmaskAddrFromCoord
+*
+*   @brief
+*       Compute Cmask address according to coordinates (of MSAA color buffer)
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeCmaskAddrFromCoord(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT*   pIn,  ///< [in] Cmask info and coordinates
+    ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT*        pOut) ///< [out] Cmask address
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeCmaskAddrFromCoord(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeCmaskCoordFromAddr
+*
+*   @brief
+*       Compute coordinates within color buffer (1st pixel of a micro tile) according to
+*       Cmask address
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeCmaskCoordFromAddr(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT*   pIn,  ///< [in] Cmask info and address
+    ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT*        pOut) ///< [out] Cmask coordinates
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeCmaskCoordFromAddr(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                     F-mask functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrComputeFmaskInfo
+*
+*   @brief
+*       Compute Fmask pitch/height/depth/alignments and size in bytes
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeFmaskInfo(
+    ADDR_HANDLE                             hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pIn,  ///< [in] Fmask information
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT*         pOut) ///< [out] Fmask pitch and height
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeFmaskInfo(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeFmaskAddrFromCoord
+*
+*   @brief
+*       Compute Fmask address according to coordinates (x,y,slice,sample,plane)
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeFmaskAddrFromCoord(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT*   pIn,  ///< [in] Fmask info and coordinates
+    ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT*        pOut) ///< [out] Fmask address
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeFmaskAddrFromCoord(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeFmaskCoordFromAddr
+*
+*   @brief
+*       Compute coordinates (x,y,slice,sample,plane) according to Fmask address
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeFmaskCoordFromAddr(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT*   pIn,  ///< [in] Fmask info and address
+    ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT*        pOut) ///< [out] Fmask coordinates
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeFmaskCoordFromAddr(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                     DCC key functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrComputeDccInfo
+*
+*   @brief
+*       Compute DCC key size, base alignment based on color surface size, tile info or tile index
+*
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeDccInfo(
+    ADDR_HANDLE                             hLib,   ///< [in] handle of addrlib
+    const ADDR_COMPUTE_DCCINFO_INPUT*       pIn,    ///< [in] input
+    ADDR_COMPUTE_DCCINFO_OUTPUT*            pOut)   ///< [out] output
+{
+    ADDR_E_RETURNCODE returnCode;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+       returnCode = pLib->ComputeDccInfo(pIn, pOut);
+    }
+    else
+    {
+       returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Below functions are element related or helper functions
+///////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrGetVersion
+*
+*   @brief
+*       Get AddrLib version number. Client may check this return value against ADDRLIB_VERSION
+*       defined in addrinterface.h to see if there is a mismatch.
+***************************************************************************************************
+*/
+UINT_32 ADDR_API AddrGetVersion(ADDR_HANDLE hLib)
+{
+    UINT_32 version = 0;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_ASSERT(pLib != NULL);
+
+    if (pLib)
+    {
+        version = pLib->GetVersion();
+    }
+
+    return version;
+}
+
+/**
+***************************************************************************************************
+*   AddrUseTileIndex
+*
+*   @brief
+*       Return TRUE if tileIndex is enabled in this address library
+***************************************************************************************************
+*/
+BOOL_32 ADDR_API AddrUseTileIndex(ADDR_HANDLE hLib)
+{
+    BOOL_32 useTileIndex = FALSE;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_ASSERT(pLib != NULL);
+
+    if (pLib)
+    {
+        useTileIndex = pLib->UseTileIndex(0);
+    }
+
+    return useTileIndex;
+}
+
+/**
+***************************************************************************************************
+*   AddrUseCombinedSwizzle
+*
+*   @brief
+*       Return TRUE if combined swizzle is enabled in this address library
+***************************************************************************************************
+*/
+BOOL_32 ADDR_API AddrUseCombinedSwizzle(ADDR_HANDLE hLib)
+{
+    BOOL_32 useCombinedSwizzle = FALSE;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_ASSERT(pLib != NULL);
+
+    if (pLib)
+    {
+        useCombinedSwizzle = pLib->UseCombinedSwizzle();
+    }
+
+    return useCombinedSwizzle;
+}
+
+/**
+***************************************************************************************************
+*   AddrExtractBankPipeSwizzle
+*
+*   @brief
+*       Extract Bank and Pipe swizzle from base256b
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrExtractBankPipeSwizzle(
+    ADDR_HANDLE                                 hLib,     ///< [in] addrlib handle
+    const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT*  pIn,      ///< [in] input structure
+    ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT*       pOut)     ///< [out] output structure
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ExtractBankPipeSwizzle(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrCombineBankPipeSwizzle
+*
+*   @brief
+*       Combine Bank and Pipe swizzle
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrCombineBankPipeSwizzle(
+    ADDR_HANDLE                                 hLib,
+    const ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT*  pIn,
+    ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT*       pOut)
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->CombineBankPipeSwizzle(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeSliceSwizzle
+*
+*   @brief
+*       Compute a swizzle for slice from a base swizzle
+*   @return
+*       ADDR_OK if no error
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSliceSwizzle(
+    ADDR_HANDLE                                 hLib,
+    const ADDR_COMPUTE_SLICESWIZZLE_INPUT*      pIn,
+    ADDR_COMPUTE_SLICESWIZZLE_OUTPUT*           pOut)
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeSliceTileSwizzle(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeBaseSwizzle
+*
+*   @brief
+*       Return a Combined Bank and Pipe swizzle base on surface based on surface type/index
+*   @return
+*       ADDR_OK if no error
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeBaseSwizzle(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_BASE_SWIZZLE_INPUT*  pIn,
+    ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT*       pOut)
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeBaseSwizzle(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   ElemFlt32ToDepthPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a depth/stencil pixel value
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+*
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API ElemFlt32ToDepthPixel(
+    ADDR_HANDLE                         hLib,    ///< [in] addrlib handle
+    const ELEM_FLT32TODEPTHPIXEL_INPUT* pIn,     ///< [in] per-component value
+    ELEM_FLT32TODEPTHPIXEL_OUTPUT*      pOut)    ///< [out] final pixel value
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        pLib->Flt32ToDepthPixel(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   ElemFlt32ToColorPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a red/green/blue/alpha pixel value
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+*
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API ElemFlt32ToColorPixel(
+    ADDR_HANDLE                         hLib,    ///< [in] addrlib handle
+    const ELEM_FLT32TOCOLORPIXEL_INPUT* pIn,     ///< [in] format, surface number and swap value
+    ELEM_FLT32TOCOLORPIXEL_OUTPUT*      pOut)    ///< [out] final pixel value
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        pLib->Flt32ToColorPixel(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   ElemGetExportNorm
+*
+*   @brief
+*       Helper function to check one format can be EXPORT_NUM,
+*       which is a register CB_COLOR_INFO.SURFACE_FORMAT.
+*       FP16 can be reported as EXPORT_NORM for rv770 in r600
+*       family
+*
+***************************************************************************************************
+*/
+BOOL_32 ADDR_API ElemGetExportNorm(
+    ADDR_HANDLE                     hLib, ///< [in] addrlib handle
+    const ELEM_GETEXPORTNORM_INPUT* pIn)  ///< [in] input structure
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+    BOOL_32 enabled = FALSE;
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        enabled = pLib->GetExportNorm(pIn);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    ADDR_ASSERT(returnCode == ADDR_OK);
+
+    return enabled;
+}
+
+/**
+***************************************************************************************************
+*   AddrConvertTileInfoToHW
+*
+*   @brief
+*       Convert tile info from real value to hardware register value
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrConvertTileInfoToHW(
+    ADDR_HANDLE                             hLib, ///< [in] address lib handle
+    const ADDR_CONVERT_TILEINFOTOHW_INPUT*  pIn,  ///< [in] tile info with real value
+    ADDR_CONVERT_TILEINFOTOHW_OUTPUT*       pOut) ///< [out] tile info with HW register value
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ConvertTileInfoToHW(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrConvertTileIndex
+*
+*   @brief
+*       Convert tile index to tile mode/type/info
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrConvertTileIndex(
+    ADDR_HANDLE                          hLib, ///< [in] address lib handle
+    const ADDR_CONVERT_TILEINDEX_INPUT*  pIn,  ///< [in] input - tile index
+    ADDR_CONVERT_TILEINDEX_OUTPUT*       pOut) ///< [out] tile mode/type/info
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ConvertTileIndex(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrConvertTileIndex1
+*
+*   @brief
+*       Convert tile index to tile mode/type/info
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrConvertTileIndex1(
+    ADDR_HANDLE                          hLib, ///< [in] address lib handle
+    const ADDR_CONVERT_TILEINDEX1_INPUT* pIn,  ///< [in] input - tile index
+    ADDR_CONVERT_TILEINDEX_OUTPUT*       pOut) ///< [out] tile mode/type/info
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ConvertTileIndex1(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrGetTileIndex
+*
+*   @brief
+*       Get tile index from tile mode/type/info
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+*
+*   @note
+*       Only meaningful for SI (and above)
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrGetTileIndex(
+    ADDR_HANDLE                     hLib,
+    const ADDR_GET_TILEINDEX_INPUT* pIn,
+    ADDR_GET_TILEINDEX_OUTPUT*      pOut)
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->GetTileIndex(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputePrtInfo
+*
+*   @brief
+*       Interface function for ComputePrtInfo
+*
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputePrtInfo(
+    ADDR_HANDLE                 hLib,
+    const ADDR_PRT_INFO_INPUT*  pIn,
+    ADDR_PRT_INFO_OUTPUT*       pOut)
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputePrtInfo(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h b/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h
new file mode 100644
index 00000000000..03fbf2bd0ee
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h
@@ -0,0 +1,2166 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrinterface.h
+* @brief Contains the addrlib interfaces declaration and parameter defines
+***************************************************************************************************
+*/
+#ifndef __ADDR_INTERFACE_H__
+#define __ADDR_INTERFACE_H__
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#include "addrtypes.h"
+
+#define ADDRLIB_VERSION_MAJOR 5
+#define ADDRLIB_VERSION_MINOR 25
+#define ADDRLIB_VERSION ((ADDRLIB_VERSION_MAJOR << 16) | ADDRLIB_VERSION_MINOR)
+
+/// Virtually all interface functions need ADDR_HANDLE as first parameter
+typedef VOID*   ADDR_HANDLE;
+
+/// Client handle used in callbacks
+typedef VOID*   ADDR_CLIENT_HANDLE;
+
+/**
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                  Callback functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*    typedef VOID* (ADDR_API* ADDR_ALLOCSYSMEM)(
+*         const ADDR_ALLOCSYSMEM_INPUT* pInput);
+*    typedef ADDR_E_RETURNCODE (ADDR_API* ADDR_FREESYSMEM)(
+*         VOID* pVirtAddr);
+*    typedef ADDR_E_RETURNCODE (ADDR_API* ADDR_DEBUGPRINT)(
+*         const ADDR_DEBUGPRINT_INPUT* pInput);
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                               Create/Destroy/Config functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrCreate()
+*     AddrDestroy()
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                  Surface functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrComputeSurfaceInfo()
+*     AddrComputeSurfaceAddrFromCoord()
+*     AddrComputeSurfaceCoordFromAddr()
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                   HTile functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrComputeHtileInfo()
+*     AddrComputeHtileAddrFromCoord()
+*     AddrComputeHtileCoordFromAddr()
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                   C-mask functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrComputeCmaskInfo()
+*     AddrComputeCmaskAddrFromCoord()
+*     AddrComputeCmaskCoordFromAddr()
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                   F-mask functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrComputeFmaskInfo()
+*     AddrComputeFmaskAddrFromCoord()
+*     AddrComputeFmaskCoordFromAddr()
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                               Element/Utility functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     ElemFlt32ToDepthPixel()
+*     ElemFlt32ToColorPixel()
+*     AddrExtractBankPipeSwizzle()
+*     AddrCombineBankPipeSwizzle()
+*     AddrComputeSliceSwizzle()
+*     AddrConvertTileInfoToHW()
+*     AddrConvertTileIndex()
+*     AddrConvertTileIndex1()
+*     AddrGetTileIndex()
+*     AddrComputeBaseSwizzle()
+*     AddrUseTileIndex()
+*     AddrUseCombinedSwizzle()
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                    Dump functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrDumpSurfaceInfo()
+*     AddrDumpFmaskInfo()
+*     AddrDumpCmaskInfo()
+*     AddrDumpHtileInfo()
+*
+**/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                      Callback functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+* @brief Alloc system memory flags.
+* @note These flags are reserved for future use and if flags are added will minimize the impact
+*       of the client.
+***************************************************************************************************
+*/
+typedef union _ADDR_ALLOCSYSMEM_FLAGS
+{
+    struct
+    {
+        UINT_32 reserved    : 32;  ///< Reserved for future use.
+    } fields;
+    UINT_32 value;
+
+} ADDR_ALLOCSYSMEM_FLAGS;
+
+/**
+***************************************************************************************************
+* @brief Alloc system memory input structure
+***************************************************************************************************
+*/
+typedef struct _ADDR_ALLOCSYSMEM_INPUT
+{
+    UINT_32                 size;           ///< Size of this structure in bytes
+
+    ADDR_ALLOCSYSMEM_FLAGS  flags;          ///< System memory flags.
+    UINT_32                 sizeInBytes;    ///< System memory allocation size in bytes.
+    ADDR_CLIENT_HANDLE      hClient;        ///< Client handle
+} ADDR_ALLOCSYSMEM_INPUT;
+
+/**
+***************************************************************************************************
+* ADDR_ALLOCSYSMEM
+*   @brief
+*       Allocate system memory callback function. Returns valid pointer on success.
+***************************************************************************************************
+*/
+typedef VOID* (ADDR_API* ADDR_ALLOCSYSMEM)(
+    const ADDR_ALLOCSYSMEM_INPUT* pInput);
+
+/**
+***************************************************************************************************
+* @brief Free system memory input structure
+***************************************************************************************************
+*/
+typedef struct _ADDR_FREESYSMEM_INPUT
+{
+    UINT_32                 size;           ///< Size of this structure in bytes
+
+    VOID*                   pVirtAddr;      ///< Virtual address
+    ADDR_CLIENT_HANDLE      hClient;        ///< Client handle
+} ADDR_FREESYSMEM_INPUT;
+
+/**
+***************************************************************************************************
+* ADDR_FREESYSMEM
+*   @brief
+*       Free system memory callback function.
+*       Returns ADDR_OK on success.
+***************************************************************************************************
+*/
+typedef ADDR_E_RETURNCODE (ADDR_API* ADDR_FREESYSMEM)(
+    const ADDR_FREESYSMEM_INPUT* pInput);
+
+/**
+***************************************************************************************************
+* @brief Print debug message input structure
+***************************************************************************************************
+*/
+typedef struct _ADDR_DEBUGPRINT_INPUT
+{
+    UINT_32             size;           ///< Size of this structure in bytes
+
+    CHAR*               pDebugString;   ///< Debug print string
+    va_list             ap;             ///< Variable argument list
+    ADDR_CLIENT_HANDLE  hClient;        ///< Client handle
+} ADDR_DEBUGPRINT_INPUT;
+
+/**
+***************************************************************************************************
+* ADDR_DEBUGPRINT
+*   @brief
+*       Print debug message callback function.
+*       Returns ADDR_OK on success.
+***************************************************************************************************
+*/
+typedef ADDR_E_RETURNCODE (ADDR_API* ADDR_DEBUGPRINT)(
+    const ADDR_DEBUGPRINT_INPUT* pInput);
+
+/**
+***************************************************************************************************
+* ADDR_CALLBACKS
+*
+*   @brief
+*       Address Library needs client to provide system memory alloc/free routines.
+***************************************************************************************************
+*/
+typedef struct _ADDR_CALLBACKS
+{
+    ADDR_ALLOCSYSMEM allocSysMem;   ///< Routine to allocate system memory
+    ADDR_FREESYSMEM  freeSysMem;    ///< Routine to free system memory
+    ADDR_DEBUGPRINT  debugPrint;    ///< Routine to print debug message
+} ADDR_CALLBACKS;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Create/Destroy functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+* ADDR_CREATE_FLAGS
+*
+*   @brief
+*       This structure is used to pass some setup in creation of AddrLib
+*   @note
+***************************************************************************************************
+*/
+typedef union _ADDR_CREATE_FLAGS
+{
+    struct
+    {
+        UINT_32 noCubeMipSlicesPad     : 1;    ///< Turn cubemap faces padding off
+        UINT_32 fillSizeFields         : 1;    ///< If clients fill size fields in all input and
+                                               ///  output structure
+        UINT_32 useTileIndex           : 1;    ///< Make tileIndex field in input valid
+        UINT_32 useCombinedSwizzle     : 1;    ///< Use combined tile swizzle
+        UINT_32 checkLast2DLevel       : 1;    ///< Check the last 2D mip sub level
+        UINT_32 useHtileSliceAlign     : 1;    ///< Do htile single slice alignment
+        UINT_32 degradeBaseLevel       : 1;    ///< Degrade to 1D modes automatically for base level
+        UINT_32 allowLargeThickTile    : 1;    ///< Allow 64*thickness*bytesPerPixel > rowSize
+        UINT_32 reserved               : 24;   ///< Reserved bits for future use
+    };
+
+    UINT_32 value;
+} ADDR_CREATE_FLAGS;
+
+/**
+***************************************************************************************************
+*   ADDR_REGISTER_VALUE
+*
+*   @brief
+*       Data from registers to setup AddrLib global data, used in AddrCreate
+***************************************************************************************************
+*/
+typedef struct _ADDR_REGISTER_VALUE
+{
+    UINT_32  gbAddrConfig;       ///< For R8xx, use GB_ADDR_CONFIG register value.
+                                 ///  For R6xx/R7xx, use GB_TILING_CONFIG.
+                                 ///  But they can be treated as the same.
+                                 ///  if this value is 0, use chip to set default value
+    UINT_32  backendDisables;    ///< 1 bit per backend, starting with LSB. 1=disabled,0=enabled.
+                                 ///  Register value of CC_RB_BACKEND_DISABLE.BACKEND_DISABLE
+
+                                 ///  R800 registers-----------------------------------------------
+    UINT_32  noOfBanks;          ///< Number of h/w ram banks - For r800: MC_ARB_RAMCFG.NOOFBANK
+                                 ///  No enums for this value in h/w header files
+                                 ///  0: 4
+                                 ///  1: 8
+                                 ///  2: 16
+    UINT_32  noOfRanks;          ///  MC_ARB_RAMCFG.NOOFRANK
+                                 ///  0: 1
+                                 ///  1: 2
+                                 ///  SI (R1000) registers-----------------------------------------
+    const UINT_32* pTileConfig;  ///< Global tile setting tables
+    UINT_32  noOfEntries;        ///< Number of entries in pTileConfig
+
+                                 ///< CI registers-------------------------------------------------
+    const UINT_32* pMacroTileConfig;    ///< Global macro tile mode table
+    UINT_32  noOfMacroEntries;   ///< Number of entries in pMacroTileConfig
+
+} ADDR_REGISTER_VALUE;
+
+/**
+***************************************************************************************************
+* ADDR_CREATE_INPUT
+*
+*   @brief
+*       Parameters use to create an AddrLib Object. Caller must provide all fields.
+*
+***************************************************************************************************
+*/
+typedef struct _ADDR_CREATE_INPUT
+{
+    UINT_32             size;                ///< Size of this structure in bytes
+
+    UINT_32             chipEngine;          ///< Chip Engine
+    UINT_32             chipFamily;          ///< Chip Family
+    UINT_32             chipRevision;        ///< Chip Revision
+    ADDR_CALLBACKS      callbacks;           ///< Callbacks for sysmem alloc/free/print
+    ADDR_CREATE_FLAGS   createFlags;         ///< Flags to setup AddrLib
+    ADDR_REGISTER_VALUE regValue;            ///< Data from registers to setup AddrLib global data
+    ADDR_CLIENT_HANDLE  hClient;             ///< Client handle
+    UINT_32             minPitchAlignPixels; ///< Minimum pitch alignment in pixels
+} ADDR_CREATE_INPUT;
+
+/**
+***************************************************************************************************
+* ADDR_CREATEINFO_OUTPUT
+*
+*   @brief
+*       Return AddrLib handle to client driver
+*
+***************************************************************************************************
+*/
+typedef struct _ADDR_CREATE_OUTPUT
+{
+    UINT_32     size;    ///< Size of this structure in bytes
+
+    ADDR_HANDLE hLib;    ///< Address lib handle
+} ADDR_CREATE_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrCreate
+*
+*   @brief
+*       Create AddrLib object, must be called before any interface calls
+*
+*   @return
+*       ADDR_OK if successful
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrCreate(
+    const ADDR_CREATE_INPUT*    pAddrCreateIn,
+    ADDR_CREATE_OUTPUT*         pAddrCreateOut);
+
+
+
+/**
+***************************************************************************************************
+*   AddrDestroy
+*
+*   @brief
+*       Destroy AddrLib object, must be called to free internally allocated resources.
+*
+*   @return
+*      ADDR_OK if successful
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrDestroy(
+    ADDR_HANDLE hLib);
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                    Surface functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+* @brief
+*       Bank/tiling parameters. On function input, these can be set as desired or
+*       left 0 for AddrLib to calculate/default. On function output, these are the actual
+*       parameters used.
+* @note
+*       Valid bankWidth/bankHeight value:
+*       1,2,4,8. They are factors instead of pixels or bytes.
+*
+*       The bank number remains constant across each row of the
+*       macro tile as each pipe is selected, so the number of
+*       tiles in the x direction with the same bank number will
+*       be bank_width * num_pipes.
+***************************************************************************************************
+*/
+typedef struct _ADDR_TILEINFO
+{
+    ///  Any of these parameters can be set to 0 to use the HW default.
+    UINT_32     banks;              ///< Number of banks, numerical value
+    UINT_32     bankWidth;          ///< Number of tiles in the X direction in the same bank
+    UINT_32     bankHeight;         ///< Number of tiles in the Y direction in the same bank
+    UINT_32     macroAspectRatio;   ///< Macro tile aspect ratio. 1-1:1, 2-4:1, 4-16:1, 8-64:1
+    UINT_32     tileSplitBytes;     ///< Tile split size, in bytes
+    AddrPipeCfg pipeConfig;         ///< Pipe Config = HW enum + 1
+} ADDR_TILEINFO;
+
+// Create a define to avoid client change. The removal of R800 is because we plan to implement SI
+// within 800 HWL - An AddrPipeCfg is added in above data structure
+typedef ADDR_TILEINFO ADDR_R800_TILEINFO;
+
+/**
+***************************************************************************************************
+* @brief
+*       Information needed by quad buffer stereo support
+***************************************************************************************************
+*/
+typedef struct _ADDR_QBSTEREOINFO
+{
+    UINT_32         eyeHeight;          ///< Height (in pixel rows) to right eye
+    UINT_32         rightOffset;        ///< Offset (in bytes) to right eye
+    UINT_32         rightSwizzle;       ///< TileSwizzle for right eyes
+} ADDR_QBSTEREOINFO;
+
+/**
+***************************************************************************************************
+*   ADDR_SURFACE_FLAGS
+*
+*   @brief
+*       Surface flags
+***************************************************************************************************
+*/
+typedef union _ADDR_SURFACE_FLAGS
+{
+    struct
+    {
+        UINT_32 color         : 1; ///< Flag indicates this is a color buffer
+        UINT_32 depth         : 1; ///< Flag indicates this is a depth/stencil buffer
+        UINT_32 stencil       : 1; ///< Flag indicates this is a stencil buffer
+        UINT_32 texture       : 1; ///< Flag indicates this is a texture
+        UINT_32 cube          : 1; ///< Flag indicates this is a cubemap
+
+        UINT_32 volume        : 1; ///< Flag indicates this is a volume texture
+        UINT_32 fmask         : 1; ///< Flag indicates this is an fmask
+        UINT_32 cubeAsArray   : 1; ///< Flag indicates if treat cubemap as arrays
+        UINT_32 compressZ     : 1; ///< Flag indicates z buffer is compressed
+        UINT_32 overlay       : 1; ///< Flag indicates this is an overlay surface
+        UINT_32 noStencil     : 1; ///< Flag indicates this depth has no separate stencil
+        UINT_32 display       : 1; ///< Flag indicates this should match display controller req.
+        UINT_32 opt4Space     : 1; ///< Flag indicates this surface should be optimized for space
+                                   ///  i.e. save some memory but may lose performance
+        UINT_32 prt           : 1; ///< Flag for partially resident texture
+        UINT_32 qbStereo      : 1; ///< Quad buffer stereo surface
+        UINT_32 pow2Pad       : 1; ///< SI: Pad to pow2, must set for mipmap (include level0)
+        UINT_32 interleaved   : 1; ///< Special flag for interleaved YUV surface padding
+        UINT_32 degrade4Space : 1; ///< Degrade base level's tile mode to save memory
+        UINT_32 tcCompatible  : 1; ///< Flag indicates surface needs to be shader readable
+        UINT_32 dispTileType  : 1; ///< NI: force display Tiling for 128 bit shared resoruce
+        UINT_32 dccCompatible : 1; ///< VI: whether to support dcc fast clear
+        UINT_32 czDispCompatible: 1; ///< SI+: CZ family (Carrizo) has a HW bug needs special alignment.
+                                     ///<      This flag indicates we need to follow the alignment with
+                                     ///<      CZ families or other ASICs under PX configuration + CZ.
+        UINT_32 reserved      :10; ///< Reserved bits
+    };
+
+    UINT_32 value;
+} ADDR_SURFACE_FLAGS;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SURFACE_INFO_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeSurfaceInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SURFACE_INFO_INPUT
+{
+    UINT_32             size;               ///< Size of this structure in bytes
+
+    AddrTileMode        tileMode;           ///< Tile mode
+    AddrFormat          format;             ///< If format is set to valid one, bpp/width/height
+                                            ///  might be overwritten
+    UINT_32             bpp;                ///< Bits per pixel
+    UINT_32             numSamples;         ///< Number of samples
+    UINT_32             width;              ///< Width, in pixels
+    UINT_32             height;             ///< Height, in pixels
+    UINT_32             numSlices;          ///< Number surface slice/depth,
+                                            ///  Note:
+                                            ///  For cubemap, driver clients usually set numSlices
+                                            ///  to 1 in per-face calc.
+                                            ///  For 7xx and above, we need pad faces as slices.
+                                            ///  In this case, clients should set numSlices to 6 and
+                                            ///  this is also can be turned off by createFlags when
+                                            ///  calling AddrCreate
+    UINT_32             slice;              ///< Slice index
+    UINT_32             mipLevel;           ///< Current mipmap level.
+                                            ///  Padding/tiling have different rules for level0 and
+                                            ///  sublevels
+    ADDR_SURFACE_FLAGS  flags;              ///< Surface type flags
+    UINT_32             numFrags;           ///< Number of fragments, leave it zero or the same as
+                                            ///  number of samples for normal AA; Set it to the
+                                            ///  number of fragments for EQAA
+    /// r800 and later HWL parameters
+    // Needed by 2D tiling, for linear and 1D tiling, just keep them 0's
+    ADDR_TILEINFO*      pTileInfo;          ///< 2D tile parameters. Set to 0 to default/calculate
+    AddrTileType        tileType;           ///< Micro tiling type, not needed when tileIndex != -1
+    INT_32              tileIndex;          ///< Tile index, MUST be -1 if you don't want to use it
+                                            ///  while the global useTileIndex is set to 1
+    UINT_32             basePitch;          ///< Base level pitch in pixels, 0 means ignored, is a
+                                            ///  must for mip levels from SI+.
+                                            ///  Don't use pitch in blocks for compressed formats!
+} ADDR_COMPUTE_SURFACE_INFO_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SURFACE_INFO_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeSurfInfo
+*   @note
+        Element: AddrLib unit for computing. e.g. BCn: 4x4 blocks; R32B32B32: 32bit with 3x pitch
+        Pixel: Original pixel
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SURFACE_INFO_OUTPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    UINT_32         pitch;          ///< Pitch in elements (in blocks for compressed formats)
+    UINT_32         height;         ///< Height in elements (in blocks for compressed formats)
+    UINT_32         depth;          ///< Number of slice/depth
+    UINT_64         surfSize;       ///< Surface size in bytes
+    AddrTileMode    tileMode;       ///< Actual tile mode. May differ from that in input
+    UINT_32         baseAlign;      ///< Base address alignment
+    UINT_32         pitchAlign;     ///< Pitch alignment, in elements
+    UINT_32         heightAlign;    ///< Height alignment, in elements
+    UINT_32         depthAlign;     ///< Depth alignment, aligned to thickness, for 3d texture
+    UINT_32         bpp;            ///< Bits per elements (e.g. blocks for BCn, 1/3 for 96bit)
+    UINT_32         pixelPitch;     ///< Pitch in original pixels
+    UINT_32         pixelHeight;    ///< Height in original pixels
+    UINT_32         pixelBits;      ///< Original bits per pixel, passed from input
+    UINT_64         sliceSize;      ///< Size of slice specified by input's slice
+                                    ///  The result is controlled by surface flags & createFlags
+                                    ///  By default this value equals to surfSize for volume
+    UINT_32         pitchTileMax;   ///< PITCH_TILE_MAX value for h/w register
+    UINT_32         heightTileMax;  ///< HEIGHT_TILE_MAX value for h/w register
+    UINT_32         sliceTileMax;   ///< SLICE_TILE_MAX value for h/w register
+
+    UINT_32         numSamples;     ///< Pass the effective numSamples processed in this call
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*  pTileInfo;      ///< Tile parameters used. Filled in if 0 on input
+    AddrTileType    tileType;       ///< Micro tiling type, only valid when tileIndex != -1
+    INT_32          tileIndex;      ///< Tile index, MAY be "downgraded"
+
+    INT_32          macroModeIndex; ///< Index in macro tile mode table if there is one (CI)
+    /// Special information to work around SI mipmap swizzle bug UBTS #317508
+    BOOL_32         last2DLevel;    ///< TRUE if this is the last 2D(3D) tiled
+                                    ///< Only meaningful when create flag checkLast2DLevel is set
+    /// Stereo info
+    ADDR_QBSTEREOINFO*  pStereoInfo;///< Stereo information, needed when .qbStereo flag is TRUE
+} ADDR_COMPUTE_SURFACE_INFO_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeSurfaceInfo
+*
+*   @brief
+*       Compute surface width/height/depth/alignments and suitable tiling mode
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSurfaceInfo(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeSurfaceAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    UINT_32         x;                  ///< X coordinate
+    UINT_32         y;                  ///< Y coordinate
+    UINT_32         slice;              ///< Slice index
+    UINT_32         sample;             ///< Sample index, use fragment index for EQAA
+
+    UINT_32         bpp;                ///< Bits per pixel
+    UINT_32         pitch;              ///< Surface pitch, in pixels
+    UINT_32         height;             ///< Surface height, in pixels
+    UINT_32         numSlices;          ///< Surface depth
+    UINT_32         numSamples;         ///< Number of samples
+
+    AddrTileMode    tileMode;           ///< Tile mode
+    BOOL_32         isDepth;            ///< TRUE if the surface uses depth sample ordering within
+                                        ///  micro tile. Textures can also choose depth sample order
+    UINT_32         tileBase;           ///< Base offset (in bits) inside micro tile which handles
+                                        ///  the case that components are stored separately
+    UINT_32         compBits;           ///< The component bits actually needed(for planar surface)
+
+    UINT_32         numFrags;           ///< Number of fragments, leave it zero or the same as
+                                        ///  number of samples for normal AA; Set it to the
+                                        ///  number of fragments for EQAA
+    /// r800 and later HWL parameters
+    // Used for 1D tiling above
+    AddrTileType    tileType;           ///< See defintion of AddrTileType
+    struct
+    {
+        UINT_32     ignoreSE : 1;       ///< TRUE if shader engines are ignored. This is texture
+                                        ///  only flag. Only non-RT texture can set this to TRUE
+        UINT_32     reserved :31;       ///< Reserved for future use.
+    };
+    // 2D tiling needs following structure
+    ADDR_TILEINFO*  pTileInfo;          ///< 2D tile parameters. Client must provide all data
+    INT_32          tileIndex;          ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    union
+    {
+        struct
+        {
+            UINT_32  bankSwizzle;       ///< Bank swizzle
+            UINT_32  pipeSwizzle;       ///< Pipe swizzle
+        };
+        UINT_32     tileSwizzle;        ///< Combined swizzle, if useCombinedSwizzle is TRUE
+    };
+
+#if ADDR_AM_BUILD // These two fields are not valid in SW blt since no HTILE access
+    UINT_32         addr5Swizzle;       ///< ADDR5_SWIZZLE_MASK of DB_DEPTH_INFO
+    BOOL_32         is32ByteTile;       ///< Caller must have access to HTILE buffer and know if
+                                        ///  this tile is compressed to 32B
+#endif
+} ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeSurfaceAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_64 addr;           ///< Byte address
+    UINT_32 bitPosition;    ///< Bit position within surfaceAddr, 0-7.
+                            ///  For surface bpp < 8, e.g. FMT_1.
+    UINT_32 prtBlockIndex;  ///< Index of a PRT tile (64K block)
+} ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeSurfaceAddrFromCoord
+*
+*   @brief
+*       Compute surface address from a given coordinate.
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSurfaceAddrFromCoord(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+    ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeSurfaceCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    UINT_64         addr;               ///< Address in bytes
+    UINT_32         bitPosition;        ///< Bit position in addr. 0-7. for surface bpp < 8,
+                                        ///  e.g. FMT_1;
+    UINT_32         bpp;                ///< Bits per pixel
+    UINT_32         pitch;              ///< Pitch, in pixels
+    UINT_32         height;             ///< Height in pixels
+    UINT_32         numSlices;          ///< Surface depth
+    UINT_32         numSamples;         ///< Number of samples
+
+    AddrTileMode    tileMode;           ///< Tile mode
+    BOOL_32         isDepth;            ///< Surface uses depth sample ordering within micro tile.
+                                        ///  Note: Textures can choose depth sample order as well.
+    UINT_32         tileBase;           ///< Base offset (in bits) inside micro tile which handles
+                                        ///  the case that components are stored separately
+    UINT_32         compBits;           ///< The component bits actually needed(for planar surface)
+
+    UINT_32         numFrags;           ///< Number of fragments, leave it zero or the same as
+                                        ///  number of samples for normal AA; Set it to the
+                                        ///  number of fragments for EQAA
+    /// r800 and later HWL parameters
+    // Used for 1D tiling above
+    AddrTileType    tileType;           ///< See defintion of AddrTileType
+    struct
+    {
+        UINT_32     ignoreSE : 1;       ///< TRUE if shader engines are ignored. This is texture
+                                        ///  only flag. Only non-RT texture can set this to TRUE
+        UINT_32     reserved :31;       ///< Reserved for future use.
+    };
+    // 2D tiling needs following structure
+    ADDR_TILEINFO*  pTileInfo;          ///< 2D tile parameters. Client must provide all data
+    INT_32          tileIndex;          ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    union
+    {
+        struct
+        {
+            UINT_32  bankSwizzle;       ///< Bank swizzle
+            UINT_32  pipeSwizzle;       ///< Pipe swizzle
+        };
+        UINT_32     tileSwizzle;        ///< Combined swizzle, if useCombinedSwizzle is TRUE
+    };
+} ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeSurfaceCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT
+{
+    UINT_32 size;   ///< Size of this structure in bytes
+
+    UINT_32 x;      ///< X coordinate
+    UINT_32 y;      ///< Y coordinate
+    UINT_32 slice;  ///< Index of slices
+    UINT_32 sample; ///< Index of samples, means fragment index for EQAA
+} ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeSurfaceCoordFromAddr
+*
+*   @brief
+*       Compute coordinate from a given surface address
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSurfaceCoordFromAddr(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,
+    ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT*      pOut);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                   HTile functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   ADDR_HTILE_FLAGS
+*
+*   @brief
+*       HTILE flags
+***************************************************************************************************
+*/
+typedef union _ADDR_HTILE_FLAGS
+{
+    struct
+    {
+        UINT_32 tcCompatible  : 1; ///< Flag indicates surface needs to be shader readable
+        UINT_32 reserved      :31; ///< Reserved bits
+    };
+
+    UINT_32 value;
+} ADDR_HTILE_FLAGS;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_HTILE_INFO_INPUT
+*
+*   @brief
+*       Input structure of AddrComputeHtileInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_HTILE_INFO_INPUT
+{
+    UINT_32            size;            ///< Size of this structure in bytes
+
+    ADDR_HTILE_FLAGS   flags;           ///< HTILE flags
+    UINT_32            pitch;           ///< Surface pitch, in pixels
+    UINT_32            height;          ///< Surface height, in pixels
+    UINT_32            numSlices;       ///< Number of slices
+    BOOL_32            isLinear;        ///< Linear or tiled HTILE layout
+    AddrHtileBlockSize blockWidth;      ///< 4 or 8. EG above only support 8
+    AddrHtileBlockSize blockHeight;     ///< 4 or 8. EG above only support 8
+    ADDR_TILEINFO*     pTileInfo;       ///< Tile info
+
+    INT_32             tileIndex;       ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    INT_32             macroModeIndex;  ///< Index in macro tile mode table if there is one (CI)
+                                        ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_HTILE_INFO_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_HTILE_INFO_OUTPUT
+*
+*   @brief
+*       Output structure of AddrComputeHtileInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_HTILE_INFO_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_32 pitch;          ///< Pitch in pixels of depth buffer represented in this
+                            ///  HTile buffer. This might be larger than original depth
+                            ///  buffer pitch when called with an unaligned pitch.
+    UINT_32 height;         ///< Height in pixels, as above
+    UINT_64 htileBytes;     ///< Size of HTILE buffer, in bytes
+    UINT_32 baseAlign;      ///< Base alignment
+    UINT_32 bpp;            ///< Bits per pixel for HTILE is how many bits for an 8x8 block!
+    UINT_32 macroWidth;     ///< Macro width in pixels, actually squared cache shape
+    UINT_32 macroHeight;    ///< Macro height in pixels
+    UINT_64 sliceSize;      ///< Slice size, in bytes.
+} ADDR_COMPUTE_HTILE_INFO_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeHtileInfo
+*
+*   @brief
+*       Compute Htile pitch, height, base alignment and size in bytes
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeHtileInfo(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_HTILE_INFO_INPUT*    pIn,
+    ADDR_COMPUTE_HTILE_INFO_OUTPUT*         pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeHtileAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT
+{
+    UINT_32            size;            ///< Size of this structure in bytes
+
+    UINT_32            pitch;           ///< Pitch, in pixels
+    UINT_32            height;          ///< Height in pixels
+    UINT_32            x;               ///< X coordinate
+    UINT_32            y;               ///< Y coordinate
+    UINT_32            slice;           ///< Index of slice
+    UINT_32            numSlices;       ///< Number of slices
+    BOOL_32            isLinear;        ///< Linear or tiled HTILE layout
+    AddrHtileBlockSize blockWidth;      ///< 4 or 8. 1 means 8, 0 means 4. EG above only support 8
+    AddrHtileBlockSize blockHeight;     ///< 4 or 8. 1 means 8, 0 means 4. EG above only support 8
+    ADDR_TILEINFO*     pTileInfo;       ///< Tile info
+
+    INT_32             tileIndex;       ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    INT_32             macroModeIndex;  ///< Index in macro tile mode table if there is one (CI)
+                                        ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeHtileAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_64 addr;           ///< Address in bytes
+    UINT_32 bitPosition;    ///< Bit position, 0 or 4. CMASK and HTILE shares some lib method.
+                            ///  So we keep bitPosition for HTILE as well
+} ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeHtileAddrFromCoord
+*
+*   @brief
+*       Compute Htile address according to coordinates (of depth buffer)
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeHtileAddrFromCoord(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT*   pIn,
+    ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT*        pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeHtileCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT
+{
+    UINT_32            size;            ///< Size of this structure in bytes
+
+    UINT_64            addr;            ///< Address
+    UINT_32            bitPosition;     ///< Bit position 0 or 4. CMASK and HTILE share some methods
+                                        ///  so we keep bitPosition for HTILE as well
+    UINT_32            pitch;           ///< Pitch, in pixels
+    UINT_32            height;          ///< Height, in pixels
+    UINT_32            numSlices;       ///< Number of slices
+    BOOL_32            isLinear;        ///< Linear or tiled HTILE layout
+    AddrHtileBlockSize blockWidth;      ///< 4 or 8. 1 means 8, 0 means 4. R8xx/R9xx only support 8
+    AddrHtileBlockSize blockHeight;     ///< 4 or 8. 1 means 8, 0 means 4. R8xx/R9xx only support 8
+    ADDR_TILEINFO*     pTileInfo;       ///< Tile info
+
+    INT_32             tileIndex;       ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    INT_32             macroModeIndex;  ///< Index in macro tile mode table if there is one (CI)
+                                        ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeHtileCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT
+{
+    UINT_32 size;   ///< Size of this structure in bytes
+
+    UINT_32 x;      ///< X coordinate
+    UINT_32 y;      ///< Y coordinate
+    UINT_32 slice;  ///< Slice index
+} ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeHtileCoordFromAddr
+*
+*   @brief
+*       Compute coordinates within depth buffer (1st pixel of a micro tile) according to
+*       Htile address
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeHtileCoordFromAddr(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT*   pIn,
+    ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT*        pOut);
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                     C-mask functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   ADDR_CMASK_FLAGS
+*
+*   @brief
+*       CMASK flags
+***************************************************************************************************
+*/
+typedef union _ADDR_CMASK_FLAGS
+{
+    struct
+    {
+        UINT_32 tcCompatible  : 1; ///< Flag indicates surface needs to be shader readable
+        UINT_32 reserved      :31; ///< Reserved bits
+    };
+
+    UINT_32 value;
+} ADDR_CMASK_FLAGS;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_CMASK_INFO_INPUT
+*
+*   @brief
+*       Input structure of AddrComputeCmaskInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_CMASKINFO_INPUT
+{
+    UINT_32             size;            ///< Size of this structure in bytes
+
+    ADDR_CMASK_FLAGS    flags;           ///< CMASK flags
+    UINT_32             pitch;           ///< Pitch, in pixels, of color buffer
+    UINT_32             height;          ///< Height, in pixels, of color buffer
+    UINT_32             numSlices;       ///< Number of slices, of color buffer
+    BOOL_32             isLinear;        ///< Linear or tiled layout, Only SI can be linear
+    ADDR_TILEINFO*      pTileInfo;       ///< Tile info
+
+    INT_32              tileIndex;       ///< Tile index, MUST be -1 if you don't want to use it
+                                         ///  while the global useTileIndex is set to 1
+    INT_32              macroModeIndex;  ///< Index in macro tile mode table if there is one (CI)
+                                         ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_CMASK_INFO_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_CMASK_INFO_OUTPUT
+*
+*   @brief
+*       Output structure of AddrComputeCmaskInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_CMASK_INFO_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_32 pitch;          ///< Pitch in pixels of color buffer which
+                            ///  this Cmask matches. The size might be larger than
+                            ///  original color buffer pitch when called with
+                            ///  an unaligned pitch.
+    UINT_32 height;         ///< Height in pixels, as above
+    UINT_64 cmaskBytes;     ///< Size in bytes of CMask buffer
+    UINT_32 baseAlign;      ///< Base alignment
+    UINT_32 blockMax;       ///< Cmask block size. Need this to set CB_COLORn_MASK register
+    UINT_32 macroWidth;     ///< Macro width in pixels, actually squared cache shape
+    UINT_32 macroHeight;    ///< Macro height in pixels
+    UINT_64 sliceSize;      ///< Slice size, in bytes.
+} ADDR_COMPUTE_CMASK_INFO_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeCmaskInfo
+*
+*   @brief
+*       Compute Cmask pitch, height, base alignment and size in bytes from color buffer
+*       info
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeCmaskInfo(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_CMASK_INFO_INPUT*    pIn,
+    ADDR_COMPUTE_CMASK_INFO_OUTPUT*         pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeCmaskAddrFromCoord
+*
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT
+{
+    UINT_32          size;           ///< Size of this structure in bytes
+    UINT_32          x;              ///< X coordinate
+    UINT_32          y;              ///< Y coordinate
+    UINT_64          fmaskAddr;      ///< Fmask addr for tc compatible Cmask
+    UINT_32          slice;          ///< Slice index
+    UINT_32          pitch;          ///< Pitch in pixels, of color buffer
+    UINT_32          height;         ///< Height in pixels, of color buffer
+    UINT_32          numSlices;      ///< Number of slices
+    UINT_32          bpp;
+    BOOL_32          isLinear;       ///< Linear or tiled layout, Only SI can be linear
+    ADDR_CMASK_FLAGS flags;          ///< CMASK flags
+    ADDR_TILEINFO*   pTileInfo;      ///< Tile info
+
+    INT_32           tileIndex;      ///< Tile index, MUST be -1 if you don't want to use it
+                                     ///< while the global useTileIndex is set to 1
+    INT_32           macroModeIndex; ///< Index in macro tile mode table if there is one (CI)
+                                     ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeCmaskAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_64 addr;           ///< CMASK address in bytes
+    UINT_32 bitPosition;    ///< Bit position within addr, 0-7. CMASK is 4 bpp,
+                            ///  so the address may be located in bit 0 (0) or 4 (4)
+} ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeCmaskAddrFromCoord
+*
+*   @brief
+*       Compute Cmask address according to coordinates (of MSAA color buffer)
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeCmaskAddrFromCoord(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT*   pIn,
+    ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT*        pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeCmaskCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT
+{
+    UINT_32        size;            ///< Size of this structure in bytes
+
+    UINT_64        addr;            ///< CMASK address in bytes
+    UINT_32        bitPosition;     ///< Bit position within addr, 0-7. CMASK is 4 bpp,
+                                    ///  so the address may be located in bit 0 (0) or 4 (4)
+    UINT_32        pitch;           ///< Pitch, in pixels
+    UINT_32        height;          ///< Height in pixels
+    UINT_32        numSlices;       ///< Number of slices
+    BOOL_32        isLinear;        ///< Linear or tiled layout, Only SI can be linear
+    ADDR_TILEINFO* pTileInfo;       ///< Tile info
+
+    INT_32         tileIndex;       ///< Tile index, MUST be -1 if you don't want to use it
+                                    ///  while the global useTileIndex is set to 1
+    INT_32         macroModeIndex;  ///< Index in macro tile mode table if there is one (CI)
+                                    ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeCmaskCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT
+{
+    UINT_32 size;   ///< Size of this structure in bytes
+
+    UINT_32 x;      ///< X coordinate
+    UINT_32 y;      ///< Y coordinate
+    UINT_32 slice;  ///< Slice index
+} ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeCmaskCoordFromAddr
+*
+*   @brief
+*       Compute coordinates within color buffer (1st pixel of a micro tile) according to
+*       Cmask address
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeCmaskCoordFromAddr(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT*   pIn,
+    ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT*        pOut);
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                     F-mask functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_FMASK_INFO_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeFmaskInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_FMASK_INFO_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    AddrTileMode    tileMode;           ///< Tile mode
+    UINT_32         pitch;              ///< Surface pitch, in pixels
+    UINT_32         height;             ///< Surface height, in pixels
+    UINT_32         numSlices;          ///< Number of slice/depth
+    UINT_32         numSamples;         ///< Number of samples
+    UINT_32         numFrags;           ///< Number of fragments, leave it zero or the same as
+                                        ///  number of samples for normal AA; Set it to the
+                                        ///  number of fragments for EQAA
+    /// r800 and later HWL parameters
+    struct
+    {
+        UINT_32 resolved:   1;          ///< TRUE if the surface is for resolved fmask, only used
+                                        ///  by H/W clients. S/W should always set it to FALSE.
+        UINT_32 reserved:  31;          ///< Reserved for future use.
+    };
+    ADDR_TILEINFO*  pTileInfo;          ///< 2D tiling parameters. Clients must give valid data
+    INT_32          tileIndex;          ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+} ADDR_COMPUTE_FMASK_INFO_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_FMASK_INFO_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeFmaskInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_FMASK_INFO_OUTPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    UINT_32         pitch;          ///< Pitch of fmask in pixels
+    UINT_32         height;         ///< Height of fmask in pixels
+    UINT_32         numSlices;      ///< Slices of fmask
+    UINT_64         fmaskBytes;     ///< Size of fmask in bytes
+    UINT_32         baseAlign;      ///< Base address alignment
+    UINT_32         pitchAlign;     ///< Pitch alignment
+    UINT_32         heightAlign;    ///< Height alignment
+    UINT_32         bpp;            ///< Bits per pixel of FMASK is: number of bit planes
+    UINT_32         numSamples;     ///< Number of samples, used for dump, export this since input
+                                    ///  may be changed in 9xx and above
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*  pTileInfo;      ///< Tile parameters used. Fmask can have different
+                                    ///  bank_height from color buffer
+    INT_32          tileIndex;      ///< Tile index, MUST be -1 if you don't want to use it
+                                    ///  while the global useTileIndex is set to 1
+    INT_32          macroModeIndex; ///< Index in macro tile mode table if there is one (CI)
+    UINT_64         sliceSize;      ///< Size of slice in bytes
+} ADDR_COMPUTE_FMASK_INFO_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeFmaskInfo
+*
+*   @brief
+*       Compute Fmask pitch/height/depth/alignments and size in bytes
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeFmaskInfo(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pIn,
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT*         pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeFmaskAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    UINT_32         x;                  ///< X coordinate
+    UINT_32         y;                  ///< Y coordinate
+    UINT_32         slice;              ///< Slice index
+    UINT_32         plane;              ///< Plane number
+    UINT_32         sample;             ///< Sample index (fragment index for EQAA)
+
+    UINT_32         pitch;              ///< Surface pitch, in pixels
+    UINT_32         height;             ///< Surface height, in pixels
+    UINT_32         numSamples;         ///< Number of samples
+    UINT_32         numFrags;           ///< Number of fragments, leave it zero or the same as
+                                        ///  number of samples for normal AA; Set it to the
+                                        ///  number of fragments for EQAA
+
+    AddrTileMode    tileMode;           ///< Tile mode
+    union
+    {
+        struct
+        {
+            UINT_32  bankSwizzle;       ///< Bank swizzle
+            UINT_32  pipeSwizzle;       ///< Pipe swizzle
+        };
+        UINT_32     tileSwizzle;        ///< Combined swizzle, if useCombinedSwizzle is TRUE
+    };
+
+    /// r800 and later HWL parameters
+    struct
+    {
+        UINT_32 resolved:   1;          ///< TRUE if this is a resolved fmask, used by H/W clients
+        UINT_32 ignoreSE:   1;          ///< TRUE if shader engines are ignored.
+        UINT_32 reserved:  30;          ///< Reserved for future use.
+    };
+    ADDR_TILEINFO*  pTileInfo;          ///< 2D tiling parameters. Client must provide all data
+
+} ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeFmaskAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_64 addr;           ///< Fmask address
+    UINT_32 bitPosition;    ///< Bit position within fmaskAddr, 0-7.
+} ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeFmaskAddrFromCoord
+*
+*   @brief
+*       Compute Fmask address according to coordinates (x,y,slice,sample,plane)
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeFmaskAddrFromCoord(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT*   pIn,
+    ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT*        pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeFmaskCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    UINT_64         addr;               ///< Address
+    UINT_32         bitPosition;        ///< Bit position within addr, 0-7.
+
+    UINT_32         pitch;              ///< Pitch, in pixels
+    UINT_32         height;             ///< Height in pixels
+    UINT_32         numSamples;         ///< Number of samples
+    UINT_32         numFrags;           ///< Number of fragments
+    AddrTileMode    tileMode;           ///< Tile mode
+    union
+    {
+        struct
+        {
+            UINT_32  bankSwizzle;       ///< Bank swizzle
+            UINT_32  pipeSwizzle;       ///< Pipe swizzle
+        };
+        UINT_32     tileSwizzle;        ///< Combined swizzle, if useCombinedSwizzle is TRUE
+    };
+
+    /// r800 and later HWL parameters
+    struct
+    {
+        UINT_32 resolved:   1;          ///< TRUE if this is a resolved fmask, used by HW components
+        UINT_32 ignoreSE:   1;          ///< TRUE if shader engines are ignored.
+        UINT_32 reserved:  30;          ///< Reserved for future use.
+    };
+    ADDR_TILEINFO*  pTileInfo;          ///< 2D tile parameters. Client must provide all data
+
+} ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeFmaskCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT
+{
+    UINT_32 size;       ///< Size of this structure in bytes
+
+    UINT_32 x;          ///< X coordinate
+    UINT_32 y;          ///< Y coordinate
+    UINT_32 slice;      ///< Slice index
+    UINT_32 plane;      ///< Plane number
+    UINT_32 sample;     ///< Sample index (fragment index for EQAA)
+} ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeFmaskCoordFromAddr
+*
+*   @brief
+*       Compute FMASK coordinate from an given address
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeFmaskCoordFromAddr(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT*   pIn,
+    ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT*        pOut);
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                          Element/utility functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrGetVersion
+*
+*   @brief
+*       Get AddrLib version number
+***************************************************************************************************
+*/
+UINT_32 ADDR_API AddrGetVersion(ADDR_HANDLE hLib);
+
+/**
+***************************************************************************************************
+*   AddrUseTileIndex
+*
+*   @brief
+*       Return TRUE if tileIndex is enabled in this address library
+***************************************************************************************************
+*/
+BOOL_32 ADDR_API AddrUseTileIndex(ADDR_HANDLE hLib);
+
+/**
+***************************************************************************************************
+*   AddrUseCombinedSwizzle
+*
+*   @brief
+*       Return TRUE if combined swizzle is enabled in this address library
+***************************************************************************************************
+*/
+BOOL_32 ADDR_API AddrUseCombinedSwizzle(ADDR_HANDLE hLib);
+
+/**
+***************************************************************************************************
+*   ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT
+*
+*   @brief
+*       Input structure of AddrExtractBankPipeSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    UINT_32         base256b;       ///< Base256b value
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*  pTileInfo;      ///< 2D tile parameters. Client must provide all data
+
+    INT_32          tileIndex;      ///< Tile index, MUST be -1 if you don't want to use it
+                                    ///  while the global useTileIndex is set to 1
+    INT_32          macroModeIndex; ///< Index in macro tile mode table if there is one (CI)
+                                    ///< README: When tileIndex is not -1, this must be valid
+} ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT
+*
+*   @brief
+*       Output structure of AddrExtractBankPipeSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_32 bankSwizzle;    ///< Bank swizzle
+    UINT_32 pipeSwizzle;    ///< Pipe swizzle
+} ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrExtractBankPipeSwizzle
+*
+*   @brief
+*       Extract Bank and Pipe swizzle from base256b
+*   @return
+*       ADDR_OK if no error
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrExtractBankPipeSwizzle(
+    ADDR_HANDLE                                 hLib,
+    const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT*  pIn,
+    ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT*       pOut);
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT
+*
+*   @brief
+*       Input structure of AddrCombineBankPipeSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    UINT_32         bankSwizzle;    ///< Bank swizzle
+    UINT_32         pipeSwizzle;    ///< Pipe swizzle
+    UINT_64         baseAddr;       ///< Base address (leave it zero for driver clients)
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*  pTileInfo;      ///< 2D tile parameters. Client must provide all data
+
+    INT_32          tileIndex;      ///< Tile index, MUST be -1 if you don't want to use it
+                                    ///  while the global useTileIndex is set to 1
+    INT_32          macroModeIndex; ///< Index in macro tile mode table if there is one (CI)
+                                    ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT
+*
+*   @brief
+*       Output structure of AddrCombineBankPipeSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_32 tileSwizzle;    ///< Combined swizzle
+} ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrCombineBankPipeSwizzle
+*
+*   @brief
+*       Combine Bank and Pipe swizzle
+*   @return
+*       ADDR_OK if no error
+*   @note
+*       baseAddr here is full MCAddress instead of base256b
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrCombineBankPipeSwizzle(
+    ADDR_HANDLE                                 hLib,
+    const ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT*  pIn,
+    ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT*       pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SLICESWIZZLE_INPUT
+*
+*   @brief
+*       Input structure of AddrComputeSliceSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SLICESWIZZLE_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    AddrTileMode    tileMode;           ///< Tile Mode
+    UINT_32         baseSwizzle;        ///< Base tile swizzle
+    UINT_32         slice;              ///< Slice index
+    UINT_64         baseAddr;           ///< Base address, driver should leave it 0 in most cases
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*  pTileInfo;          ///< 2D tile parameters. Actually banks needed here!
+
+    INT_32          tileIndex;          ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    INT_32          macroModeIndex;     ///< Index in macro tile mode table if there is one (CI)
+                                        ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_SLICESWIZZLE_INPUT;
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SLICESWIZZLE_OUTPUT
+*
+*   @brief
+*       Output structure of AddrComputeSliceSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SLICESWIZZLE_OUTPUT
+{
+    UINT_32  size;           ///< Size of this structure in bytes
+
+    UINT_32  tileSwizzle;    ///< Recalculated tileSwizzle value
+} ADDR_COMPUTE_SLICESWIZZLE_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeSliceSwizzle
+*
+*   @brief
+*       Extract Bank and Pipe swizzle from base256b
+*   @return
+*       ADDR_OK if no error
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSliceSwizzle(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_SLICESWIZZLE_INPUT*  pIn,
+    ADDR_COMPUTE_SLICESWIZZLE_OUTPUT*       pOut);
+
+
+/**
+***************************************************************************************************
+*   AddrSwizzleGenOption
+*
+*   @brief
+*       Which swizzle generating options: legacy or linear
+***************************************************************************************************
+*/
+typedef enum _AddrSwizzleGenOption
+{
+    ADDR_SWIZZLE_GEN_DEFAULT    = 0,    ///< As is in client driver implemention for swizzle
+    ADDR_SWIZZLE_GEN_LINEAR     = 1,    ///< Using a linear increment of swizzle
+} AddrSwizzleGenOption;
+
+/**
+***************************************************************************************************
+*   AddrSwizzleOption
+*
+*   @brief
+*       Controls how swizzle is generated
+***************************************************************************************************
+*/
+typedef union _ADDR_SWIZZLE_OPTION
+{
+    struct
+    {
+        UINT_32 genOption       : 1;    ///< The way swizzle is generated, see AddrSwizzleGenOption
+        UINT_32 reduceBankBit   : 1;    ///< TRUE if we need reduce swizzle bits
+        UINT_32 reserved        :30;    ///< Reserved bits
+    };
+
+    UINT_32 value;
+
+} ADDR_SWIZZLE_OPTION;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_BASE_SWIZZLE_INPUT
+*
+*   @brief
+*       Input structure of AddrComputeBaseSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_BASE_SWIZZLE_INPUT
+{
+    UINT_32             size;           ///< Size of this structure in bytes
+
+    ADDR_SWIZZLE_OPTION option;         ///< Swizzle option
+    UINT_32             surfIndex;      ///< Index of this surface type
+    AddrTileMode        tileMode;       ///< Tile Mode
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*      pTileInfo;      ///< 2D tile parameters. Actually banks needed here!
+
+    INT_32              tileIndex;      ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    INT_32              macroModeIndex; ///< Index in macro tile mode table if there is one (CI)
+                                        ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_BASE_SWIZZLE_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT
+*
+*   @brief
+*       Output structure of AddrComputeBaseSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_32 tileSwizzle;    ///< Combined swizzle
+} ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeBaseSwizzle
+*
+*   @brief
+*       Return a Combined Bank and Pipe swizzle base on surface based on surface type/index
+*   @return
+*       ADDR_OK if no error
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeBaseSwizzle(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_BASE_SWIZZLE_INPUT*  pIn,
+    ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT*       pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ELEM_GETEXPORTNORM_INPUT
+*
+*   @brief
+*       Input structure for ElemGetExportNorm
+*
+***************************************************************************************************
+*/
+typedef struct _ELEM_GETEXPORTNORM_INPUT
+{
+    UINT_32             size;       ///< Size of this structure in bytes
+
+    AddrColorFormat     format;     ///< Color buffer format; Client should use ColorFormat
+    AddrSurfaceNumber   num;        ///< Surface number type; Client should use NumberType
+    AddrSurfaceSwap     swap;       ///< Surface swap byte swap; Client should use SurfaceSwap
+    UINT_32             numSamples; ///< Number of samples
+} ELEM_GETEXPORTNORM_INPUT;
+
+/**
+***************************************************************************************************
+*  ElemGetExportNorm
+*
+*   @brief
+*       Helper function to check one format can be EXPORT_NUM, which is a register
+*       CB_COLOR_INFO.SURFACE_FORMAT. FP16 can be reported as EXPORT_NORM for rv770 in r600
+*       family
+*   @note
+*       The implementation is only for r600.
+*       00 - EXPORT_FULL: PS exports are 4 pixels with 4 components with 32-bits-per-component. (two
+*       clocks per export)
+*       01 - EXPORT_NORM: PS exports are 4 pixels with 4 components with 16-bits-per-component. (one
+*       clock per export)
+*
+***************************************************************************************************
+*/
+BOOL_32 ADDR_API ElemGetExportNorm(
+    ADDR_HANDLE                     hLib,
+    const ELEM_GETEXPORTNORM_INPUT* pIn);
+
+
+
+/**
+***************************************************************************************************
+*   ELEM_FLT32TODEPTHPIXEL_INPUT
+*
+*   @brief
+*       Input structure for addrFlt32ToDepthPixel
+*
+***************************************************************************************************
+*/
+typedef struct _ELEM_FLT32TODEPTHPIXEL_INPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    AddrDepthFormat format;         ///< Depth buffer format
+    ADDR_FLT_32     comps[2];       ///< Component values (Z/stencil)
+} ELEM_FLT32TODEPTHPIXEL_INPUT;
+
+/**
+***************************************************************************************************
+*   ELEM_FLT32TODEPTHPIXEL_INPUT
+*
+*   @brief
+*       Output structure for ElemFlt32ToDepthPixel
+*
+***************************************************************************************************
+*/
+typedef struct _ELEM_FLT32TODEPTHPIXEL_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_8* pPixel;         ///< Real depth value. Same data type as depth buffer.
+                            ///  Client must provide enough storage for this type.
+    UINT_32 depthBase;      ///< Tile base in bits for depth bits
+    UINT_32 stencilBase;    ///< Tile base in bits for stencil bits
+    UINT_32 depthBits;      ///< Bits for depth
+    UINT_32 stencilBits;    ///< Bits for stencil
+} ELEM_FLT32TODEPTHPIXEL_OUTPUT;
+
+/**
+***************************************************************************************************
+*   ElemFlt32ToDepthPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a depth/stencil pixel value
+*
+*   @return
+*       Return code
+*
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API ElemFlt32ToDepthPixel(
+    ADDR_HANDLE                         hLib,
+    const ELEM_FLT32TODEPTHPIXEL_INPUT* pIn,
+    ELEM_FLT32TODEPTHPIXEL_OUTPUT*      pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ELEM_FLT32TOCOLORPIXEL_INPUT
+*
+*   @brief
+*       Input structure for addrFlt32ToColorPixel
+*
+***************************************************************************************************
+*/
+typedef struct _ELEM_FLT32TOCOLORPIXEL_INPUT
+{
+    UINT_32            size;           ///< Size of this structure in bytes
+
+    AddrColorFormat    format;         ///< Color buffer format
+    AddrSurfaceNumber  surfNum;        ///< Surface number
+    AddrSurfaceSwap    surfSwap;       ///< Surface swap
+    ADDR_FLT_32        comps[4];       ///< Component values (r/g/b/a)
+} ELEM_FLT32TOCOLORPIXEL_INPUT;
+
+/**
+***************************************************************************************************
+*   ELEM_FLT32TOCOLORPIXEL_INPUT
+*
+*   @brief
+*       Output structure for ElemFlt32ToColorPixel
+*
+***************************************************************************************************
+*/
+typedef struct _ELEM_FLT32TOCOLORPIXEL_OUTPUT
+{
+    UINT_32 size;       ///< Size of this structure in bytes
+
+    UINT_8* pPixel;     ///< Real color value. Same data type as color buffer.
+                        ///  Client must provide enough storage for this type.
+} ELEM_FLT32TOCOLORPIXEL_OUTPUT;
+
+/**
+***************************************************************************************************
+*   ElemFlt32ToColorPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a red/green/blue/alpha pixel value
+*
+*   @return
+*       Return code
+*
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API ElemFlt32ToColorPixel(
+    ADDR_HANDLE                         hLib,
+    const ELEM_FLT32TOCOLORPIXEL_INPUT* pIn,
+    ELEM_FLT32TOCOLORPIXEL_OUTPUT*      pOut);
+
+
+/**
+***************************************************************************************************
+*   ADDR_CONVERT_TILEINFOTOHW_INPUT
+*
+*   @brief
+*       Input structure for AddrConvertTileInfoToHW
+*   @note
+*       When reverse is TRUE, indices are igonred
+***************************************************************************************************
+*/
+typedef struct _ADDR_CONVERT_TILEINFOTOHW_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+    BOOL_32         reverse;            ///< Convert control flag.
+                                        ///  FALSE: convert from real value to HW value;
+                                        ///  TRUE: convert from HW value to real value.
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*  pTileInfo;          ///< Tile parameters with real value
+
+    INT_32          tileIndex;          ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    INT_32          macroModeIndex;     ///< Index in macro tile mode table if there is one (CI)
+                                        ///< README: When tileIndex is not -1, this must be valid
+} ADDR_CONVERT_TILEINFOTOHW_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_CONVERT_TILEINFOTOHW_OUTPUT
+*
+*   @brief
+*       Output structure for AddrConvertTileInfoToHW
+***************************************************************************************************
+*/
+typedef struct _ADDR_CONVERT_TILEINFOTOHW_OUTPUT
+{
+    UINT_32             size;               ///< Size of this structure in bytes
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*      pTileInfo;          ///< Tile parameters with hardware register value
+
+} ADDR_CONVERT_TILEINFOTOHW_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrConvertTileInfoToHW
+*
+*   @brief
+*       Convert tile info from real value to hardware register value
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrConvertTileInfoToHW(
+    ADDR_HANDLE                             hLib,
+    const ADDR_CONVERT_TILEINFOTOHW_INPUT*  pIn,
+    ADDR_CONVERT_TILEINFOTOHW_OUTPUT*       pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_CONVERT_TILEINDEX_INPUT
+*
+*   @brief
+*       Input structure for AddrConvertTileIndex
+***************************************************************************************************
+*/
+typedef struct _ADDR_CONVERT_TILEINDEX_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    INT_32          tileIndex;          ///< Tile index
+    INT_32          macroModeIndex;     ///< Index in macro tile mode table if there is one (CI)
+    BOOL_32         tileInfoHw;         ///< Set to TRUE if client wants HW enum, otherwise actual
+} ADDR_CONVERT_TILEINDEX_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_CONVERT_TILEINDEX_OUTPUT
+*
+*   @brief
+*       Output structure for AddrConvertTileIndex
+***************************************************************************************************
+*/
+typedef struct _ADDR_CONVERT_TILEINDEX_OUTPUT
+{
+    UINT_32             size;           ///< Size of this structure in bytes
+
+    AddrTileMode        tileMode;       ///< Tile mode
+    AddrTileType        tileType;       ///< Tile type
+    ADDR_TILEINFO*      pTileInfo;      ///< Tile info
+
+} ADDR_CONVERT_TILEINDEX_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrConvertTileIndex
+*
+*   @brief
+*       Convert tile index to tile mode/type/info
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrConvertTileIndex(
+    ADDR_HANDLE                         hLib,
+    const ADDR_CONVERT_TILEINDEX_INPUT* pIn,
+    ADDR_CONVERT_TILEINDEX_OUTPUT*      pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_CONVERT_TILEINDEX1_INPUT
+*
+*   @brief
+*       Input structure for AddrConvertTileIndex1 (without macro mode index)
+***************************************************************************************************
+*/
+typedef struct _ADDR_CONVERT_TILEINDEX1_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    INT_32          tileIndex;          ///< Tile index
+    UINT_32         bpp;                ///< Bits per pixel
+    UINT_32         numSamples;         ///< Number of samples
+    BOOL_32         tileInfoHw;         ///< Set to TRUE if client wants HW enum, otherwise actual
+} ADDR_CONVERT_TILEINDEX1_INPUT;
+
+/**
+***************************************************************************************************
+*   AddrConvertTileIndex1
+*
+*   @brief
+*       Convert tile index to tile mode/type/info
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrConvertTileIndex1(
+    ADDR_HANDLE                             hLib,
+    const ADDR_CONVERT_TILEINDEX1_INPUT*    pIn,
+    ADDR_CONVERT_TILEINDEX_OUTPUT*          pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_GET_TILEINDEX_INPUT
+*
+*   @brief
+*       Input structure for AddrGetTileIndex
+***************************************************************************************************
+*/
+typedef struct _ADDR_GET_TILEINDEX_INPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    AddrTileMode    tileMode;       ///< Tile mode
+    AddrTileType    tileType;       ///< Tile-type: disp/non-disp/...
+    ADDR_TILEINFO*  pTileInfo;      ///< Pointer to tile-info structure, can be NULL for linear/1D
+} ADDR_GET_TILEINDEX_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_GET_TILEINDEX_OUTPUT
+*
+*   @brief
+*       Output structure for AddrGetTileIndex
+***************************************************************************************************
+*/
+typedef struct _ADDR_GET_TILEINDEX_OUTPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    INT_32          index;          ///< index in table
+} ADDR_GET_TILEINDEX_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrGetTileIndex
+*
+*   @brief
+*       Get the tiling mode index in table
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrGetTileIndex(
+    ADDR_HANDLE                     hLib,
+    const ADDR_GET_TILEINDEX_INPUT* pIn,
+    ADDR_GET_TILEINDEX_OUTPUT*      pOut);
+
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_PRT_INFO_INPUT
+*
+*   @brief
+*       Input structure for AddrComputePrtInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_PRT_INFO_INPUT
+{
+    AddrFormat          format;        ///< Surface format
+    UINT_32             baseMipWidth;  ///< Base mipmap width
+    UINT_32             baseMipHeight; ///< Base mipmap height
+    UINT_32             baseMipDepth;  ///< Base mipmap depth
+    UINT_32             numFrags;      ///< Number of fragments,
+} ADDR_PRT_INFO_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_PRT_INFO_OUTPUT
+*
+*   @brief
+*       Input structure for AddrComputePrtInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_PRT_INFO_OUTPUT
+{
+    UINT_32             prtTileWidth;
+    UINT_32             prtTileHeight;
+} ADDR_PRT_INFO_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputePrtInfo
+*
+*   @brief
+*       Compute prt surface related information
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputePrtInfo(
+    ADDR_HANDLE                 hLib,
+    const ADDR_PRT_INFO_INPUT*  pIn,
+    ADDR_PRT_INFO_OUTPUT*       pOut);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                     DCC key functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   _ADDR_COMPUTE_DCCINFO_INPUT
+*
+*   @brief
+*       Input structure of AddrComputeDccInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_DCCINFO_INPUT
+{
+    UINT_32             size;            ///< Size of this structure in bytes
+    UINT_32             bpp;             ///< BitPP of color surface
+    UINT_32             numSamples;      ///< Sample number of color surface
+    UINT_64             colorSurfSize;   ///< Size of color surface to which dcc key is bound
+    AddrTileMode        tileMode;        ///< Tile mode of color surface
+    ADDR_TILEINFO       tileInfo;        ///< Tile info of color surface
+    UINT_32             tileSwizzle;     ///< Tile swizzle
+    INT_32              tileIndex;       ///< Tile index of color surface,
+                                         ///< MUST be -1 if you don't want to use it
+                                         ///< while the global useTileIndex is set to 1
+    INT_32              macroModeIndex;  ///< Index in macro tile mode table if there is one (CI)
+                                         ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_DCCINFO_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_DCCINFO_OUTPUT
+*
+*   @brief
+*       Output structure of AddrComputeDccInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_DCCINFO_OUTPUT
+{
+    UINT_32 size;                 ///< Size of this structure in bytes
+    UINT_64 dccRamBaseAlign;      ///< Base alignment of dcc key
+    UINT_64 dccRamSize;           ///< Size of dcc key
+    UINT_64 dccFastClearSize;     ///< Size of dcc key portion that can be fast cleared
+    BOOL_32 subLvlCompressible;   ///< whether sub resource is compressiable
+} ADDR_COMPUTE_DCCINFO_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeDccInfo
+*
+*   @brief
+*       Compute DCC key size, base alignment
+*       info
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeDccInfo(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_DCCINFO_INPUT*       pIn,
+    ADDR_COMPUTE_DCCINFO_OUTPUT*            pOut);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif // __ADDR_INTERFACE_H__
+
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/addrtypes.h b/src/gallium/winsys/amdgpu/drm/addrlib/addrtypes.h
new file mode 100644
index 00000000000..4c68ac544b8
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/addrtypes.h
@@ -0,0 +1,590 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrtypes.h
+* @brief Contains the helper function and constants
+***************************************************************************************************
+*/
+#ifndef __ADDR_TYPES_H__
+#define __ADDR_TYPES_H__
+
+#if defined(__APPLE__) || defined(TCORE_BUILD)
+// External definitions header maintained by Mac driver team (and TCORE team)
+// Helps address compilation issues & reduces code covered by NDA
+#include "addrExtDef.h"
+
+#else
+
+// Windows and/or Linux
+#if !defined(VOID)
+typedef void           VOID;
+#endif
+
+#if !defined(FLOAT)
+typedef float          FLOAT;
+#endif
+
+#if !defined(CHAR)
+typedef char           CHAR;
+#endif
+
+#if !defined(INT)
+typedef int            INT;
+#endif
+
+#include <stdarg.h> // va_list...etc need this header
+
+#endif // defined (__APPLE__)
+
+/**
+***************************************************************************************************
+*   Calling conventions
+***************************************************************************************************
+*/
+#ifndef ADDR_CDECL
+    #if defined(__GNUC__)
+        #define ADDR_CDECL __attribute__((cdecl))
+    #else
+        #define ADDR_CDECL __cdecl
+    #endif
+#endif
+
+#ifndef ADDR_STDCALL
+    #if defined(__GNUC__)
+        #if defined(__AMD64__)
+            #define ADDR_STDCALL
+        #else
+            #define ADDR_STDCALL __attribute__((stdcall))
+        #endif
+    #else
+        #define ADDR_STDCALL __stdcall
+    #endif
+#endif
+
+#ifndef ADDR_FASTCALL
+    #if defined(__GNUC__)
+        #define ADDR_FASTCALL __attribute__((regparm(0)))
+    #else
+        #define ADDR_FASTCALL __fastcall
+    #endif
+#endif
+
+#ifndef GC_CDECL
+    #define GC_CDECL  ADDR_CDECL
+#endif
+
+#ifndef GC_STDCALL
+    #define GC_STDCALL  ADDR_STDCALL
+#endif
+
+#ifndef GC_FASTCALL
+    #define GC_FASTCALL  ADDR_FASTCALL
+#endif
+
+
+#if defined(__GNUC__)
+    #define ADDR_INLINE static inline   // inline needs to be static to link
+#else
+    // win32, win64, other platforms
+    #define ADDR_INLINE   __inline
+#endif // #if defined(__GNUC__)
+
+#define ADDR_API ADDR_FASTCALL //default call convention is fast call
+
+/**
+***************************************************************************************************
+* Global defines used by other modules
+***************************************************************************************************
+*/
+#if !defined(TILEINDEX_INVALID)
+#define TILEINDEX_INVALID                -1
+#endif
+
+#if !defined(TILEINDEX_LINEAR_GENERAL)
+#define TILEINDEX_LINEAR_GENERAL         -2
+#endif
+
+#if !defined(TILEINDEX_LINEAR_ALIGNED)
+#define TILEINDEX_LINEAR_ALIGNED          8
+#endif
+
+/**
+***************************************************************************************************
+* Return codes
+***************************************************************************************************
+*/
+typedef enum _ADDR_E_RETURNCODE
+{
+    // General Return
+    ADDR_OK    = 0,
+    ADDR_ERROR = 1,
+
+    // Specific Errors
+    ADDR_OUTOFMEMORY,
+    ADDR_INVALIDPARAMS,
+    ADDR_NOTSUPPORTED,
+    ADDR_NOTIMPLEMENTED,
+    ADDR_PARAMSIZEMISMATCH,
+    ADDR_INVALIDGBREGVALUES,
+
+} ADDR_E_RETURNCODE;
+
+/**
+***************************************************************************************************
+* @brief
+*   Neutral enums that define tile modes for all H/W
+* @note
+*   R600/R800 tiling mode can be cast to hw enums directly but never cast into HW enum from
+*   ADDR_TM_2D_TILED_XTHICK
+*
+***************************************************************************************************
+*/
+typedef enum _AddrTileMode
+{
+    ADDR_TM_LINEAR_GENERAL      = 0,    ///< Least restrictions, pitch: multiple of 8 if not buffer
+    ADDR_TM_LINEAR_ALIGNED      = 1,    ///< Requests pitch or slice to be multiple of 64 pixels
+    ADDR_TM_1D_TILED_THIN1      = 2,    ///< Linear array of 8x8 tiles
+    ADDR_TM_1D_TILED_THICK      = 3,    ///< Linear array of 8x8x4 tiles
+    ADDR_TM_2D_TILED_THIN1      = 4,    ///< A set of macro tiles consist of 8x8 tiles
+    ADDR_TM_2D_TILED_THIN2      = 5,    ///< 600 HWL only, macro tile ratio is 1:4
+    ADDR_TM_2D_TILED_THIN4      = 6,    ///< 600 HWL only, macro tile ratio is 1:16
+    ADDR_TM_2D_TILED_THICK      = 7,    ///< A set of macro tiles consist of 8x8x4 tiles
+    ADDR_TM_2B_TILED_THIN1      = 8,    ///< 600 HWL only, with bank swap
+    ADDR_TM_2B_TILED_THIN2      = 9,    ///< 600 HWL only, with bank swap and ratio is 1:4
+    ADDR_TM_2B_TILED_THIN4      = 10,   ///< 600 HWL only, with bank swap and ratio is 1:16
+    ADDR_TM_2B_TILED_THICK      = 11,   ///< 600 HWL only, with bank swap, consists of 8x8x4 tiles
+    ADDR_TM_3D_TILED_THIN1      = 12,   ///< Macro tiling w/ pipe rotation between slices
+    ADDR_TM_3D_TILED_THICK      = 13,   ///< Macro tiling w/ pipe rotation bwtween slices, thick
+    ADDR_TM_3B_TILED_THIN1      = 14,   ///< 600 HWL only, with bank swap
+    ADDR_TM_3B_TILED_THICK      = 15,   ///< 600 HWL only, with bank swap, thick
+    ADDR_TM_2D_TILED_XTHICK     = 16,   ///< Tile is 8x8x8, valid from NI
+    ADDR_TM_3D_TILED_XTHICK     = 17,   ///< Tile is 8x8x8, valid from NI
+    ADDR_TM_POWER_SAVE          = 18,   ///< Power save mode, only used by KMD on NI
+    ADDR_TM_PRT_TILED_THIN1     = 19,   ///< No bank/pipe rotation or hashing beyond macrotile size
+    ADDR_TM_PRT_2D_TILED_THIN1  = 20,   ///< Same as 2D_TILED_THIN1, PRT only
+    ADDR_TM_PRT_3D_TILED_THIN1  = 21,   ///< Same as 3D_TILED_THIN1, PRT only
+    ADDR_TM_PRT_TILED_THICK     = 22,   ///< No bank/pipe rotation or hashing beyond macrotile size
+    ADDR_TM_PRT_2D_TILED_THICK  = 23,   ///< Same as 2D_TILED_THICK, PRT only
+    ADDR_TM_PRT_3D_TILED_THICK  = 24,   ///< Same as 3D_TILED_THICK, PRT only
+    ADDR_TM_COUNT               = 25,   ///< Must be the value of the last tile mode
+} AddrTileMode;
+
+/**
+***************************************************************************************************
+*   AddrFormat
+*
+*   @brief
+*       Neutral enum for SurfaceFormat
+*
+***************************************************************************************************
+*/
+typedef enum _AddrFormat {
+    ADDR_FMT_INVALID                              = 0x00000000,
+    ADDR_FMT_8                                    = 0x00000001,
+    ADDR_FMT_4_4                                  = 0x00000002,
+    ADDR_FMT_3_3_2                                = 0x00000003,
+    ADDR_FMT_RESERVED_4                           = 0x00000004,
+    ADDR_FMT_16                                   = 0x00000005,
+    ADDR_FMT_16_FLOAT                             = 0x00000006,
+    ADDR_FMT_8_8                                  = 0x00000007,
+    ADDR_FMT_5_6_5                                = 0x00000008,
+    ADDR_FMT_6_5_5                                = 0x00000009,
+    ADDR_FMT_1_5_5_5                              = 0x0000000a,
+    ADDR_FMT_4_4_4_4                              = 0x0000000b,
+    ADDR_FMT_5_5_5_1                              = 0x0000000c,
+    ADDR_FMT_32                                   = 0x0000000d,
+    ADDR_FMT_32_FLOAT                             = 0x0000000e,
+    ADDR_FMT_16_16                                = 0x0000000f,
+    ADDR_FMT_16_16_FLOAT                          = 0x00000010,
+    ADDR_FMT_8_24                                 = 0x00000011,
+    ADDR_FMT_8_24_FLOAT                           = 0x00000012,
+    ADDR_FMT_24_8                                 = 0x00000013,
+    ADDR_FMT_24_8_FLOAT                           = 0x00000014,
+    ADDR_FMT_10_11_11                             = 0x00000015,
+    ADDR_FMT_10_11_11_FLOAT                       = 0x00000016,
+    ADDR_FMT_11_11_10                             = 0x00000017,
+    ADDR_FMT_11_11_10_FLOAT                       = 0x00000018,
+    ADDR_FMT_2_10_10_10                           = 0x00000019,
+    ADDR_FMT_8_8_8_8                              = 0x0000001a,
+    ADDR_FMT_10_10_10_2                           = 0x0000001b,
+    ADDR_FMT_X24_8_32_FLOAT                       = 0x0000001c,
+    ADDR_FMT_32_32                                = 0x0000001d,
+    ADDR_FMT_32_32_FLOAT                          = 0x0000001e,
+    ADDR_FMT_16_16_16_16                          = 0x0000001f,
+    ADDR_FMT_16_16_16_16_FLOAT                    = 0x00000020,
+    ADDR_FMT_RESERVED_33                          = 0x00000021,
+    ADDR_FMT_32_32_32_32                          = 0x00000022,
+    ADDR_FMT_32_32_32_32_FLOAT                    = 0x00000023,
+    ADDR_FMT_RESERVED_36                          = 0x00000024,
+    ADDR_FMT_1                                    = 0x00000025,
+    ADDR_FMT_1_REVERSED                           = 0x00000026,
+    ADDR_FMT_GB_GR                                = 0x00000027,
+    ADDR_FMT_BG_RG                                = 0x00000028,
+    ADDR_FMT_32_AS_8                              = 0x00000029,
+    ADDR_FMT_32_AS_8_8                            = 0x0000002a,
+    ADDR_FMT_5_9_9_9_SHAREDEXP                    = 0x0000002b,
+    ADDR_FMT_8_8_8                                = 0x0000002c,
+    ADDR_FMT_16_16_16                             = 0x0000002d,
+    ADDR_FMT_16_16_16_FLOAT                       = 0x0000002e,
+    ADDR_FMT_32_32_32                             = 0x0000002f,
+    ADDR_FMT_32_32_32_FLOAT                       = 0x00000030,
+    ADDR_FMT_BC1                                  = 0x00000031,
+    ADDR_FMT_BC2                                  = 0x00000032,
+    ADDR_FMT_BC3                                  = 0x00000033,
+    ADDR_FMT_BC4                                  = 0x00000034,
+    ADDR_FMT_BC5                                  = 0x00000035,
+    ADDR_FMT_BC6                                  = 0x00000036,
+    ADDR_FMT_BC7                                  = 0x00000037,
+    ADDR_FMT_32_AS_32_32_32_32                    = 0x00000038,
+    ADDR_FMT_APC3                                 = 0x00000039,
+    ADDR_FMT_APC4                                 = 0x0000003a,
+    ADDR_FMT_APC5                                 = 0x0000003b,
+    ADDR_FMT_APC6                                 = 0x0000003c,
+    ADDR_FMT_APC7                                 = 0x0000003d,
+    ADDR_FMT_CTX1                                 = 0x0000003e,
+    ADDR_FMT_RESERVED_63                          = 0x0000003f,
+} AddrFormat;
+
+/**
+***************************************************************************************************
+*   AddrDepthFormat
+*
+*   @brief
+*       Neutral enum for addrFlt32ToDepthPixel
+*
+***************************************************************************************************
+*/
+typedef enum _AddrDepthFormat
+{
+    ADDR_DEPTH_INVALID                            = 0x00000000,
+    ADDR_DEPTH_16                                 = 0x00000001,
+    ADDR_DEPTH_X8_24                              = 0x00000002,
+    ADDR_DEPTH_8_24                               = 0x00000003,
+    ADDR_DEPTH_X8_24_FLOAT                        = 0x00000004,
+    ADDR_DEPTH_8_24_FLOAT                         = 0x00000005,
+    ADDR_DEPTH_32_FLOAT                           = 0x00000006,
+    ADDR_DEPTH_X24_8_32_FLOAT                     = 0x00000007,
+
+} AddrDepthFormat;
+
+/**
+***************************************************************************************************
+*   AddrColorFormat
+*
+*   @brief
+*       Neutral enum for ColorFormat
+*
+***************************************************************************************************
+*/
+typedef enum _AddrColorFormat
+{
+    ADDR_COLOR_INVALID                            = 0x00000000,
+    ADDR_COLOR_8                                  = 0x00000001,
+    ADDR_COLOR_4_4                                = 0x00000002,
+    ADDR_COLOR_3_3_2                              = 0x00000003,
+    ADDR_COLOR_RESERVED_4                         = 0x00000004,
+    ADDR_COLOR_16                                 = 0x00000005,
+    ADDR_COLOR_16_FLOAT                           = 0x00000006,
+    ADDR_COLOR_8_8                                = 0x00000007,
+    ADDR_COLOR_5_6_5                              = 0x00000008,
+    ADDR_COLOR_6_5_5                              = 0x00000009,
+    ADDR_COLOR_1_5_5_5                            = 0x0000000a,
+    ADDR_COLOR_4_4_4_4                            = 0x0000000b,
+    ADDR_COLOR_5_5_5_1                            = 0x0000000c,
+    ADDR_COLOR_32                                 = 0x0000000d,
+    ADDR_COLOR_32_FLOAT                           = 0x0000000e,
+    ADDR_COLOR_16_16                              = 0x0000000f,
+    ADDR_COLOR_16_16_FLOAT                        = 0x00000010,
+    ADDR_COLOR_8_24                               = 0x00000011,
+    ADDR_COLOR_8_24_FLOAT                         = 0x00000012,
+    ADDR_COLOR_24_8                               = 0x00000013,
+    ADDR_COLOR_24_8_FLOAT                         = 0x00000014,
+    ADDR_COLOR_10_11_11                           = 0x00000015,
+    ADDR_COLOR_10_11_11_FLOAT                     = 0x00000016,
+    ADDR_COLOR_11_11_10                           = 0x00000017,
+    ADDR_COLOR_11_11_10_FLOAT                     = 0x00000018,
+    ADDR_COLOR_2_10_10_10                         = 0x00000019,
+    ADDR_COLOR_8_8_8_8                            = 0x0000001a,
+    ADDR_COLOR_10_10_10_2                         = 0x0000001b,
+    ADDR_COLOR_X24_8_32_FLOAT                     = 0x0000001c,
+    ADDR_COLOR_32_32                              = 0x0000001d,
+    ADDR_COLOR_32_32_FLOAT                        = 0x0000001e,
+    ADDR_COLOR_16_16_16_16                        = 0x0000001f,
+    ADDR_COLOR_16_16_16_16_FLOAT                  = 0x00000020,
+    ADDR_COLOR_RESERVED_33                        = 0x00000021,
+    ADDR_COLOR_32_32_32_32                        = 0x00000022,
+    ADDR_COLOR_32_32_32_32_FLOAT                  = 0x00000023,
+} AddrColorFormat;
+
+/**
+***************************************************************************************************
+*   AddrSurfaceNumber
+*
+*   @brief
+*       Neutral enum for SurfaceNumber
+*
+***************************************************************************************************
+*/
+typedef enum _AddrSurfaceNumber {
+    ADDR_NUMBER_UNORM                             = 0x00000000,
+    ADDR_NUMBER_SNORM                             = 0x00000001,
+    ADDR_NUMBER_USCALED                           = 0x00000002,
+    ADDR_NUMBER_SSCALED                           = 0x00000003,
+    ADDR_NUMBER_UINT                              = 0x00000004,
+    ADDR_NUMBER_SINT                              = 0x00000005,
+    ADDR_NUMBER_SRGB                              = 0x00000006,
+    ADDR_NUMBER_FLOAT                             = 0x00000007,
+} AddrSurfaceNumber;
+
+/**
+***************************************************************************************************
+*   AddrSurfaceSwap
+*
+*   @brief
+*       Neutral enum for SurfaceSwap
+*
+***************************************************************************************************
+*/
+typedef enum _AddrSurfaceSwap {
+    ADDR_SWAP_STD                                 = 0x00000000,
+    ADDR_SWAP_ALT                                 = 0x00000001,
+    ADDR_SWAP_STD_REV                             = 0x00000002,
+    ADDR_SWAP_ALT_REV                             = 0x00000003,
+} AddrSurfaceSwap;
+
+/**
+***************************************************************************************************
+*   AddrHtileBlockSize
+*
+*   @brief
+*       Size of HTILE blocks, valid values are 4 or 8 for now
+***************************************************************************************************
+*/
+typedef enum _AddrHtileBlockSize
+{
+    ADDR_HTILE_BLOCKSIZE_4 = 4,
+    ADDR_HTILE_BLOCKSIZE_8 = 8,
+} AddrHtileBlockSize;
+
+
+/**
+***************************************************************************************************
+*   AddrPipeCfg
+*
+*   @brief
+*       The pipe configuration field specifies both the number of pipes and
+*       how pipes are interleaved on the surface.
+*       The expression of number of pipes, the shader engine tile size, and packer tile size
+*       is encoded in a PIPE_CONFIG register field.
+*       In general the number of pipes usually matches the number of memory channels of the
+*       hardware configuration.
+*       For hw configurations w/ non-pow2 memory number of memory channels, it usually matches
+*       the number of ROP units(? TODO: which registers??)
+*       The enum value = hw enum + 1 which is to reserve 0 for requesting default.
+***************************************************************************************************
+*/
+typedef enum _AddrPipeCfg
+{
+    ADDR_PIPECFG_INVALID         = 0,
+    ADDR_PIPECFG_P2              = 1, /// 2 pipes,
+    ADDR_PIPECFG_P4_8x16         = 5, /// 4 pipes,
+    ADDR_PIPECFG_P4_16x16        = 6,
+    ADDR_PIPECFG_P4_16x32        = 7,
+    ADDR_PIPECFG_P4_32x32        = 8,
+    ADDR_PIPECFG_P8_16x16_8x16   = 9, /// 8 pipes
+    ADDR_PIPECFG_P8_16x32_8x16   = 10,
+    ADDR_PIPECFG_P8_32x32_8x16   = 11,
+    ADDR_PIPECFG_P8_16x32_16x16  = 12,
+    ADDR_PIPECFG_P8_32x32_16x16  = 13,
+    ADDR_PIPECFG_P8_32x32_16x32  = 14,
+    ADDR_PIPECFG_P8_32x64_32x32  = 15,
+    ADDR_PIPECFG_P16_32x32_8x16  = 17, /// 16 pipes
+    ADDR_PIPECFG_P16_32x32_16x16 = 18,
+    ADDR_PIPECFG_MAX             = 19,
+} AddrPipeCfg;
+
+/**
+***************************************************************************************************
+* AddrTileType
+*
+*   @brief
+*       Neutral enums that specifies micro tile type (MICRO_TILE_MODE)
+***************************************************************************************************
+*/
+typedef enum _AddrTileType
+{
+    ADDR_DISPLAYABLE        = 0,    ///< Displayable tiling
+    ADDR_NON_DISPLAYABLE    = 1,    ///< Non-displayable tiling, a.k.a thin micro tiling
+    ADDR_DEPTH_SAMPLE_ORDER = 2,    ///< Same as non-displayable plus depth-sample-order
+    ADDR_ROTATED            = 3,    ///< Rotated displayable tiling
+    ADDR_THICK              = 4,    ///< Thick micro-tiling, only valid for THICK and XTHICK
+} AddrTileType;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Type definitions: short system-independent names for address library types
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__APPLE__)
+
+#ifndef BOOL_32        // no bool type in C
+/// @brief Boolean type, since none is defined in C
+/// @ingroup type
+#define BOOL_32 int
+#endif
+
+#ifndef INT_32
+#define INT_32  int
+#endif
+
+#ifndef UINT_32
+#define UINT_32 unsigned int
+#endif
+
+#ifndef INT_16
+#define INT_16  short
+#endif
+
+#ifndef UINT_16
+#define UINT_16 unsigned short
+#endif
+
+#ifndef INT_8
+#define INT_8   char
+#endif
+
+#ifndef UINT_8
+#define UINT_8  unsigned char
+#endif
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+//
+//  64-bit integer types depend on the compiler
+//
+#if defined( __GNUC__ ) || defined( __WATCOMC__ )
+#define INT_64   long long
+#define UINT_64  unsigned long long
+
+#elif defined( _WIN32 )
+#define INT_64   __int64
+#define UINT_64  unsigned __int64
+
+#else
+#error Unsupported compiler and/or operating system for 64-bit integers
+
+/// @brief 64-bit signed integer type (compiler dependent)
+/// @ingroup type
+///
+/// The addrlib defines a 64-bit signed integer type for either
+/// Gnu/Watcom compilers (which use the first syntax) or for
+/// the Windows VCC compiler (which uses the second syntax).
+#define INT_64  long long OR __int64
+
+/// @brief 64-bit unsigned integer type (compiler dependent)
+/// @ingroup type
+///
+/// The addrlib defines a 64-bit unsigned integer type for either
+/// Gnu/Watcom compilers (which use the first syntax) or for
+/// the Windows VCC compiler (which uses the second syntax).
+///
+#define UINT_64  unsigned long long OR unsigned __int64
+#endif
+
+#endif // #if !defined(__APPLE__)
+
+//  ADDR64X is used to print addresses in hex form on both Windows and Linux
+//
+#if defined( __GNUC__ ) || defined( __WATCOMC__ )
+#define ADDR64X "llx"
+#define ADDR64D "lld"
+
+#elif defined( _WIN32 )
+#define ADDR64X "I64x"
+#define ADDR64D "I64d"
+
+#else
+#error Unsupported compiler and/or operating system for 64-bit integers
+
+/// @brief Addrlib device address 64-bit printf tag  (compiler dependent)
+/// @ingroup type
+///
+/// This allows printf to display an ADDR_64 for either the Windows VCC compiler
+/// (which used this value) or the Gnu/Watcom compilers (which use "llx".
+/// An example of use is printf("addr 0x%"ADDR64X"\n", address);
+///
+#define ADDR64X "llx" OR "I64x"
+#define ADDR64D "lld" OR "I64d"
+#endif
+
+
+/// @brief Union for storing a 32-bit float or 32-bit integer
+/// @ingroup type
+///
+/// This union provides a simple way to convert between a 32-bit float
+/// and a 32-bit integer. It also prevents the compiler from producing
+/// code that alters NaN values when assiging or coying floats.
+/// Therefore, all address library routines that pass or return 32-bit
+/// floating point data do so by passing or returning a FLT_32.
+///
+typedef union {
+    INT_32   i;
+    UINT_32  u;
+    float    f;
+} ADDR_FLT_32;
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Macros for controlling linking and building on multiple systems
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if defined(_MSC_VER)
+#if defined(va_copy)
+#undef va_copy  //redefine va_copy to support VC2013
+#endif
+#endif
+
+#if !defined(va_copy)
+#define va_copy(dst, src) \
+    ((void) memcpy(&(dst), &(src), sizeof(va_list)))
+#endif
+
+#endif // __ADDR_TYPES_H__
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrcommon.h b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrcommon.h
new file mode 100644
index 00000000000..f996c9a3402
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrcommon.h
@@ -0,0 +1,558 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrcommon.h
+* @brief Contains the helper function and constants
+***************************************************************************************************
+*/
+
+#ifndef __ADDR_COMMON_H__
+#define __ADDR_COMMON_H__
+
+#include "addrinterface.h"
+
+
+// ADDR_LNX_KERNEL_BUILD is for internal build
+// Moved from addrinterface.h so __KERNEL__ is not needed any more
+#if ADDR_LNX_KERNEL_BUILD // || (defined(__GNUC__) && defined(__KERNEL__))
+    #include "lnx_common_defs.h" // ported from cmmqs
+#elif !defined(__APPLE__)
+    #include <stdlib.h>
+    #include <string.h>
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Common constants
+///////////////////////////////////////////////////////////////////////////////////////////////////
+static const UINT_32 MicroTileWidth      = 8;       ///< Micro tile width, for 1D and 2D tiling
+static const UINT_32 MicroTileHeight     = 8;       ///< Micro tile height, for 1D and 2D tiling
+static const UINT_32 ThickTileThickness  = 4;       ///< Micro tile thickness, for THICK modes
+static const UINT_32 XThickTileThickness = 8;       ///< Extra thick tiling thickness
+static const UINT_32 PowerSaveTileBytes  = 64;      ///< Nuber of bytes per tile for power save 64
+static const UINT_32 CmaskCacheBits      = 1024;    ///< Number of bits for CMASK cache
+static const UINT_32 CmaskElemBits       = 4;       ///< Number of bits for CMASK element
+static const UINT_32 HtileCacheBits      = 16384;   ///< Number of bits for HTILE cache 512*32
+
+static const UINT_32 MicroTilePixels     = MicroTileWidth * MicroTileHeight;
+
+static const INT_32 TileIndexInvalid        = TILEINDEX_INVALID;
+static const INT_32 TileIndexLinearGeneral  = TILEINDEX_LINEAR_GENERAL;
+static const INT_32 TileIndexNoMacroIndex   = -3;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Common macros
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#define BITS_PER_BYTE 8
+#define BITS_TO_BYTES(x) ( ((x) + (BITS_PER_BYTE-1)) / BITS_PER_BYTE )
+#define BYTES_TO_BITS(x) ( (x) * BITS_PER_BYTE )
+
+/// Helper macros to select a single bit from an int (undefined later in section)
+#define _BIT(v,b)      (((v) >> (b) ) & 1)
+
+/**
+***************************************************************************************************
+* @brief Enums to identify AddrLib type
+***************************************************************************************************
+*/
+enum AddrLibClass
+{
+    BASE_ADDRLIB = 0x0,
+    R600_ADDRLIB = 0x6,
+    R800_ADDRLIB = 0x8,
+    SI_ADDRLIB   = 0xa,
+    CI_ADDRLIB   = 0xb,
+};
+
+/**
+***************************************************************************************************
+* AddrChipFamily
+*
+*   @brief
+*       Neutral enums that specifies chip family.
+*
+***************************************************************************************************
+*/
+enum AddrChipFamily
+{
+    ADDR_CHIP_FAMILY_IVLD,    ///< Invalid family
+    ADDR_CHIP_FAMILY_R6XX,
+    ADDR_CHIP_FAMILY_R7XX,
+    ADDR_CHIP_FAMILY_R8XX,
+    ADDR_CHIP_FAMILY_NI,
+    ADDR_CHIP_FAMILY_SI,
+    ADDR_CHIP_FAMILY_CI,
+    ADDR_CHIP_FAMILY_VI,
+};
+
+/**
+***************************************************************************************************
+* ADDR_CONFIG_FLAGS
+*
+*   @brief
+*       This structure is used to set addr configuration flags.
+***************************************************************************************************
+*/
+union ADDR_CONFIG_FLAGS
+{
+    struct
+    {
+        /// Clients do not need to set these flags except forceLinearAligned.
+        /// There flags are set up by AddrLib inside thru AddrInitGlobalParamsFromRegister
+        UINT_32 optimalBankSwap        : 1;    ///< New bank tiling for RV770 only
+        UINT_32 noCubeMipSlicesPad     : 1;    ///< Disables faces padding for cubemap mipmaps
+        UINT_32 fillSizeFields         : 1;    ///< If clients fill size fields in all input and
+                                               ///  output structure
+        UINT_32 ignoreTileInfo         : 1;    ///< Don't use tile info structure
+        UINT_32 useTileIndex           : 1;    ///< Make tileIndex field in input valid
+        UINT_32 useCombinedSwizzle     : 1;    ///< Use combined swizzle
+        UINT_32 checkLast2DLevel       : 1;    ///< Check the last 2D mip sub level
+        UINT_32 useHtileSliceAlign     : 1;    ///< Do htile single slice alignment
+        UINT_32 degradeBaseLevel       : 1;    ///< Degrade to 1D modes automatically for base level
+        UINT_32 allowLargeThickTile    : 1;    ///< Allow 64*thickness*bytesPerPixel > rowSize
+        UINT_32 reserved               : 22;   ///< Reserved bits for future use
+    };
+
+    UINT_32 value;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Platform specific debug break defines
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if DEBUG
+    #if defined(__GNUC__)
+        #define ADDR_DBG_BREAK()
+    #elif defined(__APPLE__)
+        #define ADDR_DBG_BREAK()    { IOPanic("");}
+    #else
+        #define ADDR_DBG_BREAK()    { __debugbreak(); }
+    #endif
+#else
+    #define ADDR_DBG_BREAK()
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Debug assertions used in AddrLib
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if DEBUG
+#define ADDR_ASSERT(__e) if ( !((__e) ? TRUE : FALSE)) { ADDR_DBG_BREAK(); }
+#define ADDR_ASSERT_ALWAYS() ADDR_DBG_BREAK()
+#define ADDR_UNHANDLED_CASE() ADDR_ASSERT(!"Unhandled case")
+#define ADDR_NOT_IMPLEMENTED() ADDR_ASSERT(!"Not implemented");
+#else //DEBUG
+#define ADDR_ASSERT(__e)
+#define ADDR_ASSERT_ALWAYS()
+#define ADDR_UNHANDLED_CASE()
+#define ADDR_NOT_IMPLEMENTED()
+#endif //DEBUG
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Debug print macro from legacy address library
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if DEBUG
+
+#define ADDR_PRNT(a)    AddrObject::DebugPrint a
+
+/// @brief Macro for reporting informational messages
+/// @ingroup util
+///
+/// This macro optionally prints an informational message to stdout.
+/// The first parameter is a condition -- if it is true, nothing is done.
+/// The second pararmeter MUST be a parenthesis-enclosed list of arguments,
+/// starting with a string. This is passed to printf() or an equivalent
+/// in order to format the informational message. For example,
+/// ADDR_INFO(0, ("test %d",3) ); prints out "test 3".
+///
+#define ADDR_INFO(cond, a)         \
+{ if (!(cond)) { ADDR_PRNT(a); } }
+
+
+/// @brief Macro for reporting error warning messages
+/// @ingroup util
+///
+/// This macro optionally prints an error warning message to stdout,
+/// followed by the file name and line number where the macro was called.
+/// The first parameter is a condition -- if it is true, nothing is done.
+/// The second pararmeter MUST be a parenthesis-enclosed list of arguments,
+/// starting with a string. This is passed to printf() or an equivalent
+/// in order to format the informational message. For example,
+/// ADDR_WARN(0, ("test %d",3) ); prints out "test 3" followed by
+/// a second line with the file name and line number.
+///
+#define ADDR_WARN(cond, a)         \
+{ if (!(cond))                     \
+  { ADDR_PRNT(a);                  \
+    ADDR_PRNT(("  WARNING in file %s, line %d\n", __FILE__, __LINE__)); \
+} }
+
+
+/// @brief Macro for reporting fatal error conditions
+/// @ingroup util
+///
+/// This macro optionally stops execution of the current routine
+/// after printing an error warning message to stdout,
+/// followed by the file name and line number where the macro was called.
+/// The first parameter is a condition -- if it is true, nothing is done.
+/// The second pararmeter MUST be a parenthesis-enclosed list of arguments,
+/// starting with a string. This is passed to printf() or an equivalent
+/// in order to format the informational message. For example,
+/// ADDR_EXIT(0, ("test %d",3) ); prints out "test 3" followed by
+/// a second line with the file name and line number, then stops execution.
+///
+#define ADDR_EXIT(cond, a)         \
+{ if (!(cond))                     \
+  { ADDR_PRNT(a); ADDR_DBG_BREAK();\
+} }
+
+#else // DEBUG
+
+#define ADDRDPF 1 ? (void)0 : (void)
+
+#define ADDR_PRNT(a)
+
+#define ADDR_DBG_BREAK()
+
+#define ADDR_INFO(cond, a)
+
+#define ADDR_WARN(cond, a)
+
+#define ADDR_EXIT(cond, a)
+
+#endif // DEBUG
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Misc helper functions
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrXorReduce
+*
+*   @brief
+*       Xor the right-side numberOfBits bits of x.
+***************************************************************************************************
+*/
+static inline UINT_32 XorReduce(
+    UINT_32 x,
+    UINT_32 numberOfBits)
+{
+    UINT_32 i;
+    UINT_32 result = x & 1;
+
+    for (i=1; i<numberOfBits; i++)
+    {
+        result ^= ((x>>i) & 1);
+    }
+
+    return result;
+}
+
+/**
+***************************************************************************************************
+*   IsPow2
+*
+*   @brief
+*       Check if the size (UINT_32) is pow 2
+***************************************************************************************************
+*/
+static inline UINT_32 IsPow2(
+    UINT_32 dim)        ///< [in] dimension of miplevel
+{
+    ADDR_ASSERT(dim > 0);
+    return !(dim & (dim - 1));
+}
+
+/**
+***************************************************************************************************
+*   IsPow2
+*
+*   @brief
+*       Check if the size (UINT_64) is pow 2
+***************************************************************************************************
+*/
+static inline UINT_64 IsPow2(
+    UINT_64 dim)        ///< [in] dimension of miplevel
+{
+    ADDR_ASSERT(dim > 0);
+    return !(dim & (dim - 1));
+}
+
+/**
+***************************************************************************************************
+*   ByteAlign
+*
+*   @brief
+*       Align UINT_32 "x" to "align" alignment, "align" should be power of 2
+***************************************************************************************************
+*/
+static inline UINT_32 PowTwoAlign(
+    UINT_32 x,
+    UINT_32 align)
+{
+    //
+    // Assert that x is a power of two.
+    //
+    ADDR_ASSERT(IsPow2(align));
+    return (x + (align - 1)) & (~(align - 1));
+}
+
+/**
+***************************************************************************************************
+*   ByteAlign
+*
+*   @brief
+*       Align UINT_64 "x" to "align" alignment, "align" should be power of 2
+***************************************************************************************************
+*/
+static inline UINT_64 PowTwoAlign(
+    UINT_64 x,
+    UINT_64 align)
+{
+    //
+    // Assert that x is a power of two.
+    //
+    ADDR_ASSERT(IsPow2(align));
+    return (x + (align - 1)) & (~(align - 1));
+}
+
+/**
+***************************************************************************************************
+*   Min
+*
+*   @brief
+*       Get the min value between two unsigned values
+***************************************************************************************************
+*/
+static inline UINT_32 Min(
+    UINT_32 value1,
+    UINT_32 value2)
+{
+    return ((value1 < (value2)) ? (value1) : value2);
+}
+
+/**
+***************************************************************************************************
+*   Min
+*
+*   @brief
+*       Get the min value between two signed values
+***************************************************************************************************
+*/
+static inline INT_32 Min(
+    INT_32 value1,
+    INT_32 value2)
+{
+    return ((value1 < (value2)) ? (value1) : value2);
+}
+
+/**
+***************************************************************************************************
+*   Max
+*
+*   @brief
+*       Get the max value between two unsigned values
+***************************************************************************************************
+*/
+static inline UINT_32 Max(
+    UINT_32 value1,
+    UINT_32 value2)
+{
+    return ((value1 > (value2)) ? (value1) : value2);
+}
+
+/**
+***************************************************************************************************
+*   Max
+*
+*   @brief
+*       Get the max value between two signed values
+***************************************************************************************************
+*/
+static inline INT_32 Max(
+    INT_32 value1,
+    INT_32 value2)
+{
+    return ((value1 > (value2)) ? (value1) : value2);
+}
+
+/**
+***************************************************************************************************
+*   NextPow2
+*
+*   @brief
+*       Compute the mipmap's next level dim size
+***************************************************************************************************
+*/
+static inline UINT_32 NextPow2(
+    UINT_32 dim)        ///< [in] dimension of miplevel
+{
+    UINT_32 newDim;
+
+    newDim = 1;
+
+    if (dim > 0x7fffffff)
+    {
+        ADDR_ASSERT_ALWAYS();
+        newDim = 0x80000000;
+    }
+    else
+    {
+        while (newDim < dim)
+        {
+            newDim <<= 1;
+        }
+    }
+
+    return newDim;
+}
+
+/**
+***************************************************************************************************
+*   Log2
+*
+*   @brief
+*       Compute log of base 2
+***************************************************************************************************
+*/
+static inline UINT_32 Log2(
+    UINT_32 x)      ///< [in] the value should calculate log based 2
+{
+    UINT_32 y;
+
+    //
+    // Assert that x is a power of two.
+    //
+    ADDR_ASSERT(IsPow2(x));
+
+    y = 0;
+    while (x > 1)
+    {
+        x >>= 1;
+        y++;
+    }
+
+    return y;
+}
+
+/**
+***************************************************************************************************
+*   QLog2
+*
+*   @brief
+*       Compute log of base 2 quickly (<= 16)
+***************************************************************************************************
+*/
+static inline UINT_32 QLog2(
+    UINT_32 x)      ///< [in] the value should calculate log based 2
+{
+    ADDR_ASSERT(x <= 16);
+
+    UINT_32 y = 0;
+
+    switch (x)
+    {
+        case 1:
+            y = 0;
+            break;
+        case 2:
+            y = 1;
+            break;
+        case 4:
+            y = 2;
+            break;
+        case 8:
+            y = 3;
+            break;
+        case 16:
+            y = 4;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+    }
+
+    return y;
+}
+
+/**
+***************************************************************************************************
+*   SafeAssign
+*
+*   @brief
+*       NULL pointer safe assignment
+***************************************************************************************************
+*/
+static inline VOID SafeAssign(
+    UINT_32*    pLVal,  ///< [in] Pointer to left val
+    UINT_32     rVal)   ///< [in] Right value
+{
+    if (pLVal)
+    {
+        *pLVal = rVal;
+    }
+}
+
+/**
+***************************************************************************************************
+*   SafeAssign
+*
+*   @brief
+*       NULL pointer safe assignment for 64bit values
+***************************************************************************************************
+*/
+static inline VOID SafeAssign(
+    UINT_64*    pLVal,  ///< [in] Pointer to left val
+    UINT_64     rVal)   ///< [in] Right value
+{
+    if (pLVal)
+    {
+        *pLVal = rVal;
+    }
+}
+
+/**
+***************************************************************************************************
+*   SafeAssign
+*
+*   @brief
+*       NULL pointer safe assignment for AddrTileMode
+***************************************************************************************************
+*/
+static inline VOID SafeAssign(
+    AddrTileMode*    pLVal, ///< [in] Pointer to left val
+    AddrTileMode     rVal)  ///< [in] Right value
+{
+    if (pLVal)
+    {
+        *pLVal = rVal;
+    }
+}
+
+#endif // __ADDR_COMMON_H__
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.cpp
new file mode 100644
index 00000000000..76b1badf958
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.cpp
@@ -0,0 +1,1674 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrelemlib.cpp
+* @brief Contains the class implementation for element/pixel related functions
+***************************************************************************************************
+*/
+
+#include "addrelemlib.h"
+#include "addrlib.h"
+
+
+/**
+***************************************************************************************************
+*   AddrElemLib::AddrElemLib
+*
+*   @brief
+*       constructor
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+AddrElemLib::AddrElemLib(
+    AddrLib* const pAddrLib) :  ///< [in] Parent addrlib instance pointer
+    AddrObject(pAddrLib->GetClient()),
+    m_pAddrLib(pAddrLib)
+{
+    switch (m_pAddrLib->GetAddrChipFamily())
+    {
+        case ADDR_CHIP_FAMILY_R6XX:
+            m_depthPlanarType = ADDR_DEPTH_PLANAR_R600;
+            m_fp16ExportNorm = 0;
+            break;
+        case ADDR_CHIP_FAMILY_R7XX:
+            m_depthPlanarType = ADDR_DEPTH_PLANAR_R600;
+            m_fp16ExportNorm = 1;
+            break;
+        case ADDR_CHIP_FAMILY_R8XX:
+        case ADDR_CHIP_FAMILY_NI: // Same as 8xx
+            m_depthPlanarType = ADDR_DEPTH_PLANAR_R800;
+            m_fp16ExportNorm = 1;
+            break;
+        default:
+            m_fp16ExportNorm = 1;
+            m_depthPlanarType = ADDR_DEPTH_PLANAR_R800;
+    }
+
+    m_configFlags.value = 0;
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::~AddrElemLib
+*
+*   @brief
+*       destructor
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+AddrElemLib::~AddrElemLib()
+{
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::Create
+*
+*   @brief
+*       Creates and initializes AddrLib object.
+*
+*   @return
+*       Returns point to ADDR_CREATEINFO if successful.
+***************************************************************************************************
+*/
+AddrElemLib* AddrElemLib::Create(
+    const AddrLib* const        pAddrLib)   ///< [in] Pointer of parent AddrLib instance
+{
+    AddrElemLib* pElemLib = NULL;
+
+    if (pAddrLib)
+    {
+        pElemLib = new(pAddrLib->GetClient()) AddrElemLib(const_cast<AddrLib* const>(pAddrLib));
+    }
+
+    return pElemLib;
+}
+
+/**************************************************************************************************
+*   AddrElemLib::Flt32sToInt32s
+*
+*   @brief
+*       Convert a ADDR_FLT_32 value to Int32 value
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::Flt32sToInt32s(
+    ADDR_FLT_32     value,      ///< [in] ADDR_FLT_32 value
+    UINT_32         bits,       ///< [in] nubmer of bits in value
+    AddrNumberType  numberType, ///< [in] the type of number
+    UINT_32*        pResult)    ///< [out] Int32 value
+{
+    UINT_8 round = 128;    //ADDR_ROUND_BY_HALF
+    UINT_32 uscale;
+    UINT_32 sign;
+
+    //convert each component to an INT_32
+    switch ( numberType )
+    {
+        case ADDR_NO_NUMBER:    //fall through
+        case ADDR_ZERO:         //fall through
+        case ADDR_ONE:          //fall through
+        case ADDR_EPSILON:      //fall through
+            return;        // these are zero-bit components, so don't set result
+
+        case ADDR_UINT_BITS:            // unsigned integer bit field, clamped to range
+            uscale = (1<<bits) - 1;
+            if (bits == 32)               // special case unsigned 32-bit int
+            {
+                *pResult = value.i;
+            }
+            else
+            {
+                if ((value.i < 0) || (value.u > uscale))
+                {
+                    *pResult = uscale;
+                }
+                else
+                {
+                    *pResult = value.i;
+                }
+                return;
+            }
+
+        // The algorithm used in the DB and TX differs at one value for 24-bit unorms
+        case ADDR_UNORM_R6XXDB:        // unsigned repeating fraction
+            if ((bits==24) && (value.i == 0x33000000))
+            {
+                *pResult = 1;
+                return;
+            }              // Else treat like ADDR_UNORM_R6XX
+
+        case ADDR_UNORM_R6XX:            // unsigned repeating fraction
+            if (value.f <= 0)
+            {
+                *pResult = 0;            // first clamp to [0..1]
+            }
+            else
+            {
+                if (value.f >= 1)
+                {
+                     *pResult = (1<<bits) - 1;
+                }
+                else
+                {
+                    if ((value.i | 0x87FFFFFF) == 0xFFFFFFFF)
+                    {
+                        *pResult = 0;                        // NaN, so force to 0
+                    }
+
+                    #if 0 // floating point version for documentation
+                    else
+                    {
+                        FLOAT f = value.f * ((1<<bits) - 1);
+                        *pResult = static_cast<INT_32>(f + (round/256.0f));
+                    }
+                    #endif
+                    else
+                    {
+                        ADDR_FLT_32 scaled;
+                        ADDR_FLT_32 shifted;
+                        UINT_64 truncated, rounded;
+                        UINT_32 altShift;
+                        UINT_32 mask = (1 << bits) - 1;
+                        UINT_32 half = 1 << (bits - 1);
+                        UINT_32 mant24 = (value.i & 0x7FFFFF) + 0x800000;
+                        UINT_64 temp = mant24 - (mant24>>bits) -
+                            static_cast<INT_32>((mant24 & mask) > half);
+                        UINT_32 exp8 = value.i >> 23;
+                        UINT_32 shift = 126 - exp8 + 24 - bits;
+                        UINT_64 final;
+
+                        if (shift >= 32) // This is zero, even with maximum dither add
+                        {
+                            final = 0;
+                        }
+                        else
+                        {
+                            final = ((temp<<8) + (static_cast<UINT_64>(round)<<shift)) >> (shift+8);
+                        }
+                        //ADDR_EXIT( *pResult == final,
+                        //    ("Float %x converted to %d-bit Unorm %x != bitwise %x",
+                        //     value.u, bits, (UINT_32)*pResult, (UINT_32)final) );
+                        if (final > mask)
+                        {
+                            final = mask;
+                        }
+
+                        scaled.f  = value.f * ((1<<bits) - 1);
+                        shifted.f = (scaled.f * 256);
+                        truncated = ((shifted.i&0x7FFFFF) + (INT_64)0x800000) << 8;
+                        altShift  = 126 + 24 + 8 - ((shifted.i>>23)&0xFF);
+                        truncated = (altShift > 60) ? 0 : truncated >> altShift;
+                        rounded   = static_cast<INT_32>((round + truncated) >> 8);
+                        //if (rounded > ((1<<bits) - 1))
+                        //    rounded = ((1<<bits) - 1);
+                        *pResult = static_cast<INT_32>(rounded); //(INT_32)final;
+                    }
+                }
+            }
+
+            return;
+
+        case ADDR_S8FLOAT32:    // 32-bit IEEE float, passes through NaN values
+            *pResult = value.i;
+            return;
+
+        // @@ FIX ROUNDING in this code, fix the denorm case
+        case ADDR_U4FLOATC:         // Unsigned float, 4-bit exponent. bias 15, clamped [0..1]
+            sign = (value.i >> 31) & 1;
+            if ((value.i&0x7F800000) == 0x7F800000)    // If NaN or INF:
+            {
+                if ((value.i&0x007FFFFF) != 0)             // then if NaN
+                {
+                    *pResult = 0;                       // return 0
+                }
+                else
+                {
+                    *pResult = (sign)?0:0xF00000;           // else +INF->+1, -INF->0
+                }
+                return;
+            }
+            if (value.f <= 0)
+            {
+                *pResult = 0;
+            }
+            else
+            {
+                if (value.f>=1)
+                {
+                    *pResult = 0xF << (bits-4);
+                }
+                else
+                {
+                    if ((value.i>>23) > 112 )
+                    {
+                        // 24-bit float: normalized
+                        // value.i += 1 << (22-bits+4);
+                        // round the IEEE mantissa to mantissa size
+                        // @@ NOTE: add code to support rounding
+                        value.u &= 0x7FFFFFF;             // mask off high 4 exponent bits
+                        *pResult = value.i >> (23-bits+4);// shift off unused mantissa bits
+                    }
+                    else
+                    {
+                        // 24-bit float: denormalized
+                        value.f = value.f / (1<<28) / (1<<28);
+                        value.f = value.f / (1<<28) / (1<<28);    // convert to IEEE denorm
+                        // value.i += 1 << (22-bits+4);
+                        // round the IEEE mantissa to mantissa size
+                        // @@ NOTE: add code to support rounding
+                        *pResult = value.i >> (23-bits+4);    // shift off unused mantissa bits
+                    }
+                }
+            }
+
+            return;
+
+        default:                    // invalid number mode
+            //ADDR_EXIT(0, ("Invalid AddrNumber %d", numberType) );
+            break;
+
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::Int32sToPixel
+*
+*   @brief
+*       Pack 32-bit integer values into an uncompressed pixel,
+*       in the proper order
+*
+*   @return
+*       N/A
+*
+*   @note
+*       This entry point packes four 32-bit integer values into
+*       an uncompressed pixel. The pixel values are specifies in
+*       standard order, e.g. depth/stencil. This routine asserts
+*       if called on compressed pixel.
+***************************************************************************************************
+*/
+VOID AddrElemLib::Int32sToPixel(
+    UINT_32              numComps,      ///< [in] number of components
+    UINT_32*             pComps,        ///< [in] compnents
+    UINT_32*             pCompBits,     ///< [in] total bits in each component
+    UINT_32*             pCompStart,    ///< [in] the first bit position of each component
+    ADDR_COMPONENT_FLAGS properties,    ///< [in] properties about byteAligned, exportNorm
+    UINT_32              resultBits,    ///< [in] result bits: total bpp after decompression
+    UINT_8*              pPixel)        ///< [out] a depth/stencil pixel value
+{
+    UINT_32 i;
+    UINT_32 j;
+    UINT_32 start;
+    UINT_32 size;
+    UINT_32 byte;
+    UINT_32 value = 0;
+    UINT_32 compMask;
+    UINT_32 elemMask=0;
+    UINT_32 elementXor = 0;  // address xor when reading bytes from elements
+
+
+    // @@ NOTE: assert if called on a compressed format!
+
+    if (properties.byteAligned)    // Components are all byte-sized
+    {
+        for (i = 0; i < numComps; i++)        // Then for each component
+        {
+            // Copy the bytes of the component into the element
+            start = pCompStart[i] / 8;
+            size  = pCompBits[i]  / 8;
+            for (j = 0; j < size; j++)
+            {
+                pPixel[(j+start)^elementXor] = static_cast<UINT_8>(pComps[i] >> (8*j));
+            }
+        }
+    }
+    else                        // Element is 32-bits or less, components are bit fields
+    {
+        // First, extract each component in turn and combine it into a 32-bit value
+        for (i = 0; i < numComps; i++)
+        {
+            compMask = (1 << pCompBits[i]) - 1;
+            elemMask |= compMask << pCompStart[i];
+            value |= (pComps[i] & compMask) << pCompStart[i];
+        }
+
+        // Mext, copy the masked value into the element
+        size = (resultBits + 7) / 8;
+        for (i = 0; i < size; i++)
+        {
+            byte = pPixel[i^elementXor] & ~(elemMask >> (8*i));
+            pPixel[i^elementXor] = static_cast<UINT_8>(byte | ((elemMask & value) >> (8*i)));
+        }
+    }
+}
+
+/**
+***************************************************************************************************
+*   Flt32ToDepthPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a depth/stencil pixel value
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::Flt32ToDepthPixel(
+    AddrDepthFormat     format,     ///< [in] Depth format
+    const ADDR_FLT_32   comps[2],   ///< [in] two components of depth
+    UINT_8*             pPixel      ///< [out] depth pixel value
+    ) const
+{
+    UINT_32 i;
+    UINT_32 values[2];
+    ADDR_COMPONENT_FLAGS properties;    // byteAligned, exportNorm
+    UINT_32 resultBits = 0;             // result bits: total bits per pixel after decompression
+
+    ADDR_PIXEL_FORMATINFO fmt;
+
+    // get type for each component
+    PixGetDepthCompInfo(format, &fmt);
+
+    //initialize properties
+    properties.byteAligned = TRUE;
+    properties.exportNorm  = TRUE;
+    properties.floatComp   = FALSE;
+
+    //set properties and result bits
+    for (i = 0; i < 2; i++)
+    {
+        if ((fmt.compBit[i] & 7) || (fmt.compStart[i] & 7))
+        {
+            properties.byteAligned = FALSE;
+        }
+
+        if (resultBits < fmt.compStart[i] + fmt.compBit[i])
+        {
+            resultBits = fmt.compStart[i] + fmt.compBit[i];
+        }
+
+        // Clear ADDR_EXPORT_NORM if can't be represented as 11-bit or smaller [-1..+1] format
+        if (fmt.compBit[i] > 11 || fmt.numType[i] >= ADDR_USCALED)
+        {
+            properties.exportNorm = FALSE;
+        }
+
+        // Mark if there are any floating point components
+        if ((fmt.numType[i] == ADDR_U4FLOATC) || (fmt.numType[i] >= ADDR_S8FLOAT) )
+        {
+            properties.floatComp = TRUE;
+        }
+    }
+
+    // Convert the two input floats to integer values
+    for (i = 0; i < 2; i++)
+    {
+        Flt32sToInt32s(comps[i], fmt.compBit[i], fmt.numType[i], &values[i]);
+    }
+
+    // Then pack the two integer components, in the proper order
+    Int32sToPixel(2, values, fmt.compBit, fmt.compStart, properties, resultBits, pPixel );
+
+}
+
+/**
+***************************************************************************************************
+*   Flt32ToColorPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a red/green/blue/alpha pixel value
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::Flt32ToColorPixel(
+    AddrColorFormat     format,     ///< [in] Color format
+    AddrSurfaceNumber   surfNum,    ///< [in] Surface number
+    AddrSurfaceSwap     surfSwap,   ///< [in] Surface swap
+    const ADDR_FLT_32   comps[4],   ///< [in] four components of color
+    UINT_8*             pPixel      ///< [out] a red/green/blue/alpha pixel value
+    ) const
+{
+    ADDR_PIXEL_FORMATINFO pixelInfo;
+
+    UINT_32 i;
+    UINT_32 values[4];
+    ADDR_COMPONENT_FLAGS properties;    // byteAligned, exportNorm
+    UINT_32 resultBits = 0;             // result bits: total bits per pixel after decompression
+
+    memset(&pixelInfo, 0, sizeof(ADDR_PIXEL_FORMATINFO));
+
+    PixGetColorCompInfo(format, surfNum, surfSwap, &pixelInfo);
+
+    //initialize properties
+    properties.byteAligned = TRUE;
+    properties.exportNorm  = TRUE;
+    properties.floatComp   = FALSE;
+
+    //set properties and result bits
+    for (i = 0; i < 4; i++)
+    {
+        if ( (pixelInfo.compBit[i] & 7) || (pixelInfo.compStart[i] & 7) )
+        {
+            properties.byteAligned = FALSE;
+        }
+
+        if (resultBits < pixelInfo.compStart[i] + pixelInfo.compBit[i])
+        {
+            resultBits = pixelInfo.compStart[i] + pixelInfo.compBit[i];
+        }
+
+        if (m_fp16ExportNorm)
+        {
+            // Clear ADDR_EXPORT_NORM if can't be represented as 11-bit or smaller [-1..+1] format
+            // or if it's not FP and <=16 bits
+            if (((pixelInfo.compBit[i] > 11) || (pixelInfo.numType[i] >= ADDR_USCALED))
+                && (pixelInfo.numType[i] !=ADDR_U4FLOATC))
+            {
+                properties.exportNorm = FALSE;
+            }
+        }
+        else
+        {
+            // Clear ADDR_EXPORT_NORM if can't be represented as 11-bit or smaller [-1..+1] format
+            if (pixelInfo.compBit[i] > 11 || pixelInfo.numType[i] >= ADDR_USCALED)
+            {
+                properties.exportNorm = FALSE;
+            }
+        }
+
+        // Mark if there are any floating point components
+        if ( (pixelInfo.numType[i] == ADDR_U4FLOATC) ||
+             (pixelInfo.numType[i] >= ADDR_S8FLOAT) )
+        {
+            properties.floatComp = TRUE;
+        }
+    }
+
+    // Convert the four input floats to integer values
+    for (i = 0; i < 4; i++)
+    {
+        Flt32sToInt32s(comps[i], pixelInfo.compBit[i], pixelInfo.numType[i], &values[i]);
+    }
+
+    // Then pack the four integer components, in the proper order
+    Int32sToPixel(4, values, &pixelInfo.compBit[0], &pixelInfo.compStart[0],
+                  properties, resultBits, pPixel);
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::GetCompType
+*
+*   @brief
+*       Fill per component info
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrElemLib::GetCompType(
+    AddrColorFormat         format,     ///< [in] surface format
+    AddrSurfaceNumber       numType,  ///< [in] number type
+    ADDR_PIXEL_FORMATINFO*  pInfo)       ///< [in][out] per component info out
+{
+    BOOL_32 handled = FALSE;
+
+    // Floating point formats override the number format
+    switch (format)
+    {
+        case ADDR_COLOR_16_FLOAT:            // fall through for all pure floating point format
+        case ADDR_COLOR_16_16_FLOAT:
+        case ADDR_COLOR_16_16_16_16_FLOAT:
+        case ADDR_COLOR_32_FLOAT:
+        case ADDR_COLOR_32_32_FLOAT:
+        case ADDR_COLOR_32_32_32_32_FLOAT:
+        case ADDR_COLOR_10_11_11_FLOAT:
+        case ADDR_COLOR_11_11_10_FLOAT:
+            numType = ADDR_NUMBER_FLOAT;
+            break;
+            // Special handling for the depth formats
+        case ADDR_COLOR_8_24:                // fall through for these 2 similar format
+        case ADDR_COLOR_24_8:
+            for (UINT_32 c = 0; c < 4; c++)
+            {
+                if (pInfo->compBit[c] == 8)
+                {
+                    pInfo->numType[c] = ADDR_UINT_BITS;
+                }
+                else if (pInfo->compBit[c]  == 24)
+                {
+                    pInfo->numType[c] = ADDR_UNORM_R6XX;
+                }
+                else
+                {
+                    pInfo->numType[c] = ADDR_NO_NUMBER;
+                }
+            }
+            handled = TRUE;
+            break;
+        case ADDR_COLOR_8_24_FLOAT:          // fall through for these 3 similar format
+        case ADDR_COLOR_24_8_FLOAT:
+        case ADDR_COLOR_X24_8_32_FLOAT:
+            for (UINT_32 c = 0; c < 4; c++)
+            {
+                if (pInfo->compBit[c] == 8)
+                {
+                    pInfo->numType[c] = ADDR_UINT_BITS;
+                }
+                else if (pInfo->compBit[c] == 24)
+                {
+                    pInfo->numType[c] = ADDR_U4FLOATC;
+                }
+                else if (pInfo->compBit[c] == 32)
+                {
+                    pInfo->numType[c] = ADDR_S8FLOAT32;
+                }
+                else
+                {
+                    pInfo->numType[c] = ADDR_NO_NUMBER;
+                }
+            }
+            handled = TRUE;
+            break;
+        default:
+            break;
+    }
+
+    if (!handled)
+    {
+        for (UINT_32 c = 0; c < 4; c++)
+        {
+            // Assign a number type for each component
+            AddrSurfaceNumber cnum;
+
+            // First handle default component values
+            if (pInfo->compBit[c] == 0)
+            {
+                if (c < 3)
+                {
+                    pInfo->numType[c] = ADDR_ZERO;      // Default is zero for RGB
+                }
+                else if (numType == ADDR_NUMBER_UINT || numType == ADDR_NUMBER_SINT)
+                {
+                    pInfo->numType[c] = ADDR_EPSILON;   // Alpha INT_32 bits default is 0x01
+                }
+                else
+                {
+                    pInfo->numType[c] = ADDR_ONE;       // Alpha normal default is float 1.0
+                }
+                continue;
+            }
+            // Now handle small components
+            else if (pInfo->compBit[c] == 1)
+            {
+                if (numType == ADDR_NUMBER_UINT || numType == ADDR_NUMBER_SINT)
+                {
+                    cnum = ADDR_NUMBER_UINT;
+                }
+                else
+                {
+                    cnum = ADDR_NUMBER_UNORM;
+                }
+            }
+            else
+            {
+                cnum = numType;
+            }
+
+            // If no default, set the number type fom num, compbits, and architecture
+            switch (cnum)
+            {
+                case ADDR_NUMBER_SRGB:
+                    pInfo->numType[c] = (c < 3) ? ADDR_GAMMA8_R6XX : ADDR_UNORM_R6XX;
+                    break;
+                case ADDR_NUMBER_UNORM:
+                    pInfo->numType[c] = ADDR_UNORM_R6XX;
+                    break;
+                case ADDR_NUMBER_SNORM:
+                    pInfo->numType[c] = ADDR_SNORM_R6XX;
+                    break;
+                case ADDR_NUMBER_USCALED:
+                    pInfo->numType[c] = ADDR_USCALED;  // @@ Do we need separate Pele routine?
+                    break;
+                case ADDR_NUMBER_SSCALED:
+                    pInfo->numType[c] = ADDR_SSCALED;  // @@ Do we need separate Pele routine?
+                    break;
+                case ADDR_NUMBER_FLOAT:
+                    if (pInfo->compBit[c] == 32)
+                    {
+                        pInfo->numType[c] = ADDR_S8FLOAT32;
+                    }
+                    else if (pInfo->compBit[c] == 16)
+                    {
+                        pInfo->numType[c] = ADDR_S5FLOAT;
+                    }
+                    else if (pInfo->compBit[c] >= 10)
+                    {
+                        pInfo->numType[c] = ADDR_U5FLOAT;
+                    }
+                    else
+                    {
+                        ADDR_ASSERT_ALWAYS();
+                    }
+                    break;
+                case ADDR_NUMBER_SINT:
+                    pInfo->numType[c] = ADDR_SINT_BITS;
+                    break;
+                case ADDR_NUMBER_UINT:
+                    pInfo->numType[c] = ADDR_UINT_BITS;
+                    break;
+
+                default:
+                    ADDR_ASSERT(!"Invalid number type");
+                    pInfo->numType[c] = ADDR_NO_NUMBER;
+                    break;
+             }
+        }
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::GetCompSwap
+*
+*   @brief
+*       Get components swapped for color surface
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrElemLib::GetCompSwap(
+    AddrSurfaceSwap         swap,   ///< [in] swap mode
+    ADDR_PIXEL_FORMATINFO*  pInfo)  ///< [in/out] output per component info
+{
+    switch (pInfo->comps)
+    {
+        case 4:
+            switch (swap)
+            {
+                case ADDR_SWAP_ALT:
+                    SwapComps( 0, 2, pInfo );
+                    break;    // BGRA
+                case ADDR_SWAP_STD_REV:
+                    SwapComps( 0, 3, pInfo );
+                    SwapComps( 1, 2, pInfo );
+                    break;    // ABGR
+                case ADDR_SWAP_ALT_REV:
+                    SwapComps( 0, 3, pInfo );
+                    SwapComps( 0, 2, pInfo );
+                    SwapComps( 0, 1, pInfo );
+                    break;    // ARGB
+                default:
+                    break;
+            }
+            break;
+        case 3:
+            switch (swap)
+            {
+                case ADDR_SWAP_ALT_REV:
+                    SwapComps( 0, 3, pInfo );
+                    SwapComps( 0, 2, pInfo );
+                    break;    // AGR
+                case ADDR_SWAP_STD_REV:
+                    SwapComps( 0, 2, pInfo );
+                    break;    // BGR
+                case ADDR_SWAP_ALT:
+                    SwapComps( 2, 3, pInfo );
+                    break;    // RGA
+                default:
+                    break;    // RGB
+            }
+            break;
+        case 2:
+            switch (swap)
+            {
+                case ADDR_SWAP_ALT_REV:
+                    SwapComps( 0, 1, pInfo );
+                    SwapComps( 1, 3, pInfo );
+                    break;    // AR
+                case ADDR_SWAP_STD_REV:
+                    SwapComps( 0, 1, pInfo );
+                    break;    // GR
+                case ADDR_SWAP_ALT:
+                    SwapComps( 1, 3, pInfo );
+                    break;    // RA
+                default:
+                    break;    // RG
+            }
+            break;
+        case 1:
+            switch (swap)
+            {
+                case ADDR_SWAP_ALT_REV:
+                    SwapComps( 0, 3, pInfo );
+                    break;    // A
+                case ADDR_SWAP_STD_REV:
+                    SwapComps( 0, 2, pInfo );
+                    break;    // B
+                case ADDR_SWAP_ALT:
+                    SwapComps( 0, 1, pInfo );
+                    break;    // G
+                default:
+                    break;    // R
+            }
+            break;
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::GetCompSwap
+*
+*   @brief
+*       Get components swapped for color surface
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrElemLib::SwapComps(
+    UINT_32                 c0,     ///< [in] component index 0
+    UINT_32                 c1,     ///< [in] component index 1
+    ADDR_PIXEL_FORMATINFO*  pInfo)  ///< [in/out] output per component info
+{
+    UINT_32 start;
+    UINT_32 bits;
+
+    start = pInfo->compStart[c0];
+    pInfo->compStart[c0] = pInfo->compStart[c1];
+    pInfo->compStart[c1] = start;
+
+    bits  = pInfo->compBit[c0];
+    pInfo->compBit[c0] = pInfo->compBit[c1];
+    pInfo->compBit[c1] = bits;
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::PixGetColorCompInfo
+*
+*   @brief
+*       Get per component info for color surface
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrElemLib::PixGetColorCompInfo(
+    AddrColorFormat         format, ///< [in] surface format, read from register
+    AddrSurfaceNumber       number, ///< [in] pixel number type
+    AddrSurfaceSwap         swap,   ///< [in] component swap mode
+    ADDR_PIXEL_FORMATINFO*  pInfo   ///< [out] output per component info
+    ) const
+{
+    // 1. Get componet bits
+    switch (format)
+    {
+        case ADDR_COLOR_8:
+            GetCompBits(8, 0, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_1_5_5_5:
+            GetCompBits(5, 5, 5, 1, pInfo);
+            break;
+        case ADDR_COLOR_5_6_5:
+            GetCompBits(8, 6, 5, 0, pInfo);
+            break;
+        case ADDR_COLOR_6_5_5:
+            GetCompBits(5, 5, 6, 0, pInfo);
+            break;
+        case ADDR_COLOR_8_8:
+            GetCompBits(8, 8, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_4_4_4_4:
+            GetCompBits(4, 4, 4, 4, pInfo);
+            break;
+        case ADDR_COLOR_16:
+            GetCompBits(16, 0, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_8_8_8_8:
+            GetCompBits(8, 8, 8, 8, pInfo);
+            break;
+        case ADDR_COLOR_2_10_10_10:
+            GetCompBits(10, 10, 10, 2, pInfo);
+            break;
+        case ADDR_COLOR_10_11_11:
+            GetCompBits(11, 11, 10, 0, pInfo);
+            break;
+        case ADDR_COLOR_11_11_10:
+            GetCompBits(10, 11, 11, 0, pInfo);
+            break;
+        case ADDR_COLOR_16_16:
+            GetCompBits(16, 16, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_16_16_16_16:
+            GetCompBits(16, 16, 16, 16, pInfo);
+            break;
+        case ADDR_COLOR_16_FLOAT:
+            GetCompBits(16, 0, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_16_16_FLOAT:
+            GetCompBits(16, 16, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_32_FLOAT:
+            GetCompBits(32, 0, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_32_32_FLOAT:
+            GetCompBits(32, 32, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_16_16_16_16_FLOAT:
+            GetCompBits(16, 16, 16, 16, pInfo);
+            break;
+        case ADDR_COLOR_32_32_32_32_FLOAT:
+            GetCompBits(32, 32, 32, 32, pInfo);
+            break;
+
+        case ADDR_COLOR_32:
+            GetCompBits(32, 0, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_32_32:
+            GetCompBits(32, 32, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_32_32_32_32:
+            GetCompBits(32, 32, 32, 32, pInfo);
+            break;
+        case ADDR_COLOR_10_10_10_2:
+            GetCompBits(2, 10, 10, 10, pInfo);
+            break;
+        case ADDR_COLOR_10_11_11_FLOAT:
+            GetCompBits(11, 11, 10, 0, pInfo);
+            break;
+        case ADDR_COLOR_11_11_10_FLOAT:
+            GetCompBits(10, 11, 11, 0, pInfo);
+            break;
+        case ADDR_COLOR_5_5_5_1:
+            GetCompBits(1, 5, 5, 5, pInfo);
+            break;
+        case ADDR_COLOR_3_3_2:
+            GetCompBits(2, 3, 3, 0, pInfo);
+            break;
+        case ADDR_COLOR_4_4:
+            GetCompBits(4, 4, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_8_24:
+        case ADDR_COLOR_8_24_FLOAT:  // same bit count, fall through
+            GetCompBits(24, 8, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_24_8:
+        case ADDR_COLOR_24_8_FLOAT:  // same bit count, fall through
+            GetCompBits(8, 24, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_X24_8_32_FLOAT:
+            GetCompBits(32, 8, 0, 0, pInfo);
+            break;
+
+        case ADDR_COLOR_INVALID:
+            GetCompBits(0, 0, 0, 0, pInfo);
+            break;
+        default:
+            ADDR_ASSERT(0);
+            GetCompBits(0, 0, 0, 0, pInfo);
+            break;
+    }
+
+    // 2. Get component number type
+
+    GetCompType(format, number, pInfo);
+
+    // 3. Swap components if needed
+
+    GetCompSwap(swap, pInfo);
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::PixGetDepthCompInfo
+*
+*   @brief
+*       Get per component info for depth surface
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrElemLib::PixGetDepthCompInfo(
+    AddrDepthFormat         format,     ///< [in] surface format, read from register
+    ADDR_PIXEL_FORMATINFO*  pInfo       ///< [out] output per component bits and type
+    ) const
+{
+    if (m_depthPlanarType == ADDR_DEPTH_PLANAR_R800)
+    {
+        if (format == ADDR_DEPTH_8_24_FLOAT)
+        {
+            format = ADDR_DEPTH_X24_8_32_FLOAT; // Use this format to represent R800's D24FS8
+        }
+
+        if (format == ADDR_DEPTH_X8_24_FLOAT)
+        {
+            format = ADDR_DEPTH_32_FLOAT;
+        }
+    }
+
+    switch (format)
+    {
+        case ADDR_DEPTH_16:
+            GetCompBits(16, 0, 0, 0, pInfo);
+            break;
+        case ADDR_DEPTH_8_24:
+        case ADDR_DEPTH_8_24_FLOAT:      // similar format, fall through
+            GetCompBits(24, 8, 0, 0, pInfo);
+            break;
+        case ADDR_DEPTH_X8_24:
+        case ADDR_DEPTH_X8_24_FLOAT:     // similar format, fall through
+            GetCompBits(24, 0, 0, 0, pInfo);
+            break;
+        case ADDR_DEPTH_32_FLOAT:
+            GetCompBits(32, 0, 0, 0, pInfo);
+            break;
+        case ADDR_DEPTH_X24_8_32_FLOAT:
+            GetCompBits(32, 8, 0, 0, pInfo);
+            break;
+        case ADDR_DEPTH_INVALID:
+            GetCompBits(0, 0, 0, 0, pInfo);
+            break;
+        default:
+            ADDR_ASSERT(0);
+            GetCompBits(0, 0, 0, 0, pInfo);
+            break;
+    }
+
+    switch (format)
+    {
+        case ADDR_DEPTH_16:
+            pInfo->numType [0] = ADDR_UNORM_R6XX;
+            pInfo->numType [1] = ADDR_ZERO;
+            break;
+        case ADDR_DEPTH_8_24:
+            pInfo->numType [0] = ADDR_UNORM_R6XXDB;
+            pInfo->numType [1] = ADDR_UINT_BITS;
+            break;
+        case ADDR_DEPTH_8_24_FLOAT:
+            pInfo->numType [0] = ADDR_U4FLOATC;
+            pInfo->numType [1] = ADDR_UINT_BITS;
+            break;
+        case ADDR_DEPTH_X8_24:
+            pInfo->numType [0] = ADDR_UNORM_R6XXDB;
+            pInfo->numType [1] = ADDR_ZERO;
+            break;
+        case ADDR_DEPTH_X8_24_FLOAT:
+            pInfo->numType [0] = ADDR_U4FLOATC;
+            pInfo->numType [1] = ADDR_ZERO;
+            break;
+        case ADDR_DEPTH_32_FLOAT:
+            pInfo->numType [0] = ADDR_S8FLOAT32;
+            pInfo->numType [1] = ADDR_ZERO;
+            break;
+        case ADDR_DEPTH_X24_8_32_FLOAT:
+            pInfo->numType [0] = ADDR_S8FLOAT32;
+            pInfo->numType [1] = ADDR_UINT_BITS;
+            break;
+        default:
+            pInfo->numType [0] = ADDR_NO_NUMBER;
+            pInfo->numType [1] = ADDR_NO_NUMBER;
+            break;
+    }
+
+    pInfo->numType [2] = ADDR_NO_NUMBER;
+    pInfo->numType [3] = ADDR_NO_NUMBER;
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::PixGetExportNorm
+*
+*   @brief
+*       Check if fp16 export norm can be enabled.
+*
+*   @return
+*       TRUE if this can be enabled.
+*
+***************************************************************************************************
+*/
+BOOL_32 AddrElemLib::PixGetExportNorm(
+    AddrColorFormat     colorFmt,       ///< [in] surface format, read from register
+    AddrSurfaceNumber   numberFmt,      ///< [in] pixel number type
+    AddrSurfaceSwap     swap            ///< [in] components swap type
+    ) const
+{
+    BOOL_32 enabled = TRUE;
+
+    ADDR_PIXEL_FORMATINFO formatInfo;
+
+    PixGetColorCompInfo(colorFmt, numberFmt, swap, &formatInfo);
+
+    for (UINT_32 c = 0; c < 4; c++)
+    {
+        if (m_fp16ExportNorm)
+        {
+            if (((formatInfo.compBit[c] > 11) || (formatInfo.numType[c] > ADDR_USCALED)) &&
+                (formatInfo.numType[c] != ADDR_U4FLOATC)    &&
+                (formatInfo.numType[c] != ADDR_S5FLOAT)     &&
+                (formatInfo.numType[c] != ADDR_S5FLOATM)    &&
+                (formatInfo.numType[c] != ADDR_U5FLOAT)     &&
+                (formatInfo.numType[c] != ADDR_U3FLOATM))
+            {
+                enabled = FALSE;
+                break;
+            }
+        }
+        else
+        {
+            if ((formatInfo.compBit[c] > 11) || (formatInfo.numType[c] > ADDR_USCALED))
+            {
+                enabled = FALSE;
+                break;
+            }
+        }
+    }
+
+    return enabled;
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::AdjustSurfaceInfo
+*
+*   @brief
+*       Adjust bpp/base pitch/width/height according to elemMode and expandX/Y
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::AdjustSurfaceInfo(
+    AddrElemMode    elemMode,       ///< [in] element mode
+    UINT_32         expandX,        ///< [in] decompression expansion factor in X
+    UINT_32         expandY,        ///< [in] decompression expansion factor in Y
+    UINT_32*        pBpp,           ///< [in/out] bpp
+    UINT_32*        pBasePitch,     ///< [in/out] base pitch
+    UINT_32*        pWidth,         ///< [in/out] width
+    UINT_32*        pHeight)        ///< [in/out] height
+{
+    UINT_32 packedBits;
+    UINT_32 basePitch;
+    UINT_32 width;
+    UINT_32 height;
+    UINT_32 bpp;
+    BOOL_32 bBCnFormat = FALSE;
+
+    ADDR_ASSERT(pBpp != NULL);
+    ADDR_ASSERT(pWidth != NULL && pHeight != NULL && pBasePitch != NULL);
+
+    if (pBpp)
+    {
+        bpp = *pBpp;
+
+        switch (elemMode)
+        {
+            case ADDR_EXPANDED:
+                packedBits = bpp / expandX / expandY;
+                break;
+            case ADDR_PACKED_STD: // Different bit order
+            case ADDR_PACKED_REV:
+                packedBits = bpp * expandX * expandY;
+                break;
+            case ADDR_PACKED_GBGR:
+            case ADDR_PACKED_BGRG:
+                packedBits = bpp; // 32-bit packed ==> 2 32-bit result
+                break;
+            case ADDR_PACKED_BC1: // Fall through
+            case ADDR_PACKED_BC4:
+                packedBits = 64;
+                bBCnFormat = TRUE;
+                break;
+            case ADDR_PACKED_BC2: // Fall through
+            case ADDR_PACKED_BC3: // Fall through
+            case ADDR_PACKED_BC5: // Fall through
+                bBCnFormat = TRUE;
+                packedBits = 128;
+                break;
+            case ADDR_ROUND_BY_HALF:  // Fall through
+            case ADDR_ROUND_TRUNCATE: // Fall through
+            case ADDR_ROUND_DITHER:   // Fall through
+            case ADDR_UNCOMPRESSED:
+                packedBits = bpp;
+                break;
+            default:
+                packedBits = bpp;
+                ADDR_ASSERT_ALWAYS();
+                break;
+        }
+
+        *pBpp = packedBits;
+    }
+
+    if (pWidth && pHeight && pBasePitch)
+    {
+        basePitch = *pBasePitch;
+        width     = *pWidth;
+        height    = *pHeight;
+
+        if ((expandX > 1) || (expandY > 1))
+        {
+            if (elemMode == ADDR_EXPANDED)
+            {
+                basePitch *= expandX;
+                width     *= expandX;
+                height    *= expandY;
+            }
+            else
+            {
+                // Evergreen family workaround
+                if (bBCnFormat && (m_pAddrLib->GetAddrChipFamily() == ADDR_CHIP_FAMILY_R8XX))
+                {
+                    // For BCn we now pad it to POW2 at the beginning so it is safe to
+                    // divide by 4 directly
+                    basePitch = basePitch / expandX;
+                    width     = width  / expandX;
+                    height    = height / expandY;
+#if DEBUG
+                    width     = (width == 0) ? 1 : width;
+                    height    = (height == 0) ? 1 : height;
+
+                    if ((*pWidth > PowTwoAlign(width, 8) * expandX) ||
+                        (*pHeight > PowTwoAlign(height, 8) * expandY)) // 8 is 1D tiling alignment
+                    {
+                        // if this assertion is hit we may have issues if app samples
+                        // rightmost/bottommost pixels
+                        ADDR_ASSERT_ALWAYS();
+                    }
+#endif
+                }
+                else // Not BCn format we still keep old way (FMT_1? No real test yet)
+                {
+                    basePitch = (basePitch + expandX - 1) / expandX;
+                    width     = (width + expandX - 1) / expandX;
+                    height    = (height + expandY - 1) / expandY;
+                }
+            }
+
+            *pBasePitch = basePitch; // 0 is legal value for base pitch.
+            *pWidth     = (width == 0) ? 1 : width;
+            *pHeight    = (height == 0) ? 1 : height;
+        } //if (pWidth && pHeight && pBasePitch)
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::RestoreSurfaceInfo
+*
+*   @brief
+*       Reverse operation of AdjustSurfaceInfo
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::RestoreSurfaceInfo(
+    AddrElemMode    elemMode,       ///< [in] element mode
+    UINT_32         expandX,        ///< [in] decompression expansion factor in X
+    UINT_32         expandY,        ///< [out] decompression expansion factor in Y
+    UINT_32*        pBpp,           ///< [in/out] bpp
+    UINT_32*        pWidth,         ///< [in/out] width
+    UINT_32*        pHeight)        ///< [in/out] height
+{
+    UINT_32 originalBits;
+    UINT_32 width;
+    UINT_32 height;
+    UINT_32 bpp;
+
+    ADDR_ASSERT(pBpp != NULL);
+    ADDR_ASSERT(pWidth != NULL && pHeight != NULL);
+
+    if (pBpp)
+    {
+        bpp = *pBpp;
+
+        switch (elemMode)
+        {
+        case ADDR_EXPANDED:
+            originalBits = bpp * expandX * expandY;
+            break;
+        case ADDR_PACKED_STD: // Different bit order
+        case ADDR_PACKED_REV:
+            originalBits = bpp / expandX / expandY;
+            break;
+        case ADDR_PACKED_GBGR:
+        case ADDR_PACKED_BGRG:
+            originalBits = bpp; // 32-bit packed ==> 2 32-bit result
+            break;
+        case ADDR_PACKED_BC1: // Fall through
+        case ADDR_PACKED_BC4:
+            originalBits = 64;
+            break;
+        case ADDR_PACKED_BC2: // Fall through
+        case ADDR_PACKED_BC3: // Fall through
+            case ADDR_PACKED_BC5:
+            originalBits = 128;
+            break;
+        case ADDR_ROUND_BY_HALF:  // Fall through
+        case ADDR_ROUND_TRUNCATE: // Fall through
+        case ADDR_ROUND_DITHER:   // Fall through
+        case ADDR_UNCOMPRESSED:
+            originalBits = bpp;
+            break;
+        default:
+            originalBits = bpp;
+            ADDR_ASSERT_ALWAYS();
+            break;
+        }
+
+        *pBpp = originalBits;
+    }
+
+    if (pWidth && pHeight)
+    {
+        width    = *pWidth;
+        height   = *pHeight;
+
+        if ((expandX > 1) || (expandY > 1))
+        {
+            if (elemMode == ADDR_EXPANDED)
+            {
+                width /= expandX;
+                height /= expandY;
+            }
+            else
+            {
+                width *= expandX;
+                height *= expandY;
+            }
+        }
+
+        *pWidth  = (width == 0) ? 1 : width;
+        *pHeight = (height == 0) ? 1 : height;
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::GetBitsPerPixel
+*
+*   @brief
+*       Compute the total bits per element according to a format
+*       code. For compressed formats, this is not the same as
+*       the number of bits per decompressed element.
+*
+*   @return
+*       Bits per pixel
+***************************************************************************************************
+*/
+UINT_32 AddrElemLib::GetBitsPerPixel(
+    AddrFormat          format,         ///< [in] surface format code
+    AddrElemMode*       pElemMode,      ///< [out] element mode
+    UINT_32*            pExpandX,       ///< [out] decompression expansion factor in X
+    UINT_32*            pExpandY,       ///< [out] decompression expansion factor in Y
+    UINT_32*            pUnusedBits)    ///< [out] bits unused
+{
+    UINT_32 bpp;
+    UINT_32 expandX = 1;
+    UINT_32 expandY = 1;
+    UINT_32 bitUnused = 0;
+    AddrElemMode elemMode = ADDR_UNCOMPRESSED; // default value
+
+    switch (format)
+    {
+        case ADDR_FMT_8:
+            bpp = 8;
+            break;
+        case ADDR_FMT_1_5_5_5:
+        case ADDR_FMT_5_6_5:
+        case ADDR_FMT_6_5_5:
+        case ADDR_FMT_8_8:
+        case ADDR_FMT_4_4_4_4:
+        case ADDR_FMT_16:
+        case ADDR_FMT_16_FLOAT:
+            bpp = 16;
+            break;
+        case ADDR_FMT_GB_GR: // treat as FMT_8_8
+            elemMode = ADDR_PACKED_GBGR;
+            bpp = 16;
+            break;
+        case ADDR_FMT_BG_RG: // treat as FMT_8_8
+            elemMode = ADDR_PACKED_BGRG;
+            bpp = 16;
+            break;
+        case ADDR_FMT_8_8_8_8:
+        case ADDR_FMT_2_10_10_10:
+        case ADDR_FMT_10_11_11:
+        case ADDR_FMT_11_11_10:
+        case ADDR_FMT_16_16:
+        case ADDR_FMT_16_16_FLOAT:
+        case ADDR_FMT_32:
+        case ADDR_FMT_32_FLOAT:
+        case ADDR_FMT_24_8:
+        case ADDR_FMT_24_8_FLOAT:
+            bpp = 32;
+            break;
+        case ADDR_FMT_16_16_16_16:
+        case ADDR_FMT_16_16_16_16_FLOAT:
+        case ADDR_FMT_32_32:
+        case ADDR_FMT_32_32_FLOAT:
+        case ADDR_FMT_CTX1:
+            bpp = 64;
+            break;
+        case ADDR_FMT_32_32_32_32:
+        case ADDR_FMT_32_32_32_32_FLOAT:
+            bpp = 128;
+            break;
+        case ADDR_FMT_INVALID:
+            bpp = 0;
+            break;
+        case ADDR_FMT_1_REVERSED:
+            elemMode = ADDR_PACKED_REV;
+            expandX = 8;
+            bpp = 1;
+            break;
+        case ADDR_FMT_1:
+            elemMode = ADDR_PACKED_STD;
+            expandX = 8;
+            bpp = 1;
+            break;
+        case ADDR_FMT_4_4:
+        case ADDR_FMT_3_3_2:
+            bpp = 8;
+            break;
+        case ADDR_FMT_5_5_5_1:
+            bpp = 16;
+            break;
+        case ADDR_FMT_32_AS_8:
+        case ADDR_FMT_32_AS_8_8:
+        case ADDR_FMT_8_24:
+        case ADDR_FMT_8_24_FLOAT:
+        case ADDR_FMT_10_10_10_2:
+        case ADDR_FMT_10_11_11_FLOAT:
+        case ADDR_FMT_11_11_10_FLOAT:
+        case ADDR_FMT_5_9_9_9_SHAREDEXP:
+            bpp = 32;
+            break;
+        case ADDR_FMT_X24_8_32_FLOAT:
+            bpp = 64;
+            bitUnused = 24;
+            break;
+        case ADDR_FMT_8_8_8:
+            elemMode = ADDR_EXPANDED;
+            bpp = 24;//@@ 8;      // read 3 elements per pixel
+            expandX = 3;
+            break;
+        case ADDR_FMT_16_16_16:
+        case ADDR_FMT_16_16_16_FLOAT:
+            elemMode = ADDR_EXPANDED;
+            bpp = 48;//@@ 16;      // read 3 elements per pixel
+            expandX = 3;
+            break;
+        case ADDR_FMT_32_32_32_FLOAT:
+        case ADDR_FMT_32_32_32:
+            elemMode = ADDR_EXPANDED;
+            expandX = 3;
+            bpp = 96;//@@ 32;      // read 3 elements per pixel
+            break;
+        case ADDR_FMT_BC1:
+            elemMode = ADDR_PACKED_BC1;
+            expandX = 4;
+            expandY = 4;
+            bpp = 64;
+            break;
+        case ADDR_FMT_BC4:
+            elemMode = ADDR_PACKED_BC4;
+            expandX = 4;
+            expandY = 4;
+            bpp = 64;
+            break;
+        case ADDR_FMT_BC2:
+            elemMode = ADDR_PACKED_BC2;
+            expandX = 4;
+            expandY = 4;
+            bpp = 128;
+            break;
+        case ADDR_FMT_BC3:
+            elemMode = ADDR_PACKED_BC3;
+            expandX = 4;
+            expandY = 4;
+            bpp = 128;
+            break;
+        case ADDR_FMT_BC5:
+        case ADDR_FMT_BC6: // reuse ADDR_PACKED_BC5
+        case ADDR_FMT_BC7: // reuse ADDR_PACKED_BC5
+            elemMode = ADDR_PACKED_BC5;
+            expandX = 4;
+            expandY = 4;
+            bpp = 128;
+            break;
+        default:
+            bpp = 0;
+            ADDR_ASSERT_ALWAYS();
+            break;
+            // @@ or should this be an error?
+    }
+
+    SafeAssign(pExpandX, expandX);
+    SafeAssign(pExpandY, expandY);
+    SafeAssign(pUnusedBits, bitUnused);
+    SafeAssign(reinterpret_cast<UINT_32*>(pElemMode), elemMode);
+
+    return bpp;
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::GetCompBits
+*
+*   @brief
+*       Set each component's bit size and bit start. And set element mode and number type
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::GetCompBits(
+    UINT_32 c0,                     ///< [in] bits of component 0
+    UINT_32 c1,                     ///< [in] bits of component 1
+    UINT_32 c2,                     ///< [in] bits of component 2
+    UINT_32 c3,                     ///< [in] bits of component 3
+    ADDR_PIXEL_FORMATINFO* pInfo,   ///< [out] per component info out
+    AddrElemMode elemMode)          ///< [in] element mode
+{
+    pInfo->comps = 0;
+
+    pInfo->compBit[0] = c0;
+    pInfo->compBit[1] = c1;
+    pInfo->compBit[2] = c2;
+    pInfo->compBit[3] = c3;
+
+    pInfo->compStart[0] = 0;
+    pInfo->compStart[1] = c0;
+    pInfo->compStart[2] = c0+c1;
+    pInfo->compStart[3] = c0+c1+c2;
+
+    pInfo->elemMode = elemMode;
+    // still needed since component swap may depend on number of components
+    for (INT i=0; i<4; i++)
+    {
+        if (pInfo->compBit[i] == 0)
+        {
+            pInfo->compStart[i]  = 0;       // all null components start at bit 0
+            pInfo->numType[i] = ADDR_NO_NUMBER; // and have no number type
+        }
+        else
+        {
+            pInfo->comps++;
+        }
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::GetCompBits
+*
+*   @brief
+*       Set the clear color (or clear depth/stencil) for a surface
+*
+*   @note
+*       If clearColor is zero, a default clear value is used in place of comps[4].
+*       If float32 is set, full precision is used, else the mantissa is reduced to 12-bits
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::SetClearComps(
+    ADDR_FLT_32 comps[4],   ///< [in/out] components
+    BOOL_32 clearColor,     ///< [in] TRUE if clear color is set (CLEAR_COLOR)
+    BOOL_32 float32)        ///< [in] TRUE if float32 component (BLEND_FLOAT32)
+{
+    INT_32 i;
+
+    // Use default clearvalues if clearColor is disabled
+    if (clearColor == FALSE)
+    {
+        for (i=0; i<3; i++)
+        {
+            comps[i].f = 0.0;
+        }
+        comps[3].f = 1.0;
+    }
+
+    // Otherwise use the (modified) clear value
+    else
+    {
+        for (i=0; i<4; i++)
+        {   // If full precision, use clear value unchanged
+            if (float32)
+            {
+                // Do nothing
+                //comps[i] = comps[i];
+            }
+            // Else if it is a NaN, use the standard NaN value
+            else if ((comps[i].u & 0x7FFFFFFF) > 0x7F800000)
+            {
+                comps[i].u = 0xFFC00000;
+            }
+            // Else reduce the mantissa precision
+            else
+            {
+                comps[i].u = comps[i].u & 0xFFFFF000;
+            }
+        }
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::IsBlockCompressed
+*
+*   @brief
+*       TRUE if this is block compressed format
+*
+*   @note
+*
+*   @return
+*       BOOL_32
+***************************************************************************************************
+*/
+BOOL_32 AddrElemLib::IsBlockCompressed(
+    AddrFormat format)  ///< [in] Format
+{
+    return format >= ADDR_FMT_BC1 && format <= ADDR_FMT_BC7;
+}
+
+
+/**
+***************************************************************************************************
+*   AddrElemLib::IsCompressed
+*
+*   @brief
+*       TRUE if this is block compressed format or 1 bit format
+*
+*   @note
+*
+*   @return
+*       BOOL_32
+***************************************************************************************************
+*/
+BOOL_32 AddrElemLib::IsCompressed(
+    AddrFormat format)  ///< [in] Format
+{
+    return IsBlockCompressed(format) || format == ADDR_FMT_BC1 || format == ADDR_FMT_BC7;
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::IsExpand3x
+*
+*   @brief
+*       TRUE if this is 3x expand format
+*
+*   @note
+*
+*   @return
+*       BOOL_32
+***************************************************************************************************
+*/
+BOOL_32 AddrElemLib::IsExpand3x(
+    AddrFormat format)  ///< [in] Format
+{
+    BOOL_32 is3x = FALSE;
+
+    switch (format)
+    {
+        case ADDR_FMT_8_8_8:
+        case ADDR_FMT_16_16_16:
+        case ADDR_FMT_16_16_16_FLOAT:
+        case ADDR_FMT_32_32_32:
+        case ADDR_FMT_32_32_32_FLOAT:
+            is3x = TRUE;
+            break;
+        default:
+            break;
+    }
+
+    return is3x;
+}
+
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.h
new file mode 100644
index 00000000000..c302b3b1788
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.h
@@ -0,0 +1,270 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrelemlib.h
+* @brief Contains the class for element/pixel related functions
+***************************************************************************************************
+*/
+
+#ifndef __ELEM_LIB_H__
+#define __ELEM_LIB_H__
+
+#include "addrinterface.h"
+#include "addrobject.h"
+#include "addrcommon.h"
+
+class AddrLib;
+
+// The masks for property bits within the Properties INT_32
+union ADDR_COMPONENT_FLAGS
+{
+    struct
+    {
+        UINT_32 byteAligned    : 1;    ///< all components are byte aligned
+        UINT_32 exportNorm     : 1;    ///< components support R6xx NORM compression
+        UINT_32 floatComp      : 1;    ///< there is at least one floating point component
+    };
+
+    UINT_32 value;
+};
+
+// Copy from legacy lib's AddrNumberType
+enum AddrNumberType
+{
+    // The following number types have the range [-1..1]
+    ADDR_NO_NUMBER,         // This component doesn't exist and has no default value
+    ADDR_EPSILON,           // Force component value to integer 0x00000001
+    ADDR_ZERO,              // Force component value to integer 0x00000000
+    ADDR_ONE,               // Force component value to floating point 1.0
+    // Above values don't have any bits per component (keep ADDR_ONE the last of these)
+
+    ADDR_UNORM,             // Unsigned normalized (repeating fraction) full precision
+    ADDR_SNORM,             // Signed normalized (repeating fraction) full precision
+    ADDR_GAMMA,             // Gamma-corrected, full precision
+
+    ADDR_UNORM_R5XXRB,      // Unsigned normalized (repeating fraction) for r5xx RB
+    ADDR_SNORM_R5XXRB,      // Signed normalized (repeating fraction) for r5xx RB
+    ADDR_GAMMA_R5XXRB,      // Gamma-corrected for r5xx RB (note: unnormalized value)
+    ADDR_UNORM_R5XXBC,      // Unsigned normalized (repeating fraction) for r5xx BC
+    ADDR_SNORM_R5XXBC,      // Signed normalized (repeating fraction) for r5xx BC
+    ADDR_GAMMA_R5XXBC,      // Gamma-corrected for r5xx BC (note: unnormalized value)
+
+    ADDR_UNORM_R6XX,        // Unsigned normalized (repeating fraction) for R6xx
+    ADDR_UNORM_R6XXDB,      // Unorms for 24-bit depth: one value differs from ADDR_UNORM_R6XX
+    ADDR_SNORM_R6XX,        // Signed normalized (repeating fraction) for R6xx
+    ADDR_GAMMA8_R6XX,       // Gamma-corrected for r6xx
+    ADDR_GAMMA8_R7XX_TP,    // Gamma-corrected for r7xx TP 12bit unorm 8.4.
+
+    ADDR_U4FLOATC,          // Unsigned float: 4-bit exponent, bias=15, no NaN, clamp [0..1]
+    ADDR_GAMMA_4SEG,        // Gamma-corrected, four segment approximation
+    ADDR_U0FIXED,           // Unsigned 0.N-bit fixed point
+
+    // The following number types have large ranges (LEAVE ADDR_USCALED first or fix Finish routine)
+    ADDR_USCALED,           // Unsigned integer converted to/from floating point
+    ADDR_SSCALED,           // Signed integer converted to/from floating point
+    ADDR_USCALED_R5XXRB,    // Unsigned integer to/from floating point for r5xx RB
+    ADDR_SSCALED_R5XXRB,    // Signed integer to/from floating point for r5xx RB
+    ADDR_UINT_BITS,         // Keep in unsigned integer form, clamped to specified range
+    ADDR_SINT_BITS,         // Keep in signed integer form, clamped to specified range
+    ADDR_UINTBITS,          // @@ remove Keep in unsigned integer form, use modulus to reduce bits
+    ADDR_SINTBITS,          // @@ remove Keep in signed integer form, use modulus to reduce bits
+
+    // The following number types and ADDR_U4FLOATC have exponents
+    // (LEAVE ADDR_S8FLOAT first or fix Finish routine)
+    ADDR_S8FLOAT,           // Signed floating point with 8-bit exponent, bias=127
+    ADDR_S8FLOAT32,         // 32-bit IEEE float, passes through NaN values
+    ADDR_S5FLOAT,           // Signed floating point with 5-bit exponent, bias=15
+    ADDR_S5FLOATM,          // Signed floating point with 5-bit exponent, bias=15, no NaN/Inf
+    ADDR_U5FLOAT,           // Signed floating point with 5-bit exponent, bias=15
+    ADDR_U3FLOATM,          // Unsigned floating point with 3-bit exponent, bias=3
+
+    ADDR_S5FIXED,           // Signed 5.N-bit fixed point, with rounding
+
+    ADDR_END_NUMBER         // Used for range comparisons
+};
+
+// Copy from legacy lib's AddrElement
+enum AddrElemMode
+{
+    // These formats allow both packing an unpacking
+    ADDR_ROUND_BY_HALF,     // add 1/2 and truncate when packing this element
+    ADDR_ROUND_TRUNCATE,    // truncate toward 0 for sign/mag, else toward neg
+    ADDR_ROUND_DITHER,      // Pack by dithering -- requires (x,y) position
+
+    // These formats only allow unpacking, no packing
+    ADDR_UNCOMPRESSED,      // Elements are not compressed: one data element per pixel/texel
+    ADDR_EXPANDED,          // Elements are split up and stored in multiple data elements
+    ADDR_PACKED_STD,        // Elements are compressed into ExpandX by ExpandY data elements
+    ADDR_PACKED_REV,        // Like ADDR_PACKED, but X order of pixels is reverved
+    ADDR_PACKED_GBGR,       // Elements are compressed 4:2:2 in G1B_G0R order (high to low)
+    ADDR_PACKED_BGRG,       // Elements are compressed 4:2:2 in BG1_RG0 order (high to low)
+    ADDR_PACKED_BC1,        // Each data element is uncompressed to a 4x4 pixel/texel array
+    ADDR_PACKED_BC2,        // Each data element is uncompressed to a 4x4 pixel/texel array
+    ADDR_PACKED_BC3,        // Each data element is uncompressed to a 4x4 pixel/texel array
+    ADDR_PACKED_BC4,        // Each data element is uncompressed to a 4x4 pixel/texel array
+    ADDR_PACKED_BC5,        // Each data element is uncompressed to a 4x4 pixel/texel array
+
+    // These formats provide various kinds of compression
+    ADDR_ZPLANE_R5XX,       // Compressed Zplane using r5xx architecture format
+    ADDR_ZPLANE_R6XX,       // Compressed Zplane using r6xx architecture format
+    //@@ Fill in the compression modes
+
+    ADDR_END_ELEMENT        // Used for range comparisons
+};
+
+enum AddrDepthPlanarType
+{
+    ADDR_DEPTH_PLANAR_NONE = 0, // No plane z/stencl
+    ADDR_DEPTH_PLANAR_R600 = 1, // R600 z and stencil planes are store within a tile
+    ADDR_DEPTH_PLANAR_R800 = 2, // R800 has separate z and stencil planes
+};
+
+/**
+***************************************************************************************************
+*   ADDR_PIXEL_FORMATINFO
+*
+*   @brief
+*       Per component info
+*
+***************************************************************************************************
+*/
+struct ADDR_PIXEL_FORMATINFO
+{
+    UINT_32             compBit[4];
+    AddrNumberType      numType[4];
+    UINT_32             compStart[4];
+    AddrElemMode        elemMode;
+    UINT_32             comps;          ///< Number of components
+};
+
+/**
+***************************************************************************************************
+* @brief This class contains asic indepentent element related attributes and operations
+***************************************************************************************************
+*/
+class AddrElemLib : public AddrObject
+{
+protected:
+    AddrElemLib(AddrLib* const pAddrLib);
+
+public:
+
+    /// Makes this class virtual
+    virtual ~AddrElemLib();
+
+    static AddrElemLib *Create(
+        const AddrLib* const pAddrLib);
+
+    /// The implementation is only for R6xx/R7xx, so make it virtual in case we need for R8xx
+    BOOL_32 PixGetExportNorm(
+        AddrColorFormat colorFmt,
+        AddrSurfaceNumber numberFmt, AddrSurfaceSwap swap) const;
+
+    /// Below method are asic independent, so make them just static.
+    /// Remove static if we need different operation in hwl.
+
+    VOID    Flt32ToDepthPixel(
+        AddrDepthFormat format, const ADDR_FLT_32 comps[2], UINT_8 *pPixel) const;
+
+    VOID    Flt32ToColorPixel(
+        AddrColorFormat format, AddrSurfaceNumber surfNum, AddrSurfaceSwap surfSwap,
+        const ADDR_FLT_32 comps[4], UINT_8 *pPixel) const;
+
+    static VOID    Flt32sToInt32s(
+        ADDR_FLT_32 value, UINT_32 bits, AddrNumberType numberType, UINT_32* pResult);
+
+    static VOID    Int32sToPixel(
+        UINT_32 numComps, UINT_32* pComps, UINT_32* pCompBits, UINT_32* pCompStart,
+        ADDR_COMPONENT_FLAGS properties, UINT_32 resultBits, UINT_8* pPixel);
+
+    VOID    PixGetColorCompInfo(
+        AddrColorFormat format, AddrSurfaceNumber number, AddrSurfaceSwap swap,
+        ADDR_PIXEL_FORMATINFO* pInfo) const;
+
+    VOID    PixGetDepthCompInfo(
+        AddrDepthFormat format, ADDR_PIXEL_FORMATINFO* pInfo) const;
+
+    UINT_32 GetBitsPerPixel(
+        AddrFormat format, AddrElemMode* pElemMode,
+        UINT_32* pExpandX = NULL, UINT_32* pExpandY = NULL, UINT_32* pBitsUnused = NULL);
+
+    static VOID    SetClearComps(
+        ADDR_FLT_32 comps[4], BOOL_32 clearColor, BOOL_32 float32);
+
+    VOID    AdjustSurfaceInfo(
+        AddrElemMode elemMode, UINT_32 expandX, UINT_32 expandY,
+        UINT_32* pBpp, UINT_32* pBasePitch, UINT_32* pWidth, UINT_32* pHeight);
+
+    VOID    RestoreSurfaceInfo(
+        AddrElemMode elemMode, UINT_32 expandX, UINT_32 expandY,
+        UINT_32* pBpp, UINT_32* pWidth, UINT_32* pHeight);
+
+    /// Checks if depth and stencil are planar inside a tile
+    BOOL_32 IsDepthStencilTilePlanar()
+    {
+        return (m_depthPlanarType == ADDR_DEPTH_PLANAR_R600) ? TRUE : FALSE;
+    }
+
+    /// Sets m_configFlags, copied from AddrLib
+    VOID    SetConfigFlags(ADDR_CONFIG_FLAGS flags)
+    {
+        m_configFlags = flags;
+    }
+
+    static BOOL_32 IsCompressed(AddrFormat format);
+    static BOOL_32 IsBlockCompressed(AddrFormat format);
+    static BOOL_32 IsExpand3x(AddrFormat format);
+
+protected:
+
+    static VOID    GetCompBits(
+        UINT_32 c0, UINT_32 c1, UINT_32 c2, UINT_32 c3,
+        ADDR_PIXEL_FORMATINFO* pInfo,
+        AddrElemMode elemMode = ADDR_ROUND_BY_HALF);
+
+    static VOID    GetCompType(
+        AddrColorFormat format, AddrSurfaceNumber numType,
+        ADDR_PIXEL_FORMATINFO* pInfo);
+
+    static VOID    GetCompSwap(
+        AddrSurfaceSwap swap, ADDR_PIXEL_FORMATINFO* pInfo);
+
+    static VOID    SwapComps(
+        UINT_32 c0, UINT_32 c1, ADDR_PIXEL_FORMATINFO* pInfo);
+
+private:
+
+    UINT_32             m_fp16ExportNorm;   ///< If allow FP16 to be reported as EXPORT_NORM
+    AddrDepthPlanarType m_depthPlanarType;
+
+    ADDR_CONFIG_FLAGS   m_configFlags;      ///< Copy of AddrLib's configFlags
+    AddrLib* const      m_pAddrLib;         ///< Pointer to parent addrlib instance
+};
+
+#endif
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.cpp
new file mode 100644
index 00000000000..1df693e5be5
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.cpp
@@ -0,0 +1,4023 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrlib.cpp
+* @brief Contains the implementation for the AddrLib base class..
+***************************************************************************************************
+*/
+
+#include "addrinterface.h"
+#include "addrlib.h"
+#include "addrcommon.h"
+
+#if defined(__APPLE__)
+
+UINT_32 div64_32(UINT_64 n, UINT_32 base)
+{
+    UINT_64 rem = n;
+    UINT_64 b = base;
+    UINT_64 res, d = 1;
+    UINT_32 high = rem >> 32;
+
+    res = 0;
+    if (high >= base)
+    {
+        high /= base;
+        res = (UINT_64) high << 32;
+        rem -= (UINT_64) (high*base) << 32;
+    }
+
+    while ((INT_64)b > 0 && b < rem)
+    {
+        b = b+b;
+        d = d+d;
+    }
+
+    do
+    {
+        if (rem >= b)
+        {
+            rem -= b;
+            res += d;
+        }
+        b >>= 1;
+        d >>= 1;
+    } while (d);
+
+    n = res;
+    return rem;
+}
+
+extern "C"
+UINT_32 __umoddi3(UINT_64 n, UINT_32 base)
+{
+    return div64_32(n, base);
+}
+
+#endif // __APPLE__
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Static Const Member
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+const AddrTileModeFlags AddrLib::m_modeFlags[ADDR_TM_COUNT] =
+{// T   L  1  2  3  P  Pr B
+    {1, 1, 0, 0, 0, 0, 0, 0}, // ADDR_TM_LINEAR_GENERAL
+    {1, 1, 0, 0, 0, 0, 0, 0}, // ADDR_TM_LINEAR_ALIGNED
+    {1, 0, 1, 0, 0, 0, 0, 0}, // ADDR_TM_1D_TILED_THIN1
+    {4, 0, 1, 0, 0, 0, 0, 0}, // ADDR_TM_1D_TILED_THICK
+    {1, 0, 0, 1, 0, 0, 0, 0}, // ADDR_TM_2D_TILED_THIN1
+    {1, 0, 0, 1, 0, 0, 0, 0}, // ADDR_TM_2D_TILED_THIN2
+    {1, 0, 0, 1, 0, 0, 0, 0}, // ADDR_TM_2D_TILED_THIN4
+    {4, 0, 0, 1, 0, 0, 0, 0}, // ADDR_TM_2D_TILED_THICK
+    {1, 0, 0, 1, 0, 0, 0, 1}, // ADDR_TM_2B_TILED_THIN1
+    {1, 0, 0, 1, 0, 0, 0, 1}, // ADDR_TM_2B_TILED_THIN2
+    {1, 0, 0, 1, 0, 0, 0, 1}, // ADDR_TM_2B_TILED_THIN4
+    {4, 0, 0, 1, 0, 0, 0, 1}, // ADDR_TM_2B_TILED_THICK
+    {1, 0, 0, 1, 1, 0, 0, 0}, // ADDR_TM_3D_TILED_THIN1
+    {4, 0, 0, 1, 1, 0, 0, 0}, // ADDR_TM_3D_TILED_THICK
+    {1, 0, 0, 1, 1, 0, 0, 1}, // ADDR_TM_3B_TILED_THIN1
+    {4, 0, 0, 1, 1, 0, 0, 1}, // ADDR_TM_3B_TILED_THICK
+    {8, 0, 0, 1, 0, 0, 0, 0}, // ADDR_TM_2D_TILED_XTHICK
+    {8, 0, 0, 1, 1, 0, 0, 0}, // ADDR_TM_3D_TILED_XTHICK
+    {1, 0, 0, 0, 0, 0, 0, 0}, // ADDR_TM_POWER_SAVE
+    {1, 0, 0, 1, 0, 1, 1, 0}, // ADDR_TM_PRT_TILED_THIN1
+    {1, 0, 0, 1, 0, 1, 0, 0}, // ADDR_TM_PRT_2D_TILED_THIN1
+    {1, 0, 0, 1, 1, 1, 0, 0}, // ADDR_TM_PRT_3D_TILED_THIN1
+    {4, 0, 0, 1, 0, 1, 1, 0}, // ADDR_TM_PRT_TILED_THICK
+    {4, 0, 0, 1, 0, 1, 0, 0}, // ADDR_TM_PRT_2D_TILED_THICK
+    {4, 0, 0, 1, 1, 1, 0, 0}, // ADDR_TM_PRT_3D_TILED_THICK
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Constructor/Destructor
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrLib::AddrLib
+*
+*   @brief
+*       Constructor for the AddrLib class
+*
+***************************************************************************************************
+*/
+AddrLib::AddrLib() :
+    m_class(BASE_ADDRLIB),
+    m_chipFamily(ADDR_CHIP_FAMILY_IVLD),
+    m_chipRevision(0),
+    m_version(ADDRLIB_VERSION),
+    m_pipes(0),
+    m_banks(0),
+    m_pipeInterleaveBytes(0),
+    m_rowSize(0),
+    m_minPitchAlignPixels(1),
+    m_maxSamples(8),
+    m_pElemLib(NULL)
+{
+    m_configFlags.value = 0;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::AddrLib
+*
+*   @brief
+*       Constructor for the AddrLib class with hClient as parameter
+*
+***************************************************************************************************
+*/
+AddrLib::AddrLib(const AddrClient* pClient) :
+    AddrObject(pClient),
+    m_class(BASE_ADDRLIB),
+    m_chipFamily(ADDR_CHIP_FAMILY_IVLD),
+    m_chipRevision(0),
+    m_version(ADDRLIB_VERSION),
+    m_pipes(0),
+    m_banks(0),
+    m_pipeInterleaveBytes(0),
+    m_rowSize(0),
+    m_minPitchAlignPixels(1),
+    m_maxSamples(8),
+    m_pElemLib(NULL)
+{
+    m_configFlags.value = 0;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::~AddrLib
+*
+*   @brief
+*       Destructor for the AddrLib class
+*
+***************************************************************************************************
+*/
+AddrLib::~AddrLib()
+{
+    if (m_pElemLib)
+    {
+        delete m_pElemLib;
+    }
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Initialization/Helper
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrLib::Create
+*
+*   @brief
+*       Creates and initializes AddrLib object.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::Create(
+    const ADDR_CREATE_INPUT* pCreateIn,     ///< [in] pointer to ADDR_CREATE_INPUT
+    ADDR_CREATE_OUTPUT*      pCreateOut)    ///< [out] pointer to ADDR_CREATE_OUTPUT
+{
+    AddrLib* pLib = NULL;
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pCreateIn->createFlags.fillSizeFields == TRUE)
+    {
+        if ((pCreateIn->size != sizeof(ADDR_CREATE_INPUT)) ||
+            (pCreateOut->size != sizeof(ADDR_CREATE_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if ((returnCode == ADDR_OK)                    &&
+        (pCreateIn->callbacks.allocSysMem != NULL) &&
+        (pCreateIn->callbacks.freeSysMem != NULL))
+    {
+        AddrClient client = {
+            pCreateIn->hClient,
+            pCreateIn->callbacks
+        };
+
+        switch (pCreateIn->chipEngine)
+        {
+            case CIASICIDGFXENGINE_SOUTHERNISLAND:
+                switch (pCreateIn->chipFamily)
+                {
+                    case FAMILY_SI:
+                        pLib = AddrSIHwlInit(&client);
+                        break;
+                    case FAMILY_VI:
+                    case FAMILY_CZ: // VI based fusion(carrizo)
+                    case FAMILY_CI:
+                    case FAMILY_KV: // CI based fusion
+                        pLib = AddrCIHwlInit(&client);
+                        break;
+                    default:
+                        ADDR_ASSERT_ALWAYS();
+                        break;
+                }
+                break;
+            default:
+                ADDR_ASSERT_ALWAYS();
+                break;
+        }
+    }
+
+    if ((pLib != NULL))
+    {
+        BOOL_32 initValid;
+
+        // Pass createFlags to configFlags first since these flags may be overwritten
+        pLib->m_configFlags.noCubeMipSlicesPad  = pCreateIn->createFlags.noCubeMipSlicesPad;
+        pLib->m_configFlags.fillSizeFields      = pCreateIn->createFlags.fillSizeFields;
+        pLib->m_configFlags.useTileIndex        = pCreateIn->createFlags.useTileIndex;
+        pLib->m_configFlags.useCombinedSwizzle  = pCreateIn->createFlags.useCombinedSwizzle;
+        pLib->m_configFlags.checkLast2DLevel    = pCreateIn->createFlags.checkLast2DLevel;
+        pLib->m_configFlags.useHtileSliceAlign  = pCreateIn->createFlags.useHtileSliceAlign;
+        pLib->m_configFlags.degradeBaseLevel    = pCreateIn->createFlags.degradeBaseLevel;
+        pLib->m_configFlags.allowLargeThickTile = pCreateIn->createFlags.allowLargeThickTile;
+
+        pLib->SetAddrChipFamily(pCreateIn->chipFamily, pCreateIn->chipRevision);
+
+        pLib->SetMinPitchAlignPixels(pCreateIn->minPitchAlignPixels);
+
+        // Global parameters initialized and remaining configFlags bits are set as well
+        initValid = pLib->HwlInitGlobalParams(pCreateIn);
+
+        if (initValid)
+        {
+            pLib->m_pElemLib = AddrElemLib::Create(pLib);
+        }
+        else
+        {
+            pLib->m_pElemLib = NULL; // Don't go on allocating element lib
+            returnCode = ADDR_INVALIDGBREGVALUES;
+        }
+
+        if (pLib->m_pElemLib == NULL)
+        {
+            delete pLib;
+            pLib = NULL;
+            ADDR_ASSERT_ALWAYS();
+        }
+        else
+        {
+            pLib->m_pElemLib->SetConfigFlags(pLib->m_configFlags);
+        }
+    }
+
+    pCreateOut->hLib = pLib;
+
+    if ((pLib == NULL) &&
+        (returnCode == ADDR_OK))
+    {
+        // Unknown failures, we return the general error code
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::SetAddrChipFamily
+*
+*   @brief
+*       Convert familyID defined in atiid.h to AddrChipFamily and set m_chipFamily/m_chipRevision
+*   @return
+*      N/A
+***************************************************************************************************
+*/
+VOID AddrLib::SetAddrChipFamily(
+    UINT_32 uChipFamily,        ///< [in] chip family defined in atiih.h
+    UINT_32 uChipRevision)      ///< [in] chip revision defined in "asic_family"_id.h
+{
+    AddrChipFamily family = ADDR_CHIP_FAMILY_IVLD;
+
+    family = HwlConvertChipFamily(uChipFamily, uChipRevision);
+
+    ADDR_ASSERT(family != ADDR_CHIP_FAMILY_IVLD);
+
+    m_chipFamily    = family;
+    m_chipRevision  = uChipRevision;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::SetMinPitchAlignPixels
+*
+*   @brief
+*       Set m_minPitchAlignPixels with input param
+*
+*   @return
+*      N/A
+***************************************************************************************************
+*/
+VOID AddrLib::SetMinPitchAlignPixels(
+    UINT_32 minPitchAlignPixels)    ///< [in] minmum pitch alignment in pixels
+{
+    m_minPitchAlignPixels = (minPitchAlignPixels == 0)? 1 : minPitchAlignPixels;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::GetAddrLib
+*
+*   @brief
+*       Get AddrLib pointer
+*
+*   @return
+*      An AddrLib class pointer
+***************************************************************************************************
+*/
+AddrLib * AddrLib::GetAddrLib(
+    ADDR_HANDLE hLib)   ///< [in] handle of ADDR_HANDLE
+{
+    return static_cast<AddrLib *>(hLib);
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Surface Methods
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceInfo
+*
+*   @brief
+*       Interface function stub of AddrComputeSurfaceInfo.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeSurfaceInfo(
+     const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,    ///< [in] input structure
+     ADDR_COMPUTE_SURFACE_INFO_OUTPUT*      pOut    ///< [out] output structure
+     ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    // We suggest client do sanity check but a check here is also good
+    if (pIn->bpp > 128)
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+
+    // Thick modes don't support multisample
+    if (ComputeSurfaceThickness(pIn->tileMode) > 1 && pIn->numSamples > 1)
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        // Get a local copy of input structure and only reference pIn for unadjusted values
+        ADDR_COMPUTE_SURFACE_INFO_INPUT localIn = *pIn;
+        ADDR_TILEINFO tileInfoNull = {0};
+
+        if (UseTileInfo())
+        {
+            // If the original input has a valid ADDR_TILEINFO pointer then copy its contents.
+            // Otherwise the default 0's in tileInfoNull are used.
+            if (pIn->pTileInfo)
+            {
+                tileInfoNull = *pIn->pTileInfo;
+            }
+            localIn.pTileInfo  = &tileInfoNull;
+        }
+
+        localIn.numSamples = pIn->numSamples == 0 ? 1 : pIn->numSamples;
+
+        // Do mipmap check first
+        // If format is BCn, pre-pad dimension to power-of-two according to HWL
+        ComputeMipLevel(&localIn);
+
+        if (m_configFlags.checkLast2DLevel)
+        {
+            // Save this level's original height in pixels
+            pOut->height = pIn->height;
+        }
+
+        UINT_32 expandX = 1;
+        UINT_32 expandY = 1;
+        AddrElemMode elemMode;
+
+        // Save outputs that may not go through HWL
+        pOut->pixelBits = localIn.bpp;
+        pOut->numSamples = localIn.numSamples;
+        pOut->last2DLevel = FALSE;
+
+#if !ALT_TEST
+        if (localIn.numSamples > 1)
+        {
+            ADDR_ASSERT(localIn.mipLevel == 0);
+        }
+#endif
+
+        if (localIn.format != ADDR_FMT_INVALID) // Set format to INVALID will skip this conversion
+        {
+            // Get compression/expansion factors and element mode
+            // (which indicates compression/expansion
+            localIn.bpp = GetElemLib()->GetBitsPerPixel(localIn.format,
+                                                        &elemMode,
+                                                        &expandX,
+                                                        &expandY);
+
+            // Special flag for 96 bit surface. 96 (or 48 if we support) bit surface's width is
+            // pre-multiplied by 3 and bpp is divided by 3. So pitch alignment for linear-
+            // aligned does not meet 64-pixel in real. We keep special handling in hwl since hw
+            // restrictions are different.
+            // Also Mip 1+ needs an element pitch of 32 bits so we do not need this workaround
+            // but we use this flag to skip RestoreSurfaceInfo below
+
+            if ((elemMode == ADDR_EXPANDED) &&
+                (expandX > 1))
+            {
+                ADDR_ASSERT(localIn.tileMode == ADDR_TM_LINEAR_ALIGNED || localIn.height == 1);
+            }
+
+            GetElemLib()->AdjustSurfaceInfo(elemMode,
+                                            expandX,
+                                            expandY,
+                                            &localIn.bpp,
+                                            &localIn.basePitch,
+                                            &localIn.width,
+                                            &localIn.height);
+
+            // Overwrite these parameters if we have a valid format
+        }
+        else if (localIn.bpp != 0)
+        {
+            localIn.width  = (localIn.width != 0) ? localIn.width : 1;
+            localIn.height = (localIn.height != 0) ? localIn.height : 1;
+        }
+        else // Rule out some invalid parameters
+        {
+            ADDR_ASSERT_ALWAYS();
+
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+
+        // Check mipmap after surface expansion
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = PostComputeMipLevel(&localIn, pOut);
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            if (UseTileIndex(localIn.tileIndex))
+            {
+                // Make sure pTileInfo is not NULL
+                ADDR_ASSERT(localIn.pTileInfo);
+
+                UINT_32 numSamples = GetNumFragments(localIn.numSamples, localIn.numFrags);
+
+                INT_32 macroModeIndex = TileIndexNoMacroIndex;
+
+                if (localIn.tileIndex != TileIndexLinearGeneral)
+                {
+                    // Try finding a macroModeIndex
+                    macroModeIndex = HwlComputeMacroModeIndex(localIn.tileIndex,
+                                                              localIn.flags,
+                                                              localIn.bpp,
+                                                              numSamples,
+                                                              localIn.pTileInfo,
+                                                              &localIn.tileMode,
+                                                              &localIn.tileType);
+                }
+
+                // If macroModeIndex is not needed, then call HwlSetupTileCfg to get tile info
+                if (macroModeIndex == TileIndexNoMacroIndex)
+                {
+                    returnCode = HwlSetupTileCfg(localIn.tileIndex, macroModeIndex,
+                                                 localIn.pTileInfo,
+                                                 &localIn.tileMode, &localIn.tileType);
+                }
+                // If macroModeIndex is invalid, then assert this is not macro tiled
+                else if (macroModeIndex == TileIndexInvalid)
+                {
+                    ADDR_ASSERT(!IsMacroTiled(localIn.tileMode));
+                }
+            }
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            AddrTileMode tileMode = localIn.tileMode;
+            AddrTileType tileType = localIn.tileType;
+
+            // HWL layer may override tile mode if necessary
+            if (HwlOverrideTileMode(&localIn, &tileMode, &tileType))
+            {
+                localIn.tileMode = tileMode;
+                localIn.tileType = tileType;
+            }
+            // Degrade base level if applicable
+            if (DegradeBaseLevel(&localIn, &tileMode))
+            {
+                localIn.tileMode = tileMode;
+            }
+        }
+
+        // Call main function to compute surface info
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlComputeSurfaceInfo(&localIn, pOut);
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            // Since bpp might be changed we just pass it through
+            pOut->bpp  = localIn.bpp;
+
+            // Also original width/height/bpp
+            pOut->pixelPitch    = pOut->pitch;
+            pOut->pixelHeight   = pOut->height;
+
+#if DEBUG
+            if (localIn.flags.display)
+            {
+                ADDR_ASSERT((pOut->pitchAlign % 32) == 0);
+            }
+#endif //DEBUG
+
+            if (localIn.format != ADDR_FMT_INVALID)
+            {
+                //
+                // 96 bits surface of level 1+ requires element pitch of 32 bits instead
+                // In hwl function we skip multiplication of 3 then we should skip division of 3
+                // We keep pitch that represents 32 bit element instead of 96 bits since we
+                // will get an odd number if divided by 3.
+                //
+                if (!((expandX == 3) && (localIn.mipLevel > 0)))
+                {
+
+                    GetElemLib()->RestoreSurfaceInfo(elemMode,
+                                                     expandX,
+                                                     expandY,
+                                                     &localIn.bpp,
+                                                     &pOut->pixelPitch,
+                                                     &pOut->pixelHeight);
+                }
+            }
+
+            if (localIn.flags.qbStereo)
+            {
+                if (pOut->pStereoInfo)
+                {
+                    ComputeQbStereoInfo(pOut);
+                }
+            }
+
+            if (localIn.flags.volume) // For volume sliceSize equals to all z-slices
+            {
+                pOut->sliceSize = pOut->surfSize;
+            }
+            else // For array: sliceSize is likely to have slice-padding (the last one)
+            {
+                pOut->sliceSize = pOut->surfSize / pOut->depth;
+
+                // array or cubemap
+                if (pIn->numSlices > 1)
+                {
+                    // If this is the last slice then add the padding size to this slice
+                    if (pIn->slice == (pIn->numSlices - 1))
+                    {
+                        pOut->sliceSize += pOut->sliceSize * (pOut->depth - pIn->numSlices);
+                    }
+                    else if (m_configFlags.checkLast2DLevel)
+                    {
+                        // Reset last2DLevel flag if this is not the last array slice
+                        pOut->last2DLevel = FALSE;
+                    }
+                }
+            }
+
+            pOut->pitchTileMax = pOut->pitch / 8 - 1;
+            pOut->heightTileMax = pOut->height / 8 - 1;
+            pOut->sliceTileMax = pOut->pitch * pOut->height / 64 - 1;
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceInfo
+*
+*   @brief
+*       Interface function stub of AddrComputeSurfaceInfo.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeSurfaceAddrFromCoord(
+    const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            const ADDR_SURFACE_FLAGS flags = {{0}};
+            UINT_32 numSamples = GetNumFragments(pIn->numSamples, pIn->numFrags);
+
+            // Try finding a macroModeIndex
+            INT_32 macroModeIndex = HwlComputeMacroModeIndex(input.tileIndex,
+                                                             flags,
+                                                             input.bpp,
+                                                             numSamples,
+                                                             input.pTileInfo,
+                                                             &input.tileMode,
+                                                             &input.tileType);
+
+            // If macroModeIndex is not needed, then call HwlSetupTileCfg to get tile info
+            if (macroModeIndex == TileIndexNoMacroIndex)
+            {
+                returnCode = HwlSetupTileCfg(input.tileIndex, macroModeIndex,
+                                             input.pTileInfo, &input.tileMode, &input.tileType);
+            }
+            // If macroModeIndex is invalid, then assert this is not macro tiled
+            else if (macroModeIndex == TileIndexInvalid)
+            {
+                ADDR_ASSERT(!IsMacroTiled(input.tileMode));
+            }
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlComputeSurfaceAddrFromCoord(pIn, pOut);
+
+            if (returnCode == ADDR_OK)
+            {
+                pOut->prtBlockIndex = static_cast<UINT_32>(pOut->addr / (64 * 1024));
+            }
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceCoordFromAddr
+*
+*   @brief
+*       Interface function stub of ComputeSurfaceCoordFromAddr.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeSurfaceCoordFromAddr(
+    const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            const ADDR_SURFACE_FLAGS flags = {{0}};
+            UINT_32 numSamples = GetNumFragments(pIn->numSamples, pIn->numFrags);
+
+            // Try finding a macroModeIndex
+            INT_32 macroModeIndex = HwlComputeMacroModeIndex(input.tileIndex,
+                                                             flags,
+                                                             input.bpp,
+                                                             numSamples,
+                                                             input.pTileInfo,
+                                                             &input.tileMode,
+                                                             &input.tileType);
+
+            // If macroModeIndex is not needed, then call HwlSetupTileCfg to get tile info
+            if (macroModeIndex == TileIndexNoMacroIndex)
+            {
+                returnCode = HwlSetupTileCfg(input.tileIndex, macroModeIndex,
+                                             input.pTileInfo, &input.tileMode, &input.tileType);
+            }
+            // If macroModeIndex is invalid, then assert this is not macro tiled
+            else if (macroModeIndex == TileIndexInvalid)
+            {
+                ADDR_ASSERT(!IsMacroTiled(input.tileMode));
+            }
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlComputeSurfaceCoordFromAddr(pIn, pOut);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSliceTileSwizzle
+*
+*   @brief
+*       Interface function stub of ComputeSliceTileSwizzle.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeSliceTileSwizzle(
+    const ADDR_COMPUTE_SLICESWIZZLE_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SLICESWIZZLE_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_SLICESWIZZLE_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_SLICESWIZZLE_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_SLICESWIZZLE_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex,
+                                         input.pTileInfo, &input.tileMode);
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlComputeSliceTileSwizzle(pIn, pOut);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ExtractBankPipeSwizzle
+*
+*   @brief
+*       Interface function stub of AddrExtractBankPipeSwizzle.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ExtractBankPipeSwizzle(
+    const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT*  pIn,    ///< [in] input structure
+    ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT)) ||
+            (pOut->size != sizeof(ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlExtractBankPipeSwizzle(pIn, pOut);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::CombineBankPipeSwizzle
+*
+*   @brief
+*       Interface function stub of AddrCombineBankPipeSwizzle.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::CombineBankPipeSwizzle(
+    const ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_FMASK_INFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_FMASK_INFO_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlCombineBankPipeSwizzle(pIn->bankSwizzle,
+                                                   pIn->pipeSwizzle,
+                                                   pIn->pTileInfo,
+                                                   pIn->baseAddr,
+                                                   &pOut->tileSwizzle);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeBaseSwizzle
+*
+*   @brief
+*       Interface function stub of AddrCompueBaseSwizzle.
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeBaseSwizzle(
+    const ADDR_COMPUTE_BASE_SWIZZLE_INPUT*  pIn,
+    ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT* pOut) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_BASE_SWIZZLE_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_BASE_SWIZZLE_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            if (IsMacroTiled(pIn->tileMode))
+            {
+                returnCode = HwlComputeBaseSwizzle(pIn, pOut);
+            }
+            else
+            {
+                pOut->tileSwizzle = 0;
+            }
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeFmaskInfo
+*
+*   @brief
+*       Interface function stub of ComputeFmaskInfo.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeFmaskInfo(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pIn,    ///< [in] input structure
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT*         pOut    ///< [out] output structure
+    )
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_FMASK_INFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_FMASK_INFO_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    // No thick MSAA
+    if (ComputeSurfaceThickness(pIn->tileMode) > 1)
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_FMASK_INFO_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+
+            if (pOut->pTileInfo)
+            {
+                // Use temp tile info for calcalation
+                input.pTileInfo = pOut->pTileInfo;
+            }
+            else
+            {
+                input.pTileInfo = &tileInfoNull;
+            }
+
+            ADDR_SURFACE_FLAGS flags = {{0}};
+            flags.fmask = 1;
+
+            // Try finding a macroModeIndex
+            INT_32 macroModeIndex = HwlComputeMacroModeIndex(pIn->tileIndex,
+                                                             flags,
+                                                             HwlComputeFmaskBits(pIn, NULL),
+                                                             pIn->numSamples,
+                                                             input.pTileInfo,
+                                                             &input.tileMode);
+
+            // If macroModeIndex is not needed, then call HwlSetupTileCfg to get tile info
+            if (macroModeIndex == TileIndexNoMacroIndex)
+            {
+                returnCode = HwlSetupTileCfg(input.tileIndex, macroModeIndex,
+                                             input.pTileInfo, &input.tileMode);
+            }
+
+            ADDR_ASSERT(macroModeIndex != TileIndexInvalid);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            if (pIn->numSamples > 1)
+            {
+                returnCode = HwlComputeFmaskInfo(pIn, pOut);
+            }
+            else
+            {
+                memset(pOut, 0, sizeof(ADDR_COMPUTE_FMASK_INFO_OUTPUT));
+
+                returnCode = ADDR_INVALIDPARAMS;
+            }
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeFmaskAddrFromCoord
+*
+*   @brief
+*       Interface function stub of ComputeFmaskAddrFromCoord.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeFmaskAddrFromCoord(
+    const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_ASSERT(pIn->numSamples > 1);
+
+        if (pIn->numSamples > 1)
+        {
+            returnCode = HwlComputeFmaskAddrFromCoord(pIn, pOut);
+        }
+        else
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeFmaskCoordFromAddr
+*
+*   @brief
+*       Interface function stub of ComputeFmaskAddrFromCoord.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeFmaskCoordFromAddr(
+    const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT*  pIn,     ///< [in] input structure
+    ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT* pOut           ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_ASSERT(pIn->numSamples > 1);
+
+        if (pIn->numSamples > 1)
+        {
+            returnCode = HwlComputeFmaskCoordFromAddr(pIn, pOut);
+        }
+        else
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ConvertTileInfoToHW
+*
+*   @brief
+*       Convert tile info from real value to HW register value in HW layer
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ConvertTileInfoToHW(
+    const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn, ///< [in] input structure
+    ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut      ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_CONVERT_TILEINFOTOHW_INPUT)) ||
+            (pOut->size != sizeof(ADDR_CONVERT_TILEINFOTOHW_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_CONVERT_TILEINFOTOHW_INPUT input;
+        // if pIn->reverse is TRUE, indices are ignored
+        if (pIn->reverse == FALSE && UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlConvertTileInfoToHW(pIn, pOut);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ConvertTileIndex
+*
+*   @brief
+*       Convert tile index to tile mode/type/info
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ConvertTileIndex(
+    const ADDR_CONVERT_TILEINDEX_INPUT* pIn, ///< [in] input structure
+    ADDR_CONVERT_TILEINDEX_OUTPUT* pOut      ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_CONVERT_TILEINDEX_INPUT)) ||
+            (pOut->size != sizeof(ADDR_CONVERT_TILEINDEX_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+
+        returnCode = HwlSetupTileCfg(pIn->tileIndex, pIn->macroModeIndex,
+                                     pOut->pTileInfo, &pOut->tileMode, &pOut->tileType);
+
+        if (returnCode == ADDR_OK && pIn->tileInfoHw)
+        {
+            ADDR_CONVERT_TILEINFOTOHW_INPUT hwInput = {0};
+            ADDR_CONVERT_TILEINFOTOHW_OUTPUT hwOutput = {0};
+
+            hwInput.pTileInfo = pOut->pTileInfo;
+            hwInput.tileIndex = -1;
+            hwOutput.pTileInfo = pOut->pTileInfo;
+
+            returnCode = HwlConvertTileInfoToHW(&hwInput, &hwOutput);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ConvertTileIndex1
+*
+*   @brief
+*       Convert tile index to tile mode/type/info
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ConvertTileIndex1(
+    const ADDR_CONVERT_TILEINDEX1_INPUT* pIn,   ///< [in] input structure
+    ADDR_CONVERT_TILEINDEX_OUTPUT* pOut         ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_CONVERT_TILEINDEX1_INPUT)) ||
+            (pOut->size != sizeof(ADDR_CONVERT_TILEINDEX_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_SURFACE_FLAGS flags = {{0}};
+
+        HwlComputeMacroModeIndex(pIn->tileIndex, flags, pIn->bpp, pIn->numSamples,
+                                 pOut->pTileInfo, &pOut->tileMode, &pOut->tileType);
+
+        if (pIn->tileInfoHw)
+        {
+            ADDR_CONVERT_TILEINFOTOHW_INPUT hwInput = {0};
+            ADDR_CONVERT_TILEINFOTOHW_OUTPUT hwOutput = {0};
+
+            hwInput.pTileInfo = pOut->pTileInfo;
+            hwInput.tileIndex = -1;
+            hwOutput.pTileInfo = pOut->pTileInfo;
+
+            returnCode = HwlConvertTileInfoToHW(&hwInput, &hwOutput);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::GetTileIndex
+*
+*   @brief
+*       Get tile index from tile mode/type/info
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::GetTileIndex(
+    const ADDR_GET_TILEINDEX_INPUT* pIn, ///< [in] input structure
+    ADDR_GET_TILEINDEX_OUTPUT* pOut      ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_GET_TILEINDEX_INPUT)) ||
+            (pOut->size != sizeof(ADDR_GET_TILEINDEX_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        returnCode = HwlGetTileIndex(pIn, pOut);
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceThickness
+*
+*   @brief
+*       Compute surface thickness
+*
+*   @return
+*       Surface thickness
+***************************************************************************************************
+*/
+UINT_32 AddrLib::ComputeSurfaceThickness(
+    AddrTileMode tileMode)    ///< [in] tile mode
+{
+    return m_modeFlags[tileMode].thickness;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               CMASK/HTILE
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeHtileInfo
+*
+*   @brief
+*       Interface function stub of AddrComputeHtilenfo
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeHtileInfo(
+    const ADDR_COMPUTE_HTILE_INFO_INPUT*    pIn,    ///< [in] input structure
+    ADDR_COMPUTE_HTILE_INFO_OUTPUT*         pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    BOOL_32 isWidth8  = (pIn->blockWidth == 8) ? TRUE : FALSE;
+    BOOL_32 isHeight8 = (pIn->blockHeight == 8) ? TRUE : FALSE;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_HTILE_INFO_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            pOut->bpp = ComputeHtileInfo(pIn->flags,
+                                         pIn->pitch,
+                                         pIn->height,
+                                         pIn->numSlices,
+                                         pIn->isLinear,
+                                         isWidth8,
+                                         isHeight8,
+                                         pIn->pTileInfo,
+                                         &pOut->pitch,
+                                         &pOut->height,
+                                         &pOut->htileBytes,
+                                         &pOut->macroWidth,
+                                         &pOut->macroHeight,
+                                         &pOut->sliceSize,
+                                         &pOut->baseAlign);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeCmaskInfo
+*
+*   @brief
+*       Interface function stub of AddrComputeCmaskInfo
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeCmaskInfo(
+    const ADDR_COMPUTE_CMASK_INFO_INPUT*    pIn,    ///< [in] input structure
+    ADDR_COMPUTE_CMASK_INFO_OUTPUT*         pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_CMASK_INFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_CMASK_INFO_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_CMASK_INFO_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = ComputeCmaskInfo(pIn->flags,
+                                          pIn->pitch,
+                                          pIn->height,
+                                          pIn->numSlices,
+                                          pIn->isLinear,
+                                          pIn->pTileInfo,
+                                          &pOut->pitch,
+                                          &pOut->height,
+                                          &pOut->cmaskBytes,
+                                          &pOut->macroWidth,
+                                          &pOut->macroHeight,
+                                          &pOut->sliceSize,
+                                          &pOut->baseAlign,
+                                          &pOut->blockMax);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeDccInfo
+*
+*   @brief
+*       Interface function to compute DCC key info
+*
+*   @return
+*       return code of HwlComputeDccInfo
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeDccInfo(
+    const ADDR_COMPUTE_DCCINFO_INPUT*    pIn,    ///< [in] input structure
+    ADDR_COMPUTE_DCCINFO_OUTPUT*         pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE ret = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_DCCINFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT)))
+        {
+            ret = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (ret == ADDR_OK)
+    {
+        ADDR_COMPUTE_DCCINFO_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+
+            ret = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex,
+                                  &input.tileInfo, &input.tileMode);
+
+            pIn = &input;
+        }
+
+        if (ADDR_OK == ret)
+        {
+            ret = HwlComputeDccInfo(pIn, pOut);
+        }
+    }
+
+    return ret;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeHtileAddrFromCoord
+*
+*   @brief
+*       Interface function stub of AddrComputeHtileAddrFromCoord
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeHtileAddrFromCoord(
+    const ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    BOOL_32 isWidth8  = (pIn->blockWidth == 8) ? TRUE : FALSE;
+    BOOL_32 isHeight8 = (pIn->blockHeight == 8) ? TRUE : FALSE;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            pOut->addr = HwlComputeXmaskAddrFromCoord(pIn->pitch,
+                                                      pIn->height,
+                                                      pIn->x,
+                                                      pIn->y,
+                                                      pIn->slice,
+                                                      pIn->numSlices,
+                                                      1,
+                                                      pIn->isLinear,
+                                                      isWidth8,
+                                                      isHeight8,
+                                                      pIn->pTileInfo,
+                                                      &pOut->bitPosition);
+        }
+    }
+
+    return returnCode;
+
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeHtileCoordFromAddr
+*
+*   @brief
+*       Interface function stub of AddrComputeHtileCoordFromAddr
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeHtileCoordFromAddr(
+    const ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    BOOL_32 isWidth8  = (pIn->blockWidth == 8) ? TRUE : FALSE;
+    BOOL_32 isHeight8 = (pIn->blockHeight == 8) ? TRUE : FALSE;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            HwlComputeXmaskCoordFromAddr(pIn->addr,
+                                         pIn->bitPosition,
+                                         pIn->pitch,
+                                         pIn->height,
+                                         pIn->numSlices,
+                                         1,
+                                         pIn->isLinear,
+                                         isWidth8,
+                                         isHeight8,
+                                         pIn->pTileInfo,
+                                         &pOut->x,
+                                         &pOut->y,
+                                         &pOut->slice);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeCmaskAddrFromCoord
+*
+*   @brief
+*       Interface function stub of AddrComputeCmaskAddrFromCoord
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeCmaskAddrFromCoord(
+    const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            if (pIn->flags.tcCompatible == TRUE)
+            {
+                returnCode = HwlComputeCmaskAddrFromCoord(pIn, pOut);
+            }
+            else
+            {
+                pOut->addr = HwlComputeXmaskAddrFromCoord(pIn->pitch,
+                                                          pIn->height,
+                                                          pIn->x,
+                                                          pIn->y,
+                                                          pIn->slice,
+                                                          pIn->numSlices,
+                                                          2,
+                                                          pIn->isLinear,
+                                                          FALSE, //this is cmask, isWidth8 is not needed
+                                                          FALSE, //this is cmask, isHeight8 is not needed
+                                                          pIn->pTileInfo,
+                                                          &pOut->bitPosition);
+            }
+
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeCmaskCoordFromAddr
+*
+*   @brief
+*       Interface function stub of AddrComputeCmaskCoordFromAddr
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeCmaskCoordFromAddr(
+    const ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            HwlComputeXmaskCoordFromAddr(pIn->addr,
+                                         pIn->bitPosition,
+                                         pIn->pitch,
+                                         pIn->height,
+                                         pIn->numSlices,
+                                         2,
+                                         pIn->isLinear,
+                                         FALSE,
+                                         FALSE,
+                                         pIn->pTileInfo,
+                                         &pOut->x,
+                                         &pOut->y,
+                                         &pOut->slice);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeTileDataWidthAndHeight
+*
+*   @brief
+*       Compute the squared cache shape for per-tile data (CMASK and HTILE)
+*
+*   @return
+*       N/A
+*
+*   @note
+*       MacroWidth and macroHeight are measured in pixels
+***************************************************************************************************
+*/
+VOID AddrLib::ComputeTileDataWidthAndHeight(
+    UINT_32         bpp,             ///< [in] bits per pixel
+    UINT_32         cacheBits,       ///< [in] bits of cache
+    ADDR_TILEINFO*  pTileInfo,       ///< [in] Tile info
+    UINT_32*        pMacroWidth,     ///< [out] macro tile width
+    UINT_32*        pMacroHeight     ///< [out] macro tile height
+    ) const
+{
+    UINT_32 height = 1;
+    UINT_32 width  = cacheBits / bpp;
+    UINT_32 pipes  = HwlGetPipes(pTileInfo);
+
+    // Double height until the macro-tile is close to square
+    // Height can only be doubled if width is even
+
+    while ((width > height * 2 * pipes) && !(width & 1))
+    {
+        width  /= 2;
+        height *= 2;
+    }
+
+    *pMacroWidth  = 8 * width;
+    *pMacroHeight = 8 * height * pipes;
+
+    // Note: The above iterative comptuation is equivalent to the following
+    //
+    //int log2_height = ((log2(cacheBits)-log2(bpp)-log2(pipes))/2);
+    //int macroHeight = pow2( 3+log2(pipes)+log2_height );
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlComputeTileDataWidthAndHeightLinear
+*
+*   @brief
+*       Compute the squared cache shape for per-tile data (CMASK and HTILE) for linear layout
+*
+*   @return
+*       N/A
+*
+*   @note
+*       MacroWidth and macroHeight are measured in pixels
+***************************************************************************************************
+*/
+VOID AddrLib::HwlComputeTileDataWidthAndHeightLinear(
+    UINT_32*        pMacroWidth,     ///< [out] macro tile width
+    UINT_32*        pMacroHeight,    ///< [out] macro tile height
+    UINT_32         bpp,             ///< [in] bits per pixel
+    ADDR_TILEINFO*  pTileInfo        ///< [in] tile info
+    ) const
+{
+    ADDR_ASSERT(bpp != 4);              // Cmask does not support linear layout prior to SI
+    *pMacroWidth  = 8 * 512 / bpp;      // Align width to 512-bit memory accesses
+    *pMacroHeight = 8 * m_pipes;        // Align height to number of pipes
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeHtileInfo
+*
+*   @brief
+*       Compute htile pitch,width, bytes per 2D slice
+*
+*   @return
+*       Htile bpp i.e. How many bits for an 8x8 tile
+*       Also returns by output parameters:
+*       *Htile pitch, height, total size in bytes, macro-tile dimensions and slice size*
+***************************************************************************************************
+*/
+UINT_32 AddrLib::ComputeHtileInfo(
+    ADDR_HTILE_FLAGS flags,             ///< [in] htile flags
+    UINT_32          pitchIn,           ///< [in] pitch input
+    UINT_32          heightIn,          ///< [in] height input
+    UINT_32          numSlices,         ///< [in] number of slices
+    BOOL_32          isLinear,          ///< [in] if it is linear mode
+    BOOL_32          isWidth8,          ///< [in] if htile block width is 8
+    BOOL_32          isHeight8,         ///< [in] if htile block height is 8
+    ADDR_TILEINFO*   pTileInfo,         ///< [in] Tile info
+    UINT_32*         pPitchOut,         ///< [out] pitch output
+    UINT_32*         pHeightOut,        ///< [out] height output
+    UINT_64*         pHtileBytes,       ///< [out] bytes per 2D slice
+    UINT_32*         pMacroWidth,       ///< [out] macro-tile width in pixels
+    UINT_32*         pMacroHeight,      ///< [out] macro-tile width in pixels
+    UINT_64*         pSliceSize,        ///< [out] slice size in bytes
+    UINT_32*         pBaseAlign         ///< [out] base alignment
+    ) const
+{
+
+    UINT_32 macroWidth;
+    UINT_32 macroHeight;
+    UINT_32 baseAlign;
+    UINT_64 surfBytes;
+    UINT_64 sliceBytes;
+
+    numSlices = Max(1u, numSlices);
+
+    const UINT_32 bpp = HwlComputeHtileBpp(isWidth8, isHeight8);
+    const UINT_32 cacheBits = HtileCacheBits;
+
+    if (isLinear)
+    {
+        HwlComputeTileDataWidthAndHeightLinear(&macroWidth,
+                                               &macroHeight,
+                                               bpp,
+                                               pTileInfo);
+    }
+    else
+    {
+        ComputeTileDataWidthAndHeight(bpp,
+                                      cacheBits,
+                                      pTileInfo,
+                                      &macroWidth,
+                                      &macroHeight);
+    }
+
+    *pPitchOut = PowTwoAlign(pitchIn,  macroWidth);
+    *pHeightOut = PowTwoAlign(heightIn,  macroHeight);
+
+    baseAlign = HwlComputeHtileBaseAlign(flags.tcCompatible, isLinear, pTileInfo);
+
+    surfBytes = HwlComputeHtileBytes(*pPitchOut,
+                                     *pHeightOut,
+                                     bpp,
+                                     isLinear,
+                                     numSlices,
+                                     &sliceBytes,
+                                     baseAlign);
+
+    *pHtileBytes = surfBytes;
+
+    //
+    // Use SafeAssign since they are optional
+    //
+    SafeAssign(pMacroWidth, macroWidth);
+
+    SafeAssign(pMacroHeight, macroHeight);
+
+    SafeAssign(pSliceSize,  sliceBytes);
+
+    SafeAssign(pBaseAlign, baseAlign);
+
+    return bpp;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeCmaskBaseAlign
+*
+*   @brief
+*       Compute cmask base alignment
+*
+*   @return
+*       Cmask base alignment
+***************************************************************************************************
+*/
+UINT_32 AddrLib::ComputeCmaskBaseAlign(
+    ADDR_CMASK_FLAGS flags,           ///< [in] Cmask flags
+    ADDR_TILEINFO*   pTileInfo        ///< [in] Tile info
+    ) const
+{
+    UINT_32 baseAlign = m_pipeInterleaveBytes * HwlGetPipes(pTileInfo);
+
+    if (flags.tcCompatible)
+    {
+        ADDR_ASSERT(pTileInfo != NULL);
+        if (pTileInfo)
+        {
+            baseAlign *= pTileInfo->banks;
+        }
+    }
+
+    return baseAlign;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeCmaskBytes
+*
+*   @brief
+*       Compute cmask size in bytes
+*
+*   @return
+*       Cmask size in bytes
+***************************************************************************************************
+*/
+UINT_64 AddrLib::ComputeCmaskBytes(
+    UINT_32 pitch,        ///< [in] pitch
+    UINT_32 height,       ///< [in] height
+    UINT_32 numSlices     ///< [in] number of slices
+    ) const
+{
+    return BITS_TO_BYTES(static_cast<UINT_64>(pitch) * height * numSlices * CmaskElemBits) /
+        MicroTilePixels;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeCmaskInfo
+*
+*   @brief
+*       Compute cmask pitch,width, bytes per 2D slice
+*
+*   @return
+*       BlockMax. Also by output parameters: Cmask pitch,height, total size in bytes,
+*       macro-tile dimensions
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeCmaskInfo(
+    ADDR_CMASK_FLAGS flags,            ///< [in] cmask flags
+    UINT_32          pitchIn,           ///< [in] pitch input
+    UINT_32          heightIn,          ///< [in] height input
+    UINT_32          numSlices,         ///< [in] number of slices
+    BOOL_32          isLinear,          ///< [in] is linear mode
+    ADDR_TILEINFO*   pTileInfo,         ///< [in] Tile info
+    UINT_32*         pPitchOut,         ///< [out] pitch output
+    UINT_32*         pHeightOut,        ///< [out] height output
+    UINT_64*         pCmaskBytes,       ///< [out] bytes per 2D slice
+    UINT_32*         pMacroWidth,       ///< [out] macro-tile width in pixels
+    UINT_32*         pMacroHeight,      ///< [out] macro-tile width in pixels
+    UINT_64*         pSliceSize,        ///< [out] slice size in bytes
+    UINT_32*         pBaseAlign,        ///< [out] base alignment
+    UINT_32*         pBlockMax          ///< [out] block max == slice / 128 / 128 - 1
+    ) const
+{
+    UINT_32 macroWidth;
+    UINT_32 macroHeight;
+    UINT_32 baseAlign;
+    UINT_64 surfBytes;
+    UINT_64 sliceBytes;
+
+    numSlices = Max(1u, numSlices);
+
+    const UINT_32 bpp = CmaskElemBits;
+    const UINT_32 cacheBits = CmaskCacheBits;
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (isLinear)
+    {
+        HwlComputeTileDataWidthAndHeightLinear(&macroWidth,
+                                               &macroHeight,
+                                               bpp,
+                                               pTileInfo);
+    }
+    else
+    {
+        ComputeTileDataWidthAndHeight(bpp,
+                                      cacheBits,
+                                      pTileInfo,
+                                      &macroWidth,
+                                      &macroHeight);
+    }
+
+    *pPitchOut = (pitchIn + macroWidth - 1) & ~(macroWidth - 1);
+    *pHeightOut = (heightIn + macroHeight - 1) & ~(macroHeight - 1);
+
+
+    sliceBytes = ComputeCmaskBytes(*pPitchOut,
+                                   *pHeightOut,
+                                   1);
+
+    baseAlign = ComputeCmaskBaseAlign(flags, pTileInfo);
+
+    while (sliceBytes % baseAlign)
+    {
+        *pHeightOut += macroHeight;
+
+        sliceBytes = ComputeCmaskBytes(*pPitchOut,
+                                       *pHeightOut,
+                                       1);
+    }
+
+    surfBytes = sliceBytes * numSlices;
+
+    *pCmaskBytes = surfBytes;
+
+    //
+    // Use SafeAssign since they are optional
+    //
+    SafeAssign(pMacroWidth, macroWidth);
+
+    SafeAssign(pMacroHeight, macroHeight);
+
+    SafeAssign(pBaseAlign, baseAlign);
+
+    SafeAssign(pSliceSize, sliceBytes);
+
+    UINT_32 slice = (*pPitchOut) * (*pHeightOut);
+    UINT_32 blockMax = slice / 128 / 128 - 1;
+
+#if DEBUG
+    if (slice % (64*256) != 0)
+    {
+        ADDR_ASSERT_ALWAYS();
+    }
+#endif //DEBUG
+
+    UINT_32 maxBlockMax = HwlGetMaxCmaskBlockMax();
+
+    if (blockMax > maxBlockMax)
+    {
+        blockMax = maxBlockMax;
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+
+    SafeAssign(pBlockMax, blockMax);
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeXmaskCoordYFromPipe
+*
+*   @brief
+*       Compute the Y coord from pipe number for cmask/htile
+*
+*   @return
+*       Y coordinate
+*
+***************************************************************************************************
+*/
+UINT_32 AddrLib::ComputeXmaskCoordYFromPipe(
+    UINT_32         pipe,       ///< [in] pipe number
+    UINT_32         x           ///< [in] x coordinate
+    ) const
+{
+    UINT_32 pipeBit0;
+    UINT_32 pipeBit1;
+    UINT_32 xBit0;
+    UINT_32 xBit1;
+    UINT_32 yBit0;
+    UINT_32 yBit1;
+
+    UINT_32 y = 0;
+
+    UINT_32 numPipes = m_pipes; // SI has its implementation
+    //
+    // Convert pipe + x to y coordinate.
+    //
+    switch (numPipes)
+    {
+        case 1:
+            //
+            // 1 pipe
+            //
+            // p0 = 0
+            //
+            y = 0;
+            break;
+        case 2:
+            //
+            // 2 pipes
+            //
+            // p0 = x0 ^ y0
+            //
+            // y0 = p0 ^ x0
+            //
+            pipeBit0 = pipe & 0x1;
+
+            xBit0 = x & 0x1;
+
+            yBit0 = pipeBit0 ^ xBit0;
+
+            y = yBit0;
+            break;
+        case 4:
+            //
+            // 4 pipes
+            //
+            // p0 = x1 ^ y0
+            // p1 = x0 ^ y1
+            //
+            // y0 = p0 ^ x1
+            // y1 = p1 ^ x0
+            //
+            pipeBit0 =  pipe & 0x1;
+            pipeBit1 = (pipe & 0x2) >> 1;
+
+            xBit0 =  x & 0x1;
+            xBit1 = (x & 0x2) >> 1;
+
+            yBit0 = pipeBit0 ^ xBit1;
+            yBit1 = pipeBit1 ^ xBit0;
+
+            y = (yBit0 |
+                 (yBit1 << 1));
+            break;
+        case 8:
+            //
+            // 8 pipes
+            //
+            // r600 and r800 have different method
+            //
+            y = HwlComputeXmaskCoordYFrom8Pipe(pipe, x);
+            break;
+        default:
+            break;
+    }
+    return y;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlComputeXmaskCoordFromAddr
+*
+*   @brief
+*       Compute the coord from an address of a cmask/htile
+*
+*   @return
+*       N/A
+*
+*   @note
+*       This method is reused by htile, so rename to Xmask
+***************************************************************************************************
+*/
+VOID AddrLib::HwlComputeXmaskCoordFromAddr(
+    UINT_64         addr,           ///< [in] address
+    UINT_32         bitPosition,    ///< [in] bitPosition in a byte
+    UINT_32         pitch,          ///< [in] pitch
+    UINT_32         height,         ///< [in] height
+    UINT_32         numSlices,      ///< [in] number of slices
+    UINT_32         factor,         ///< [in] factor that indicates cmask or htile
+    BOOL_32         isLinear,       ///< [in] linear or tiled HTILE layout
+    BOOL_32         isWidth8,       ///< [in] TRUE if width is 8, FALSE means 4. It's register value
+    BOOL_32         isHeight8,      ///< [in] TRUE if width is 8, FALSE means 4. It's register value
+    ADDR_TILEINFO*  pTileInfo,      ///< [in] Tile info
+    UINT_32*        pX,             ///< [out] x coord
+    UINT_32*        pY,             ///< [out] y coord
+    UINT_32*        pSlice          ///< [out] slice index
+    ) const
+{
+    UINT_32 pipe;
+    UINT_32 numPipes;
+    UINT_32 numPipeBits;
+    UINT_32 macroTilePitch;
+    UINT_32 macroTileHeight;
+
+    UINT_64 bitAddr;
+
+    UINT_32 microTileCoordY;
+
+    UINT_32 elemBits;
+
+    UINT_32 pitchAligned = pitch;
+    UINT_32 heightAligned = height;
+    UINT_64 totalBytes;
+
+    UINT_64 elemOffset;
+
+    UINT_64 macroIndex;
+    UINT_32 microIndex;
+
+    UINT_64 macroNumber;
+    UINT_32 microNumber;
+
+    UINT_32 macroX;
+    UINT_32 macroY;
+    UINT_32 macroZ;
+
+    UINT_32 microX;
+    UINT_32 microY;
+
+    UINT_32 tilesPerMacro;
+    UINT_32 macrosPerPitch;
+    UINT_32 macrosPerSlice;
+
+    //
+    // Extract pipe.
+    //
+    numPipes = HwlGetPipes(pTileInfo);
+    pipe = ComputePipeFromAddr(addr, numPipes);
+
+    //
+    // Compute the number of group and pipe bits.
+    //
+    numPipeBits  = Log2(numPipes);
+
+    UINT_32 groupBits = 8 * m_pipeInterleaveBytes;
+    UINT_32 pipes = numPipes;
+
+
+    //
+    // Compute the micro tile size, in bits. And macro tile pitch and height.
+    //
+    if (factor == 2) //CMASK
+    {
+        ADDR_CMASK_FLAGS flags = {{0}};
+
+        elemBits = CmaskElemBits;
+
+        ComputeCmaskInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         pTileInfo,
+                         &pitchAligned,
+                         &heightAligned,
+                         &totalBytes,
+                         &macroTilePitch,
+                         &macroTileHeight);
+    }
+    else  //HTILE
+    {
+        ADDR_HTILE_FLAGS flags = {{0}};
+
+        if (factor != 1)
+        {
+            factor = 1;
+        }
+
+        elemBits = HwlComputeHtileBpp(isWidth8, isHeight8);
+
+        ComputeHtileInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         isWidth8,
+                         isHeight8,
+                         pTileInfo,
+                         &pitchAligned,
+                         &heightAligned,
+                         &totalBytes,
+                         &macroTilePitch,
+                         &macroTileHeight);
+    }
+
+    // Should use aligned dims
+    //
+    pitch = pitchAligned;
+    height = heightAligned;
+
+
+    //
+    // Convert byte address to bit address.
+    //
+    bitAddr = BYTES_TO_BITS(addr) + bitPosition;
+
+
+    //
+    // Remove pipe bits from address.
+    //
+
+    bitAddr = (bitAddr % groupBits) + ((bitAddr/groupBits/pipes)*groupBits);
+
+
+    elemOffset = bitAddr / elemBits;
+
+    tilesPerMacro = (macroTilePitch/factor) * macroTileHeight / MicroTilePixels >> numPipeBits;
+
+    macrosPerPitch = pitch / (macroTilePitch/factor);
+    macrosPerSlice = macrosPerPitch * height / macroTileHeight;
+
+    macroIndex = elemOffset / factor / tilesPerMacro;
+    microIndex = static_cast<UINT_32>(elemOffset % (tilesPerMacro * factor));
+
+    macroNumber = macroIndex * factor + microIndex % factor;
+    microNumber = microIndex / factor;
+
+    macroX = static_cast<UINT_32>((macroNumber % macrosPerPitch));
+    macroY = static_cast<UINT_32>((macroNumber % macrosPerSlice) / macrosPerPitch);
+    macroZ = static_cast<UINT_32>((macroNumber / macrosPerSlice));
+
+
+    microX = microNumber % (macroTilePitch / factor / MicroTileWidth);
+    microY = (microNumber / (macroTilePitch / factor / MicroTileHeight));
+
+    *pX = macroX * (macroTilePitch/factor) + microX * MicroTileWidth;
+    *pY = macroY * macroTileHeight + (microY * MicroTileHeight << numPipeBits);
+    *pSlice = macroZ;
+
+    microTileCoordY = ComputeXmaskCoordYFromPipe(pipe,
+                                                 *pX/MicroTileWidth);
+
+
+    //
+    // Assemble final coordinates.
+    //
+    *pY += microTileCoordY * MicroTileHeight;
+
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlComputeXmaskAddrFromCoord
+*
+*   @brief
+*       Compute the address from an address of cmask (prior to si)
+*
+*   @return
+*       Address in bytes
+*
+***************************************************************************************************
+*/
+UINT_64 AddrLib::HwlComputeXmaskAddrFromCoord(
+    UINT_32        pitch,          ///< [in] pitch
+    UINT_32        height,         ///< [in] height
+    UINT_32        x,              ///< [in] x coord
+    UINT_32        y,              ///< [in] y coord
+    UINT_32        slice,          ///< [in] slice/depth index
+    UINT_32        numSlices,      ///< [in] number of slices
+    UINT_32        factor,         ///< [in] factor that indicates cmask(2) or htile(1)
+    BOOL_32        isLinear,       ///< [in] linear or tiled HTILE layout
+    BOOL_32        isWidth8,       ///< [in] TRUE if width is 8, FALSE means 4. It's register value
+    BOOL_32        isHeight8,      ///< [in] TRUE if width is 8, FALSE means 4. It's register value
+    ADDR_TILEINFO* pTileInfo,      ///< [in] Tile info
+    UINT_32*       pBitPosition    ///< [out] bit position inside a byte
+    ) const
+{
+    UINT_64 addr;
+    UINT_32 numGroupBits;
+    UINT_32 numPipeBits;
+    UINT_32 newPitch = 0;
+    UINT_32 newHeight = 0;
+    UINT_64 sliceBytes = 0;
+    UINT_64 totalBytes = 0;
+    UINT_64 sliceOffset;
+    UINT_32 pipe;
+    UINT_32 macroTileWidth;
+    UINT_32 macroTileHeight;
+    UINT_32 macroTilesPerRow;
+    UINT_32 macroTileBytes;
+    UINT_32 macroTileIndexX;
+    UINT_32 macroTileIndexY;
+    UINT_64 macroTileOffset;
+    UINT_32 pixelBytesPerRow;
+    UINT_32 pixelOffsetX;
+    UINT_32 pixelOffsetY;
+    UINT_32 pixelOffset;
+    UINT_64 totalOffset;
+    UINT_64 offsetLo;
+    UINT_64 offsetHi;
+    UINT_64 groupMask;
+
+
+    UINT_32 elemBits = 0;
+
+    UINT_32 numPipes = m_pipes; // This function is accessed prior to si only
+
+    if (factor == 2) //CMASK
+    {
+        elemBits = CmaskElemBits;
+
+        // For asics before SI, cmask is always tiled
+        isLinear = FALSE;
+    }
+    else //HTILE
+    {
+        if (factor != 1) // Fix compile warning
+        {
+            factor = 1;
+        }
+
+        elemBits = HwlComputeHtileBpp(isWidth8, isHeight8);
+    }
+
+    //
+    // Compute the number of group bits and pipe bits.
+    //
+    numGroupBits = Log2(m_pipeInterleaveBytes);
+    numPipeBits  = Log2(numPipes);
+
+    //
+    // Compute macro tile dimensions.
+    //
+    if (factor == 2) // CMASK
+    {
+        ADDR_CMASK_FLAGS flags = {{0}};
+
+        ComputeCmaskInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         pTileInfo,
+                         &newPitch,
+                         &newHeight,
+                         &totalBytes,
+                         &macroTileWidth,
+                         &macroTileHeight);
+
+        sliceBytes = totalBytes / numSlices;
+    }
+    else // HTILE
+    {
+        ADDR_HTILE_FLAGS flags = {{0}};
+
+        ComputeHtileInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         isWidth8,
+                         isHeight8,
+                         pTileInfo,
+                         &newPitch,
+                         &newHeight,
+                         &totalBytes,
+                         &macroTileWidth,
+                         &macroTileHeight,
+                         &sliceBytes);
+    }
+
+    sliceOffset = slice * sliceBytes;
+
+    //
+    // Get the pipe.  Note that neither slice rotation nor pipe swizzling apply for CMASK.
+    //
+    pipe = ComputePipeFromCoord(x,
+                                y,
+                                0,
+                                ADDR_TM_2D_TILED_THIN1,
+                                0,
+                                FALSE,
+                                pTileInfo);
+
+    //
+    // Compute the number of macro tiles per row.
+    //
+    macroTilesPerRow = newPitch / macroTileWidth;
+
+    //
+    // Compute the number of bytes per macro tile.
+    //
+    macroTileBytes = BITS_TO_BYTES((macroTileWidth * macroTileHeight * elemBits) / MicroTilePixels);
+
+    //
+    // Compute the offset to the macro tile containing the specified coordinate.
+    //
+    macroTileIndexX = x / macroTileWidth;
+    macroTileIndexY = y / macroTileHeight;
+    macroTileOffset = ((macroTileIndexY * macroTilesPerRow) + macroTileIndexX) * macroTileBytes;
+
+    //
+    // Compute the pixel offset within the macro tile.
+    //
+    pixelBytesPerRow = BITS_TO_BYTES(macroTileWidth * elemBits) / MicroTileWidth;
+
+    //
+    // The nibbles are interleaved (see below), so the part of the offset relative to the x
+    // coordinate repeats halfway across the row. (Not for HTILE)
+    //
+    if (factor == 2)
+    {
+        pixelOffsetX = (x % (macroTileWidth / 2)) / MicroTileWidth;
+    }
+    else
+    {
+        pixelOffsetX = (x % (macroTileWidth)) / MicroTileWidth * BITS_TO_BYTES(elemBits);
+    }
+
+    //
+    // Compute the y offset within the macro tile.
+    //
+    pixelOffsetY = (((y % macroTileHeight) / MicroTileHeight) / numPipes) * pixelBytesPerRow;
+
+    pixelOffset = pixelOffsetX + pixelOffsetY;
+
+    //
+    // Combine the slice offset and macro tile offset with the pixel offset, accounting for the
+    // pipe bits in the middle of the address.
+    //
+    totalOffset = ((sliceOffset + macroTileOffset) >> numPipeBits) + pixelOffset;
+
+    //
+    // Split the offset to put some bits below the pipe bits and some above.
+    //
+    groupMask = (1 << numGroupBits) - 1;
+    offsetLo  = totalOffset &  groupMask;
+    offsetHi  = (totalOffset & ~groupMask) << numPipeBits;
+
+    //
+    // Assemble the address from its components.
+    //
+    addr  = offsetLo;
+    addr |= offsetHi;
+    // This is to remove warning with /analyze option
+    UINT_32 pipeBits = pipe << numGroupBits;
+    addr |= pipeBits;
+
+    //
+    // Compute the bit position.  The lower nibble is used when the x coordinate within the macro
+    // tile is less than half of the macro tile width, and the upper nibble is used when the x
+    // coordinate within the macro tile is greater than or equal to half the macro tile width.
+    //
+    *pBitPosition = ((x % macroTileWidth) < (macroTileWidth / factor)) ? 0 : 4;
+
+    return addr;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Surface Addressing Shared
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceAddrFromCoordLinear
+*
+*   @brief
+*       Compute address from coord for linear surface
+*
+*   @return
+*       Address in bytes
+*
+***************************************************************************************************
+*/
+UINT_64 AddrLib::ComputeSurfaceAddrFromCoordLinear(
+    UINT_32  x,              ///< [in] x coord
+    UINT_32  y,              ///< [in] y coord
+    UINT_32  slice,          ///< [in] slice/depth index
+    UINT_32  sample,         ///< [in] sample index
+    UINT_32  bpp,            ///< [in] bits per pixel
+    UINT_32  pitch,          ///< [in] pitch
+    UINT_32  height,         ///< [in] height
+    UINT_32  numSlices,      ///< [in] number of slices
+    UINT_32* pBitPosition    ///< [out] bit position inside a byte
+    ) const
+{
+    const UINT_64 sliceSize = static_cast<UINT_64>(pitch) * height;
+
+    UINT_64 sliceOffset = (slice + sample * numSlices)* sliceSize;
+    UINT_64 rowOffset   = static_cast<UINT_64>(y) * pitch;
+    UINT_64 pixOffset   = x;
+
+    UINT_64 addr = (sliceOffset + rowOffset + pixOffset) * bpp;
+
+    *pBitPosition = static_cast<UINT_32>(addr % 8);
+    addr /= 8;
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceCoordFromAddrLinear
+*
+*   @brief
+*       Compute the coord from an address of a linear surface
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrLib::ComputeSurfaceCoordFromAddrLinear(
+    UINT_64  addr,           ///< [in] address
+    UINT_32  bitPosition,    ///< [in] bitPosition in a byte
+    UINT_32  bpp,            ///< [in] bits per pixel
+    UINT_32  pitch,          ///< [in] pitch
+    UINT_32  height,         ///< [in] height
+    UINT_32  numSlices,      ///< [in] number of slices
+    UINT_32* pX,             ///< [out] x coord
+    UINT_32* pY,             ///< [out] y coord
+    UINT_32* pSlice,         ///< [out] slice/depth index
+    UINT_32* pSample         ///< [out] sample index
+    ) const
+{
+    const UINT_64 sliceSize = static_cast<UINT_64>(pitch) * height;
+    const UINT_64 linearOffset = (BYTES_TO_BITS(addr) + bitPosition) / bpp;
+
+    *pX = static_cast<UINT_32>((linearOffset % sliceSize) % pitch);
+    *pY = static_cast<UINT_32>((linearOffset % sliceSize) / pitch % height);
+    *pSlice  = static_cast<UINT_32>((linearOffset / sliceSize) % numSlices);
+    *pSample = static_cast<UINT_32>((linearOffset / sliceSize) / numSlices);
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceCoordFromAddrMicroTiled
+*
+*   @brief
+*       Compute the coord from an address of a micro tiled surface
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrLib::ComputeSurfaceCoordFromAddrMicroTiled(
+    UINT_64         addr,               ///< [in] address
+    UINT_32         bitPosition,        ///< [in] bitPosition in a byte
+    UINT_32         bpp,                ///< [in] bits per pixel
+    UINT_32         pitch,              ///< [in] pitch
+    UINT_32         height,             ///< [in] height
+    UINT_32         numSamples,         ///< [in] number of samples
+    AddrTileMode    tileMode,           ///< [in] tile mode
+    UINT_32         tileBase,           ///< [in] base offset within a tile
+    UINT_32         compBits,           ///< [in] component bits actually needed(for planar surface)
+    UINT_32*        pX,                 ///< [out] x coord
+    UINT_32*        pY,                 ///< [out] y coord
+    UINT_32*        pSlice,             ///< [out] slice/depth index
+    UINT_32*        pSample,            ///< [out] sample index,
+    AddrTileType    microTileType,      ///< [in] micro tiling order
+    BOOL_32         isDepthSampleOrder  ///< [in] TRUE if in depth sample order
+    ) const
+{
+    UINT_64 bitAddr;
+    UINT_32 microTileThickness;
+    UINT_32 microTileBits;
+    UINT_64 sliceBits;
+    UINT_64 rowBits;
+    UINT_32 sliceIndex;
+    UINT_32 microTileCoordX;
+    UINT_32 microTileCoordY;
+    UINT_32 pixelOffset;
+    UINT_32 pixelCoordX = 0;
+    UINT_32 pixelCoordY = 0;
+    UINT_32 pixelCoordZ = 0;
+    UINT_32 pixelCoordS = 0;
+
+    //
+    // Convert byte address to bit address.
+    //
+    bitAddr = BYTES_TO_BITS(addr) + bitPosition;
+
+    //
+    // Compute the micro tile size, in bits.
+    //
+    switch (tileMode)
+    {
+        case ADDR_TM_1D_TILED_THICK:
+            microTileThickness = ThickTileThickness;
+            break;
+        default:
+            microTileThickness = 1;
+            break;
+    }
+
+    microTileBits = MicroTilePixels * microTileThickness * bpp * numSamples;
+
+    //
+    // Compute number of bits per slice and number of bits per row of micro tiles.
+    //
+    sliceBits = static_cast<UINT_64>(pitch) * height * microTileThickness * bpp * numSamples;
+
+    rowBits   = (pitch / MicroTileWidth) * microTileBits;
+
+    //
+    // Extract the slice index.
+    //
+    sliceIndex = static_cast<UINT_32>(bitAddr / sliceBits);
+    bitAddr -= sliceIndex * sliceBits;
+
+    //
+    // Extract the y coordinate of the micro tile.
+    //
+    microTileCoordY = static_cast<UINT_32>(bitAddr / rowBits) * MicroTileHeight;
+    bitAddr -= (microTileCoordY / MicroTileHeight) * rowBits;
+
+    //
+    // Extract the x coordinate of the micro tile.
+    //
+    microTileCoordX = static_cast<UINT_32>(bitAddr / microTileBits) * MicroTileWidth;
+
+    //
+    // Compute the pixel offset within the micro tile.
+    //
+    pixelOffset = static_cast<UINT_32>(bitAddr % microTileBits);
+
+    //
+    // Extract pixel coordinates from the offset.
+    //
+    HwlComputePixelCoordFromOffset(pixelOffset,
+                                   bpp,
+                                   numSamples,
+                                   tileMode,
+                                   tileBase,
+                                   compBits,
+                                   &pixelCoordX,
+                                   &pixelCoordY,
+                                   &pixelCoordZ,
+                                   &pixelCoordS,
+                                   microTileType,
+                                   isDepthSampleOrder);
+
+    //
+    // Assemble final coordinates.
+    //
+    *pX     = microTileCoordX + pixelCoordX;
+    *pY     = microTileCoordY + pixelCoordY;
+    *pSlice = (sliceIndex * microTileThickness) + pixelCoordZ;
+    *pSample = pixelCoordS;
+
+    if (microTileThickness > 1)
+    {
+        *pSample = 0;
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputePipeFromAddr
+*
+*   @brief
+*       Compute the pipe number from an address
+*
+*   @return
+*       Pipe number
+*
+***************************************************************************************************
+*/
+UINT_32 AddrLib::ComputePipeFromAddr(
+    UINT_64 addr,        ///< [in] address
+    UINT_32 numPipes     ///< [in] number of banks
+    ) const
+{
+    UINT_32 pipe;
+
+    UINT_32 groupBytes = m_pipeInterleaveBytes; //just different terms
+
+    // R600
+    // The LSBs of the address are arranged as follows:
+    //   bank | pipe | group
+    //
+    // To get the pipe number, shift off the group bits and mask the pipe bits.
+    //
+
+    // R800
+    // The LSBs of the address are arranged as follows:
+    //   bank | bankInterleave | pipe | pipeInterleave
+    //
+    // To get the pipe number, shift off the pipe interleave bits and mask the pipe bits.
+    //
+
+    pipe = static_cast<UINT_32>(addr >> Log2(groupBytes)) & (numPipes - 1);
+
+    return pipe;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputePixelIndexWithinMicroTile
+*
+*   @brief
+*       Compute the pixel index inside a micro tile of surface
+*
+*   @return
+*       Pixel index
+*
+***************************************************************************************************
+*/
+UINT_32 AddrLib::ComputePixelIndexWithinMicroTile(
+    UINT_32         x,              ///< [in] x coord
+    UINT_32         y,              ///< [in] y coord
+    UINT_32         z,              ///< [in] slice/depth index
+    UINT_32         bpp,            ///< [in] bits per pixel
+    AddrTileMode    tileMode,       ///< [in] tile mode
+    AddrTileType    microTileType   ///< [in] pixel order in display/non-display mode
+    ) const
+{
+    UINT_32 pixelBit0 = 0;
+    UINT_32 pixelBit1 = 0;
+    UINT_32 pixelBit2 = 0;
+    UINT_32 pixelBit3 = 0;
+    UINT_32 pixelBit4 = 0;
+    UINT_32 pixelBit5 = 0;
+    UINT_32 pixelBit6 = 0;
+    UINT_32 pixelBit7 = 0;
+    UINT_32 pixelBit8 = 0;
+    UINT_32 pixelNumber;
+
+    UINT_32 x0 = _BIT(x, 0);
+    UINT_32 x1 = _BIT(x, 1);
+    UINT_32 x2 = _BIT(x, 2);
+    UINT_32 y0 = _BIT(y, 0);
+    UINT_32 y1 = _BIT(y, 1);
+    UINT_32 y2 = _BIT(y, 2);
+    UINT_32 z0 = _BIT(z, 0);
+    UINT_32 z1 = _BIT(z, 1);
+    UINT_32 z2 = _BIT(z, 2);
+
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+    // Compute the pixel number within the micro tile.
+
+    if (microTileType != ADDR_THICK)
+    {
+        if (microTileType == ADDR_DISPLAYABLE)
+        {
+            switch (bpp)
+            {
+                case 8:
+                    pixelBit0 = x0;
+                    pixelBit1 = x1;
+                    pixelBit2 = x2;
+                    pixelBit3 = y1;
+                    pixelBit4 = y0;
+                    pixelBit5 = y2;
+                    break;
+                case 16:
+                    pixelBit0 = x0;
+                    pixelBit1 = x1;
+                    pixelBit2 = x2;
+                    pixelBit3 = y0;
+                    pixelBit4 = y1;
+                    pixelBit5 = y2;
+                    break;
+                case 32:
+                    pixelBit0 = x0;
+                    pixelBit1 = x1;
+                    pixelBit2 = y0;
+                    pixelBit3 = x2;
+                    pixelBit4 = y1;
+                    pixelBit5 = y2;
+                    break;
+                case 64:
+                    pixelBit0 = x0;
+                    pixelBit1 = y0;
+                    pixelBit2 = x1;
+                    pixelBit3 = x2;
+                    pixelBit4 = y1;
+                    pixelBit5 = y2;
+                    break;
+                case 128:
+                    pixelBit0 = y0;
+                    pixelBit1 = x0;
+                    pixelBit2 = x1;
+                    pixelBit3 = x2;
+                    pixelBit4 = y1;
+                    pixelBit5 = y2;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    break;
+            }
+        }
+        else if (microTileType == ADDR_NON_DISPLAYABLE || microTileType == ADDR_DEPTH_SAMPLE_ORDER)
+        {
+            pixelBit0 = x0;
+            pixelBit1 = y0;
+            pixelBit2 = x1;
+            pixelBit3 = y1;
+            pixelBit4 = x2;
+            pixelBit5 = y2;
+        }
+        else if (microTileType == ADDR_ROTATED)
+        {
+            ADDR_ASSERT(thickness == 1);
+
+            switch (bpp)
+            {
+                case 8:
+                    pixelBit0 = y0;
+                    pixelBit1 = y1;
+                    pixelBit2 = y2;
+                    pixelBit3 = x1;
+                    pixelBit4 = x0;
+                    pixelBit5 = x2;
+                    break;
+                case 16:
+                    pixelBit0 = y0;
+                    pixelBit1 = y1;
+                    pixelBit2 = y2;
+                    pixelBit3 = x0;
+                    pixelBit4 = x1;
+                    pixelBit5 = x2;
+                    break;
+                case 32:
+                    pixelBit0 = y0;
+                    pixelBit1 = y1;
+                    pixelBit2 = x0;
+                    pixelBit3 = y2;
+                    pixelBit4 = x1;
+                    pixelBit5 = x2;
+                    break;
+                case 64:
+                    pixelBit0 = y0;
+                    pixelBit1 = x0;
+                    pixelBit2 = y1;
+                    pixelBit3 = x1;
+                    pixelBit4 = x2;
+                    pixelBit5 = y2;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    break;
+            }
+        }
+
+        if (thickness > 1)
+        {
+            pixelBit6 = z0;
+            pixelBit7 = z1;
+        }
+    }
+    else // ADDR_THICK
+    {
+        ADDR_ASSERT(thickness > 1);
+
+        switch (bpp)
+        {
+            case 8:
+            case 16:
+                pixelBit0 = x0;
+                pixelBit1 = y0;
+                pixelBit2 = x1;
+                pixelBit3 = y1;
+                pixelBit4 = z0;
+                pixelBit5 = z1;
+                break;
+            case 32:
+                pixelBit0 = x0;
+                pixelBit1 = y0;
+                pixelBit2 = x1;
+                pixelBit3 = z0;
+                pixelBit4 = y1;
+                pixelBit5 = z1;
+                break;
+            case 64:
+            case 128:
+                pixelBit0 = y0;
+                pixelBit1 = x0;
+                pixelBit2 = z0;
+                pixelBit3 = x1;
+                pixelBit4 = y1;
+                pixelBit5 = z1;
+                break;
+            default:
+                ADDR_ASSERT_ALWAYS();
+                break;
+        }
+
+        pixelBit6 = x2;
+        pixelBit7 = y2;
+    }
+
+    if (thickness == 8)
+    {
+        pixelBit8 = z2;
+    }
+
+    pixelNumber = ((pixelBit0     ) |
+                   (pixelBit1 << 1) |
+                   (pixelBit2 << 2) |
+                   (pixelBit3 << 3) |
+                   (pixelBit4 << 4) |
+                   (pixelBit5 << 5) |
+                   (pixelBit6 << 6) |
+                   (pixelBit7 << 7) |
+                   (pixelBit8 << 8));
+
+    return pixelNumber;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::AdjustPitchAlignment
+*
+*   @brief
+*       Adjusts pitch alignment for flipping surface
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrLib::AdjustPitchAlignment(
+    ADDR_SURFACE_FLAGS  flags,      ///< [in] Surface flags
+    UINT_32*            pPitchAlign ///< [out] Pointer to pitch alignment
+    ) const
+{
+    // Display engine hardwires lower 5 bit of GRPH_PITCH to ZERO which means 32 pixel alignment
+    // Maybe it will be fixed in future but let's make it general for now.
+    if (flags.display || flags.overlay)
+    {
+        *pPitchAlign = PowTwoAlign(*pPitchAlign, 32);
+
+        if(flags.display)
+        {
+            *pPitchAlign = Max(m_minPitchAlignPixels, *pPitchAlign);
+        }
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::PadDimensions
+*
+*   @brief
+*       Helper function to pad dimensions
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrLib::PadDimensions(
+    AddrTileMode        tileMode,    ///< [in] tile mode
+    UINT_32             bpp,         ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,       ///< [in] surface flags
+    UINT_32             numSamples,  ///< [in] number of samples
+    ADDR_TILEINFO*      pTileInfo,   ///< [in/out] bank structure.
+    UINT_32             padDims,     ///< [in] Dimensions to pad valid value 1,2,3
+    UINT_32             mipLevel,    ///< [in] MipLevel
+    UINT_32*            pPitch,      ///< [in/out] pitch in pixels
+    UINT_32             pitchAlign,  ///< [in] pitch alignment
+    UINT_32*            pHeight,     ///< [in/out] height in pixels
+    UINT_32             heightAlign, ///< [in] height alignment
+    UINT_32*            pSlices,     ///< [in/out] number of slices
+    UINT_32             sliceAlign   ///< [in] number of slice alignment
+    ) const
+{
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+    ADDR_ASSERT(padDims <= 3);
+
+    //
+    // Override padding for mip levels
+    //
+    if (mipLevel > 0)
+    {
+        if (flags.cube)
+        {
+            // for cubemap, we only pad when client call with 6 faces as an identity
+            if (*pSlices > 1)
+            {
+                padDims = 3; // we should pad cubemap sub levels when we treat it as 3d texture
+            }
+            else
+            {
+                padDims = 2;
+            }
+        }
+    }
+
+    // Any possibilities that padDims is 0?
+    if (padDims == 0)
+    {
+        padDims = 3;
+    }
+
+    if (IsPow2(pitchAlign))
+    {
+        *pPitch = PowTwoAlign((*pPitch), pitchAlign);
+    }
+    else // add this code to pass unit test, r600 linear mode is not align bpp to pow2 for linear
+    {
+        *pPitch += pitchAlign - 1;
+        *pPitch /= pitchAlign;
+        *pPitch *= pitchAlign;
+    }
+
+    if (padDims > 1)
+    {
+        *pHeight = PowTwoAlign((*pHeight), heightAlign);
+    }
+
+    if (padDims > 2 || thickness > 1)
+    {
+        // for cubemap single face, we do not pad slices.
+        // if we pad it, the slice number should be set to 6 and current mip level > 1
+        if (flags.cube && (!m_configFlags.noCubeMipSlicesPad || flags.cubeAsArray))
+        {
+            *pSlices = NextPow2(*pSlices);
+        }
+
+        // normal 3D texture or arrays or cubemap has a thick mode? (Just pass unit test)
+        if (thickness > 1)
+        {
+            *pSlices = PowTwoAlign((*pSlices), sliceAlign);
+        }
+
+    }
+
+    HwlPadDimensions(tileMode,
+                     bpp,
+                     flags,
+                     numSamples,
+                     pTileInfo,
+                     padDims,
+                     mipLevel,
+                     pPitch,
+                     pitchAlign,
+                     pHeight,
+                     heightAlign,
+                     pSlices,
+                     sliceAlign);
+}
+
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlPreHandleBaseLvl3xPitch
+*
+*   @brief
+*       Pre-handler of 3x pitch (96 bit) adjustment
+*
+*   @return
+*       Expected pitch
+***************************************************************************************************
+*/
+UINT_32 AddrLib::HwlPreHandleBaseLvl3xPitch(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] input
+    UINT_32                                 expPitch    ///< [in] pitch
+    ) const
+{
+    ADDR_ASSERT(pIn->width == expPitch);
+    //
+    // If pitch is pre-multiplied by 3, we retrieve original one here to get correct miplevel size
+    //
+    if (AddrElemLib::IsExpand3x(pIn->format) &&
+        pIn->mipLevel == 0 &&
+        pIn->tileMode == ADDR_TM_LINEAR_ALIGNED)
+    {
+        expPitch /= 3;
+        expPitch = NextPow2(expPitch);
+    }
+
+    return expPitch;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlPostHandleBaseLvl3xPitch
+*
+*   @brief
+*       Post-handler of 3x pitch adjustment
+*
+*   @return
+*       Expected pitch
+***************************************************************************************************
+*/
+UINT_32 AddrLib::HwlPostHandleBaseLvl3xPitch(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] input
+    UINT_32                                 expPitch    ///< [in] pitch
+    ) const
+{
+    //
+    // 96 bits surface of sub levels require element pitch of 32 bits instead
+    // So we just return pitch in 32 bit pixels without timing 3
+    //
+    if (AddrElemLib::IsExpand3x(pIn->format) &&
+        pIn->mipLevel == 0 &&
+        pIn->tileMode == ADDR_TM_LINEAR_ALIGNED)
+    {
+        expPitch *= 3;
+    }
+
+    return expPitch;
+}
+
+
+/**
+***************************************************************************************************
+*   AddrLib::IsMacroTiled
+*
+*   @brief
+*       Check if the tile mode is macro tiled
+*
+*   @return
+*       TRUE if it is macro tiled (2D/2B/3D/3B)
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::IsMacroTiled(
+    AddrTileMode tileMode)  ///< [in] tile mode
+{
+   return m_modeFlags[tileMode].isMacro;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::IsMacro3dTiled
+*
+*   @brief
+*       Check if the tile mode is 3D macro tiled
+*
+*   @return
+*       TRUE if it is 3D macro tiled
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::IsMacro3dTiled(
+    AddrTileMode tileMode)  ///< [in] tile mode
+{
+    return m_modeFlags[tileMode].isMacro3d;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::IsMicroTiled
+*
+*   @brief
+*       Check if the tile mode is micro tiled
+*
+*   @return
+*       TRUE if micro tiled
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::IsMicroTiled(
+    AddrTileMode tileMode)  ///< [in] tile mode
+{
+    return m_modeFlags[tileMode].isMicro;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::IsLinear
+*
+*   @brief
+*       Check if the tile mode is linear
+*
+*   @return
+*       TRUE if linear
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::IsLinear(
+    AddrTileMode tileMode)  ///< [in] tile mode
+{
+    return m_modeFlags[tileMode].isLinear;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::IsPrtNoRotationTileMode
+*
+*   @brief
+*       Return TRUE if it is prt tile without rotation
+*   @note
+*       This function just used by CI
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::IsPrtNoRotationTileMode(
+    AddrTileMode tileMode)
+{
+    return m_modeFlags[tileMode].isPrtNoRotation;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::IsPrtTileMode
+*
+*   @brief
+*       Return TRUE if it is prt tile
+*   @note
+*       This function just used by CI
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::IsPrtTileMode(
+    AddrTileMode tileMode)
+{
+    return m_modeFlags[tileMode].isPrt;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::Bits2Number
+*
+*   @brief
+*       Cat a array of binary bit to a number
+*
+*   @return
+*       The number combined with the array of bits
+***************************************************************************************************
+*/
+UINT_32 AddrLib::Bits2Number(
+    UINT_32 bitNum,     ///< [in] how many bits
+    ...)                ///< [in] varaible bits value starting from MSB
+{
+    UINT_32 number = 0;
+    UINT_32 i;
+    va_list bits_ptr;
+
+    va_start(bits_ptr, bitNum);
+
+    for(i = 0; i < bitNum; i++)
+    {
+        number |= va_arg(bits_ptr, UINT_32);
+        number <<= 1;
+    }
+
+    number>>=1;
+
+    va_end(bits_ptr);
+
+    return number;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeMipLevel
+*
+*   @brief
+*       Compute mipmap level width/height/slices
+*   @return
+*      N/A
+***************************************************************************************************
+*/
+VOID AddrLib::ComputeMipLevel(
+    ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn ///< [in/out] Input structure
+    ) const
+{
+    if (AddrElemLib::IsBlockCompressed(pIn->format))
+    {
+        if (pIn->mipLevel == 0)
+        {
+            // DXTn's level 0 must be multiple of 4
+            // But there are exceptions:
+            // 1. Internal surface creation in hostblt/vsblt/etc...
+            // 2. Runtime doesn't reject ATI1/ATI2 whose width/height are not multiple of 4
+            pIn->width = PowTwoAlign(pIn->width, 4);
+            pIn->height = PowTwoAlign(pIn->height, 4);
+        }
+    }
+
+    HwlComputeMipLevel(pIn);
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::DegradeBaseLevel
+*
+*   @brief
+*       Check if base level's tile mode can be degraded
+*   @return
+*       TRUE if degraded, also returns degraded tile mode (unchanged if not degraded)
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::DegradeBaseLevel(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] Input structure for surface info
+    AddrTileMode*                           pTileMode   ///< [out] Degraded tile mode
+    ) const
+{
+    BOOL_32 degraded = FALSE;
+    AddrTileMode tileMode = pIn->tileMode;
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+    if (m_configFlags.degradeBaseLevel) // This is a global setting
+    {
+        if (pIn->flags.degrade4Space        && // Degradation per surface
+            pIn->mipLevel == 0              &&
+            pIn->numSamples == 1            &&
+            IsMacroTiled(tileMode))
+        {
+            if (HwlDegradeBaseLevel(pIn))
+            {
+                *pTileMode = thickness == 1 ? ADDR_TM_1D_TILED_THIN1 : ADDR_TM_1D_TILED_THICK;
+                degraded = TRUE;
+            }
+            else if (thickness > 1)
+            {
+                // As in the following HwlComputeSurfaceInfo, thick modes may be degraded to
+                // thinner modes, we should re-evaluate whether the corresponding thinner modes
+                // need to be degraded. If so, we choose 1D thick mode instead.
+                tileMode = DegradeLargeThickTile(pIn->tileMode, pIn->bpp);
+                if (tileMode != pIn->tileMode)
+                {
+                    ADDR_COMPUTE_SURFACE_INFO_INPUT input = *pIn;
+                    input.tileMode = tileMode;
+                    if (HwlDegradeBaseLevel(&input))
+                    {
+                        *pTileMode = ADDR_TM_1D_TILED_THICK;
+                        degraded = TRUE;
+                    }
+                }
+            }
+        }
+    }
+
+    return degraded;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::DegradeLargeThickTile
+*
+*   @brief
+*       Check if the thickness needs to be reduced if a tile is too large
+*   @return
+*       The degraded tile mode (unchanged if not degraded)
+***************************************************************************************************
+*/
+AddrTileMode AddrLib::DegradeLargeThickTile(
+    AddrTileMode tileMode,
+    UINT_32 bpp) const
+{
+    // Override tilemode
+    // When tile_width (8) * tile_height (8) * thickness * element_bytes is > row_size,
+    // it is better to just use THIN mode in this case
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+    if (thickness > 1 && m_configFlags.allowLargeThickTile == 0)
+    {
+        UINT_32 tileSize = MicroTilePixels * thickness * (bpp >> 3);
+
+        if (tileSize > m_rowSize)
+        {
+            switch (tileMode)
+            {
+                case ADDR_TM_2D_TILED_XTHICK:
+                    if ((tileSize >> 1) <= m_rowSize)
+                    {
+                        tileMode = ADDR_TM_2D_TILED_THICK;
+                        break;
+                    }
+                    // else fall through
+                case ADDR_TM_2D_TILED_THICK:
+                    tileMode    = ADDR_TM_2D_TILED_THIN1;
+                    break;
+
+                case ADDR_TM_3D_TILED_XTHICK:
+                    if ((tileSize >> 1) <= m_rowSize)
+                    {
+                        tileMode = ADDR_TM_3D_TILED_THICK;
+                        break;
+                    }
+                    // else fall through
+                case ADDR_TM_3D_TILED_THICK:
+                    tileMode    = ADDR_TM_3D_TILED_THIN1;
+                    break;
+
+                case ADDR_TM_PRT_TILED_THICK:
+                    tileMode    = ADDR_TM_PRT_TILED_THIN1;
+                    break;
+
+                case ADDR_TM_PRT_2D_TILED_THICK:
+                    tileMode    = ADDR_TM_PRT_2D_TILED_THIN1;
+                    break;
+
+                case ADDR_TM_PRT_3D_TILED_THICK:
+                    tileMode    = ADDR_TM_PRT_3D_TILED_THIN1;
+                    break;
+
+                default:
+                    break;
+            }
+        }
+    }
+
+    return tileMode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::PostComputeMipLevel
+*   @brief
+*       Compute MipLevel info (including level 0) after surface adjustment
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::PostComputeMipLevel(
+    ADDR_COMPUTE_SURFACE_INFO_INPUT*    pIn,   ///< [in/out] Input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*   pOut   ///< [out] Output structure
+    ) const
+{
+    // Mipmap including level 0 must be pow2 padded since either SI hw expects so or it is
+    // required by CFX  for Hw Compatibility between NI and SI. Otherwise it is only needed for
+    // mipLevel > 0. Any h/w has different requirement should implement its own virtual function
+
+    if (pIn->flags.pow2Pad)
+    {
+        pIn->width      = NextPow2(pIn->width);
+        pIn->height     = NextPow2(pIn->height);
+        pIn->numSlices  = NextPow2(pIn->numSlices);
+    }
+    else if (pIn->mipLevel > 0)
+    {
+        pIn->width      = NextPow2(pIn->width);
+        pIn->height     = NextPow2(pIn->height);
+
+        if (!pIn->flags.cube)
+        {
+            pIn->numSlices = NextPow2(pIn->numSlices);
+        }
+
+        // for cubemap, we keep its value at first
+    }
+
+    return ADDR_OK;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlSetupTileCfg
+*
+*   @brief
+*       Map tile index to tile setting.
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::HwlSetupTileCfg(
+    INT_32          index,            ///< [in] Tile index
+    INT_32          macroModeIndex,   ///< [in] Index in macro tile mode table(CI)
+    ADDR_TILEINFO*  pInfo,            ///< [out] Tile Info
+    AddrTileMode*   pMode,            ///< [out] Tile mode
+    AddrTileType*   pType             ///< [out] Tile type
+    ) const
+{
+    return ADDR_NOTSUPPORTED;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlGetPipes
+*
+*   @brief
+*       Get number pipes
+*   @return
+*       num pipes
+***************************************************************************************************
+*/
+UINT_32 AddrLib::HwlGetPipes(
+    const ADDR_TILEINFO* pTileInfo    ///< [in] Tile info
+    ) const
+{
+    //pTileInfo can be NULL when asic is 6xx and 8xx.
+    return m_pipes;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeQbStereoInfo
+*
+*   @brief
+*       Get quad buffer stereo information
+*   @return
+*       TRUE if no error
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::ComputeQbStereoInfo(
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut    ///< [in/out] updated pOut+pStereoInfo
+    ) const
+{
+    BOOL_32 success = FALSE;
+
+    if (pOut->pStereoInfo)
+    {
+        ADDR_ASSERT(pOut->bpp >= 8);
+        ADDR_ASSERT((pOut->surfSize % pOut->baseAlign) == 0);
+
+        // Save original height
+        pOut->pStereoInfo->eyeHeight = pOut->height;
+
+        // Right offset
+        pOut->pStereoInfo->rightOffset = static_cast<UINT_32>(pOut->surfSize);
+
+        pOut->pStereoInfo->rightSwizzle = HwlComputeQbStereoRightSwizzle(pOut);
+        // Double height
+        pOut->height <<= 1;
+        pOut->pixelHeight <<= 1;
+
+        // Double size
+        pOut->surfSize <<= 1;
+
+        // Right start address meets the base align since it is guaranteed by AddrLib
+
+        // 1D surface on SI may break this rule, but we can force it to meet by checking .qbStereo.
+        success = TRUE;
+    }
+
+    return success;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Element lib
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/**
+***************************************************************************************************
+*   AddrLib::Flt32ToColorPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a depth/stencil pixel value
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::Flt32ToDepthPixel(
+    const ELEM_FLT32TODEPTHPIXEL_INPUT* pIn,
+    ELEM_FLT32TODEPTHPIXEL_OUTPUT* pOut) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ELEM_FLT32TODEPTHPIXEL_INPUT)) ||
+            (pOut->size != sizeof(ELEM_FLT32TODEPTHPIXEL_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        GetElemLib()->Flt32ToDepthPixel(pIn->format,
+                                        pIn->comps,
+                                        pOut->pPixel);
+        UINT_32 depthBase = 0;
+        UINT_32 stencilBase = 0;
+        UINT_32 depthBits = 0;
+        UINT_32 stencilBits = 0;
+
+        switch (pIn->format)
+        {
+            case ADDR_DEPTH_16:
+                depthBits = 16;
+                break;
+            case ADDR_DEPTH_X8_24:
+            case ADDR_DEPTH_8_24:
+            case ADDR_DEPTH_X8_24_FLOAT:
+            case ADDR_DEPTH_8_24_FLOAT:
+                depthBase = 8;
+                depthBits = 24;
+                stencilBits = 8;
+                break;
+            case ADDR_DEPTH_32_FLOAT:
+                depthBits = 32;
+                break;
+            case ADDR_DEPTH_X24_8_32_FLOAT:
+                depthBase = 8;
+                depthBits = 32;
+                stencilBits = 8;
+                break;
+            default:
+                break;
+        }
+
+        // Overwrite base since R800 has no "tileBase"
+        if (GetElemLib()->IsDepthStencilTilePlanar() == FALSE)
+        {
+            depthBase = 0;
+            stencilBase = 0;
+        }
+
+        depthBase *= 64;
+        stencilBase *= 64;
+
+        pOut->stencilBase = stencilBase;
+        pOut->depthBase = depthBase;
+        pOut->depthBits = depthBits;
+        pOut->stencilBits = stencilBits;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::Flt32ToColorPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a red/green/blue/alpha pixel value
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::Flt32ToColorPixel(
+    const ELEM_FLT32TOCOLORPIXEL_INPUT* pIn,
+    ELEM_FLT32TOCOLORPIXEL_OUTPUT* pOut) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ELEM_FLT32TOCOLORPIXEL_INPUT)) ||
+            (pOut->size != sizeof(ELEM_FLT32TOCOLORPIXEL_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        GetElemLib()->Flt32ToColorPixel(pIn->format,
+                                        pIn->surfNum,
+                                        pIn->surfSwap,
+                                        pIn->comps,
+                                        pOut->pPixel);
+    }
+
+    return returnCode;
+}
+
+
+/**
+***************************************************************************************************
+*   AddrLib::GetExportNorm
+*
+*   @brief
+*       Check one format can be EXPORT_NUM
+*   @return
+*       TRUE if EXPORT_NORM can be used
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::GetExportNorm(
+    const ELEM_GETEXPORTNORM_INPUT* pIn) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    BOOL_32 enabled = FALSE;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if (pIn->size != sizeof(ELEM_GETEXPORTNORM_INPUT))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        enabled = GetElemLib()->PixGetExportNorm(pIn->format,
+                                                 pIn->num,
+                                                 pIn->swap);
+    }
+
+    return enabled;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputePrtInfo
+*
+*   @brief
+*       Compute prt surface related info
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputePrtInfo(
+    const ADDR_PRT_INFO_INPUT*  pIn,
+    ADDR_PRT_INFO_OUTPUT*       pOut) const
+{
+    ADDR_ASSERT(pOut != NULL);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    UINT_32     expandX = 1;
+    UINT_32     expandY = 1;
+    AddrElemMode elemMode;
+
+    UINT_32     bpp = GetElemLib()->GetBitsPerPixel(pIn->format,
+                                                &elemMode,
+                                                &expandX,
+                                                &expandY);
+
+    if (bpp <8 || bpp == 24 || bpp == 48 || bpp == 96 )
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+
+    UINT_32     numFrags = pIn->numFrags;
+    ADDR_ASSERT(numFrags <= 8);
+
+    UINT_32     tileWidth = 0;
+    UINT_32     tileHeight = 0;
+    if (returnCode == ADDR_OK)
+    {
+        // 3D texture without depth or 2d texture
+        if (pIn->baseMipDepth > 1 || pIn->baseMipHeight > 1)
+        {
+            if (bpp == 8)
+            {
+                tileWidth = 256;
+                tileHeight = 256;
+            }
+            else if (bpp == 16)
+            {
+                tileWidth = 256;
+                tileHeight = 128;
+            }
+            else if (bpp == 32)
+            {
+                tileWidth = 128;
+                tileHeight = 128;
+            }
+            else if (bpp == 64)
+            {
+                // assume it is BC1/4
+                tileWidth = 512;
+                tileHeight = 256;
+
+                if (elemMode == ADDR_UNCOMPRESSED)
+                {
+                    tileWidth = 128;
+                    tileHeight = 64;
+                }
+            }
+            else if (bpp == 128)
+            {
+                // assume it is BC2/3/5/6H/7
+                tileWidth = 256;
+                tileHeight = 256;
+
+                if (elemMode == ADDR_UNCOMPRESSED)
+                {
+                    tileWidth = 64;
+                    tileHeight = 64;
+                }
+            }
+
+            if (numFrags == 2)
+            {
+                tileWidth = tileWidth / 2;
+            }
+            else if (numFrags == 4)
+            {
+                tileWidth = tileWidth / 2;
+                tileHeight = tileHeight / 2;
+            }
+            else if (numFrags == 8)
+            {
+                tileWidth = tileWidth / 4;
+                tileHeight = tileHeight / 2;
+            }
+        }
+        else    // 1d
+        {
+            tileHeight = 1;
+            if (bpp == 8)
+            {
+                tileWidth = 65536;
+            }
+            else if (bpp == 16)
+            {
+                tileWidth = 32768;
+            }
+            else if (bpp == 32)
+            {
+                tileWidth = 16384;
+            }
+            else if (bpp == 64)
+            {
+                tileWidth = 8192;
+            }
+            else if (bpp == 128)
+            {
+                tileWidth = 4096;
+            }
+        }
+    }
+
+    pOut->prtTileWidth = tileWidth;
+    pOut->prtTileHeight = tileHeight;
+
+    return returnCode;
+}
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.h
new file mode 100644
index 00000000000..43c55ff32ff
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.h
@@ -0,0 +1,695 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrlib.h
+* @brief Contains the AddrLib base class definition.
+***************************************************************************************************
+*/
+
+#ifndef __ADDR_LIB_H__
+#define __ADDR_LIB_H__
+
+
+#include "addrinterface.h"
+#include "addrobject.h"
+#include "addrelemlib.h"
+
+#if BRAHMA_BUILD
+#include "amdgpu_id.h"
+#else
+#include "atiid.h"
+#endif
+
+#ifndef CIASICIDGFXENGINE_R600
+#define CIASICIDGFXENGINE_R600 0x00000006
+#endif
+
+#ifndef CIASICIDGFXENGINE_R800
+#define CIASICIDGFXENGINE_R800 0x00000008
+#endif
+
+#ifndef CIASICIDGFXENGINE_SOUTHERNISLAND
+#define CIASICIDGFXENGINE_SOUTHERNISLAND 0x0000000A
+#endif
+
+#ifndef CIASICIDGFXENGINE_SEAISLAND
+#define CIASICIDGFXENGINE_SEAISLAND 0x0000000B
+#endif
+/**
+***************************************************************************************************
+* @brief Neutral enums that define pipeinterleave
+***************************************************************************************************
+*/
+enum AddrPipeInterleave
+{
+    ADDR_PIPEINTERLEAVE_256B = 256,
+    ADDR_PIPEINTERLEAVE_512B = 512,
+};
+
+/**
+***************************************************************************************************
+* @brief Neutral enums that define DRAM row size
+***************************************************************************************************
+*/
+enum AddrRowSize
+{
+    ADDR_ROWSIZE_1KB = 1024,
+    ADDR_ROWSIZE_2KB = 2048,
+    ADDR_ROWSIZE_4KB = 4096,
+    ADDR_ROWSIZE_8KB = 8192,
+};
+
+/**
+***************************************************************************************************
+* @brief Neutral enums that define bank interleave
+***************************************************************************************************
+*/
+enum AddrBankInterleave
+{
+    ADDR_BANKINTERLEAVE_1 = 1,
+    ADDR_BANKINTERLEAVE_2 = 2,
+    ADDR_BANKINTERLEAVE_4 = 4,
+    ADDR_BANKINTERLEAVE_8 = 8,
+};
+
+/**
+***************************************************************************************************
+* @brief Neutral enums that define MGPU chip tile size
+***************************************************************************************************
+*/
+enum AddrChipTileSize
+{
+    ADDR_CHIPTILESIZE_16 = 16,
+    ADDR_CHIPTILESIZE_32 = 32,
+    ADDR_CHIPTILESIZE_64 = 64,
+    ADDR_CHIPTILESIZE_128 = 128,
+};
+
+/**
+***************************************************************************************************
+* @brief Neutral enums that define shader engine tile size
+***************************************************************************************************
+*/
+enum AddrEngTileSize
+{
+    ADDR_SE_TILESIZE_16 = 16,
+    ADDR_SE_TILESIZE_32 = 32,
+};
+
+/**
+***************************************************************************************************
+* @brief Neutral enums that define bank swap size
+***************************************************************************************************
+*/
+enum AddrBankSwapSize
+{
+    ADDR_BANKSWAP_128B = 128,
+    ADDR_BANKSWAP_256B = 256,
+    ADDR_BANKSWAP_512B = 512,
+    ADDR_BANKSWAP_1KB = 1024,
+};
+
+/**
+***************************************************************************************************
+* @brief Neutral enums that define bank swap size
+***************************************************************************************************
+*/
+enum AddrSampleSplitSize
+{
+    ADDR_SAMPLESPLIT_1KB = 1024,
+    ADDR_SAMPLESPLIT_2KB = 2048,
+    ADDR_SAMPLESPLIT_4KB = 4096,
+    ADDR_SAMPLESPLIT_8KB = 8192,
+};
+
+/**
+***************************************************************************************************
+* @brief Flags for AddrTileMode
+***************************************************************************************************
+*/
+struct AddrTileModeFlags
+{
+    UINT_32 thickness       : 4;
+    UINT_32 isLinear        : 1;
+    UINT_32 isMicro         : 1;
+    UINT_32 isMacro         : 1;
+    UINT_32 isMacro3d       : 1;
+    UINT_32 isPrt           : 1;
+    UINT_32 isPrtNoRotation : 1;
+    UINT_32 isBankSwapped   : 1;
+};
+
+/**
+***************************************************************************************************
+* @brief This class contains asic independent address lib functionalities
+***************************************************************************************************
+*/
+class AddrLib : public AddrObject
+{
+public:
+    virtual ~AddrLib();
+
+    static ADDR_E_RETURNCODE Create(
+        const ADDR_CREATE_INPUT* pCreateInfo, ADDR_CREATE_OUTPUT* pCreateOut);
+
+    /// Pair of Create
+    VOID Destroy()
+    {
+        delete this;
+    }
+
+    static AddrLib* GetAddrLib(
+        ADDR_HANDLE hLib);
+
+    /// Returns AddrLib version (from compiled binary instead include file)
+    UINT_32 GetVersion()
+    {
+        return m_version;
+    }
+
+    /// Returns asic chip family name defined by AddrLib
+    AddrChipFamily GetAddrChipFamily()
+    {
+        return m_chipFamily;
+    }
+
+    /// Returns tileIndex support
+    BOOL_32 UseTileIndex(INT_32 index) const
+    {
+        return m_configFlags.useTileIndex && (index != TileIndexInvalid);
+    }
+
+    /// Returns combined swizzle support
+    BOOL_32 UseCombinedSwizzle() const
+    {
+        return m_configFlags.useCombinedSwizzle;
+    }
+
+    //
+    // Interface stubs
+    //
+    ADDR_E_RETURNCODE ComputeSurfaceInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeSurfaceAddrFromCoord(
+        const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeSurfaceCoordFromAddr(
+        const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT*  pIn,
+        ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeSliceTileSwizzle(
+        const ADDR_COMPUTE_SLICESWIZZLE_INPUT*  pIn,
+        ADDR_COMPUTE_SLICESWIZZLE_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ExtractBankPipeSwizzle(
+        const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT* pIn,
+        ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE CombineBankPipeSwizzle(
+        const ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT*  pIn,
+        ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeBaseSwizzle(
+        const ADDR_COMPUTE_BASE_SWIZZLE_INPUT*  pIn,
+        ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeFmaskInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT*  pIn,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pOut);
+
+    ADDR_E_RETURNCODE ComputeFmaskAddrFromCoord(
+        const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT*  pIn,
+        ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeFmaskCoordFromAddr(
+        const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT*  pIn,
+        ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ConvertTileInfoToHW(
+        const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn,
+        ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ConvertTileIndex(
+        const ADDR_CONVERT_TILEINDEX_INPUT* pIn,
+        ADDR_CONVERT_TILEINDEX_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ConvertTileIndex1(
+        const ADDR_CONVERT_TILEINDEX1_INPUT* pIn,
+        ADDR_CONVERT_TILEINDEX_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE GetTileIndex(
+        const ADDR_GET_TILEINDEX_INPUT* pIn,
+        ADDR_GET_TILEINDEX_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeHtileInfo(
+        const ADDR_COMPUTE_HTILE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_HTILE_INFO_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeCmaskInfo(
+        const ADDR_COMPUTE_CMASK_INFO_INPUT* pIn,
+        ADDR_COMPUTE_CMASK_INFO_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeDccInfo(
+        const ADDR_COMPUTE_DCCINFO_INPUT* pIn,
+        ADDR_COMPUTE_DCCINFO_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeHtileAddrFromCoord(
+        const ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT*  pIn,
+        ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeCmaskAddrFromCoord(
+        const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT*  pIn,
+        ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeHtileCoordFromAddr(
+        const ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT*  pIn,
+        ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeCmaskCoordFromAddr(
+        const ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT*  pIn,
+        ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputePrtInfo(
+        const ADDR_PRT_INFO_INPUT*  pIn,
+        ADDR_PRT_INFO_OUTPUT*       pOut) const;
+
+    ADDR_E_RETURNCODE Flt32ToDepthPixel(
+        const ELEM_FLT32TODEPTHPIXEL_INPUT* pIn,
+        ELEM_FLT32TODEPTHPIXEL_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE Flt32ToColorPixel(
+        const ELEM_FLT32TOCOLORPIXEL_INPUT* pIn,
+        ELEM_FLT32TOCOLORPIXEL_OUTPUT* pOut) const;
+
+    BOOL_32 GetExportNorm(
+        const ELEM_GETEXPORTNORM_INPUT* pIn) const;
+
+protected:
+    AddrLib();  // Constructor is protected
+    AddrLib(const AddrClient* pClient);
+
+    /// Pure Virtual function for Hwl computing surface info
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl computing surface address from coord
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceAddrFromCoord(
+        const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl computing surface coord from address
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceCoordFromAddr(
+        const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl computing surface tile swizzle
+    virtual ADDR_E_RETURNCODE HwlComputeSliceTileSwizzle(
+        const ADDR_COMPUTE_SLICESWIZZLE_INPUT* pIn,
+        ADDR_COMPUTE_SLICESWIZZLE_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl extracting bank/pipe swizzle from base256b
+    virtual ADDR_E_RETURNCODE HwlExtractBankPipeSwizzle(
+        const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT* pIn,
+        ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl combining bank/pipe swizzle
+    virtual ADDR_E_RETURNCODE HwlCombineBankPipeSwizzle(
+        UINT_32 bankSwizzle, UINT_32 pipeSwizzle, ADDR_TILEINFO*  pTileInfo,
+        UINT_64 baseAddr, UINT_32* pTileSwizzle) const = 0;
+
+    /// Pure Virtual function for Hwl computing base swizzle
+    virtual ADDR_E_RETURNCODE HwlComputeBaseSwizzle(
+        const ADDR_COMPUTE_BASE_SWIZZLE_INPUT* pIn,
+        ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl computing HTILE base align
+    virtual UINT_32 HwlComputeHtileBaseAlign(
+        BOOL_32 isTcCompatible, BOOL_32 isLinear, ADDR_TILEINFO* pTileInfo) const = 0;
+
+    /// Pure Virtual function for Hwl computing HTILE bpp
+    virtual UINT_32 HwlComputeHtileBpp(
+        BOOL_32 isWidth8, BOOL_32 isHeight8) const = 0;
+
+    /// Pure Virtual function for Hwl computing HTILE bytes
+    virtual UINT_64 HwlComputeHtileBytes(
+        UINT_32 pitch, UINT_32 height, UINT_32 bpp,
+        BOOL_32 isLinear, UINT_32 numSlices, UINT_64* pSliceBytes, UINT_32 baseAlign) const = 0;
+
+    /// Pure Virtual function for Hwl computing FMASK info
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pOut) = 0;
+
+    /// Pure Virtual function for Hwl FMASK address from coord
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskAddrFromCoord(
+        const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl FMASK coord from address
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskCoordFromAddr(
+        const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl convert tile info from real value to HW value
+    virtual ADDR_E_RETURNCODE HwlConvertTileInfoToHW(
+        const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn,
+        ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl compute mipmap info
+    virtual BOOL_32 HwlComputeMipLevel(
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn) const = 0;
+
+    /// Pure Virtual function for Hwl compute max cmask blockMax value
+    virtual BOOL_32 HwlGetMaxCmaskBlockMax() const = 0;
+
+    /// Pure Virtual function for Hwl compute fmask bits
+    virtual UINT_32 HwlComputeFmaskBits(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+        UINT_32* pNumSamples) const = 0;
+
+    /// Virtual function to get index (not pure then no need to implement this in all hwls
+    virtual ADDR_E_RETURNCODE HwlGetTileIndex(
+        const ADDR_GET_TILEINDEX_INPUT* pIn,
+        ADDR_GET_TILEINDEX_OUTPUT*      pOut) const
+    {
+        return ADDR_NOTSUPPORTED;
+    }
+
+    /// Virtual function for Hwl to compute Dcc info
+    virtual ADDR_E_RETURNCODE HwlComputeDccInfo(
+        const ADDR_COMPUTE_DCCINFO_INPUT* pIn,
+        ADDR_COMPUTE_DCCINFO_OUTPUT* pOut) const
+    {
+        return ADDR_NOTSUPPORTED;
+    }
+
+    /// Virtual function to get cmask address for tc compatible cmask
+    virtual ADDR_E_RETURNCODE HwlComputeCmaskAddrFromCoord(
+        const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT* pOut) const
+    {
+        return ADDR_NOTSUPPORTED;
+    }
+    // Compute attributes
+
+    // HTILE
+    UINT_32    ComputeHtileInfo(
+        ADDR_HTILE_FLAGS flags,
+        UINT_32 pitchIn, UINT_32 heightIn, UINT_32 numSlices,
+        BOOL_32 isLinear, BOOL_32 isWidth8, BOOL_32 isHeight8,
+        ADDR_TILEINFO*  pTileInfo,
+        UINT_32* pPitchOut, UINT_32* pHeightOut, UINT_64* pHtileBytes,
+        UINT_32* pMacroWidth = NULL, UINT_32* pMacroHeight = NULL,
+        UINT_64* pSliceSize = NULL, UINT_32* pBaseAlign = NULL) const;
+
+    // CMASK
+    ADDR_E_RETURNCODE ComputeCmaskInfo(
+        ADDR_CMASK_FLAGS flags,
+        UINT_32 pitchIn, UINT_32 heightIn, UINT_32 numSlices, BOOL_32 isLinear,
+        ADDR_TILEINFO* pTileInfo, UINT_32* pPitchOut, UINT_32* pHeightOut, UINT_64* pCmaskBytes,
+        UINT_32* pMacroWidth, UINT_32* pMacroHeight, UINT_64* pSliceSize = NULL,
+        UINT_32* pBaseAlign = NULL, UINT_32* pBlockMax = NULL) const;
+
+    virtual VOID HwlComputeTileDataWidthAndHeightLinear(
+        UINT_32* pMacroWidth, UINT_32* pMacroHeight,
+        UINT_32 bpp, ADDR_TILEINFO* pTileInfo) const;
+
+    // CMASK & HTILE addressing
+    virtual UINT_64 HwlComputeXmaskAddrFromCoord(
+        UINT_32 pitch, UINT_32 height, UINT_32 x, UINT_32 y, UINT_32 slice,
+        UINT_32 numSlices, UINT_32 factor, BOOL_32 isLinear, BOOL_32 isWidth8,
+        BOOL_32 isHeight8, ADDR_TILEINFO* pTileInfo,
+        UINT_32* bitPosition) const;
+
+    virtual VOID HwlComputeXmaskCoordFromAddr(
+        UINT_64 addr, UINT_32 bitPosition, UINT_32 pitch, UINT_32 height, UINT_32 numSlices,
+        UINT_32 factor, BOOL_32 isLinear, BOOL_32 isWidth8, BOOL_32 isHeight8,
+        ADDR_TILEINFO* pTileInfo, UINT_32* pX, UINT_32* pY, UINT_32* pSlice) const;
+
+    // Surface mipmap
+    VOID    ComputeMipLevel(
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn) const;
+
+    /// Pure Virtual function for Hwl checking degrade for base level
+    virtual BOOL_32 HwlDegradeBaseLevel(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn) const = 0;
+
+    virtual BOOL_32 HwlOverrideTileMode(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        AddrTileMode* pTileMode,
+        AddrTileType* pTileType) const
+    {
+        // not supported in hwl layer, FALSE for not-overrided
+        return FALSE;
+    }
+
+    AddrTileMode DegradeLargeThickTile(AddrTileMode tileMode, UINT_32 bpp) const;
+
+    VOID PadDimensions(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags,
+        UINT_32 numSamples, ADDR_TILEINFO* pTileInfo, UINT_32 padDims, UINT_32 mipLevel,
+        UINT_32* pPitch, UINT_32 pitchAlign, UINT_32* pHeight, UINT_32 heightAlign,
+        UINT_32* pSlices, UINT_32 sliceAlign) const;
+
+    virtual VOID HwlPadDimensions(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags,
+        UINT_32 numSamples, ADDR_TILEINFO* pTileInfo, UINT_32 padDims, UINT_32 mipLevel,
+        UINT_32* pPitch, UINT_32 pitchAlign, UINT_32* pHeight, UINT_32 heightAlign,
+        UINT_32* pSlices, UINT_32 sliceAlign) const
+    {
+    }
+
+    //
+    // Addressing shared for linear/1D tiling
+    //
+    UINT_64 ComputeSurfaceAddrFromCoordLinear(
+        UINT_32 x, UINT_32 y, UINT_32 slice, UINT_32 sample,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSlices,
+        UINT_32* pBitPosition) const;
+
+    VOID    ComputeSurfaceCoordFromAddrLinear(
+        UINT_64 addr, UINT_32 bitPosition, UINT_32 bpp,
+        UINT_32 pitch, UINT_32 height, UINT_32 numSlices,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample) const;
+
+    VOID    ComputeSurfaceCoordFromAddrMicroTiled(
+        UINT_64 addr, UINT_32 bitPosition,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        AddrTileMode tileMode, UINT_32 tileBase, UINT_32 compBits,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample,
+        AddrTileType microTileType, BOOL_32 isDepthSampleOrder) const;
+
+    UINT_32 ComputePixelIndexWithinMicroTile(
+        UINT_32 x, UINT_32 y, UINT_32 z,
+        UINT_32 bpp, AddrTileMode tileMode, AddrTileType microTileType) const;
+
+    /// Pure Virtual function for Hwl computing coord from offset inside micro tile
+    virtual VOID HwlComputePixelCoordFromOffset(
+        UINT_32 offset, UINT_32 bpp, UINT_32 numSamples,
+        AddrTileMode tileMode, UINT_32 tileBase, UINT_32 compBits,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample,
+        AddrTileType microTileType, BOOL_32 isDepthSampleOrder) const = 0;
+
+    //
+    // Addressing shared by all
+    //
+    virtual UINT_32 HwlGetPipes(
+        const ADDR_TILEINFO* pTileInfo) const;
+
+    UINT_32 ComputePipeFromAddr(
+        UINT_64 addr, UINT_32 numPipes) const;
+
+    /// Pure Virtual function for Hwl computing pipe from coord
+    virtual UINT_32 ComputePipeFromCoord(
+        UINT_32 x, UINT_32 y, UINT_32 slice, AddrTileMode tileMode,
+        UINT_32 pipeSwizzle, BOOL_32 flags, ADDR_TILEINFO* pTileInfo) const = 0;
+
+    /// Pure Virtual function for Hwl computing coord Y for 8 pipe cmask/htile
+    virtual UINT_32 HwlComputeXmaskCoordYFrom8Pipe(
+        UINT_32 pipe, UINT_32 x) const = 0;
+
+    //
+    // Initialization
+    //
+    /// Pure Virtual function for Hwl computing internal global parameters from h/w registers
+    virtual BOOL_32 HwlInitGlobalParams(
+        const ADDR_CREATE_INPUT* pCreateIn) = 0;
+
+    /// Pure Virtual function for Hwl converting chip family
+    virtual AddrChipFamily HwlConvertChipFamily(UINT_32 uChipFamily, UINT_32 uChipRevision) = 0;
+
+    //
+    // Misc helper
+    //
+    static const AddrTileModeFlags m_modeFlags[ADDR_TM_COUNT];
+
+    static UINT_32 ComputeSurfaceThickness(
+        AddrTileMode tileMode);
+
+    // Checking tile mode
+    static BOOL_32 IsMacroTiled(AddrTileMode tileMode);
+    static BOOL_32 IsMacro3dTiled(AddrTileMode tileMode);
+    static BOOL_32 IsLinear(AddrTileMode tileMode);
+    static BOOL_32 IsMicroTiled(AddrTileMode tileMode);
+    static BOOL_32 IsPrtTileMode(AddrTileMode tileMode);
+    static BOOL_32 IsPrtNoRotationTileMode(AddrTileMode tileMode);
+
+    static UINT_32 Bits2Number(UINT_32 bitNum,...);
+
+    static UINT_32 GetNumFragments(UINT_32 numSamples, UINT_32 numFrags)
+    {
+        return numFrags != 0 ? numFrags : Max(1u, numSamples);
+    }
+
+    /// Returns pointer of AddrElemLib
+    AddrElemLib* GetElemLib() const
+    {
+        return m_pElemLib;
+    }
+
+    /// Return TRUE if tile info is needed
+    BOOL_32 UseTileInfo() const
+    {
+        return !m_configFlags.ignoreTileInfo;
+    }
+
+    /// Returns fillSizeFields flag
+    UINT_32 GetFillSizeFieldsFlags() const
+    {
+        return m_configFlags.fillSizeFields;
+    }
+
+    /// Adjusts pitch alignment for flipping surface
+    VOID    AdjustPitchAlignment(
+        ADDR_SURFACE_FLAGS flags, UINT_32* pPitchAlign) const;
+
+    /// Overwrite tile config according to tile index
+    virtual ADDR_E_RETURNCODE HwlSetupTileCfg(
+        INT_32 index, INT_32 macroModeIndex,
+        ADDR_TILEINFO* pInfo, AddrTileMode* mode = NULL, AddrTileType* type = NULL) const;
+
+    /// Overwrite macro tile config according to tile index
+    virtual INT_32 HwlComputeMacroModeIndex(
+        INT_32 index, ADDR_SURFACE_FLAGS flags, UINT_32 bpp, UINT_32 numSamples,
+        ADDR_TILEINFO* pTileInfo, AddrTileMode *pTileMode = NULL, AddrTileType *pTileType = NULL
+        ) const
+    {
+        return TileIndexNoMacroIndex;
+    }
+
+    /// Pre-handler of 3x pitch (96 bit) adjustment
+    virtual UINT_32 HwlPreHandleBaseLvl3xPitch(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, UINT_32 expPitch) const;
+    /// Post-handler of 3x pitch adjustment
+    virtual UINT_32 HwlPostHandleBaseLvl3xPitch(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, UINT_32 expPitch) const;
+    /// Check miplevel after surface adjustment
+    ADDR_E_RETURNCODE PostComputeMipLevel(
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    /// Quad buffer stereo support, has its implementation in ind. layer
+    virtual BOOL_32 ComputeQbStereoInfo(
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    /// Pure virutual function to compute stereo bank swizzle for right eye
+    virtual UINT_32 HwlComputeQbStereoRightSwizzle(
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const = 0;
+
+private:
+    // Disallow the copy constructor
+    AddrLib(const AddrLib& a);
+
+    // Disallow the assignment operator
+    AddrLib& operator=(const AddrLib& a);
+
+    VOID SetAddrChipFamily(UINT_32 uChipFamily, UINT_32 uChipRevision);
+
+    UINT_32 ComputeCmaskBaseAlign(
+        ADDR_CMASK_FLAGS flags, ADDR_TILEINFO*  pTileInfo) const;
+
+    UINT_64 ComputeCmaskBytes(
+        UINT_32 pitch, UINT_32 height, UINT_32 numSlices) const;
+
+    //
+    // CMASK/HTILE shared methods
+    //
+    VOID    ComputeTileDataWidthAndHeight(
+        UINT_32 bpp, UINT_32 cacheBits, ADDR_TILEINFO* pTileInfo,
+        UINT_32* pMacroWidth, UINT_32* pMacroHeight) const;
+
+    UINT_32 ComputeXmaskCoordYFromPipe(
+        UINT_32 pipe, UINT_32 x) const;
+
+    VOID SetMinPitchAlignPixels(UINT_32 minPitchAlignPixels);
+
+    BOOL_32 DegradeBaseLevel(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, AddrTileMode* pTileMode) const;
+
+protected:
+    AddrLibClass        m_class;        ///< Store class type (HWL type)
+
+    AddrChipFamily      m_chipFamily;   ///< Chip family translated from the one in atiid.h
+
+    UINT_32             m_chipRevision; ///< Revision id from xxx_id.h
+
+    UINT_32             m_version;      ///< Current version
+
+    //
+    // Global parameters
+    //
+    ADDR_CONFIG_FLAGS   m_configFlags;  ///< Global configuration flags. Note this is setup by
+                                        ///  AddrLib instead of Client except forceLinearAligned
+
+    UINT_32             m_pipes;        ///< Number of pipes
+    UINT_32             m_banks;        ///< Number of banks
+                                        ///  For r800 this is MC_ARB_RAMCFG.NOOFBANK
+                                        ///  Keep it here to do default parameter calculation
+
+    UINT_32             m_pipeInterleaveBytes;
+                                        ///< Specifies the size of contiguous address space
+                                        ///  within each tiling pipe when making linear
+                                        ///  accesses. (Formerly Group Size)
+
+    UINT_32             m_rowSize;      ///< DRAM row size, in bytes
+
+    UINT_32             m_minPitchAlignPixels; ///< Minimum pitch alignment in pixels
+    UINT_32             m_maxSamples;   ///< Max numSamples
+private:
+    AddrElemLib*        m_pElemLib;     ///< Element Lib pointer
+};
+
+AddrLib* AddrSIHwlInit  (const AddrClient* pClient);
+AddrLib* AddrCIHwlInit  (const AddrClient* pClient);
+
+#endif
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.cpp
new file mode 100644
index 00000000000..863a252fcf1
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrobject.cpp
+* @brief Contains the AddrObject base class implementation.
+***************************************************************************************************
+*/
+
+#include "addrinterface.h"
+#include "addrobject.h"
+
+/**
+***************************************************************************************************
+*   AddrObject::AddrObject
+*
+*   @brief
+*       Constructor for the AddrObject class.
+***************************************************************************************************
+*/
+AddrObject::AddrObject()
+{
+    m_client.handle = NULL;
+    m_client.callbacks.allocSysMem = NULL;
+    m_client.callbacks.freeSysMem = NULL;
+    m_client.callbacks.debugPrint = NULL;
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::AddrObject
+*
+*   @brief
+*       Constructor for the AddrObject class.
+***************************************************************************************************
+*/
+AddrObject::AddrObject(const AddrClient* pClient)
+{
+    m_client = *pClient;
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::~AddrObject
+*
+*   @brief
+*       Destructor for the AddrObject class.
+***************************************************************************************************
+*/
+AddrObject::~AddrObject()
+{
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::ClientAlloc
+*
+*   @brief
+*       Calls instanced allocSysMem inside AddrClient
+***************************************************************************************************
+*/
+VOID* AddrObject::ClientAlloc(
+    size_t             objSize,    ///< [in] Size to allocate
+    const AddrClient*  pClient)    ///< [in] Client pointer
+{
+    VOID* pObjMem = NULL;
+
+    if (pClient->callbacks.allocSysMem != NULL)
+    {
+        ADDR_ALLOCSYSMEM_INPUT allocInput = {0};
+
+        allocInput.size        = sizeof(ADDR_ALLOCSYSMEM_INPUT);
+        allocInput.flags.value = 0;
+        allocInput.sizeInBytes = static_cast<UINT_32>(objSize);
+        allocInput.hClient     = pClient->handle;
+
+        pObjMem = pClient->callbacks.allocSysMem(&allocInput);
+    }
+
+    return pObjMem;
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::AddrMalloc
+*
+*   @brief
+*       A wrapper of ClientAlloc
+***************************************************************************************************
+*/
+VOID* AddrObject::AddrMalloc(
+    size_t objSize) const   ///< [in] Size to allocate
+{
+    return ClientAlloc(objSize, &m_client);;
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::ClientFree
+*
+*   @brief
+*       Calls freeSysMem inside AddrClient
+***************************************************************************************************
+*/
+VOID AddrObject::ClientFree(
+    VOID*              pObjMem,    ///< [in] User virtual address to free.
+    const AddrClient*  pClient)    ///< [in] Client pointer
+{
+    if (pClient->callbacks.freeSysMem != NULL)
+    {
+        if (pObjMem != NULL)
+        {
+            ADDR_FREESYSMEM_INPUT freeInput = {0};
+
+            freeInput.size      = sizeof(ADDR_FREESYSMEM_INPUT);
+            freeInput.hClient   = pClient->handle;
+            freeInput.pVirtAddr = pObjMem;
+
+            pClient->callbacks.freeSysMem(&freeInput);
+        }
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::AddrFree
+*
+*   @brief
+*       A wrapper of ClientFree
+***************************************************************************************************
+*/
+VOID AddrObject::AddrFree(
+    VOID* pObjMem) const                 ///< [in] User virtual address to free.
+{
+    ClientFree(pObjMem, &m_client);
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::operator new
+*
+*   @brief
+*       Allocates memory needed for AddrObject object. (with ADDR_CLIENT_HANDLE)
+*
+*   @return
+*       Returns NULL if unsuccessful.
+***************************************************************************************************
+*/
+VOID* AddrObject::operator new(
+    size_t             objSize,    ///< [in] Size to allocate
+    const AddrClient*  pClient)    ///< [in] Client pointer
+{
+    return ClientAlloc(objSize, pClient);
+}
+
+
+/**
+***************************************************************************************************
+*   AddrObject::operator delete
+*
+*   @brief
+*       Frees AddrObject object memory.
+***************************************************************************************************
+*/
+VOID AddrObject::operator delete(
+    VOID* pObjMem,              ///< [in] User virtual address to free.
+    const AddrClient* pClient)  ///< [in] Client handle
+{
+    ClientFree(pObjMem, pClient);
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::operator delete
+*
+*   @brief
+*       Frees AddrObject object memory.
+***************************************************************************************************
+*/
+VOID AddrObject::operator delete(
+    VOID* pObjMem)                  ///< [in] User virtual address to free.
+{
+    AddrObject* pObj = static_cast<AddrObject*>(pObjMem);
+    ClientFree(pObjMem, &pObj->m_client);
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::DebugPrint
+*
+*   @brief
+*       Print debug message
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrObject::DebugPrint(
+    const CHAR* pDebugString,     ///< [in] Debug string
+    ...) const
+{
+#if DEBUG
+    if (m_client.callbacks.debugPrint != NULL)
+    {
+        va_list ap;
+
+        va_start(ap, pDebugString);
+
+        ADDR_DEBUGPRINT_INPUT debugPrintInput = {0};
+
+        debugPrintInput.size         = sizeof(ADDR_DEBUGPRINT_INPUT);
+        debugPrintInput.pDebugString = const_cast<CHAR*>(pDebugString);
+        debugPrintInput.hClient      = m_client.handle;
+        va_copy(debugPrintInput.ap, ap);
+
+        m_client.callbacks.debugPrint(&debugPrintInput);
+
+        va_end(ap);
+    }
+#endif
+}
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.h b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.h
new file mode 100644
index 00000000000..35400885afe
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrobject.h
+* @brief Contains the AddrObject base class definition.
+***************************************************************************************************
+*/
+
+#ifndef __ADDR_OBJECT_H__
+#define __ADDR_OBJECT_H__
+
+#include "addrtypes.h"
+#include "addrcommon.h"
+
+/**
+***************************************************************************************************
+* @brief This structure contains client specific data
+***************************************************************************************************
+*/
+struct AddrClient
+{
+    ADDR_CLIENT_HANDLE  handle;
+    ADDR_CALLBACKS      callbacks;
+};
+/**
+***************************************************************************************************
+* @brief This class is the base class for all ADDR class objects.
+***************************************************************************************************
+*/
+class AddrObject
+{
+public:
+    AddrObject();
+    AddrObject(const AddrClient* pClient);
+    virtual ~AddrObject();
+
+    VOID* operator new(size_t size, const AddrClient* pClient);
+    VOID  operator delete(VOID* pObj, const AddrClient* pClient);
+    VOID  operator delete(VOID* pObj);
+    VOID* AddrMalloc(size_t size) const;
+    VOID  AddrFree(VOID* pObj) const;
+
+    VOID DebugPrint(
+        const CHAR* pDebugString,
+        ...) const;
+
+    const AddrClient* GetClient() const {return &m_client;}
+
+protected:
+    AddrClient m_client;
+
+private:
+    static VOID* ClientAlloc(size_t size, const AddrClient* pClient);
+    static VOID  ClientFree(VOID* pObj, const AddrClient* pClient);
+
+    // disallow the copy constructor
+    AddrObject(const AddrObject& a);
+
+    // disallow the assignment operator
+    AddrObject& operator=(const AddrObject& a);
+};
+
+#endif
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/inc/chip/r800/si_gb_reg.h b/src/gallium/winsys/amdgpu/drm/addrlib/inc/chip/r800/si_gb_reg.h
new file mode 100644
index 00000000000..cf67f602bdf
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/inc/chip/r800/si_gb_reg.h
@@ -0,0 +1,155 @@
+#if !defined (__SI_GB_REG_H__)
+#define __SI_GB_REG_H__
+
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+//
+// Make sure the necessary endian defines are there.
+//
+#if defined(LITTLEENDIAN_CPU)
+#elif defined(BIGENDIAN_CPU)
+#else
+#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined"
+#endif
+
+/*
+ * GB_ADDR_CONFIG struct
+ */
+
+#if     defined(LITTLEENDIAN_CPU)
+
+     typedef struct _GB_ADDR_CONFIG_T {
+          unsigned int num_pipes                      : 3;
+          unsigned int                                : 1;
+          unsigned int pipe_interleave_size           : 3;
+          unsigned int                                : 1;
+          unsigned int bank_interleave_size           : 3;
+          unsigned int                                : 1;
+          unsigned int num_shader_engines             : 2;
+          unsigned int                                : 2;
+          unsigned int shader_engine_tile_size        : 3;
+          unsigned int                                : 1;
+          unsigned int num_gpus                       : 3;
+          unsigned int                                : 1;
+          unsigned int multi_gpu_tile_size            : 2;
+          unsigned int                                : 2;
+          unsigned int row_size                       : 2;
+          unsigned int num_lower_pipes                : 1;
+          unsigned int                                : 1;
+     } GB_ADDR_CONFIG_T;
+
+#elif       defined(BIGENDIAN_CPU)
+
+     typedef struct _GB_ADDR_CONFIG_T {
+          unsigned int                                : 1;
+          unsigned int num_lower_pipes                : 1;
+          unsigned int row_size                       : 2;
+          unsigned int                                : 2;
+          unsigned int multi_gpu_tile_size            : 2;
+          unsigned int                                : 1;
+          unsigned int num_gpus                       : 3;
+          unsigned int                                : 1;
+          unsigned int shader_engine_tile_size        : 3;
+          unsigned int                                : 2;
+          unsigned int num_shader_engines             : 2;
+          unsigned int                                : 1;
+          unsigned int bank_interleave_size           : 3;
+          unsigned int                                : 1;
+          unsigned int pipe_interleave_size           : 3;
+          unsigned int                                : 1;
+          unsigned int num_pipes                      : 3;
+     } GB_ADDR_CONFIG_T;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     GB_ADDR_CONFIG_T f;
+} GB_ADDR_CONFIG;
+
+#if       defined(LITTLEENDIAN_CPU)
+
+     typedef struct _GB_TILE_MODE_T {
+          unsigned int micro_tile_mode                : 2;
+          unsigned int array_mode                     : 4;
+          unsigned int pipe_config                    : 5;
+          unsigned int tile_split                     : 3;
+          unsigned int bank_width                     : 2;
+          unsigned int bank_height                    : 2;
+          unsigned int macro_tile_aspect              : 2;
+          unsigned int num_banks                      : 2;
+          unsigned int micro_tile_mode_new            : 3;
+          unsigned int sample_split                   : 2;
+          unsigned int                                : 5;
+     } GB_TILE_MODE_T;
+
+     typedef struct _GB_MACROTILE_MODE_T {
+          unsigned int bank_width                     : 2;
+          unsigned int bank_height                    : 2;
+          unsigned int macro_tile_aspect              : 2;
+          unsigned int num_banks                      : 2;
+          unsigned int                                : 24;
+     } GB_MACROTILE_MODE_T;
+
+#elif          defined(BIGENDIAN_CPU)
+
+     typedef struct _GB_TILE_MODE_T {
+          unsigned int                                : 5;
+          unsigned int sample_split                   : 2;
+          unsigned int micro_tile_mode_new            : 3;
+          unsigned int num_banks                      : 2;
+          unsigned int macro_tile_aspect              : 2;
+          unsigned int bank_height                    : 2;
+          unsigned int bank_width                     : 2;
+          unsigned int tile_split                     : 3;
+          unsigned int pipe_config                    : 5;
+          unsigned int array_mode                     : 4;
+          unsigned int micro_tile_mode                : 2;
+     } GB_TILE_MODE_T;
+
+     typedef struct _GB_MACROTILE_MODE_T {
+          unsigned int                                : 24;
+          unsigned int num_banks                      : 2;
+          unsigned int macro_tile_aspect              : 2;
+          unsigned int bank_height                    : 2;
+          unsigned int bank_width                     : 2;
+     } GB_MACROTILE_MODE_T;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     GB_TILE_MODE_T f;
+} GB_TILE_MODE;
+
+typedef union {
+     unsigned int val : 32;
+     GB_MACROTILE_MODE_T f;
+} GB_MACROTILE_MODE;
+
+#endif
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/inc/lnx_common_defs.h b/src/gallium/winsys/amdgpu/drm/addrlib/inc/lnx_common_defs.h
new file mode 100644
index 00000000000..61540f49b7e
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/inc/lnx_common_defs.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+#ifndef _lnx_common_defs_h_
+#define _lnx_common_defs_h_
+
+#if DBG
+#include <stdarg.h>                         // We do not have any choice: need variable
+                                            // number of parameters support for debug
+                                            // build.
+#endif                                      // #if DBG
+
+//
+// --------------  External functions from Linux kernel driver ----------------
+//
+// Note: The definitions/declararions below must match the original ones.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned long __ke_size_t;              // as it is defined in firegl_public.h
+typedef int           __kernel_ptrdiff_t;       // as it is defined in posix_types.h
+
+
+#if !defined(ATI_API_CALL)
+#define ATI_API_CALL __attribute__((regparm(0)))
+#endif
+
+extern void * ATI_API_CALL __ke_memset(void* s, int c, __ke_size_t count);
+extern void * ATI_API_CALL __ke_memcpy(void* d, const void* s, __ke_size_t count);
+extern ATI_API_CALL __ke_size_t __ke_strlen(const char *s);
+extern char* ATI_API_CALL __ke_strcpy(char* d, const char* s);
+extern char* ATI_API_CALL __ke_strncpy(char* d, const char* s, __ke_size_t count);
+extern void __ke_printk(const char* fmt, ...);
+
+extern int ATI_API_CALL __ke_snprintf(char* buf, __ke_size_t size, const char* fmt, ...);
+extern int ATI_API_CALL KCL_CopyFromUserSpace(void* to, const void* from, __ke_size_t size);
+extern int ATI_API_CALL KCL_CopyToUserSpace(void* to, const void* from, __ke_size_t size);
+#define __ke_copy_from_user  KCL_CopyFromUserSpace
+#define __ke_copy_to_user    KCL_CopyToUserSpace
+extern int ATI_API_CALL __ke_verify_area(int type, const void * addr, unsigned long size);
+
+extern unsigned long ATI_API_CALL KAS_GetTickCounter(void);
+extern unsigned long ATI_API_CALL KAS_GetTicksPerSecond(void);
+
+
+#if DBG
+extern int ATI_API_CALL __ke_vsnprintf(char *buf, __ke_size_t size, const char *fmt, va_list ap);
+#define vsnprintf(_dst, _size, _fmt, varg)  __ke_snprintf(_dst, _size, _fmt, varg)
+#endif                                      // #if DBG
+
+
+// Note: This function is not defined in firegl_public.h.
+void    firegl_hardwareHangRecovery(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+// --------------------------  C/C++ standard typedefs ----------------------------
+//
+#ifdef __SIZE_TYPE__
+typedef __SIZE_TYPE__       size_t;
+#else                                       // #ifdef __SIZE_TYPE__
+typedef unsigned int        size_t;
+#endif                                      // #ifdef __SIZE_TYPE__
+
+#ifdef __PTRDIFF_TYPE__
+typedef __PTRDIFF_TYPE__    ptrdiff_t;
+#else                                       // #ifdef __PTRDIFF_TYPE__
+typedef int                 ptrdiff_t;
+#endif                                      // #ifdef __PTRDIFF_TYPE__
+
+#ifndef NULL
+#ifdef __cplusplus
+#define NULL    __null
+#else
+#define NULL    ((void *)0)
+#endif
+#endif
+
+
+//
+// -------------------------  C/C++ standard macros ---------------------------
+//
+
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)  // as it is defined in stddef.h
+#define CHAR_BIT            8                                   // as it is defined in limits.h
+
+//
+// ---------------------------------  C RTL -----------------------------------
+//
+
+#define memset(_p, _v, _n)                  __ke_memset(_p, _v, _n)
+#define memcpy(_d, _s, _n)                  __ke_memcpy(_d, _s, _n)
+#define strlen(_s)                          __ke_strlen(_s)
+#define strcpy(_d, _s)                      __ke_strcpy(_d, _s)
+#define strncpy(_d, _s, _n)                 __ke_strncpy(_d, _s, _n)
+// Note: C99 supports macros with variable number of arguments. GCC also supports this C99 feature as
+//       C++ extension.
+#define snprintf(_dst, _size, _fmt, arg...) __ke_snprintf(_dst, _size, _fmt, ##arg)
+
+
+#endif                                      // #ifdef _lnx_common_defs_h_
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/chip/si_ci_vi_merged_enum.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/chip/si_ci_vi_merged_enum.h
new file mode 100644
index 00000000000..5ed81add264
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/chip/si_ci_vi_merged_enum.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+#if !defined (SI_CI_VI_MERGED_ENUM_HEADER)
+#define SI_CI_VI_MERGED_ENUM_HEADER
+
+typedef enum PipeInterleaveSize {
+ADDR_CONFIG_PIPE_INTERLEAVE_256B         = 0x00000000,
+ADDR_CONFIG_PIPE_INTERLEAVE_512B         = 0x00000001,
+} PipeInterleaveSize;
+
+typedef enum RowSize {
+ADDR_CONFIG_1KB_ROW                      = 0x00000000,
+ADDR_CONFIG_2KB_ROW                      = 0x00000001,
+ADDR_CONFIG_4KB_ROW                      = 0x00000002,
+} RowSize;
+
+#endif
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
new file mode 100644
index 00000000000..264e2efb8b2
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
@@ -0,0 +1,1777 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  ciaddrlib.cpp
+* @brief Contains the implementation for the CIAddrLib class.
+***************************************************************************************************
+*/
+
+#include "ciaddrlib.h"
+
+#include "si_gb_reg.h"
+
+#include "si_ci_vi_merged_enum.h"
+
+#if BRAHMA_BUILD
+#include "amdgpu_id.h"
+#else
+#include "ci_id.h"
+#include "kv_id.h"
+#include "vi_id.h"
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrMask
+*
+*   @brief
+*       Gets a mask of "width"
+*   @return
+*       Bit mask
+***************************************************************************************************
+*/
+static UINT_64 AddrMask(
+    UINT_32 width)  ///< Width of bits
+{
+    UINT_64 ret;
+
+    if (width >= sizeof(UINT_64)*8)
+    {
+        ret = ~((UINT_64) 0);
+    }
+    else
+    {
+        return (((UINT_64) 1) << width) - 1;
+    }
+    return ret;
+}
+
+/**
+***************************************************************************************************
+*   AddrGetBits
+*
+*   @brief
+*       Gets bits within a range of [msb, lsb]
+*   @return
+*       Bits of this range
+***************************************************************************************************
+*/
+static UINT_64 AddrGetBits(
+    UINT_64 bits,   ///< Source bits
+    UINT_32 msb,    ///< Most signicant bit
+    UINT_32 lsb)    ///< Least signicant bit
+{
+    UINT_64 ret = 0;
+
+    if (msb >= lsb)
+    {
+        ret = (bits >> lsb) & (AddrMask(1 + msb - lsb));
+    }
+    return ret;
+}
+
+/**
+***************************************************************************************************
+*   AddrRemoveBits
+*
+*   @brief
+*       Removes bits within the range of [msb, lsb]
+*   @return
+*       Modified bits
+***************************************************************************************************
+*/
+static UINT_64 AddrRemoveBits(
+    UINT_64 bits,   ///< Source bits
+    UINT_32 msb,    ///< Most signicant bit
+    UINT_32 lsb)    ///< Least signicant bit
+{
+    UINT_64 ret = bits;
+
+    if (msb >= lsb)
+    {
+        ret = AddrGetBits(bits, lsb - 1, 0) // low bits
+            | (AddrGetBits(bits, 8 * sizeof(bits) - 1, msb + 1) << lsb); //high bits
+    }
+    return ret;
+}
+
+/**
+***************************************************************************************************
+*   AddrInsertBits
+*
+*   @brief
+*       Inserts new bits into the range of [msb, lsb]
+*   @return
+*       Modified bits
+***************************************************************************************************
+*/
+static UINT_64 AddrInsertBits(
+    UINT_64 bits,       ///< Source bits
+    UINT_64 newBits,    ///< New bits to be inserted
+    UINT_32 msb,        ///< Most signicant bit
+    UINT_32 lsb)        ///< Least signicant bit
+{
+    UINT_64 ret = bits;
+
+    if (msb >= lsb)
+    {
+        ret = AddrGetBits(bits, lsb - 1, 0) // old low bitss
+             | (AddrGetBits(newBits, msb - lsb, 0) << lsb) //new bits
+             | (AddrGetBits(bits, 8 * sizeof(bits) - 1, lsb) << (msb + 1)); //old high bits
+    }
+    return ret;
+}
+
+
+/**
+***************************************************************************************************
+*   AddrCIHwlInit
+*
+*   @brief
+*       Creates an CIAddrLib object.
+*
+*   @return
+*       Returns an CIAddrLib object pointer.
+***************************************************************************************************
+*/
+AddrLib* AddrCIHwlInit(const AddrClient* pClient)
+{
+    return CIAddrLib::CreateObj(pClient);
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::CIAddrLib
+*
+*   @brief
+*       Constructor
+*
+***************************************************************************************************
+*/
+CIAddrLib::CIAddrLib(const AddrClient* pClient) :
+    SIAddrLib(pClient),
+    m_noOfMacroEntries(0),
+    m_allowNonDispThickModes(FALSE)
+{
+    m_class = CI_ADDRLIB;
+    memset(&m_settings, 0, sizeof(m_settings));
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::~CIAddrLib
+*
+*   @brief
+*       Destructor
+***************************************************************************************************
+*/
+CIAddrLib::~CIAddrLib()
+{
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlComputeDccInfo
+*
+*   @brief
+*       Compute DCC key size, base alignment
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE CIAddrLib::HwlComputeDccInfo(
+    const ADDR_COMPUTE_DCCINFO_INPUT*  pIn,
+    ADDR_COMPUTE_DCCINFO_OUTPUT*       pOut) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (m_settings.isVolcanicIslands && IsMacroTiled(pIn->tileMode))
+    {
+        UINT_64 dccFastClearSize = pIn->colorSurfSize >> 8;
+
+        ADDR_ASSERT(0 == (pIn->colorSurfSize & 0xff));
+
+        if (pIn->numSamples > 1)
+        {
+            UINT_32 tileSizePerSample = BITS_TO_BYTES(pIn->bpp * MicroTileWidth * MicroTileHeight);
+            UINT_32 samplesPerSplit  = pIn->tileInfo.tileSplitBytes / tileSizePerSample;
+
+            if (samplesPerSplit < pIn->numSamples)
+            {
+                UINT_32 numSplits = pIn->numSamples / samplesPerSplit;
+                UINT_32 fastClearBaseAlign = HwlGetPipes(&pIn->tileInfo) * m_pipeInterleaveBytes;
+
+                ADDR_ASSERT(IsPow2(fastClearBaseAlign));
+
+                dccFastClearSize /= numSplits;
+
+                if (0 != (dccFastClearSize & (fastClearBaseAlign - 1)))
+                {
+                    // Disable dcc fast clear
+                    // if key size of fisrt sample split is not pipe*interleave aligned
+                    dccFastClearSize = 0;
+                }
+            }
+        }
+
+        pOut->dccRamSize          = pIn->colorSurfSize >> 8;
+        pOut->dccRamBaseAlign     = pIn->tileInfo.banks *
+                                    HwlGetPipes(&pIn->tileInfo) *
+                                    m_pipeInterleaveBytes;
+        pOut->dccFastClearSize    = dccFastClearSize;
+
+        ADDR_ASSERT(IsPow2(pOut->dccRamBaseAlign));
+
+        if (0 == (pOut->dccRamSize & (pOut->dccRamBaseAlign - 1)))
+        {
+            pOut->subLvlCompressible = TRUE;
+        }
+        else
+        {
+            UINT_64 dccRamSizeAlign = HwlGetPipes(&pIn->tileInfo) * m_pipeInterleaveBytes;
+
+            if (pOut->dccRamSize == pOut->dccFastClearSize)
+            {
+                pOut->dccFastClearSize = PowTwoAlign(pOut->dccRamSize, dccRamSizeAlign);
+            }
+            pOut->dccRamSize          = PowTwoAlign(pOut->dccRamSize, dccRamSizeAlign);
+            pOut->subLvlCompressible  = FALSE;
+        }
+    }
+    else
+    {
+        returnCode = ADDR_NOTSUPPORTED;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlComputeCmaskAddrFromCoord
+*
+*   @brief
+*       Compute tc compatible Cmask address from fmask ram address
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE CIAddrLib::HwlComputeCmaskAddrFromCoord(
+    const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT*  pIn,  ///< [in] fmask addr/bpp/tile input
+    ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT*       pOut  ///< [out] cmask address
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_NOTSUPPORTED;
+
+    if ((m_settings.isVolcanicIslands == TRUE) &&
+        (pIn->flags.tcCompatible == TRUE))
+    {
+        UINT_32 numOfPipes   = HwlGetPipes(pIn->pTileInfo);
+        UINT_32 numOfBanks   = pIn->pTileInfo->banks;
+        UINT_64 fmaskAddress = pIn->fmaskAddr;
+        UINT_32 elemBits     = pIn->bpp;
+        UINT_32 blockByte    = 64 * elemBits / 8;
+        UINT_64 metaNibbleAddress = HwlComputeMetadataNibbleAddress(fmaskAddress,
+                                                                    0,
+                                                                    0,
+                                                                    4,
+                                                                    elemBits,
+                                                                    blockByte,
+                                                                    m_pipeInterleaveBytes,
+                                                                    numOfPipes,
+                                                                    numOfBanks,
+                                                                    1);
+        pOut->addr = (metaNibbleAddress >> 1);
+        pOut->bitPosition = (metaNibbleAddress % 2) ? 4 : 0;
+        returnCode = ADDR_OK;
+    }
+
+    return returnCode;
+}
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlConvertChipFamily
+*
+*   @brief
+*       Convert familyID defined in atiid.h to AddrChipFamily and set m_chipFamily/m_chipRevision
+*   @return
+*       AddrChipFamily
+***************************************************************************************************
+*/
+AddrChipFamily CIAddrLib::HwlConvertChipFamily(
+    UINT_32 uChipFamily,        ///< [in] chip family defined in atiih.h
+    UINT_32 uChipRevision)      ///< [in] chip revision defined in "asic_family"_id.h
+{
+    AddrChipFamily family = ADDR_CHIP_FAMILY_CI;
+
+    switch (uChipFamily)
+    {
+        case FAMILY_CI:
+            m_settings.isSeaIsland  = 1;
+            m_settings.isBonaire    = ASICREV_IS_BONAIRE_M(uChipRevision);
+            m_settings.isHawaii     = ASICREV_IS_HAWAII_P(uChipRevision);
+            break;
+        case FAMILY_KV:
+            m_settings.isKaveri     = 1;
+            m_settings.isSpectre    = ASICREV_IS_SPECTRE(uChipRevision);
+            m_settings.isSpooky     = ASICREV_IS_SPOOKY(uChipRevision);
+            m_settings.isKalindi    = ASICREV_IS_KALINDI(uChipRevision);
+            break;
+        case FAMILY_VI:
+            m_settings.isVolcanicIslands = 1;
+            m_settings.isIceland         = ASICREV_IS_ICELAND_M(uChipRevision);
+            m_settings.isTonga           = ASICREV_IS_TONGA_P(uChipRevision);
+            break;
+        case FAMILY_CZ:
+            m_settings.isCarrizo         = 1;
+            m_settings.isVolcanicIslands = 1;
+            break;
+        default:
+            ADDR_ASSERT(!"This should be a unexpected Fusion");
+            break;
+    }
+
+    return family;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlInitGlobalParams
+*
+*   @brief
+*       Initializes global parameters
+*
+*   @return
+*       TRUE if all settings are valid
+*
+***************************************************************************************************
+*/
+BOOL_32 CIAddrLib::HwlInitGlobalParams(
+    const ADDR_CREATE_INPUT* pCreateIn) ///< [in] create input
+{
+    BOOL_32  valid = TRUE;
+
+    const ADDR_REGISTER_VALUE* pRegValue = &pCreateIn->regValue;
+
+    valid = DecodeGbRegs(pRegValue);
+
+    // The following assignments for m_pipes is only for fail-safe, InitTileSettingTable should
+    // read the correct pipes from tile mode table
+    if (m_settings.isHawaii)
+    {
+        // Hawaii has 16-pipe, see GFXIP_Config_Summary.xls
+        m_pipes = 16;
+    }
+    else if (m_settings.isBonaire || m_settings.isSpectre)
+    {
+        m_pipes = 4;
+    }
+    else // Treat other KV asics to be 2-pipe
+    {
+        m_pipes = 2;
+    }
+
+    // @todo: VI
+    // Move this to VI code path once created
+    if (m_settings.isTonga)
+    {
+        m_pipes = 8;
+    }
+    else if (m_settings.isIceland)
+    {
+        m_pipes = 2;
+    }
+
+    if (valid)
+    {
+        valid = InitTileSettingTable(pRegValue->pTileConfig, pRegValue->noOfEntries);
+    }
+    if (valid)
+    {
+        valid = InitMacroTileCfgTable(pRegValue->pMacroTileConfig, pRegValue->noOfMacroEntries);
+    }
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlPostCheckTileIndex
+*
+*   @brief
+*       Map a tile setting to index if curIndex is invalid, otherwise check if curIndex matches
+*       tile mode/type/info and change the index if needed
+*   @return
+*       Tile index.
+***************************************************************************************************
+*/
+INT_32 CIAddrLib::HwlPostCheckTileIndex(
+    const ADDR_TILEINFO* pInfo,     ///< [in] Tile Info
+    AddrTileMode         mode,      ///< [in] Tile mode
+    AddrTileType         type,      ///< [in] Tile type
+    INT                  curIndex   ///< [in] Current index assigned in HwlSetupTileInfo
+    ) const
+{
+    INT_32 index = curIndex;
+
+    if (mode == ADDR_TM_LINEAR_GENERAL)
+    {
+        index = TileIndexLinearGeneral;
+    }
+    else
+    {
+        BOOL_32 macroTiled = IsMacroTiled(mode);
+
+        // We need to find a new index if either of them is true
+        // 1. curIndex is invalid
+        // 2. tile mode is changed
+        // 3. tile info does not match for macro tiled
+        if ((index == TileIndexInvalid)         ||
+            (mode != m_tileTable[index].mode)   ||
+            (macroTiled && pInfo->pipeConfig != m_tileTable[index].info.pipeConfig))
+        {
+            for (index = 0; index < static_cast<INT_32>(m_noOfEntries); index++)
+            {
+                if (macroTiled)
+                {
+                    // macro tile modes need all to match
+                    if ((pInfo->pipeConfig == m_tileTable[index].info.pipeConfig) &&
+                        (mode == m_tileTable[index].mode) &&
+                        (type == m_tileTable[index].type))
+                    {
+                        // tileSplitBytes stored in m_tileTable is only valid for depth entries
+                        if (type == ADDR_DEPTH_SAMPLE_ORDER)
+                        {
+                            if (pInfo->tileSplitBytes == m_tileTable[index].info.tileSplitBytes)
+                            {
+                                break;
+                            }
+                        }
+                        else // other entries are determined by other 3 fields
+                        {
+                            break;
+                        }
+                    }
+                }
+                else if (mode == ADDR_TM_LINEAR_ALIGNED)
+                {
+                    // linear mode only needs tile mode to match
+                    if (mode == m_tileTable[index].mode)
+                    {
+                        break;
+                    }
+                }
+                else
+                {
+                    // micro tile modes only need tile mode and tile type to match
+                    if (mode == m_tileTable[index].mode &&
+                        type == m_tileTable[index].type)
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    ADDR_ASSERT(index < static_cast<INT_32>(m_noOfEntries));
+
+    if (index >= static_cast<INT_32>(m_noOfEntries))
+    {
+        index = TileIndexInvalid;
+    }
+
+    return index;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlSetupTileCfg
+*
+*   @brief
+*       Map tile index to tile setting.
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE CIAddrLib::HwlSetupTileCfg(
+    INT_32          index,          ///< [in] Tile index
+    INT_32          macroModeIndex, ///< [in] Index in macro tile mode table(CI)
+    ADDR_TILEINFO*  pInfo,          ///< [out] Tile Info
+    AddrTileMode*   pMode,          ///< [out] Tile mode
+    AddrTileType*   pType           ///< [out] Tile type
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    // Global flag to control usage of tileIndex
+    if (UseTileIndex(index))
+    {
+        if (static_cast<UINT_32>(index) >= m_noOfEntries)
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+        else
+        {
+            const ADDR_TILECONFIG* pCfgTable = GetTileSetting(index);
+
+            if (pInfo != NULL)
+            {
+                if (IsMacroTiled(pCfgTable->mode))
+                {
+                    ADDR_ASSERT(((macroModeIndex != TileIndexInvalid)
+                        && (macroModeIndex != TileIndexNoMacroIndex)));
+                    // Here we used tile_bytes to replace of tile_split
+                    // According info as below:
+                    // "tile_split_c = MIN(ROW_SIZE, tile_split)
+                    // "tile_bytes = MIN(tile_split_c, num_samples * tile_bytes_1x)
+                    // when using tile_bytes replacing of tile_split, the result of
+                    // alignment and others(such as slicesPerTile) are unaffected -
+                    // since if tile_split_c is larger, split won't happen, otherwise
+                    // (num_samples * tile_bytes_1x is larger), a correct tile_split is
+                    // returned.
+                    *pInfo = m_macroTileTable[macroModeIndex];
+
+                    if (pCfgTable->type == ADDR_DEPTH_SAMPLE_ORDER)
+                    {
+                        pInfo->tileSplitBytes = pCfgTable->info.tileSplitBytes;
+                    }
+                    pInfo->pipeConfig = pCfgTable->info.pipeConfig;
+                }
+                else // 1D and linear modes, we return default value stored in table
+                {
+                    *pInfo = pCfgTable->info;
+                }
+            }
+
+            if (pMode != NULL)
+            {
+                *pMode = pCfgTable->mode;
+            }
+
+            if (pType != NULL)
+            {
+                *pType = pCfgTable->type;
+            }
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlComputeSurfaceInfo
+*
+*   @brief
+*       Entry of ci's ComputeSurfaceInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE CIAddrLib::HwlComputeSurfaceInfo(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    // If tileIndex is invalid, force macroModeIndex to be invalid, too
+    if (pIn->tileIndex == TileIndexInvalid)
+    {
+        pOut->macroModeIndex = TileIndexInvalid;
+    }
+
+    ADDR_E_RETURNCODE retCode = SIAddrLib::HwlComputeSurfaceInfo(pIn,pOut);
+
+    if (pOut->macroModeIndex == TileIndexNoMacroIndex)
+    {
+        pOut->macroModeIndex = TileIndexInvalid;
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlFmaskSurfaceInfo
+*   @brief
+*       Entry of r800's ComputeFmaskInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE CIAddrLib::HwlComputeFmaskInfo(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pIn,   ///< [in] input structure
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT*         pOut   ///< [out] output structure
+    )
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    ADDR_TILEINFO tileInfo = {0};
+    ADDR_COMPUTE_FMASK_INFO_INPUT fmaskIn;
+    fmaskIn = *pIn;
+
+    AddrTileMode tileMode = pIn->tileMode;
+
+    // Use internal tile info if pOut does not have a valid pTileInfo
+    if (pOut->pTileInfo == NULL)
+    {
+        pOut->pTileInfo = &tileInfo;
+    }
+
+    ADDR_ASSERT(tileMode == ADDR_TM_2D_TILED_THIN1     ||
+                tileMode == ADDR_TM_3D_TILED_THIN1     ||
+                tileMode == ADDR_TM_PRT_TILED_THIN1    ||
+                tileMode == ADDR_TM_PRT_2D_TILED_THIN1 ||
+                tileMode == ADDR_TM_PRT_3D_TILED_THIN1);
+
+    ADDR_ASSERT(m_tileTable[14].mode == ADDR_TM_2D_TILED_THIN1);
+    ADDR_ASSERT(m_tileTable[15].mode == ADDR_TM_3D_TILED_THIN1);
+
+    // The only valid tile modes for fmask are 2D_THIN1 and 3D_THIN1 plus non-displayable
+    INT_32 tileIndex = tileMode == ADDR_TM_2D_TILED_THIN1 ? 14 : 15;
+    ADDR_SURFACE_FLAGS flags = {{0}};
+    flags.fmask = 1;
+
+    INT_32 macroModeIndex = TileIndexInvalid;
+
+    UINT_32 numSamples = pIn->numSamples;
+    UINT_32 numFrags = pIn->numFrags == 0 ? numSamples : pIn->numFrags;
+
+    UINT_32 bpp = QLog2(numFrags);
+
+    // EQAA needs one more bit
+    if (numSamples > numFrags)
+    {
+        bpp++;
+    }
+
+    if (bpp == 3)
+    {
+        bpp = 4;
+    }
+
+    bpp = Max(8u, bpp * numSamples);
+
+    macroModeIndex = HwlComputeMacroModeIndex(tileIndex, flags, bpp, numSamples, pOut->pTileInfo);
+
+    fmaskIn.tileIndex = tileIndex;
+    fmaskIn.pTileInfo = pOut->pTileInfo;
+    pOut->macroModeIndex = macroModeIndex;
+    pOut->tileIndex = tileIndex;
+
+    retCode = DispatchComputeFmaskInfo(&fmaskIn, pOut);
+
+    if (retCode == ADDR_OK)
+    {
+        pOut->tileIndex =
+            HwlPostCheckTileIndex(pOut->pTileInfo, pIn->tileMode, ADDR_NON_DISPLAYABLE,
+                                  pOut->tileIndex);
+    }
+
+    // Resets pTileInfo to NULL if the internal tile info is used
+    if (pOut->pTileInfo == &tileInfo)
+    {
+        pOut->pTileInfo = NULL;
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlFmaskPreThunkSurfInfo
+*
+*   @brief
+*       Some preparation before thunking a ComputeSurfaceInfo call for Fmask
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+VOID CIAddrLib::HwlFmaskPreThunkSurfInfo(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pFmaskIn,   ///< [in] Input of fmask info
+    const ADDR_COMPUTE_FMASK_INFO_OUTPUT*   pFmaskOut,  ///< [in] Output of fmask info
+    ADDR_COMPUTE_SURFACE_INFO_INPUT*        pSurfIn,    ///< [out] Input of thunked surface info
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pSurfOut    ///< [out] Output of thunked surface info
+    ) const
+{
+    pSurfIn->tileIndex = pFmaskIn->tileIndex;
+    pSurfOut->macroModeIndex  = pFmaskOut->macroModeIndex;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlFmaskPostThunkSurfInfo
+*
+*   @brief
+*       Copy hwl extra field after calling thunked ComputeSurfaceInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+VOID CIAddrLib::HwlFmaskPostThunkSurfInfo(
+    const ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut,   ///< [in] Output of surface info
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut           ///< [out] Output of fmask info
+    ) const
+{
+    pFmaskOut->tileIndex = pSurfOut->tileIndex;
+    pFmaskOut->macroModeIndex = pSurfOut->macroModeIndex;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlDegradeThickTileMode
+*
+*   @brief
+*       Degrades valid tile mode for thick modes if needed
+*
+*   @return
+*       Suitable tile mode
+***************************************************************************************************
+*/
+AddrTileMode CIAddrLib::HwlDegradeThickTileMode(
+    AddrTileMode        baseTileMode,   ///< [in] base tile mode
+    UINT_32             numSlices,      ///< [in] current number of slices
+    UINT_32*            pBytesPerTile   ///< [in/out] pointer to bytes per slice
+    ) const
+{
+    return baseTileMode;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlOverrideTileMode
+*
+*   @brief
+*       Override THICK to THIN, for specific formats on CI
+*
+*   @return
+*       Suitable tile mode
+*
+***************************************************************************************************
+*/
+BOOL_32 CIAddrLib::HwlOverrideTileMode(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,       ///< [in] input structure
+    AddrTileMode*                           pTileMode, ///< [in/out] pointer to the tile mode
+    AddrTileType*                           pTileType  ///< [in/out] pointer to the tile type
+    ) const
+{
+    BOOL_32 bOverrided = FALSE;
+    AddrTileMode tileMode = *pTileMode;
+
+    // currently, all CI/VI family do not
+    // support ADDR_TM_PRT_2D_TILED_THICK,ADDR_TM_PRT_3D_TILED_THICK and
+    // ADDR_TM_PRT_2D_TILED_THIN1, ADDR_TM_PRT_3D_TILED_THIN1
+    switch (tileMode)
+    {
+        case ADDR_TM_PRT_2D_TILED_THICK:
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            tileMode = ADDR_TM_PRT_TILED_THICK;
+            break;
+        case ADDR_TM_PRT_2D_TILED_THIN1:
+        case ADDR_TM_PRT_3D_TILED_THIN1:
+            tileMode = ADDR_TM_PRT_TILED_THIN1;
+            break;
+        default:
+            break;
+    }
+
+    // UBTS#404321, we do not need such overriding, as THICK+THICK entries removed from the tile-mode table
+    if (!m_settings.isBonaire)
+    {
+        UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+        // tile_thickness = (array_mode == XTHICK) ? 8 : ((array_mode == THICK) ? 4 : 1)
+        if (thickness > 1)
+        {
+            switch (pIn->format)
+            {
+                // see //gfxip/gcB/devel/cds/src/verif/tc/models/csim/tcp.cpp
+                // tcpError("Thick micro tiling is not supported for format...
+                case ADDR_FMT_X24_8_32_FLOAT:
+                case ADDR_FMT_32_AS_8:
+                case ADDR_FMT_32_AS_8_8:
+                case ADDR_FMT_32_AS_32_32_32_32:
+
+                // packed formats
+                case ADDR_FMT_GB_GR:
+                case ADDR_FMT_BG_RG:
+                case ADDR_FMT_1_REVERSED:
+                case ADDR_FMT_1:
+                case ADDR_FMT_BC1:
+                case ADDR_FMT_BC2:
+                case ADDR_FMT_BC3:
+                case ADDR_FMT_BC4:
+                case ADDR_FMT_BC5:
+                case ADDR_FMT_BC6:
+                case ADDR_FMT_BC7:
+                    switch (tileMode)
+                    {
+                        case ADDR_TM_1D_TILED_THICK:
+                            tileMode    = ADDR_TM_1D_TILED_THIN1;
+                            break;
+
+                        case ADDR_TM_2D_TILED_XTHICK:
+                        case ADDR_TM_2D_TILED_THICK:
+                            tileMode    = ADDR_TM_2D_TILED_THIN1;
+                            break;
+
+                        case ADDR_TM_3D_TILED_XTHICK:
+                        case ADDR_TM_3D_TILED_THICK:
+                            tileMode    = ADDR_TM_3D_TILED_THIN1;
+                            break;
+
+                        case ADDR_TM_PRT_TILED_THICK:
+                            tileMode    = ADDR_TM_PRT_TILED_THIN1;
+                            break;
+
+                        case ADDR_TM_PRT_2D_TILED_THICK:
+                            tileMode    = ADDR_TM_PRT_2D_TILED_THIN1;
+                            break;
+
+                        case ADDR_TM_PRT_3D_TILED_THICK:
+                            tileMode    = ADDR_TM_PRT_3D_TILED_THIN1;
+                            break;
+
+                        default:
+                            break;
+
+                    }
+
+                    // Switch tile type from thick to thin
+                    if (tileMode != *pTileMode)
+                    {
+                        // see tileIndex: 13-18
+                        *pTileType = ADDR_NON_DISPLAYABLE;
+                    }
+
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    if (tileMode != *pTileMode)
+    {
+        *pTileMode = tileMode;
+        bOverrided = TRUE;
+    }
+
+    return bOverrided;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlSetupTileInfo
+*
+*   @brief
+*       Setup default value of tile info for SI
+***************************************************************************************************
+*/
+VOID CIAddrLib::HwlSetupTileInfo(
+    AddrTileMode                        tileMode,       ///< [in] Tile mode
+    ADDR_SURFACE_FLAGS                  flags,          ///< [in] Surface type flags
+    UINT_32                             bpp,            ///< [in] Bits per pixel
+    UINT_32                             pitch,          ///< [in] Pitch in pixels
+    UINT_32                             height,         ///< [in] Height in pixels
+    UINT_32                             numSamples,     ///< [in] Number of samples
+    ADDR_TILEINFO*                      pTileInfoIn,    ///< [in] Tile info input: NULL for default
+    ADDR_TILEINFO*                      pTileInfoOut,   ///< [out] Tile info output
+    AddrTileType                        inTileType,     ///< [in] Tile type
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*   pOut            ///< [out] Output
+    ) const
+{
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+    ADDR_TILEINFO* pTileInfo = pTileInfoOut;
+    INT index = TileIndexInvalid;
+    INT macroModeIndex = TileIndexInvalid;
+
+    // Fail-safe code
+    if (!IsLinear(tileMode))
+    {
+        // Thick tile modes must use thick micro tile mode but Bonaire does not support due to
+        // old derived netlists (UBTS 404321)
+        if (thickness > 1)
+        {
+            if (m_settings.isBonaire)
+            {
+                inTileType = ADDR_NON_DISPLAYABLE;
+            }
+            else if ((m_allowNonDispThickModes == FALSE) || (inTileType != ADDR_NON_DISPLAYABLE))
+            {
+                inTileType = ADDR_THICK;
+            }
+        }
+        // 128 bpp tiling must be non-displayable.
+        // Fmask reuse color buffer's entry but bank-height field can be from another entry
+        // To simplify the logic, fmask entry should be picked from non-displayable ones
+        else if (bpp == 128 || flags.fmask)
+        {
+            inTileType = ADDR_NON_DISPLAYABLE;
+        }
+        // These two modes only have non-disp entries though they can be other micro tile modes
+        else if (tileMode == ADDR_TM_3D_TILED_THIN1 || tileMode == ADDR_TM_PRT_3D_TILED_THIN1)
+        {
+            inTileType = ADDR_NON_DISPLAYABLE;
+        }
+
+        if (flags.depth || flags.stencil)
+        {
+            inTileType = ADDR_DEPTH_SAMPLE_ORDER;
+        }
+    }
+
+    if (IsTileInfoAllZero(pTileInfo))
+    {
+        // See table entries 0-4
+        if (flags.depth || flags.stencil)
+        {
+            if (flags.depth && flags.tcCompatible)
+            {
+                // tileSize = bpp * numSamples * 8 * 8 / 8
+                UINT_32 tileSize = bpp * numSamples * 8;
+
+                // Texure readable depth surface should not be split
+                switch (tileSize)
+                {
+                    case 128:
+                        index = 1;
+                        break;
+                    case 256:
+                        index = 2;
+                        break;
+                    case 512:
+                        index = 3;
+                        break;
+                    default:
+                        index = 4;
+                        break;
+                }
+            }
+            else
+            {
+                // Depth and stencil need to use the same index, thus the pre-defined tile_split
+                // can meet the requirement to choose the same macro mode index
+                // uncompressed depth/stencil are not supported for now
+                switch (numSamples)
+                {
+                    case 1:
+                        index = 0;
+                        break;
+                    case 2:
+                    case 4:
+                        index = 1;
+                        break;
+                    case 8:
+                        index = 2;
+                        break;
+                    default:
+                        break;
+                }
+            }
+        }
+
+        // See table entries 5-6
+        if (inTileType == ADDR_DEPTH_SAMPLE_ORDER)
+        {
+            switch (tileMode)
+            {
+                case ADDR_TM_1D_TILED_THIN1:
+                    index = 5;
+                    break;
+                case ADDR_TM_PRT_TILED_THIN1:
+                    index = 6;
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        // See table entries 8-12
+        if (inTileType == ADDR_DISPLAYABLE)
+        {
+            switch (tileMode)
+            {
+                case ADDR_TM_1D_TILED_THIN1:
+                    index = 9;
+                    break;
+                case ADDR_TM_2D_TILED_THIN1:
+                    index = 10;
+                    break;
+                case ADDR_TM_PRT_TILED_THIN1:
+                    index = 11;
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        // See table entries 13-18
+        if (inTileType == ADDR_NON_DISPLAYABLE)
+        {
+            switch (tileMode)
+            {
+                case ADDR_TM_1D_TILED_THIN1:
+                    index = 13;
+                    break;
+                case ADDR_TM_2D_TILED_THIN1:
+                    index = 14;
+                    break;
+                case ADDR_TM_3D_TILED_THIN1:
+                    index = 15;
+                    break;
+                case ADDR_TM_PRT_TILED_THIN1:
+                    index = 16;
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        // See table entries 19-26
+        if (thickness > 1)
+        {
+            switch (tileMode)
+            {
+            case ADDR_TM_1D_TILED_THICK:
+                    //special check for bonaire, for the compatablity between old KMD and new UMD for bonaire
+                    index = ((inTileType == ADDR_THICK) || m_settings.isBonaire) ? 19 : 18;
+                    break;
+            case ADDR_TM_2D_TILED_THICK:
+                    // special check for bonaire, for the compatablity between old KMD and new UMD for bonaire
+                    index = ((inTileType == ADDR_THICK) || m_settings.isBonaire) ? 20 : 24;
+                    break;
+                case ADDR_TM_3D_TILED_THICK:
+                    index = 21;
+                    break;
+                case ADDR_TM_PRT_TILED_THICK:
+                    index = 22;
+                    break;
+                case ADDR_TM_2D_TILED_XTHICK:
+                    index = 25;
+                    break;
+                case ADDR_TM_3D_TILED_XTHICK:
+                    index = 26;
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        // See table entries 27-30
+        if (inTileType == ADDR_ROTATED)
+        {
+            switch (tileMode)
+            {
+                case ADDR_TM_1D_TILED_THIN1:
+                    index = 27;
+                    break;
+                case ADDR_TM_2D_TILED_THIN1:
+                    index = 28;
+                    break;
+                case ADDR_TM_PRT_TILED_THIN1:
+                    index = 29;
+                    break;
+                case ADDR_TM_PRT_2D_TILED_THIN1:
+                    index = 30;
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        if (m_pipes >= 8)
+        {
+            ADDR_ASSERT((index + 1) < static_cast<INT_32>(m_noOfEntries));
+            // Only do this when tile mode table is updated.
+            if (((tileMode == ADDR_TM_PRT_TILED_THIN1) || (tileMode == ADDR_TM_PRT_TILED_THICK)) &&
+                (m_tileTable[index+1].mode == tileMode))
+            {
+                UINT_32 bytesXSamples = bpp * numSamples / 8;
+                UINT_32 bytesXThickness = bpp * thickness / 8;
+                UINT_32 switchP4Threshold = (m_pipes == 16) ? 8 : 32;
+
+                if ((bytesXSamples > switchP4Threshold) || (bytesXThickness > switchP4Threshold))
+                {
+                    // Pick next 4 pipe entry
+                    index += 1;
+                }
+            }
+        }
+    }
+    else
+    {
+        // A pre-filled tile info is ready
+        index = pOut->tileIndex;
+        macroModeIndex = pOut->macroModeIndex;
+
+        // pass tile type back for post tile index compute
+        pOut->tileType = inTileType;
+    }
+
+    // We only need to set up tile info if there is a valid index but macroModeIndex is invalid
+    if (index != TileIndexInvalid && macroModeIndex == TileIndexInvalid)
+    {
+        macroModeIndex = HwlComputeMacroModeIndex(index, flags, bpp, numSamples, pTileInfo);
+
+        /// Copy to pOut->tileType/tileIndex/macroModeIndex
+        pOut->tileIndex = index;
+        pOut->tileType = m_tileTable[index].type; // Or inTileType, the samea
+        pOut->macroModeIndex = macroModeIndex;
+    }
+    else if (tileMode == ADDR_TM_LINEAR_GENERAL)
+    {
+        pOut->tileIndex = TileIndexLinearGeneral;
+
+        // Copy linear-aligned entry??
+        *pTileInfo = m_tileTable[8].info;
+    }
+    else if (tileMode == ADDR_TM_LINEAR_ALIGNED)
+    {
+        pOut->tileIndex = 8;
+        *pTileInfo = m_tileTable[8].info;
+    }
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::ReadGbTileMode
+*
+*   @brief
+*       Convert GB_TILE_MODE HW value to ADDR_TILE_CONFIG.
+*   @return
+*       NA.
+***************************************************************************************************
+*/
+VOID CIAddrLib::ReadGbTileMode(
+    UINT_32             regValue,   ///< [in] GB_TILE_MODE register
+    ADDR_TILECONFIG*    pCfg        ///< [out] output structure
+    ) const
+{
+    GB_TILE_MODE gbTileMode;
+    gbTileMode.val = regValue;
+
+    pCfg->type = static_cast<AddrTileType>(gbTileMode.f.micro_tile_mode_new);
+    pCfg->info.pipeConfig = static_cast<AddrPipeCfg>(gbTileMode.f.pipe_config + 1);
+
+    if (pCfg->type == ADDR_DEPTH_SAMPLE_ORDER)
+    {
+        pCfg->info.tileSplitBytes = 64 << gbTileMode.f.tile_split;
+    }
+    else
+    {
+        pCfg->info.tileSplitBytes = 1 << gbTileMode.f.sample_split;
+    }
+
+    UINT_32 regArrayMode = gbTileMode.f.array_mode;
+
+    pCfg->mode = static_cast<AddrTileMode>(regArrayMode);
+
+    switch (regArrayMode)
+    {
+        case 5:
+            pCfg->mode = ADDR_TM_PRT_TILED_THIN1;
+            break;
+        case 6:
+            pCfg->mode = ADDR_TM_PRT_2D_TILED_THIN1;
+            break;
+        case 8:
+            pCfg->mode = ADDR_TM_2D_TILED_XTHICK;
+            break;
+        case 9:
+            pCfg->mode = ADDR_TM_PRT_TILED_THICK;
+            break;
+        case 0xa:
+            pCfg->mode = ADDR_TM_PRT_2D_TILED_THICK;
+            break;
+        case 0xb:
+            pCfg->mode = ADDR_TM_PRT_3D_TILED_THIN1;
+            break;
+        case 0xe:
+            pCfg->mode = ADDR_TM_3D_TILED_XTHICK;
+            break;
+        case 0xf:
+            pCfg->mode = ADDR_TM_PRT_3D_TILED_THICK;
+            break;
+        default:
+            break;
+    }
+
+    // Fail-safe code for these always convert tile info, as the non-macro modes
+    // return the entry of tile mode table directly without looking up macro mode table
+    if (!IsMacroTiled(pCfg->mode))
+    {
+        pCfg->info.banks = 2;
+        pCfg->info.bankWidth = 1;
+        pCfg->info.bankHeight = 1;
+        pCfg->info.macroAspectRatio = 1;
+        pCfg->info.tileSplitBytes = 64;
+    }
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::InitTileSettingTable
+*
+*   @brief
+*       Initialize the ADDR_TILE_CONFIG table.
+*   @return
+*       TRUE if tile table is correctly initialized
+***************************************************************************************************
+*/
+BOOL_32 CIAddrLib::InitTileSettingTable(
+    const UINT_32*  pCfg,           ///< [in] Pointer to table of tile configs
+    UINT_32         noOfEntries     ///< [in] Numbe of entries in the table above
+    )
+{
+    BOOL_32 initOk = TRUE;
+
+    ADDR_ASSERT(noOfEntries <= TileTableSize);
+
+    memset(m_tileTable, 0, sizeof(m_tileTable));
+
+    if (noOfEntries != 0)
+    {
+        m_noOfEntries = noOfEntries;
+    }
+    else
+    {
+        m_noOfEntries = TileTableSize;
+    }
+
+    if (pCfg) // From Client
+    {
+        for (UINT_32 i = 0; i < m_noOfEntries; i++)
+        {
+            ReadGbTileMode(*(pCfg + i), &m_tileTable[i]);
+        }
+    }
+    else
+    {
+        ADDR_ASSERT_ALWAYS();
+        initOk = FALSE;
+    }
+
+    if (initOk)
+    {
+        ADDR_ASSERT(m_tileTable[TILEINDEX_LINEAR_ALIGNED].mode == ADDR_TM_LINEAR_ALIGNED);
+
+        if (m_settings.isBonaire == FALSE)
+        {
+            // Check if entry 18 is "thick+thin" combination
+            if ((m_tileTable[18].mode == ADDR_TM_1D_TILED_THICK) &&
+                (m_tileTable[18].type == ADDR_NON_DISPLAYABLE))
+            {
+                m_allowNonDispThickModes = TRUE;
+                ADDR_ASSERT(m_tileTable[24].mode == ADDR_TM_2D_TILED_THICK);
+            }
+        }
+        else
+        {
+            m_allowNonDispThickModes = TRUE;
+        }
+
+        // Assume the first entry is always programmed with full pipes
+        m_pipes = HwlGetPipes(&m_tileTable[0].info);
+    }
+
+    return initOk;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::ReadGbMacroTileCfg
+*
+*   @brief
+*       Convert GB_MACRO_TILE_CFG HW value to ADDR_TILE_CONFIG.
+*   @return
+*       NA.
+***************************************************************************************************
+*/
+VOID CIAddrLib::ReadGbMacroTileCfg(
+    UINT_32             regValue,   ///< [in] GB_MACRO_TILE_MODE register
+    ADDR_TILEINFO*      pCfg        ///< [out] output structure
+    ) const
+{
+    GB_MACROTILE_MODE gbTileMode;
+    gbTileMode.val = regValue;
+
+    pCfg->bankHeight = 1 << gbTileMode.f.bank_height;
+    pCfg->bankWidth = 1 << gbTileMode.f.bank_width;
+    pCfg->banks = 1 << (gbTileMode.f.num_banks + 1);
+    pCfg->macroAspectRatio = 1 << gbTileMode.f.macro_tile_aspect;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::InitMacroTileCfgTable
+*
+*   @brief
+*       Initialize the ADDR_MACRO_TILE_CONFIG table.
+*   @return
+*       TRUE if macro tile table is correctly initialized
+***************************************************************************************************
+*/
+BOOL_32 CIAddrLib::InitMacroTileCfgTable(
+    const UINT_32*  pCfg,           ///< [in] Pointer to table of tile configs
+    UINT_32         noOfMacroEntries     ///< [in] Numbe of entries in the table above
+    )
+{
+    BOOL_32 initOk = TRUE;
+
+    ADDR_ASSERT(noOfMacroEntries <= MacroTileTableSize);
+
+    memset(m_macroTileTable, 0, sizeof(m_macroTileTable));
+
+    if (noOfMacroEntries != 0)
+    {
+        m_noOfMacroEntries = noOfMacroEntries;
+    }
+    else
+    {
+        m_noOfMacroEntries = MacroTileTableSize;
+    }
+
+    if (pCfg) // From Client
+    {
+        for (UINT_32 i = 0; i < m_noOfMacroEntries; i++)
+        {
+            ReadGbMacroTileCfg(*(pCfg + i), &m_macroTileTable[i]);
+
+            m_macroTileTable[i].tileSplitBytes = 64 << (i % 8);
+        }
+    }
+    else
+    {
+        ADDR_ASSERT_ALWAYS();
+        initOk = FALSE;
+    }
+    return initOk;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlComputeMacroModeIndex
+*
+*   @brief
+*       Computes macro tile mode index
+*   @return
+*       TRUE if macro tile table is correctly initialized
+***************************************************************************************************
+*/
+INT_32 CIAddrLib::HwlComputeMacroModeIndex(
+    INT_32              tileIndex,      ///< [in] Tile mode index
+    ADDR_SURFACE_FLAGS  flags,          ///< [in] Surface flags
+    UINT_32             bpp,            ///< [in] Bit per pixel
+    UINT_32             numSamples,     ///< [in] Number of samples
+    ADDR_TILEINFO*      pTileInfo,      ///< [out] Pointer to ADDR_TILEINFO
+    AddrTileMode*       pTileMode,      ///< [out] Pointer to AddrTileMode
+    AddrTileType*       pTileType       ///< [out] Pointer to AddrTileType
+    ) const
+{
+    INT_32 macroModeIndex = TileIndexInvalid;
+
+    if (flags.tcCompatible && flags.stencil)
+    {
+        // Don't compute macroModeIndex for tc compatible stencil surface
+        macroModeIndex = TileIndexNoMacroIndex;
+    }
+    else
+    {
+        AddrTileMode tileMode = m_tileTable[tileIndex].mode;
+        AddrTileType tileType = m_tileTable[tileIndex].type;
+        UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+        if (!IsMacroTiled(tileMode))
+        {
+            *pTileInfo = m_tileTable[tileIndex].info;
+            macroModeIndex = TileIndexNoMacroIndex;
+        }
+        else
+        {
+            UINT_32 tileBytes1x = BITS_TO_BYTES(bpp * MicroTilePixels * thickness);
+            UINT_32 tileSplit;
+
+            if (m_tileTable[tileIndex].type == ADDR_DEPTH_SAMPLE_ORDER)
+            {
+                // Depth entries store real tileSplitBytes
+                tileSplit = m_tileTable[tileIndex].info.tileSplitBytes;
+            }
+            else
+            {
+                // Non-depth entries store a split factor
+                UINT_32 sampleSplit = m_tileTable[tileIndex].info.tileSplitBytes;
+                UINT_32 colorTileSplit = Max(256u, sampleSplit * tileBytes1x);
+
+                tileSplit = colorTileSplit;
+            }
+
+            UINT_32 tileSplitC = Min(m_rowSize, tileSplit);
+            UINT_32 tileBytes;
+
+            if (flags.fmask)
+            {
+                tileBytes = Min(tileSplitC, tileBytes1x);
+            }
+            else
+            {
+                tileBytes = Min(tileSplitC, numSamples * tileBytes1x);
+            }
+
+            if (tileBytes < 64)
+            {
+                tileBytes = 64;
+            }
+
+            macroModeIndex = Log2(tileBytes / 64);
+
+            if (flags.prt || IsPrtTileMode(tileMode))
+            {
+                // Unknown - assume it is 1/2 of table size
+                const UINT_32 PrtMacroModeOffset = MacroTileTableSize / 2;
+
+                macroModeIndex += PrtMacroModeOffset;
+                *pTileInfo = m_macroTileTable[macroModeIndex];
+            }
+            else
+            {
+                *pTileInfo = m_macroTileTable[macroModeIndex];
+            }
+
+            pTileInfo->pipeConfig = m_tileTable[tileIndex].info.pipeConfig;
+
+            if (m_tileTable[tileIndex].type != ADDR_DEPTH_SAMPLE_ORDER)
+            {
+                pTileInfo->tileSplitBytes = tileSplitC;
+            }
+            else
+            {
+                pTileInfo->tileSplitBytes = m_tileTable[tileIndex].info.tileSplitBytes;
+            }
+        }
+
+        if (NULL != pTileMode)
+        {
+            *pTileMode = tileMode;
+        }
+
+        if (NULL != pTileType)
+        {
+            *pTileType = tileType;
+        }
+    }
+
+    return macroModeIndex;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlComputeTileDataWidthAndHeightLinear
+*
+*   @brief
+*       Compute the squared cache shape for per-tile data (CMASK and HTILE) for linear layout
+*
+*   @return
+*       N/A
+*
+*   @note
+*       MacroWidth and macroHeight are measured in pixels
+***************************************************************************************************
+*/
+VOID CIAddrLib::HwlComputeTileDataWidthAndHeightLinear(
+    UINT_32*        pMacroWidth,     ///< [out] macro tile width
+    UINT_32*        pMacroHeight,    ///< [out] macro tile height
+    UINT_32         bpp,             ///< [in] bits per pixel
+    ADDR_TILEINFO*  pTileInfo        ///< [in] tile info
+    ) const
+{
+    ADDR_ASSERT(pTileInfo != NULL);
+
+    UINT_32 numTiles;
+
+    switch (pTileInfo->pipeConfig)
+    {
+        case ADDR_PIPECFG_P16_32x32_8x16:
+        case ADDR_PIPECFG_P16_32x32_16x16:
+        case ADDR_PIPECFG_P8_32x64_32x32:
+        case ADDR_PIPECFG_P8_32x32_16x32:
+        case ADDR_PIPECFG_P8_32x32_16x16:
+        case ADDR_PIPECFG_P8_32x32_8x16:
+        case ADDR_PIPECFG_P4_32x32:
+            numTiles = 8;
+            break;
+        default:
+            numTiles = 4;
+            break;
+    }
+
+    *pMacroWidth    = numTiles * MicroTileWidth;
+    *pMacroHeight   = numTiles * MicroTileHeight;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlStereoCheckRightOffsetPadding
+*
+*   @brief
+*       check if the height needs extra padding for stereo right eye offset, to avoid swizzling
+*
+*   @return
+*       TRUE is the extra padding is needed
+*
+*   @note
+*       Kalindi (Kabini) is the only one that needs this padding as there is a uncertain
+*       possible HW issue where the right eye displays incorrectly with some type of swizzles, if
+*       the right eye offset is not 64KB aligned - EPR#366461
+*       Other Kaveri APUs also need the padding according to DXX team's report otherwise
+*       corruption observed. - EPR#374788
+***************************************************************************************************
+*/
+BOOL_32 CIAddrLib::HwlStereoCheckRightOffsetPadding() const
+{
+    BOOL_32 bNeedPadding = FALSE;
+
+    if (m_settings.isKaveri)
+    {
+        bNeedPadding = TRUE;
+    }
+
+    return bNeedPadding;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlComputeMetadataNibbleAddress
+*
+*   @brief
+*        calculate meta data address based on input information
+*
+*   &parameter
+*        uncompressedDataByteAddress - address of a pixel in color surface
+*        dataBaseByteAddress         - base address of color surface
+*        metadataBaseByteAddress     - base address of meta ram
+*        metadataBitSize             - meta key size, 8 for DCC, 4 for cmask
+*        elementBitSize              - element size of color surface
+*        blockByteSize               - compression block size, 256 for DCC
+*        pipeInterleaveBytes         - pipe interleave size
+*        numOfPipes                  - number of pipes
+*        numOfBanks                  - number of banks
+*        numOfSamplesPerSplit        - number of samples per tile split
+*   @return
+*        meta data nibble address (nibble address is used to support DCC compatible cmask)
+*
+***************************************************************************************************
+*/
+UINT_64 CIAddrLib::HwlComputeMetadataNibbleAddress(
+    UINT_64 uncompressedDataByteAddress,
+    UINT_64 dataBaseByteAddress,
+    UINT_64 metadataBaseByteAddress,
+    UINT_32 metadataBitSize,
+    UINT_32 elementBitSize,
+    UINT_32 blockByteSize,
+    UINT_32 pipeInterleaveBytes,
+    UINT_32 numOfPipes,
+    UINT_32 numOfBanks,
+    UINT_32 numOfSamplesPerSplit) const
+{
+    ///--------------------------------------------------------------------------------------------
+    /// Get pipe interleave, bank and pipe bits
+    ///--------------------------------------------------------------------------------------------
+    UINT_32 pipeInterleaveBits  = Log2(pipeInterleaveBytes);
+    UINT_32 pipeBits            = Log2(numOfPipes);
+    UINT_32 bankBits            = Log2(numOfBanks);
+
+    ///--------------------------------------------------------------------------------------------
+    /// Clear pipe and bank swizzles
+    ///--------------------------------------------------------------------------------------------
+    UINT_32 dataMacrotileBits        = pipeInterleaveBits + pipeBits + bankBits;
+    UINT_32 metadataMacrotileBits    = pipeInterleaveBits + pipeBits + bankBits;
+
+    UINT_64 dataMacrotileClearMask     = ~((1L << dataMacrotileBits) - 1);
+    UINT_64 metadataMacrotileClearMask = ~((1L << metadataMacrotileBits) - 1);
+
+    UINT_64 dataBaseByteAddressNoSwizzle = dataBaseByteAddress & dataMacrotileClearMask;
+    UINT_64 metadataBaseByteAddressNoSwizzle = metadataBaseByteAddress & metadataMacrotileClearMask;
+
+    ///--------------------------------------------------------------------------------------------
+    /// Modify metadata base before adding in so that when final address is divided by data ratio,
+    /// the base address returns to where it should be
+    ///--------------------------------------------------------------------------------------------
+    ADDR_ASSERT((0 != metadataBitSize));
+    UINT_64 metadataBaseShifted = metadataBaseByteAddressNoSwizzle * blockByteSize * 8 /
+                                  metadataBitSize;
+    UINT_64 offset = uncompressedDataByteAddress -
+                     dataBaseByteAddressNoSwizzle +
+                     metadataBaseShifted;
+
+    ///--------------------------------------------------------------------------------------------
+    /// Save bank data bits
+    ///--------------------------------------------------------------------------------------------
+    UINT_32 lsb = pipeBits + pipeInterleaveBits;
+    UINT_32 msb = bankBits - 1 + lsb;
+
+    UINT_64 bankDataBits = AddrGetBits(offset, msb, lsb);
+
+    ///--------------------------------------------------------------------------------------------
+    /// Save pipe data bits
+    ///--------------------------------------------------------------------------------------------
+    lsb = pipeInterleaveBits;
+    msb = pipeBits - 1 + lsb;
+
+    UINT_64 pipeDataBits = AddrGetBits(offset, msb, lsb);
+
+    ///--------------------------------------------------------------------------------------------
+    /// Remove pipe and bank bits
+    ///--------------------------------------------------------------------------------------------
+    lsb = pipeInterleaveBits;
+    msb = dataMacrotileBits - 1;
+
+    UINT_64 offsetWithoutPipeBankBits = AddrRemoveBits(offset, msb, lsb);
+
+    ADDR_ASSERT((0 != blockByteSize));
+    UINT_64 blockInBankpipe = offsetWithoutPipeBankBits / blockByteSize;
+
+    UINT_32 tileSize = 8 * 8 * elementBitSize/8 * numOfSamplesPerSplit;
+    UINT_32 blocksInTile = tileSize / blockByteSize;
+
+    if (0 == blocksInTile)
+    {
+        lsb = 0;
+    }
+    else
+    {
+        lsb = Log2(blocksInTile);
+    }
+    msb = bankBits - 1 + lsb;
+
+    UINT_64 blockInBankpipeWithBankBits = AddrInsertBits(blockInBankpipe, bankDataBits, msb, lsb);
+
+    /// NOTE *2 because we are converting to Nibble address in this step
+    UINT_64 metaAddressInPipe = blockInBankpipeWithBankBits * 2 * metadataBitSize / 8;
+
+
+    ///--------------------------------------------------------------------------------------------
+    /// Reinsert pipe bits back into the final address
+    ///--------------------------------------------------------------------------------------------
+    lsb = pipeInterleaveBits + 1; ///<+1 due to Nibble address now gives interleave bits extra lsb.
+    msb = pipeBits - 1 + lsb;
+    UINT_64 metadataAddress = AddrInsertBits(metaAddressInPipe, pipeDataBits, msb, lsb);
+
+    return metadataAddress;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlPadDimensions
+*
+*   @brief
+*       Helper function to pad dimensions
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID CIAddrLib::HwlPadDimensions(
+    AddrTileMode        tileMode,    ///< [in] tile mode
+    UINT_32             bpp,         ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,       ///< [in] surface flags
+    UINT_32             numSamples,  ///< [in] number of samples
+    ADDR_TILEINFO*      pTileInfo,   ///< [in/out] bank structure.
+    UINT_32             padDims,     ///< [in] Dimensions to pad valid value 1,2,3
+    UINT_32             mipLevel,    ///< [in] MipLevel
+    UINT_32*            pPitch,      ///< [in/out] pitch in pixels
+    UINT_32             pitchAlign,  ///< [in] pitch alignment
+    UINT_32*            pHeight,     ///< [in/out] height in pixels
+    UINT_32             heightAlign, ///< [in] height alignment
+    UINT_32*            pSlices,     ///< [in/out] number of slices
+    UINT_32             sliceAlign   ///< [in] number of slice alignment
+    ) const
+{
+    if (m_settings.isVolcanicIslands &&
+        flags.dccCompatible &&
+        (numSamples > 1) &&
+        (mipLevel == 0) &&
+        IsMacroTiled(tileMode))
+    {
+        UINT_32 tileSizePerSample = BITS_TO_BYTES(bpp * MicroTileWidth * MicroTileHeight);
+        UINT_32 samplesPerSplit  = pTileInfo->tileSplitBytes / tileSizePerSample;
+
+        if (samplesPerSplit < numSamples)
+        {
+            UINT_32 dccFastClearByteAlign = HwlGetPipes(pTileInfo) * m_pipeInterleaveBytes * 256;
+            UINT_32 bytesPerSplit = BITS_TO_BYTES((*pPitch) * (*pHeight) * bpp * samplesPerSplit);
+
+            ADDR_ASSERT(IsPow2(dccFastClearByteAlign));
+
+            if (0 != (bytesPerSplit & (dccFastClearByteAlign - 1)))
+            {
+                UINT_32 dccFastClearPixelAlign = dccFastClearByteAlign /
+                                                BITS_TO_BYTES(bpp) /
+                                                samplesPerSplit;
+                UINT_32 macroTilePixelAlign = pitchAlign * heightAlign;
+
+                if ((dccFastClearPixelAlign >= macroTilePixelAlign) &&
+                    ((dccFastClearPixelAlign % macroTilePixelAlign) == 0))
+                {
+                    UINT_32 dccFastClearPitchAlignInMacroTile =
+                        dccFastClearPixelAlign / macroTilePixelAlign;
+                    UINT_32 heightInMacroTile = *pHeight / heightAlign;
+                    UINT_32 dccFastClearPitchAlignInPixels;
+
+                    while ((heightInMacroTile > 1) &&
+                           ((heightInMacroTile % 2) == 0) &&
+                           (dccFastClearPitchAlignInMacroTile > 1) &&
+                           ((dccFastClearPitchAlignInMacroTile % 2) == 0))
+                    {
+                        heightInMacroTile >>= 1;
+                        dccFastClearPitchAlignInMacroTile >>= 1;
+                    }
+
+                    dccFastClearPitchAlignInPixels = pitchAlign * dccFastClearPitchAlignInMacroTile;
+
+                    if (IsPow2(dccFastClearPitchAlignInPixels))
+                    {
+                        *pPitch = PowTwoAlign((*pPitch), dccFastClearPitchAlignInPixels);
+                    }
+                    else
+                    {
+                        *pPitch += (dccFastClearPitchAlignInPixels - 1);
+                        *pPitch /= dccFastClearPitchAlignInPixels;
+                        *pPitch *= dccFastClearPitchAlignInPixels;
+                    }
+                }
+            }
+        }
+    }
+}
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
new file mode 100644
index 00000000000..0220736c7bc
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  ciaddrlib.h
+* @brief Contains the CIAddrLib class definition.
+***************************************************************************************************
+*/
+
+#ifndef __CI_ADDR_LIB_H__
+#define __CI_ADDR_LIB_H__
+
+#include "addrlib.h"
+#include "siaddrlib.h"
+
+/**
+***************************************************************************************************
+* @brief CI specific settings structure.
+***************************************************************************************************
+*/
+struct CIChipSettings
+{
+    struct
+    {
+        UINT_32 isSeaIsland : 1;
+        UINT_32 isBonaire   : 1;
+        UINT_32 isKaveri    : 1;
+        UINT_32 isSpectre   : 1;
+        UINT_32 isSpooky    : 1;
+        UINT_32 isKalindi   : 1;
+        // Hawaii is GFXIP 7.2, similar with CI (Bonaire)
+        UINT_32 isHawaii    : 1;
+
+        // VI
+        UINT_32 isVolcanicIslands : 1;
+        UINT_32 isIceland         : 1;
+        UINT_32 isTonga           : 1;
+        // VI fusion (Carrizo)
+        UINT_32 isCarrizo         : 1;
+    };
+};
+
+/**
+***************************************************************************************************
+* @brief This class is the CI specific address library
+*        function set.
+***************************************************************************************************
+*/
+class CIAddrLib : public SIAddrLib
+{
+public:
+    /// Creates CIAddrLib object
+    static AddrLib* CreateObj(const AddrClient* pClient)
+    {
+        return new(pClient) CIAddrLib(pClient);
+    }
+
+private:
+    CIAddrLib(const AddrClient* pClient);
+    virtual ~CIAddrLib();
+
+protected:
+
+    // Hwl interface - defined in AddrLib
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pOut);
+
+    virtual AddrChipFamily HwlConvertChipFamily(
+        UINT_32 uChipFamily, UINT_32 uChipRevision);
+
+    virtual BOOL_32 HwlInitGlobalParams(
+        const ADDR_CREATE_INPUT* pCreateIn);
+
+    virtual ADDR_E_RETURNCODE HwlSetupTileCfg(
+        INT_32 index, INT_32 macroModeIndex, ADDR_TILEINFO* pInfo,
+        AddrTileMode* pMode = 0, AddrTileType* pType = 0) const;
+
+    virtual VOID HwlComputeTileDataWidthAndHeightLinear(
+        UINT_32* pMacroWidth, UINT_32* pMacroHeight,
+        UINT_32 bpp, ADDR_TILEINFO* pTileInfo) const;
+
+    virtual INT_32 HwlComputeMacroModeIndex(
+        INT_32 tileIndex, ADDR_SURFACE_FLAGS flags, UINT_32 bpp, UINT_32 numSamples,
+        ADDR_TILEINFO* pTileInfo, AddrTileMode* pTileMode = NULL, AddrTileType* pTileType = NULL
+        ) const;
+
+    // Sub-hwl interface - defined in EgBasedAddrLib
+    virtual VOID HwlSetupTileInfo(
+        AddrTileMode tileMode, ADDR_SURFACE_FLAGS flags,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        ADDR_TILEINFO* inputTileInfo, ADDR_TILEINFO* outputTileInfo,
+        AddrTileType inTileType, ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    virtual INT_32 HwlPostCheckTileIndex(
+        const ADDR_TILEINFO* pInfo, AddrTileMode mode, AddrTileType type,
+        INT curIndex = TileIndexInvalid) const;
+
+    virtual VOID   HwlFmaskPreThunkSurfInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pFmaskIn,
+        const ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut,
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pSurfIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut) const;
+
+    virtual VOID   HwlFmaskPostThunkSurfInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut) const;
+
+    virtual AddrTileMode HwlDegradeThickTileMode(
+        AddrTileMode baseTileMode, UINT_32 numSlices, UINT_32* pBytesPerTile) const;
+
+    virtual BOOL_32 HwlOverrideTileMode(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        AddrTileMode* pTileMode,
+        AddrTileType* pTileType) const;
+
+    virtual BOOL_32 HwlStereoCheckRightOffsetPadding() const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeDccInfo(
+        const ADDR_COMPUTE_DCCINFO_INPUT* pIn,
+        ADDR_COMPUTE_DCCINFO_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeCmaskAddrFromCoord(
+        const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+protected:
+    virtual VOID HwlPadDimensions(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags,
+        UINT_32 numSamples, ADDR_TILEINFO* pTileInfo, UINT_32 padDims, UINT_32 mipLevel,
+        UINT_32* pPitch, UINT_32 pitchAlign, UINT_32* pHeight, UINT_32 heightAlign,
+        UINT_32* pSlices, UINT_32 sliceAlign) const;
+
+private:
+    VOID ReadGbTileMode(
+        UINT_32 regValue, ADDR_TILECONFIG* pCfg) const;
+
+    VOID ReadGbMacroTileCfg(
+        UINT_32 regValue, ADDR_TILEINFO* pCfg) const;
+
+    BOOL_32 InitTileSettingTable(
+        const UINT_32 *pSetting, UINT_32 noOfEntries);
+
+    BOOL_32 InitMacroTileCfgTable(
+        const UINT_32 *pSetting, UINT_32 noOfEntries);
+
+    UINT_64 HwlComputeMetadataNibbleAddress(
+        UINT_64 uncompressedDataByteAddress,
+        UINT_64 dataBaseByteAddress,
+        UINT_64 metadataBaseByteAddress,
+        UINT_32 metadataBitSize,
+        UINT_32 elementBitSize,
+        UINT_32 blockByteSize,
+        UINT_32 pipeInterleaveBytes,
+        UINT_32 numOfPipes,
+        UINT_32 numOfBanks,
+        UINT_32 numOfSamplesPerSplit) const;
+
+    static const UINT_32    MacroTileTableSize = 16;
+    ADDR_TILEINFO           m_macroTileTable[MacroTileTableSize];
+    UINT_32                 m_noOfMacroEntries;
+    BOOL_32                 m_allowNonDispThickModes;
+
+    CIChipSettings          m_settings;
+};
+
+#endif
+
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.cpp
new file mode 100644
index 00000000000..b1e008b8392
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.cpp
@@ -0,0 +1,4575 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  egbaddrlib.cpp
+* @brief Contains the EgBasedAddrLib class implementation
+***************************************************************************************************
+*/
+
+#include "egbaddrlib.h"
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::EgBasedAddrLib
+*
+*   @brief
+*       Constructor
+*
+*   @note
+*
+***************************************************************************************************
+*/
+EgBasedAddrLib::EgBasedAddrLib(const AddrClient* pClient) :
+    AddrLib(pClient),
+    m_ranks(0),
+    m_logicalBanks(0),
+    m_bankInterleave(1)
+{
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::~EgBasedAddrLib
+*
+*   @brief
+*       Destructor
+***************************************************************************************************
+*/
+EgBasedAddrLib::~EgBasedAddrLib()
+{
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::DispatchComputeSurfaceInfo
+*
+*   @brief
+*       Compute surface sizes include padded pitch,height,slices,total size in bytes,
+*       meanwhile output suitable tile mode and base alignment might be changed in this
+*       call as well. Results are returned through output parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::DispatchComputeSurfaceInfo(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    AddrTileMode        tileMode      = pIn->tileMode;
+    UINT_32             bpp           = pIn->bpp;
+    UINT_32             numSamples    = pIn->numSamples;
+    UINT_32             numFrags      = ((pIn->numFrags == 0) ? numSamples : pIn->numFrags);
+    UINT_32             pitch         = pIn->width;
+    UINT_32             height        = pIn->height;
+    UINT_32             numSlices     = pIn->numSlices;
+    UINT_32             mipLevel      = pIn->mipLevel;
+    ADDR_SURFACE_FLAGS  flags         = pIn->flags;
+
+    ADDR_TILEINFO       tileInfoDef   = {0};
+    ADDR_TILEINFO*      pTileInfo     = &tileInfoDef;
+
+    UINT_32             padDims = 0;
+    BOOL_32             valid;
+
+    tileMode = DegradeLargeThickTile(tileMode, bpp);
+
+    // Only override numSamples for NI above
+    if (m_chipFamily >= ADDR_CHIP_FAMILY_NI)
+    {
+        if (numFrags != numSamples) // This means EQAA
+        {
+            // The real surface size needed is determined by number of fragments
+            numSamples = numFrags;
+        }
+
+        // Save altered numSamples in pOut
+        pOut->numSamples = numSamples;
+    }
+
+    // Caller makes sure pOut->pTileInfo is not NULL, see HwlComputeSurfaceInfo
+    ADDR_ASSERT(pOut->pTileInfo);
+
+    if (pOut->pTileInfo != NULL)
+    {
+        pTileInfo = pOut->pTileInfo;
+    }
+
+    // Set default values
+    if (pIn->pTileInfo != NULL)
+    {
+        if (pTileInfo != pIn->pTileInfo)
+        {
+            *pTileInfo = *pIn->pTileInfo;
+        }
+    }
+    else
+    {
+        memset(pTileInfo, 0, sizeof(ADDR_TILEINFO));
+    }
+
+    // For macro tile mode, we should calculate default tiling parameters
+    HwlSetupTileInfo(tileMode,
+                     flags,
+                     bpp,
+                     pitch,
+                     height,
+                     numSamples,
+                     pIn->pTileInfo,
+                     pTileInfo,
+                     pIn->tileType,
+                     pOut);
+
+    if (flags.cube)
+    {
+        if (mipLevel == 0)
+        {
+            padDims = 2;
+        }
+
+        if (numSlices == 1)
+        {
+            // This is calculating one face, remove cube flag
+            flags.cube = 0;
+        }
+    }
+
+    switch (tileMode)
+    {
+        case ADDR_TM_LINEAR_GENERAL://fall through
+        case ADDR_TM_LINEAR_ALIGNED:
+            valid = ComputeSurfaceInfoLinear(pIn, pOut, padDims);
+            break;
+
+        case ADDR_TM_1D_TILED_THIN1://fall through
+        case ADDR_TM_1D_TILED_THICK:
+            valid = ComputeSurfaceInfoMicroTiled(pIn, pOut, padDims, tileMode);
+            break;
+
+        case ADDR_TM_2D_TILED_THIN1:    //fall through
+        case ADDR_TM_2D_TILED_THICK:    //fall through
+        case ADDR_TM_3D_TILED_THIN1:    //fall through
+        case ADDR_TM_3D_TILED_THICK:    //fall through
+        case ADDR_TM_2D_TILED_XTHICK:   //fall through
+        case ADDR_TM_3D_TILED_XTHICK:   //fall through
+        case ADDR_TM_PRT_TILED_THIN1:   //fall through
+        case ADDR_TM_PRT_2D_TILED_THIN1://fall through
+        case ADDR_TM_PRT_3D_TILED_THIN1://fall through
+        case ADDR_TM_PRT_TILED_THICK:   //fall through
+        case ADDR_TM_PRT_2D_TILED_THICK://fall through
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            valid = ComputeSurfaceInfoMacroTiled(pIn, pOut, padDims, tileMode);
+            break;
+
+        default:
+            valid = FALSE;
+            ADDR_ASSERT_ALWAYS();
+            break;
+    }
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceInfoLinear
+*
+*   @brief
+*       Compute linear surface sizes include padded pitch, height, slices, total size in
+*       bytes, meanwhile alignments as well. Since it is linear mode, so output tile mode
+*       will not be changed here. Results are returned through output parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::ComputeSurfaceInfoLinear(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,    ///< [in] Input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut,   ///< [out] Output structure
+    UINT_32                                 padDims ///< [in] Dimensions to padd
+    ) const
+{
+    UINT_32 expPitch = pIn->width;
+    UINT_32 expHeight = pIn->height;
+    UINT_32 expNumSlices = pIn->numSlices;
+
+    // No linear MSAA on real H/W, keep this for TGL
+    UINT_32 numSamples = pOut->numSamples;
+
+    const UINT_32 microTileThickness = 1;
+
+    //
+    // Compute the surface alignments.
+    //
+    ComputeSurfaceAlignmentsLinear(pIn->tileMode,
+                                   pIn->bpp,
+                                   pIn->flags,
+                                   &pOut->baseAlign,
+                                   &pOut->pitchAlign,
+                                   &pOut->heightAlign);
+
+    if ((pIn->tileMode == ADDR_TM_LINEAR_GENERAL) && pIn->flags.color && (pIn->height > 1))
+    {
+#if !ALT_TEST
+        // When linear_general surface is accessed in multiple lines, it requires 8 pixels in pitch
+        // alignment since PITCH_TILE_MAX is in unit of 8 pixels.
+        // It is OK if it is accessed per line.
+        ADDR_ASSERT((pIn->width % 8) == 0);
+#endif
+    }
+
+    pOut->depthAlign = microTileThickness;
+
+    expPitch = HwlPreHandleBaseLvl3xPitch(pIn, expPitch);
+
+    //
+    // Pad pitch and height to the required granularities.
+    //
+    PadDimensions(pIn->tileMode,
+                  pIn->bpp,
+                  pIn->flags,
+                  numSamples,
+                  pOut->pTileInfo,
+                  padDims,
+                  pIn->mipLevel,
+                  &expPitch, pOut->pitchAlign,
+                  &expHeight, pOut->heightAlign,
+                  &expNumSlices, microTileThickness);
+
+    expPitch = HwlPostHandleBaseLvl3xPitch(pIn, expPitch);
+
+    //
+    // Adjust per HWL
+    //
+
+    UINT_64 logicalSliceSize;
+
+    logicalSliceSize = HwlGetSizeAdjustmentLinear(pIn->tileMode,
+                                                  pIn->bpp,
+                                                  numSamples,
+                                                  pOut->baseAlign,
+                                                  pOut->pitchAlign,
+                                                  &expPitch,
+                                                  &expHeight,
+                                                  &pOut->heightAlign);
+
+
+    pOut->pitch = expPitch;
+    pOut->height = expHeight;
+    pOut->depth = expNumSlices;
+
+    pOut->surfSize = logicalSliceSize * expNumSlices;
+
+    pOut->tileMode = pIn->tileMode;
+
+    return TRUE;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceInfoMicroTiled
+*
+*   @brief
+*       Compute 1D/Micro Tiled surface sizes include padded pitch, height, slices, total
+*       size in bytes, meanwhile alignments as well. Results are returned through output
+*       parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::ComputeSurfaceInfoMicroTiled(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] Input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut,       ///< [out] Output structure
+    UINT_32                                 padDims,    ///< [in] Dimensions to padd
+    AddrTileMode                            expTileMode ///< [in] Expected tile mode
+    ) const
+{
+    BOOL_32 valid = TRUE;
+
+    UINT_32 microTileThickness;
+    UINT_32 expPitch = pIn->width;
+    UINT_32 expHeight = pIn->height;
+    UINT_32 expNumSlices = pIn->numSlices;
+
+    // No 1D MSAA on real H/W, keep this for TGL
+    UINT_32 numSamples = pOut->numSamples;
+
+    //
+    // Compute the micro tile thickness.
+    //
+    microTileThickness = ComputeSurfaceThickness(expTileMode);
+
+    //
+    // Extra override for mip levels
+    //
+    if (pIn->mipLevel > 0)
+    {
+        //
+        // Reduce tiling mode from thick to thin if the number of slices is less than the
+        // micro tile thickness.
+        //
+        if ((expTileMode == ADDR_TM_1D_TILED_THICK) &&
+            (expNumSlices < ThickTileThickness))
+        {
+            expTileMode = HwlDegradeThickTileMode(ADDR_TM_1D_TILED_THICK, expNumSlices, NULL);
+            if (expTileMode != ADDR_TM_1D_TILED_THICK)
+            {
+                microTileThickness = 1;
+            }
+        }
+    }
+
+    //
+    // Compute the surface restrictions.
+    //
+    ComputeSurfaceAlignmentsMicroTiled(expTileMode,
+                                       pIn->bpp,
+                                       pIn->flags,
+                                       numSamples,
+                                       &pOut->baseAlign,
+                                       &pOut->pitchAlign,
+                                       &pOut->heightAlign);
+
+    pOut->depthAlign = microTileThickness;
+
+    //
+    // Pad pitch and height to the required granularities.
+    // Compute surface size.
+    // Return parameters.
+    //
+    PadDimensions(expTileMode,
+                  pIn->bpp,
+                  pIn->flags,
+                  numSamples,
+                  pOut->pTileInfo,
+                  padDims,
+                  pIn->mipLevel,
+                  &expPitch, pOut->pitchAlign,
+                  &expHeight, pOut->heightAlign,
+                  &expNumSlices, microTileThickness);
+
+    //
+    // Get HWL specific pitch adjustment
+    //
+    UINT_64 logicalSliceSize = HwlGetSizeAdjustmentMicroTiled(microTileThickness,
+                                                              pIn->bpp,
+                                                              pIn->flags,
+                                                              numSamples,
+                                                              pOut->baseAlign,
+                                                              pOut->pitchAlign,
+                                                              &expPitch,
+                                                              &expHeight);
+
+
+    pOut->pitch = expPitch;
+    pOut->height = expHeight;
+    pOut->depth = expNumSlices;
+
+    pOut->surfSize = logicalSliceSize * expNumSlices;
+
+    pOut->tileMode = expTileMode;
+
+    return valid;
+}
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceInfoMacroTiled
+*
+*   @brief
+*       Compute 2D/macro tiled surface sizes include padded pitch, height, slices, total
+*       size in bytes, meanwhile output suitable tile mode and alignments might be changed
+*       in this call as well. Results are returned through output parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::ComputeSurfaceInfoMacroTiled(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] Input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut,       ///< [out] Output structure
+    UINT_32                                 padDims,    ///< [in] Dimensions to padd
+    AddrTileMode                            expTileMode ///< [in] Expected tile mode
+    ) const
+{
+    BOOL_32 valid = TRUE;
+
+    AddrTileMode origTileMode = expTileMode;
+    UINT_32 microTileThickness;
+
+    UINT_32 paddedPitch;
+    UINT_32 paddedHeight;
+    UINT_64 bytesPerSlice;
+
+    UINT_32 expPitch     = pIn->width;
+    UINT_32 expHeight    = pIn->height;
+    UINT_32 expNumSlices = pIn->numSlices;
+
+    UINT_32 numSamples = pOut->numSamples;
+
+    //
+    // Compute the surface restrictions as base
+    // SanityCheckMacroTiled is called in ComputeSurfaceAlignmentsMacroTiled
+    //
+    valid = ComputeSurfaceAlignmentsMacroTiled(expTileMode,
+                                               pIn->bpp,
+                                               pIn->flags,
+                                               pIn->mipLevel,
+                                               numSamples,
+                                               pOut->pTileInfo,
+                                               &pOut->baseAlign,
+                                               &pOut->pitchAlign,
+                                               &pOut->heightAlign);
+
+    if (valid)
+    {
+        //
+        // Compute the micro tile thickness.
+        //
+        microTileThickness = ComputeSurfaceThickness(expTileMode);
+
+        //
+        // Find the correct tiling mode for mip levels
+        //
+        if (pIn->mipLevel > 0)
+        {
+            //
+            // Try valid tile mode
+            //
+            expTileMode = ComputeSurfaceMipLevelTileMode(expTileMode,
+                                                         pIn->bpp,
+                                                         expPitch,
+                                                         expHeight,
+                                                         expNumSlices,
+                                                         numSamples,
+                                                         pOut->pitchAlign,
+                                                         pOut->heightAlign,
+                                                         pOut->pTileInfo);
+
+            if (!IsMacroTiled(expTileMode)) // Downgraded to micro-tiled
+            {
+                return ComputeSurfaceInfoMicroTiled(pIn, pOut, padDims, expTileMode);
+            }
+            else
+            {
+                if (microTileThickness != ComputeSurfaceThickness(expTileMode))
+                {
+                    //
+                    // Re-compute if thickness changed since bank-height may be changed!
+                    //
+                    return ComputeSurfaceInfoMacroTiled(pIn, pOut, padDims, expTileMode);
+                }
+            }
+        }
+
+        paddedPitch     = expPitch;
+        paddedHeight    = expHeight;
+
+        //
+        // Re-cal alignment
+        //
+        if (expTileMode != origTileMode) // Tile mode is changed but still macro-tiled
+        {
+            valid = ComputeSurfaceAlignmentsMacroTiled(expTileMode,
+                                                       pIn->bpp,
+                                                       pIn->flags,
+                                                       pIn->mipLevel,
+                                                       numSamples,
+                                                       pOut->pTileInfo,
+                                                       &pOut->baseAlign,
+                                                       &pOut->pitchAlign,
+                                                       &pOut->heightAlign);
+        }
+
+        //
+        // Do padding
+        //
+        PadDimensions(expTileMode,
+                      pIn->bpp,
+                      pIn->flags,
+                      numSamples,
+                      pOut->pTileInfo,
+                      padDims,
+                      pIn->mipLevel,
+                      &paddedPitch, pOut->pitchAlign,
+                      &paddedHeight, pOut->heightAlign,
+                      &expNumSlices, microTileThickness);
+
+        if (pIn->flags.qbStereo &&
+            (pOut->pStereoInfo != NULL) &&
+            HwlStereoCheckRightOffsetPadding())
+        {
+            // Eye height's bank bits are different from y == 0?
+            // Since 3D rendering treats right eye buffer starting from y == "eye height" while
+            // display engine treats it to be 0, so the bank bits may be different, we pad
+            // more in height to make sure y == "eye height" has the same bank bits as y == 0.
+            UINT_32 checkMask = pOut->pTileInfo->banks - 1;
+            UINT_32 bankBits = 0;
+            do
+            {
+                bankBits = (paddedHeight / 8 / pOut->pTileInfo->bankHeight) & checkMask;
+
+                if (bankBits)
+                {
+                   paddedHeight += pOut->heightAlign;
+                }
+            } while (bankBits);
+        }
+
+        //
+        // Compute the size of a slice.
+        //
+        bytesPerSlice = BITS_TO_BYTES(static_cast<UINT_64>(paddedPitch) *
+                                      paddedHeight * NextPow2(pIn->bpp) * numSamples);
+
+        pOut->pitch = paddedPitch;
+        // Put this check right here to workaround special mipmap cases which the original height
+        // is needed.
+        // The original height is pre-stored in pOut->height in PostComputeMipLevel and
+        // pOut->pitch is needed in HwlCheckLastMacroTiledLvl, too.
+        if (m_configFlags.checkLast2DLevel && numSamples == 1) // Don't check MSAA
+        {
+            // Set a TRUE in pOut if next Level is the first 1D sub level
+            HwlCheckLastMacroTiledLvl(pIn, pOut);
+        }
+        pOut->height = paddedHeight;
+
+        pOut->depth = expNumSlices;
+
+        pOut->surfSize = bytesPerSlice * expNumSlices;
+
+        pOut->tileMode = expTileMode;
+
+        pOut->depthAlign = microTileThickness;
+
+    } // if (valid)
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceAlignmentsLinear
+*
+*   @brief
+*       Compute linear surface alignment, calculation results are returned through
+*       output parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::ComputeSurfaceAlignmentsLinear(
+    AddrTileMode        tileMode,          ///< [in] tile mode
+    UINT_32             bpp,               ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,             ///< [in] surface flags
+    UINT_32*            pBaseAlign,        ///< [out] base address alignment in bytes
+    UINT_32*            pPitchAlign,       ///< [out] pitch alignment in pixels
+    UINT_32*            pHeightAlign       ///< [out] height alignment in pixels
+    ) const
+{
+    BOOL_32 valid = TRUE;
+
+    switch (tileMode)
+    {
+        case ADDR_TM_LINEAR_GENERAL:
+            //
+            // The required base alignment and pitch and height granularities is to 1 element.
+            //
+            *pBaseAlign   = (bpp > 8) ? bpp / 8 : 1;
+            *pPitchAlign  = 1;
+            *pHeightAlign = 1;
+            break;
+        case ADDR_TM_LINEAR_ALIGNED:
+            //
+            // The required alignment for base is the pipe interleave size.
+            // The required granularity for pitch is hwl dependent.
+            // The required granularity for height is one row.
+            //
+            *pBaseAlign     = m_pipeInterleaveBytes;
+            *pPitchAlign    = HwlGetPitchAlignmentLinear(bpp, flags);
+            *pHeightAlign   = 1;
+            break;
+        default:
+            *pBaseAlign     = 1;
+            *pPitchAlign    = 1;
+            *pHeightAlign   = 1;
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+
+    AdjustPitchAlignment(flags, pPitchAlign);
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceAlignmentsMicroTiled
+*
+*   @brief
+*       Compute 1D tiled surface alignment, calculation results are returned through
+*       output parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::ComputeSurfaceAlignmentsMicroTiled(
+    AddrTileMode        tileMode,          ///< [in] tile mode
+    UINT_32             bpp,               ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,             ///< [in] surface flags
+    UINT_32             numSamples,        ///< [in] number of samples
+    UINT_32*            pBaseAlign,        ///< [out] base address alignment in bytes
+    UINT_32*            pPitchAlign,       ///< [out] pitch alignment in pixels
+    UINT_32*            pHeightAlign       ///< [out] height alignment in pixels
+    ) const
+{
+    BOOL_32 valid = TRUE;
+
+    //
+    // The required alignment for base is the pipe interleave size.
+    //
+    *pBaseAlign   = m_pipeInterleaveBytes;
+
+    *pPitchAlign  = HwlGetPitchAlignmentMicroTiled(tileMode, bpp, flags, numSamples);
+
+    *pHeightAlign = MicroTileHeight;
+
+    AdjustPitchAlignment(flags, pPitchAlign);
+
+    // ECR#393489
+    // Workaround 2 for 1D tiling -  There is HW bug for Carrizo
+    // where it requires the following alignments for 1D tiling.
+    if (flags.czDispCompatible)
+    {
+        *pBaseAlign  = PowTwoAlign(*pBaseAlign, 4096);                         //Base address MOD 4096 = 0
+        *pPitchAlign = PowTwoAlign(*pPitchAlign, 512 >> (BITS_TO_BYTES(bpp))); //(8 lines * pitch * bytes per pixel) MOD 4096 = 0
+    }
+    // end Carrizo workaround for 1D tilling
+
+    return valid;
+}
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlReduceBankWidthHeight
+*
+*   @brief
+*       Additional checks, reduce bankHeight/bankWidth if needed and possible
+*       tileSize*BANK_WIDTH*BANK_HEIGHT <= ROW_SIZE
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::HwlReduceBankWidthHeight(
+    UINT_32             tileSize,           ///< [in] tile size
+    UINT_32             bpp,                ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,              ///< [in] surface flags
+    UINT_32             numSamples,         ///< [in] number of samples
+    UINT_32             bankHeightAlign,    ///< [in] bank height alignment
+    UINT_32             pipes,              ///< [in] pipes
+    ADDR_TILEINFO*      pTileInfo           ///< [in/out] bank structure.
+    ) const
+{
+    UINT_32 macroAspectAlign;
+    BOOL_32 valid = TRUE;
+
+    if (tileSize * pTileInfo->bankWidth * pTileInfo->bankHeight > m_rowSize)
+    {
+        BOOL_32 stillGreater = TRUE;
+
+        // Try reducing bankWidth first
+        if (stillGreater && pTileInfo->bankWidth > 1)
+        {
+            while (stillGreater && pTileInfo->bankWidth > 0)
+            {
+                pTileInfo->bankWidth >>= 1;
+
+                if (pTileInfo->bankWidth == 0)
+                {
+                    pTileInfo->bankWidth = 1;
+                    break;
+                }
+
+                stillGreater =
+                    tileSize * pTileInfo->bankWidth * pTileInfo->bankHeight > m_rowSize;
+            }
+
+            // bankWidth is reduced above, so we need to recalculate bankHeight and ratio
+            bankHeightAlign = Max(1u,
+                                  m_pipeInterleaveBytes * m_bankInterleave /
+                                  (tileSize * pTileInfo->bankWidth)
+                                  );
+
+            // We cannot increase bankHeight so just assert this case.
+            ADDR_ASSERT((pTileInfo->bankHeight % bankHeightAlign) == 0);
+
+            if (numSamples == 1)
+            {
+                macroAspectAlign = Max(1u,
+                                   m_pipeInterleaveBytes * m_bankInterleave /
+                                   (tileSize * pipes * pTileInfo->bankWidth)
+                                   );
+                pTileInfo->macroAspectRatio = PowTwoAlign(pTileInfo->macroAspectRatio,
+                                                          macroAspectAlign);
+            }
+        }
+
+        // Early quit bank_height degradation for "64" bit z buffer
+        if (flags.depth && bpp >= 64)
+        {
+            stillGreater = FALSE;
+        }
+
+        // Then try reducing bankHeight
+        if (stillGreater && pTileInfo->bankHeight > bankHeightAlign)
+        {
+            while (stillGreater && pTileInfo->bankHeight > bankHeightAlign)
+            {
+                pTileInfo->bankHeight >>= 1;
+
+                if (pTileInfo->bankHeight < bankHeightAlign)
+                {
+                    pTileInfo->bankHeight = bankHeightAlign;
+                    break;
+                }
+
+                stillGreater =
+                    tileSize * pTileInfo->bankWidth * pTileInfo->bankHeight > m_rowSize;
+            }
+        }
+
+        valid = !stillGreater;
+
+        // Generate a warning if we still fail to meet this constraint
+        if (!valid)
+        {
+            ADDR_WARN(
+                0, ("TILE_SIZE(%d)*BANK_WIDTH(%d)*BANK_HEIGHT(%d) <= ROW_SIZE(%d)",
+                tileSize, pTileInfo->bankWidth, pTileInfo->bankHeight, m_rowSize));
+        }
+    }
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceAlignmentsMacroTiled
+*
+*   @brief
+*       Compute 2D tiled surface alignment, calculation results are returned through
+*       output parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::ComputeSurfaceAlignmentsMacroTiled(
+    AddrTileMode        tileMode,           ///< [in] tile mode
+    UINT_32             bpp,                ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,              ///< [in] surface flags
+    UINT_32             mipLevel,           ///< [in] mip level
+    UINT_32             numSamples,         ///< [in] number of samples
+    ADDR_TILEINFO*      pTileInfo,          ///< [in/out] bank structure.
+    UINT_32*            pBaseAlign,         ///< [out] base address alignment in bytes
+    UINT_32*            pPitchAlign,        ///< [out] pitch alignment in pixels
+    UINT_32*            pHeightAlign        ///< [out] height alignment in pixels
+    ) const
+{
+    BOOL_32 valid = SanityCheckMacroTiled(pTileInfo);
+
+    if (valid)
+    {
+        UINT_32 macroTileWidth;
+        UINT_32 macroTileHeight;
+
+        UINT_32 tileSize;
+        UINT_32 bankHeightAlign;
+        UINT_32 macroAspectAlign;
+
+        UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+        UINT_32 pipes = HwlGetPipes(pTileInfo);
+
+        //
+        // Align bank height first according to latest h/w spec
+        //
+
+        // tile_size = MIN(tile_split, 64 * tile_thickness * element_bytes * num_samples)
+        tileSize = Min(pTileInfo->tileSplitBytes,
+                       BITS_TO_BYTES(64 * thickness * bpp * numSamples));
+
+        // bank_height_align =
+        // MAX(1, (pipe_interleave_bytes * bank_interleave)/(tile_size*bank_width))
+        bankHeightAlign = Max(1u,
+                              m_pipeInterleaveBytes * m_bankInterleave /
+                              (tileSize * pTileInfo->bankWidth)
+                              );
+
+        pTileInfo->bankHeight = PowTwoAlign(pTileInfo->bankHeight, bankHeightAlign);
+
+        // num_pipes * bank_width * macro_tile_aspect >=
+        // (pipe_interleave_size * bank_interleave) / tile_size
+        if (numSamples == 1)
+        {
+            // this restriction is only for mipmap (mipmap's numSamples must be 1)
+            macroAspectAlign = Max(1u,
+                               m_pipeInterleaveBytes * m_bankInterleave /
+                               (tileSize * pipes * pTileInfo->bankWidth)
+                               );
+            pTileInfo->macroAspectRatio = PowTwoAlign(pTileInfo->macroAspectRatio, macroAspectAlign);
+        }
+
+        valid = HwlReduceBankWidthHeight(tileSize,
+                                      bpp,
+                                      flags,
+                                      numSamples,
+                                      bankHeightAlign,
+                                      pipes,
+                                      pTileInfo);
+
+        //
+        // The required granularity for pitch is the macro tile width.
+        //
+        macroTileWidth = MicroTileWidth * pTileInfo->bankWidth * pipes *
+            pTileInfo->macroAspectRatio;
+
+        *pPitchAlign = macroTileWidth;
+
+        AdjustPitchAlignment(flags, pPitchAlign);
+
+        //
+        // The required granularity for height is the macro tile height.
+        //
+        macroTileHeight = MicroTileHeight * pTileInfo->bankHeight * pTileInfo->banks /
+            pTileInfo->macroAspectRatio;
+
+        *pHeightAlign = macroTileHeight;
+
+        //
+        // Compute base alignment
+        //
+        *pBaseAlign = pipes *
+            pTileInfo->bankWidth * pTileInfo->banks * pTileInfo->bankHeight * tileSize;
+
+        if ((mipLevel == 0) && (flags.prt) && (m_chipFamily == ADDR_CHIP_FAMILY_SI))
+        {
+            static const UINT_32 PrtTileSize = 0x10000;
+
+            UINT_32 macroTileSize = macroTileWidth * macroTileHeight * numSamples * bpp / 8;
+
+            if (macroTileSize < PrtTileSize)
+            {
+                UINT_32 numMacroTiles = PrtTileSize / macroTileSize;
+
+                ADDR_ASSERT((PrtTileSize % macroTileSize) == 0);
+
+                *pPitchAlign *= numMacroTiles;
+                *pBaseAlign  *= numMacroTiles;
+            }
+        }
+    }
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::SanityCheckMacroTiled
+*
+*   @brief
+*       Check if macro-tiled parameters are valid
+*   @return
+*       TRUE if valid
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::SanityCheckMacroTiled(
+    ADDR_TILEINFO* pTileInfo   ///< [in] macro-tiled parameters
+    ) const
+{
+    BOOL_32 valid       = TRUE;
+    UINT_32 numPipes    = HwlGetPipes(pTileInfo);
+
+    switch (pTileInfo->banks)
+    {
+        case 2: //fall through
+        case 4: //fall through
+        case 8: //fall through
+        case 16:
+            break;
+        default:
+            valid = FALSE;
+            break;
+
+    }
+
+    if (valid)
+    {
+        switch (pTileInfo->bankWidth)
+        {
+            case 1: //fall through
+            case 2: //fall through
+            case 4: //fall through
+            case 8:
+                break;
+            default:
+                valid = FALSE;
+                break;
+        }
+    }
+
+    if (valid)
+    {
+        switch (pTileInfo->bankHeight)
+        {
+            case 1: //fall through
+            case 2: //fall through
+            case 4: //fall through
+            case 8:
+                break;
+            default:
+                valid = FALSE;
+                break;
+        }
+    }
+
+    if (valid)
+    {
+        switch (pTileInfo->macroAspectRatio)
+        {
+            case 1: //fall through
+            case 2: //fall through
+            case 4: //fall through
+            case 8:
+                break;
+            default:
+                valid = FALSE;
+                break;
+        }
+    }
+
+    if (valid)
+    {
+        if (pTileInfo->banks < pTileInfo->macroAspectRatio)
+        {
+            // This will generate macro tile height <= 1
+            valid = FALSE;
+        }
+    }
+
+    if (valid)
+    {
+        if (pTileInfo->tileSplitBytes > m_rowSize)
+        {
+            valid = FALSE;
+        }
+    }
+
+    if (valid)
+    {
+        valid = HwlSanityCheckMacroTiled(pTileInfo);
+    }
+
+    ADDR_ASSERT(valid == TRUE);
+
+    // Add this assert for guidance
+    ADDR_ASSERT(numPipes * pTileInfo->banks >= 4);
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceMipLevelTileMode
+*
+*   @brief
+*       Compute valid tile mode for surface mipmap sub-levels
+*
+*   @return
+*       Suitable tile mode
+***************************************************************************************************
+*/
+AddrTileMode EgBasedAddrLib::ComputeSurfaceMipLevelTileMode(
+    AddrTileMode        baseTileMode,   ///< [in] base tile mode
+    UINT_32             bpp,            ///< [in] bits per pixels
+    UINT_32             pitch,          ///< [in] current level pitch
+    UINT_32             height,         ///< [in] current level height
+    UINT_32             numSlices,      ///< [in] current number of slices
+    UINT_32             numSamples,     ///< [in] number of samples
+    UINT_32             pitchAlign,     ///< [in] pitch alignment
+    UINT_32             heightAlign,    ///< [in] height alignment
+    ADDR_TILEINFO*      pTileInfo       ///< [in] ptr to bank structure
+    ) const
+{
+    UINT_32 bytesPerTile;
+
+    AddrTileMode expTileMode = baseTileMode;
+    UINT_32 microTileThickness = ComputeSurfaceThickness(expTileMode);
+    UINT_32 interleaveSize = m_pipeInterleaveBytes * m_bankInterleave;
+
+    //
+    // Compute the size of a slice.
+    //
+    bytesPerTile = BITS_TO_BYTES(MicroTilePixels * microTileThickness * NextPow2(bpp) * numSamples);
+
+    //
+    // Reduce tiling mode from thick to thin if the number of slices is less than the
+    // micro tile thickness.
+    //
+    if (numSlices < microTileThickness)
+    {
+        expTileMode = HwlDegradeThickTileMode(expTileMode, numSlices, &bytesPerTile);
+    }
+
+    if (bytesPerTile > pTileInfo->tileSplitBytes)
+    {
+        bytesPerTile = pTileInfo->tileSplitBytes;
+    }
+
+    UINT_32 threshold1 =
+        bytesPerTile * HwlGetPipes(pTileInfo) * pTileInfo->bankWidth * pTileInfo->macroAspectRatio;
+
+    UINT_32 threshold2 =
+        bytesPerTile * pTileInfo->bankWidth * pTileInfo->bankHeight;
+
+    //
+    // Reduce the tile mode from 2D/3D to 1D in following conditions
+    //
+    switch (expTileMode)
+    {
+        case ADDR_TM_2D_TILED_THIN1: //fall through
+        case ADDR_TM_3D_TILED_THIN1:
+        case ADDR_TM_PRT_TILED_THIN1:
+        case ADDR_TM_PRT_2D_TILED_THIN1:
+        case ADDR_TM_PRT_3D_TILED_THIN1:
+            if ((pitch < pitchAlign) ||
+                (height < heightAlign) ||
+                (interleaveSize > threshold1) ||
+                (interleaveSize > threshold2))
+            {
+                expTileMode = ADDR_TM_1D_TILED_THIN1;
+            }
+            break;
+        case ADDR_TM_2D_TILED_THICK: //fall through
+        case ADDR_TM_3D_TILED_THICK:
+        case ADDR_TM_2D_TILED_XTHICK:
+        case ADDR_TM_3D_TILED_XTHICK:
+        case ADDR_TM_PRT_TILED_THICK:
+        case ADDR_TM_PRT_2D_TILED_THICK:
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            if ((pitch < pitchAlign) ||
+                (height < heightAlign))
+            {
+                expTileMode = ADDR_TM_1D_TILED_THICK;
+            }
+            break;
+        default:
+            break;
+    }
+
+    return expTileMode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlDegradeBaseLevel
+*   @brief
+*       Check if degrade is needed for base level
+*   @return
+*       TRUE if degrade is suggested
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::HwlDegradeBaseLevel(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn) const
+{
+    BOOL_32 degrade = FALSE;
+    BOOL_32 valid = TRUE;
+
+    ADDR_ASSERT(IsMacroTiled(pIn->tileMode));
+
+    UINT_32 baseAlign;
+    UINT_32 pitchAlign;
+    UINT_32 heightAlign;
+
+    ADDR_ASSERT(pIn->pTileInfo);
+    ADDR_TILEINFO tileInfo = *pIn->pTileInfo;
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
+
+    if (UseTileIndex(pIn->tileIndex))
+    {
+        out.tileIndex = pIn->tileIndex;
+        out.macroModeIndex = TileIndexInvalid;
+    }
+
+    HwlSetupTileInfo(pIn->tileMode,
+                     pIn->flags,
+                     pIn->bpp,
+                     pIn->width,
+                     pIn->height,
+                     pIn->numSamples,
+                     &tileInfo,
+                     &tileInfo,
+                     pIn->tileType,
+                     &out);
+
+    valid = ComputeSurfaceAlignmentsMacroTiled(pIn->tileMode,
+                                               pIn->bpp,
+                                               pIn->flags,
+                                               pIn->mipLevel,
+                                               pIn->numSamples,
+                                               &tileInfo,
+                                               &baseAlign,
+                                               &pitchAlign,
+                                               &heightAlign);
+
+    if (valid)
+    {
+        degrade = (pIn->width < pitchAlign || pIn->height < heightAlign);
+    }
+    else
+    {
+        degrade = TRUE;
+    }
+
+    return degrade;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlDegradeThickTileMode
+*
+*   @brief
+*       Degrades valid tile mode for thick modes if needed
+*
+*   @return
+*       Suitable tile mode
+***************************************************************************************************
+*/
+AddrTileMode EgBasedAddrLib::HwlDegradeThickTileMode(
+    AddrTileMode        baseTileMode,   ///< [in] base tile mode
+    UINT_32             numSlices,      ///< [in] current number of slices
+    UINT_32*            pBytesPerTile   ///< [in/out] pointer to bytes per slice
+    ) const
+{
+    ADDR_ASSERT(numSlices < ComputeSurfaceThickness(baseTileMode));
+    // if pBytesPerTile is NULL, this is a don't-care....
+    UINT_32 bytesPerTile = pBytesPerTile != NULL ? *pBytesPerTile : 64;
+
+    AddrTileMode expTileMode = baseTileMode;
+    switch (baseTileMode)
+    {
+        case ADDR_TM_1D_TILED_THICK:
+            expTileMode = ADDR_TM_1D_TILED_THIN1;
+            bytesPerTile >>= 2;
+            break;
+        case ADDR_TM_2D_TILED_THICK:
+            expTileMode = ADDR_TM_2D_TILED_THIN1;
+            bytesPerTile >>= 2;
+            break;
+        case ADDR_TM_3D_TILED_THICK:
+            expTileMode = ADDR_TM_3D_TILED_THIN1;
+            bytesPerTile >>= 2;
+            break;
+        case ADDR_TM_2D_TILED_XTHICK:
+            if (numSlices < ThickTileThickness)
+            {
+                expTileMode = ADDR_TM_2D_TILED_THIN1;
+                bytesPerTile >>= 3;
+            }
+            else
+            {
+                expTileMode = ADDR_TM_2D_TILED_THICK;
+                bytesPerTile >>= 1;
+            }
+            break;
+        case ADDR_TM_3D_TILED_XTHICK:
+            if (numSlices < ThickTileThickness)
+            {
+                expTileMode = ADDR_TM_3D_TILED_THIN1;
+                bytesPerTile >>= 3;
+            }
+            else
+            {
+                expTileMode = ADDR_TM_3D_TILED_THICK;
+                bytesPerTile >>= 1;
+            }
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+            break;
+    }
+
+    if (pBytesPerTile != NULL)
+    {
+        *pBytesPerTile = bytesPerTile;
+    }
+
+    return expTileMode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::DispatchComputeSurfaceAddrFromCoord
+*
+*   @brief
+*       Compute surface address from given coord (x, y, slice,sample)
+*
+*   @return
+*       Address in bytes
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::DispatchComputeSurfaceAddrFromCoord(
+    const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    UINT_32             x                  = pIn->x;
+    UINT_32             y                  = pIn->y;
+    UINT_32             slice              = pIn->slice;
+    UINT_32             sample             = pIn->sample;
+    UINT_32             bpp                = pIn->bpp;
+    UINT_32             pitch              = pIn->pitch;
+    UINT_32             height             = pIn->height;
+    UINT_32             numSlices          = pIn->numSlices;
+    UINT_32             numSamples         = ((pIn->numSamples == 0) ? 1 : pIn->numSamples);
+    UINT_32             numFrags           = ((pIn->numFrags == 0) ? numSamples : pIn->numFrags);
+    AddrTileMode        tileMode           = pIn->tileMode;
+    AddrTileType        microTileType      = pIn->tileType;
+    BOOL_32             ignoreSE           = pIn->ignoreSE;
+    BOOL_32             isDepthSampleOrder = pIn->isDepth;
+    ADDR_TILEINFO*      pTileInfo          = pIn->pTileInfo;
+
+    UINT_32*            pBitPosition       = &pOut->bitPosition;
+    UINT_64             addr;
+
+#if ADDR_AM_BUILD
+    UINT_32             addr5Bit           = 0;
+    UINT_32             addr5Swizzle       = pIn->addr5Swizzle;
+    BOOL_32             is32ByteTile       = pIn->is32ByteTile;
+#endif
+
+    // ADDR_DEPTH_SAMPLE_ORDER = non-disp + depth-sample-order
+    if (microTileType == ADDR_DEPTH_SAMPLE_ORDER)
+    {
+        isDepthSampleOrder = TRUE;
+    }
+
+    if (m_chipFamily >= ADDR_CHIP_FAMILY_NI)
+    {
+        if (numFrags != numSamples)
+        {
+            numSamples = numFrags;
+            ADDR_ASSERT(sample < numSamples);
+        }
+
+        /// @note
+        /// 128 bit/thick tiled surface doesn't support display tiling and
+        /// mipmap chain must have the same tileType, so please fill tileType correctly
+        if (!IsLinear(pIn->tileMode))
+        {
+            if (bpp >= 128 || ComputeSurfaceThickness(tileMode) > 1)
+            {
+                ADDR_ASSERT(microTileType != ADDR_DISPLAYABLE);
+            }
+        }
+    }
+
+    switch (tileMode)
+    {
+        case ADDR_TM_LINEAR_GENERAL://fall through
+        case ADDR_TM_LINEAR_ALIGNED:
+            addr = ComputeSurfaceAddrFromCoordLinear(x,
+                                                     y,
+                                                     slice,
+                                                     sample,
+                                                     bpp,
+                                                     pitch,
+                                                     height,
+                                                     numSlices,
+                                                     pBitPosition);
+            break;
+        case ADDR_TM_1D_TILED_THIN1://fall through
+        case ADDR_TM_1D_TILED_THICK:
+            addr = ComputeSurfaceAddrFromCoordMicroTiled(x,
+                                                         y,
+                                                         slice,
+                                                         sample,
+                                                         bpp,
+                                                         pitch,
+                                                         height,
+                                                         numSamples,
+                                                         tileMode,
+                                                         microTileType,
+                                                         isDepthSampleOrder,
+                                                         pBitPosition);
+            break;
+        case ADDR_TM_2D_TILED_THIN1:    //fall through
+        case ADDR_TM_2D_TILED_THICK:    //fall through
+        case ADDR_TM_3D_TILED_THIN1:    //fall through
+        case ADDR_TM_3D_TILED_THICK:    //fall through
+        case ADDR_TM_2D_TILED_XTHICK:   //fall through
+        case ADDR_TM_3D_TILED_XTHICK:   //fall through
+        case ADDR_TM_PRT_TILED_THIN1:   //fall through
+        case ADDR_TM_PRT_2D_TILED_THIN1://fall through
+        case ADDR_TM_PRT_3D_TILED_THIN1://fall through
+        case ADDR_TM_PRT_TILED_THICK:   //fall through
+        case ADDR_TM_PRT_2D_TILED_THICK://fall through
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            UINT_32 pipeSwizzle;
+            UINT_32 bankSwizzle;
+
+            if (m_configFlags.useCombinedSwizzle)
+            {
+                ExtractBankPipeSwizzle(pIn->tileSwizzle, pIn->pTileInfo,
+                                       &bankSwizzle, &pipeSwizzle);
+            }
+            else
+            {
+                pipeSwizzle = pIn->pipeSwizzle;
+                bankSwizzle = pIn->bankSwizzle;
+            }
+
+            addr = ComputeSurfaceAddrFromCoordMacroTiled(x,
+                                                         y,
+                                                         slice,
+                                                         sample,
+                                                         bpp,
+                                                         pitch,
+                                                         height,
+                                                         numSamples,
+                                                         tileMode,
+                                                         microTileType,
+                                                         ignoreSE,
+                                                         isDepthSampleOrder,
+                                                         pipeSwizzle,
+                                                         bankSwizzle,
+                                                         pTileInfo,
+                                                         pBitPosition);
+            break;
+        default:
+            addr = 0;
+            ADDR_ASSERT_ALWAYS();
+            break;
+    }
+
+#if ADDR_AM_BUILD
+    if (m_chipFamily >= ADDR_CHIP_FAMILY_NI)
+    {
+        if (addr5Swizzle && isDepthSampleOrder && is32ByteTile)
+        {
+            UINT_32 tx = x >> 3;
+            UINT_32 ty = y >> 3;
+            UINT_32 tileBits = ((ty&0x3) << 2) | (tx&0x3);
+
+            tileBits = tileBits & addr5Swizzle;
+            addr5Bit = XorReduce(tileBits, 4);
+
+            addr = addr | static_cast<UINT_64>(addr5Bit << 5);
+        }
+    }
+#endif
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceAddrFromCoordMicroTiled
+*
+*   @brief
+*       Computes the surface address and bit position from a
+*       coordinate for 2D tilied (macro tiled)
+*   @return
+*       The byte address
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::ComputeSurfaceAddrFromCoordMacroTiled(
+    UINT_32             x,                      ///< [in] x coordinate
+    UINT_32             y,                      ///< [in] y coordinate
+    UINT_32             slice,                  ///< [in] slice index
+    UINT_32             sample,                 ///< [in] sample index
+    UINT_32             bpp,                    ///< [in] bits per pixel
+    UINT_32             pitch,                  ///< [in] surface pitch, in pixels
+    UINT_32             height,                 ///< [in] surface height, in pixels
+    UINT_32             numSamples,             ///< [in] number of samples
+    AddrTileMode        tileMode,               ///< [in] tile mode
+    AddrTileType        microTileType,          ///< [in] micro tiling type
+    BOOL_32             ignoreSE,               ///< [in] TRUE if shader enginers can be ignored
+    BOOL_32             isDepthSampleOrder,     ///< [in] TRUE if it depth sample ordering is used
+    UINT_32             pipeSwizzle,            ///< [in] pipe swizzle
+    UINT_32             bankSwizzle,            ///< [in] bank swizzle
+    ADDR_TILEINFO*      pTileInfo,              ///< [in] bank structure
+                                                ///  **All fields to be valid on entry**
+    UINT_32*            pBitPosition            ///< [out] bit position, e.g. FMT_1 will use this
+    ) const
+{
+    UINT_64 addr;
+
+    UINT_32 microTileBytes;
+    UINT_32 microTileBits;
+    UINT_32 sampleOffset;
+    UINT_32 pixelIndex;
+    UINT_32 pixelOffset;
+    UINT_32 elementOffset;
+    UINT_32 tileSplitSlice;
+    UINT_32 pipe;
+    UINT_32 bank;
+    UINT_64 sliceBytes;
+    UINT_64 sliceOffset;
+    UINT_32 macroTilePitch;
+    UINT_32 macroTileHeight;
+    UINT_32 macroTilesPerRow;
+    UINT_32 macroTilesPerSlice;
+    UINT_64 macroTileBytes;
+    UINT_32 macroTileIndexX;
+    UINT_32 macroTileIndexY;
+    UINT_64 macroTileOffset;
+    UINT_64 totalOffset;
+    UINT_64 pipeInterleaveMask;
+    UINT_64 bankInterleaveMask;
+    UINT_64 pipeInterleaveOffset;
+    UINT_32 bankInterleaveOffset;
+    UINT_64 offset;
+    UINT_32 tileRowIndex;
+    UINT_32 tileColumnIndex;
+    UINT_32 tileIndex;
+    UINT_32 tileOffset;
+
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    //
+    // Compute the number of group, pipe, and bank bits.
+    //
+    UINT_32 numPipes              = HwlGetPipes(pTileInfo);
+    UINT_32 numPipeInterleaveBits = Log2(m_pipeInterleaveBytes);
+    UINT_32 numPipeBits           = Log2(numPipes);
+    UINT_32 numBankInterleaveBits = Log2(m_bankInterleave);
+    UINT_32 numBankBits           = Log2(pTileInfo->banks);
+
+    //
+    // Compute the micro tile size.
+    //
+    microTileBits = MicroTilePixels * microTileThickness * bpp * numSamples;
+
+    microTileBytes = microTileBits / 8;
+    //
+    // Compute the pixel index within the micro tile.
+    //
+    pixelIndex = ComputePixelIndexWithinMicroTile(x,
+                                                  y,
+                                                  slice,
+                                                  bpp,
+                                                  tileMode,
+                                                  microTileType);
+
+    //
+    // Compute the sample offset and pixel offset.
+    //
+    if (isDepthSampleOrder)
+    {
+        //
+        // For depth surfaces, samples are stored contiguously for each element, so the sample
+        // offset is the sample number times the element size.
+        //
+        sampleOffset = sample * bpp;
+        pixelOffset  = pixelIndex * bpp * numSamples;
+    }
+    else
+    {
+        //
+        // For color surfaces, all elements for a particular sample are stored contiguously, so
+        // the sample offset is the sample number times the micro tile size divided yBit the number
+        // of samples.
+        //
+        sampleOffset = sample * (microTileBits / numSamples);
+        pixelOffset  = pixelIndex * bpp;
+    }
+
+    //
+    // Compute the element offset.
+    //
+    elementOffset = pixelOffset + sampleOffset;
+
+    *pBitPosition = static_cast<UINT_32>(elementOffset % 8);
+
+    elementOffset /= 8; //bit-to-byte
+
+    //
+    // Determine if tiles need to be split across slices.
+    //
+    // If the size of the micro tile is larger than the tile split size, then the tile will be
+    // split across multiple slices.
+    //
+    UINT_32 slicesPerTile = 1;
+
+    if ((microTileBytes > pTileInfo->tileSplitBytes) && (microTileThickness == 1))
+    {   //don't support for thick mode
+
+        //
+        // Compute the number of slices per tile.
+        //
+        slicesPerTile = microTileBytes / pTileInfo->tileSplitBytes;
+
+        //
+        // Compute the tile split slice number for use in rotating the bank.
+        //
+        tileSplitSlice = elementOffset / pTileInfo->tileSplitBytes;
+
+        //
+        // Adjust the element offset to account for the portion of the tile that is being moved to
+        // a new slice..
+        //
+        elementOffset %= pTileInfo->tileSplitBytes;
+
+        //
+        // Adjust the microTileBytes size to tileSplitBytes size since
+        // a new slice..
+        //
+        microTileBytes = pTileInfo->tileSplitBytes;
+    }
+    else
+    {
+        tileSplitSlice = 0;
+    }
+
+    //
+    // Compute macro tile pitch and height.
+    //
+    macroTilePitch  =
+        (MicroTileWidth  * pTileInfo->bankWidth  * numPipes) * pTileInfo->macroAspectRatio;
+    macroTileHeight =
+        (MicroTileHeight * pTileInfo->bankHeight * pTileInfo->banks) / pTileInfo->macroAspectRatio;
+
+    //
+    // Compute the number of bytes per macro tile. Note: bytes of the same bank/pipe actually
+    //
+    macroTileBytes =
+        static_cast<UINT_64>(microTileBytes) *
+        (macroTilePitch / MicroTileWidth) * (macroTileHeight / MicroTileHeight) /
+        (numPipes * pTileInfo->banks);
+
+    //
+    // Compute the number of macro tiles per row.
+    //
+    macroTilesPerRow = pitch / macroTilePitch;
+
+    //
+    // Compute the offset to the macro tile containing the specified coordinate.
+    //
+    macroTileIndexX = x / macroTilePitch;
+    macroTileIndexY = y / macroTileHeight;
+    macroTileOffset = ((macroTileIndexY * macroTilesPerRow) + macroTileIndexX) * macroTileBytes;
+
+    //
+    // Compute the number of macro tiles per slice.
+    //
+    macroTilesPerSlice = macroTilesPerRow  * (height / macroTileHeight);
+
+    //
+    // Compute the slice size.
+    //
+    sliceBytes = macroTilesPerSlice * macroTileBytes;
+
+    //
+    // Compute the slice offset.
+    //
+    sliceOffset = sliceBytes * (tileSplitSlice + slicesPerTile * (slice / microTileThickness));
+
+    //
+    // Compute tile offest
+    //
+    tileRowIndex    = (y / MicroTileHeight) % pTileInfo->bankHeight;
+    tileColumnIndex = ((x / MicroTileWidth) / numPipes) % pTileInfo->bankWidth;
+    tileIndex        = (tileRowIndex * pTileInfo->bankWidth) + tileColumnIndex;
+    tileOffset       = tileIndex * microTileBytes;
+
+    //
+    // Combine the slice offset and macro tile offset with the pixel and sample offsets, accounting
+    // for the pipe and bank bits in the middle of the address.
+    //
+    totalOffset = sliceOffset + macroTileOffset + elementOffset + tileOffset;
+
+    //
+    // Get the pipe and bank.
+    //
+
+    // when the tileMode is PRT type, then adjust x and y coordinates
+    if (IsPrtNoRotationTileMode(tileMode))
+    {
+        x = x % macroTilePitch;
+        y = y % macroTileHeight;
+    }
+
+    pipe = ComputePipeFromCoord(x,
+                                y,
+                                slice,
+                                tileMode,
+                                pipeSwizzle,
+                                ignoreSE,
+                                pTileInfo);
+
+    bank = ComputeBankFromCoord(x,
+                                y,
+                                slice,
+                                tileMode,
+                                bankSwizzle,
+                                tileSplitSlice,
+                                pTileInfo);
+
+
+    //
+    // Split the offset to put some bits below the pipe+bank bits and some above.
+    //
+    pipeInterleaveMask = (1 << numPipeInterleaveBits) - 1;
+    bankInterleaveMask = (1 << numBankInterleaveBits) - 1;
+    pipeInterleaveOffset = totalOffset & pipeInterleaveMask;
+    bankInterleaveOffset = static_cast<UINT_32>((totalOffset >> numPipeInterleaveBits) &
+                                                bankInterleaveMask);
+    offset               =  totalOffset >> (numPipeInterleaveBits + numBankInterleaveBits);
+
+    //
+    // Assemble the address from its components.
+    //
+    addr  = pipeInterleaveOffset;
+    // This is to remove /analyze warnings
+    UINT_32 pipeBits            = pipe                 <<  numPipeInterleaveBits;
+    UINT_32 bankInterleaveBits  = bankInterleaveOffset << (numPipeInterleaveBits + numPipeBits);
+    UINT_32 bankBits            = bank                 << (numPipeInterleaveBits + numPipeBits +
+                                                           numBankInterleaveBits);
+    UINT_64 offsetBits          = offset               << (numPipeInterleaveBits + numPipeBits +
+                                                           numBankInterleaveBits + numBankBits);
+
+    addr |= pipeBits;
+    addr |= bankInterleaveBits;
+    addr |= bankBits;
+    addr |= offsetBits;
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceAddrFromCoordMicroTiled
+*
+*   @brief
+*       Computes the surface address and bit position from a coordinate for 1D tilied
+*       (micro tiled)
+*   @return
+*       The byte address
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::ComputeSurfaceAddrFromCoordMicroTiled(
+    UINT_32             x,                      ///< [in] x coordinate
+    UINT_32             y,                      ///< [in] y coordinate
+    UINT_32             slice,                  ///< [in] slice index
+    UINT_32             sample,                 ///< [in] sample index
+    UINT_32             bpp,                    ///< [in] bits per pixel
+    UINT_32             pitch,                  ///< [in] pitch, in pixels
+    UINT_32             height,                 ///< [in] height, in pixels
+    UINT_32             numSamples,             ///< [in] number of samples
+    AddrTileMode        tileMode,               ///< [in] tile mode
+    AddrTileType        microTileType,          ///< [in] micro tiling type
+    BOOL_32             isDepthSampleOrder,     ///< [in] TRUE if depth sample ordering is used
+    UINT_32*            pBitPosition            ///< [out] bit position, e.g. FMT_1 will use this
+    ) const
+{
+    UINT_64 addr = 0;
+
+    UINT_32 microTileBytes;
+    UINT_64 sliceBytes;
+    UINT_32 microTilesPerRow;
+    UINT_32 microTileIndexX;
+    UINT_32 microTileIndexY;
+    UINT_32 microTileIndexZ;
+    UINT_64 sliceOffset;
+    UINT_64 microTileOffset;
+    UINT_32 sampleOffset;
+    UINT_32 pixelIndex;
+    UINT_32 pixelOffset;
+
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    //
+    // Compute the micro tile size.
+    //
+    microTileBytes = BITS_TO_BYTES(MicroTilePixels * microTileThickness * bpp * numSamples);
+
+    //
+    // Compute the slice size.
+    //
+    sliceBytes =
+        BITS_TO_BYTES(static_cast<UINT_64>(pitch) * height * microTileThickness * bpp * numSamples);
+
+    //
+    // Compute the number of micro tiles per row.
+    //
+    microTilesPerRow = pitch / MicroTileWidth;
+
+    //
+    // Compute the micro tile index.
+    //
+    microTileIndexX = x     / MicroTileWidth;
+    microTileIndexY = y     / MicroTileHeight;
+    microTileIndexZ = slice / microTileThickness;
+
+    //
+    // Compute the slice offset.
+    //
+    sliceOffset = static_cast<UINT_64>(microTileIndexZ) * sliceBytes;
+
+    //
+    // Compute the offset to the micro tile containing the specified coordinate.
+    //
+    microTileOffset = (static_cast<UINT_64>(microTileIndexY) * microTilesPerRow + microTileIndexX) *
+        microTileBytes;
+
+    //
+    // Compute the pixel index within the micro tile.
+    //
+    pixelIndex = ComputePixelIndexWithinMicroTile(x,
+                                                  y,
+                                                  slice,
+                                                  bpp,
+                                                  tileMode,
+                                                  microTileType);
+
+    // Compute the sample offset.
+    //
+    if (isDepthSampleOrder)
+    {
+        //
+        // For depth surfaces, samples are stored contiguously for each element, so the sample
+        // offset is the sample number times the element size.
+        //
+        sampleOffset = sample * bpp;
+        pixelOffset = pixelIndex * bpp * numSamples;
+    }
+    else
+    {
+        //
+        // For color surfaces, all elements for a particular sample are stored contiguously, so
+        // the sample offset is the sample number times the micro tile size divided yBit the number
+        // of samples.
+        //
+        sampleOffset = sample * (microTileBytes*8 / numSamples);
+        pixelOffset = pixelIndex * bpp;
+    }
+
+    //
+    // Compute the bit position of the pixel.  Each element is stored with one bit per sample.
+    //
+
+    UINT_32 elemOffset = sampleOffset + pixelOffset;
+
+    *pBitPosition = elemOffset % 8;
+    elemOffset /= 8;
+
+    //
+    // Combine the slice offset, micro tile offset, sample offset, and pixel offsets.
+    //
+    addr = sliceOffset + microTileOffset + elemOffset;
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputePixelCoordFromOffset
+*
+*   @brief
+*       Compute pixel coordinate from offset inside a micro tile
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::HwlComputePixelCoordFromOffset(
+    UINT_32         offset,             ///< [in] offset inside micro tile in bits
+    UINT_32         bpp,                ///< [in] bits per pixel
+    UINT_32         numSamples,         ///< [in] number of samples
+    AddrTileMode    tileMode,           ///< [in] tile mode
+    UINT_32         tileBase,           ///< [in] base offset within a tile
+    UINT_32         compBits,           ///< [in] component bits actually needed(for planar surface)
+    UINT_32*        pX,                 ///< [out] x coordinate
+    UINT_32*        pY,                 ///< [out] y coordinate
+    UINT_32*        pSlice,             ///< [out] slice index
+    UINT_32*        pSample,            ///< [out] sample index
+    AddrTileType    microTileType,      ///< [in] micro tiling type
+    BOOL_32         isDepthSampleOrder  ///< [in] TRUE if depth sample order in microtile is used
+    ) const
+{
+    UINT_32 x = 0;
+    UINT_32 y = 0;
+    UINT_32 z = 0;
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+    // For planar surface, we adjust offset acoording to tile base
+    if ((bpp != compBits) && (compBits != 0) && isDepthSampleOrder)
+    {
+        offset -= tileBase;
+
+        ADDR_ASSERT(microTileType == ADDR_NON_DISPLAYABLE ||
+                    microTileType == ADDR_DEPTH_SAMPLE_ORDER);
+
+        bpp = compBits;
+    }
+
+    UINT_32 sampleTileBits;
+    UINT_32 samplePixelBits;
+    UINT_32 pixelIndex;
+
+    if (isDepthSampleOrder)
+    {
+        samplePixelBits = bpp * numSamples;
+        pixelIndex = offset / samplePixelBits;
+        *pSample = (offset % samplePixelBits) / bpp;
+    }
+    else
+    {
+        sampleTileBits = MicroTilePixels * bpp * thickness;
+        *pSample = offset / sampleTileBits;
+        pixelIndex = (offset % sampleTileBits) / bpp;
+    }
+
+    if (microTileType != ADDR_THICK)
+    {
+        if (microTileType == ADDR_DISPLAYABLE) // displayable
+        {
+            switch (bpp)
+            {
+                case 8:
+                    x = pixelIndex & 0x7;
+                    y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,3),_BIT(pixelIndex,4));
+                    break;
+                case 16:
+                    x = pixelIndex & 0x7;
+                    y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,4),_BIT(pixelIndex,3));
+                    break;
+                case 32:
+                    x = Bits2Number(3, _BIT(pixelIndex,3),_BIT(pixelIndex,1),_BIT(pixelIndex,0));
+                    y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,4),_BIT(pixelIndex,2));
+                    break;
+                case 64:
+                    x = Bits2Number(3, _BIT(pixelIndex,3),_BIT(pixelIndex,2),_BIT(pixelIndex,0));
+                    y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,4),_BIT(pixelIndex,1));
+                    break;
+                case 128:
+                    x = Bits2Number(3, _BIT(pixelIndex,3),_BIT(pixelIndex,2),_BIT(pixelIndex,1));
+                    y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,4),_BIT(pixelIndex,0));
+                    break;
+                default:
+                    break;
+            }
+        }
+        else if (microTileType == ADDR_NON_DISPLAYABLE || microTileType == ADDR_DEPTH_SAMPLE_ORDER)
+        {
+            x = Bits2Number(3, _BIT(pixelIndex,4),_BIT(pixelIndex,2),_BIT(pixelIndex,0));
+            y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,3),_BIT(pixelIndex,1));
+        }
+        else if (microTileType == ADDR_ROTATED)
+        {
+            /*
+                8-Bit Elements
+                element_index[5:0] = { x[2], x[0], x[1], y[2], y[1], y[0] }
+
+                16-Bit Elements
+                element_index[5:0] = { x[2], x[1], x[0], y[2], y[1], y[0] }
+
+                32-Bit Elements
+                element_index[5:0] = { x[2], x[1], y[2], x[0], y[1], y[0] }
+
+                64-Bit Elements
+                element_index[5:0] = { y[2], x[2], x[1], y[1], x[0], y[0] }
+            */
+            switch(bpp)
+            {
+                case 8:
+                    x = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,3),_BIT(pixelIndex,4));
+                    y = pixelIndex & 0x7;
+                    break;
+                case 16:
+                    x = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,4),_BIT(pixelIndex,3));
+                    y = pixelIndex & 0x7;
+                    break;
+                case 32:
+                    x = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,4),_BIT(pixelIndex,2));
+                    y = Bits2Number(3, _BIT(pixelIndex,3),_BIT(pixelIndex,1),_BIT(pixelIndex,0));
+                    break;
+                case 64:
+                    x = Bits2Number(3, _BIT(pixelIndex,4),_BIT(pixelIndex,3),_BIT(pixelIndex,1));
+                    y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,2),_BIT(pixelIndex,0));
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    break;
+            }
+        }
+
+        if (thickness > 1) // thick
+        {
+            z = Bits2Number(3, _BIT(pixelIndex,8),_BIT(pixelIndex,7),_BIT(pixelIndex,6));
+        }
+    }
+    else
+    {
+        ADDR_ASSERT((m_chipFamily >= ADDR_CHIP_FAMILY_CI) && (thickness > 1));
+        /*
+            8-Bit Elements and 16-Bit Elements
+            element_index[7:0] = { y[2], x[2], z[1], z[0], y[1], x[1], y[0], x[0] }
+
+            32-Bit Elements
+            element_index[7:0] = { y[2], x[2], z[1], y[1], z[0], x[1], y[0], x[0] }
+
+            64-Bit Elements and 128-Bit Elements
+            element_index[7:0] = { y[2], x[2], z[1], y[1], x[1], z[0], y[0], x[0] }
+
+            The equation to compute the element index for the extra thick tile:
+            element_index[8] = z[2]
+        */
+        switch (bpp)
+        {
+            case 8:
+            case 16: // fall-through
+                x = Bits2Number(3, _BIT(pixelIndex,6),_BIT(pixelIndex,2),_BIT(pixelIndex,0));
+                y = Bits2Number(3, _BIT(pixelIndex,7),_BIT(pixelIndex,3),_BIT(pixelIndex,1));
+                z = Bits2Number(2, _BIT(pixelIndex,5),_BIT(pixelIndex,4));
+                break;
+            case 32:
+                x = Bits2Number(3, _BIT(pixelIndex,6),_BIT(pixelIndex,2),_BIT(pixelIndex,0));
+                y = Bits2Number(3, _BIT(pixelIndex,7),_BIT(pixelIndex,4),_BIT(pixelIndex,1));
+                z = Bits2Number(2, _BIT(pixelIndex,5),_BIT(pixelIndex,3));
+                break;
+            case 64:
+            case 128: // fall-through
+                x = Bits2Number(3, _BIT(pixelIndex,6),_BIT(pixelIndex,3),_BIT(pixelIndex,0));
+                y = Bits2Number(3, _BIT(pixelIndex,7),_BIT(pixelIndex,4),_BIT(pixelIndex,1));
+                z = Bits2Number(2, _BIT(pixelIndex,5),_BIT(pixelIndex,2));
+                break;
+            default:
+                ADDR_ASSERT_ALWAYS();
+                break;
+        }
+
+        if (thickness == 8)
+        {
+            z += Bits2Number(3,_BIT(pixelIndex,8),0,0);
+        }
+    }
+
+    *pX = x;
+    *pY = y;
+    *pSlice += z;
+}
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::DispatchComputeSurfaceCoordFromAddrDispatch
+*
+*   @brief
+*       Compute (x,y,slice,sample) coordinates from surface address
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::DispatchComputeSurfaceCoordFromAddr(
+    const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    UINT_64             addr               = pIn->addr;
+    UINT_32             bitPosition        = pIn->bitPosition;
+    UINT_32             bpp                = pIn->bpp;
+    UINT_32             pitch              = pIn->pitch;
+    UINT_32             height             = pIn->height;
+    UINT_32             numSlices          = pIn->numSlices;
+    UINT_32             numSamples         = ((pIn->numSamples == 0) ? 1 : pIn->numSamples);
+    UINT_32             numFrags           = ((pIn->numFrags == 0) ? numSamples : pIn->numFrags);
+    AddrTileMode        tileMode           = pIn->tileMode;
+    UINT_32             tileBase           = pIn->tileBase;
+    UINT_32             compBits           = pIn->compBits;
+    AddrTileType        microTileType      = pIn->tileType;
+    BOOL_32             ignoreSE           = pIn->ignoreSE;
+    BOOL_32             isDepthSampleOrder = pIn->isDepth;
+    ADDR_TILEINFO*      pTileInfo          = pIn->pTileInfo;
+
+    UINT_32*            pX                 = &pOut->x;
+    UINT_32*            pY                 = &pOut->y;
+    UINT_32*            pSlice             = &pOut->slice;
+    UINT_32*            pSample            = &pOut->sample;
+
+    if (microTileType == ADDR_DEPTH_SAMPLE_ORDER)
+    {
+        isDepthSampleOrder = TRUE;
+    }
+
+    if (m_chipFamily >= ADDR_CHIP_FAMILY_NI)
+    {
+        if (numFrags != numSamples)
+        {
+            numSamples = numFrags;
+        }
+
+        /// @note
+        /// 128 bit/thick tiled surface doesn't support display tiling and
+        /// mipmap chain must have the same tileType, so please fill tileType correctly
+        if (!IsLinear(pIn->tileMode))
+        {
+            if (bpp >= 128 || ComputeSurfaceThickness(tileMode) > 1)
+            {
+                ADDR_ASSERT(microTileType != ADDR_DISPLAYABLE);
+            }
+        }
+    }
+
+    switch (tileMode)
+    {
+        case ADDR_TM_LINEAR_GENERAL://fall through
+        case ADDR_TM_LINEAR_ALIGNED:
+            ComputeSurfaceCoordFromAddrLinear(addr,
+                                              bitPosition,
+                                              bpp,
+                                              pitch,
+                                              height,
+                                              numSlices,
+                                              pX,
+                                              pY,
+                                              pSlice,
+                                              pSample);
+            break;
+        case ADDR_TM_1D_TILED_THIN1://fall through
+        case ADDR_TM_1D_TILED_THICK:
+            ComputeSurfaceCoordFromAddrMicroTiled(addr,
+                                                  bitPosition,
+                                                  bpp,
+                                                  pitch,
+                                                  height,
+                                                  numSamples,
+                                                  tileMode,
+                                                  tileBase,
+                                                  compBits,
+                                                  pX,
+                                                  pY,
+                                                  pSlice,
+                                                  pSample,
+                                                  microTileType,
+                                                  isDepthSampleOrder);
+            break;
+        case ADDR_TM_2D_TILED_THIN1:    //fall through
+        case ADDR_TM_2D_TILED_THICK:    //fall through
+        case ADDR_TM_3D_TILED_THIN1:    //fall through
+        case ADDR_TM_3D_TILED_THICK:    //fall through
+        case ADDR_TM_2D_TILED_XTHICK:   //fall through
+        case ADDR_TM_3D_TILED_XTHICK:   //fall through
+        case ADDR_TM_PRT_TILED_THIN1:   //fall through
+        case ADDR_TM_PRT_2D_TILED_THIN1://fall through
+        case ADDR_TM_PRT_3D_TILED_THIN1://fall through
+        case ADDR_TM_PRT_TILED_THICK:   //fall through
+        case ADDR_TM_PRT_2D_TILED_THICK://fall through
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            UINT_32 pipeSwizzle;
+            UINT_32 bankSwizzle;
+
+            if (m_configFlags.useCombinedSwizzle)
+            {
+                ExtractBankPipeSwizzle(pIn->tileSwizzle, pIn->pTileInfo,
+                                       &bankSwizzle, &pipeSwizzle);
+            }
+            else
+            {
+                pipeSwizzle = pIn->pipeSwizzle;
+                bankSwizzle = pIn->bankSwizzle;
+            }
+
+            ComputeSurfaceCoordFromAddrMacroTiled(addr,
+                                                  bitPosition,
+                                                  bpp,
+                                                  pitch,
+                                                  height,
+                                                  numSamples,
+                                                  tileMode,
+                                                  tileBase,
+                                                  compBits,
+                                                  microTileType,
+                                                  ignoreSE,
+                                                  isDepthSampleOrder,
+                                                  pipeSwizzle,
+                                                  bankSwizzle,
+                                                  pTileInfo,
+                                                  pX,
+                                                  pY,
+                                                  pSlice,
+                                                  pSample);
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+    }
+}
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceCoordFromAddrMacroTiled
+*
+*   @brief
+*       Compute surface coordinates from address for macro tiled surface
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::ComputeSurfaceCoordFromAddrMacroTiled(
+    UINT_64             addr,               ///< [in] byte address
+    UINT_32             bitPosition,        ///< [in] bit position
+    UINT_32             bpp,                ///< [in] bits per pixel
+    UINT_32             pitch,              ///< [in] pitch in pixels
+    UINT_32             height,             ///< [in] height in pixels
+    UINT_32             numSamples,         ///< [in] number of samples
+    AddrTileMode        tileMode,           ///< [in] tile mode
+    UINT_32             tileBase,           ///< [in] tile base offset
+    UINT_32             compBits,           ///< [in] component bits (for planar surface)
+    AddrTileType        microTileType,      ///< [in] micro tiling type
+    BOOL_32             ignoreSE,           ///< [in] TRUE if shader engines can be ignored
+    BOOL_32             isDepthSampleOrder, ///< [in] TRUE if depth sample order is used
+    UINT_32             pipeSwizzle,        ///< [in] pipe swizzle
+    UINT_32             bankSwizzle,        ///< [in] bank swizzle
+    ADDR_TILEINFO*      pTileInfo,          ///< [in] bank structure.
+                                            ///  **All fields to be valid on entry**
+    UINT_32*            pX,                 ///< [out] X coord
+    UINT_32*            pY,                 ///< [out] Y coord
+    UINT_32*            pSlice,             ///< [out] slice index
+    UINT_32*            pSample             ///< [out] sample index
+    ) const
+{
+    UINT_32 mx;
+    UINT_32 my;
+    UINT_64 tileBits;
+    UINT_64 macroTileBits;
+    UINT_32 slices;
+    UINT_32 tileSlices;
+    UINT_64 elementOffset;
+    UINT_64 macroTileIndex;
+    UINT_32 tileIndex;
+    UINT_64 totalOffset;
+
+
+    UINT_32 bank;
+    UINT_32 pipe;
+    UINT_32 groupBits = m_pipeInterleaveBytes << 3;
+    UINT_32 pipes = HwlGetPipes(pTileInfo);
+    UINT_32 banks = pTileInfo->banks;
+
+    UINT_32 bankInterleave = m_bankInterleave;
+
+    UINT_64 addrBits = BYTES_TO_BITS(addr) + bitPosition;
+
+    //
+    // remove bits for bank and pipe
+    //
+    totalOffset = (addrBits % groupBits) +
+        (((addrBits / groupBits / pipes) % bankInterleave) * groupBits) +
+        (((addrBits / groupBits / pipes) / bankInterleave) / banks) * groupBits * bankInterleave;
+
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    UINT_32 microTileBits = bpp * microTileThickness * MicroTilePixels * numSamples;
+
+    UINT_32 microTileBytes = BITS_TO_BYTES(microTileBits);
+    //
+    // Determine if tiles need to be split across slices.
+    //
+    // If the size of the micro tile is larger than the tile split size, then the tile will be
+    // split across multiple slices.
+    //
+    UINT_32 slicesPerTile = 1; //_State->TileSlices
+
+    if ((microTileBytes > pTileInfo->tileSplitBytes) && (microTileThickness == 1))
+    {   //don't support for thick mode
+
+        //
+        // Compute the number of slices per tile.
+        //
+        slicesPerTile = microTileBytes / pTileInfo->tileSplitBytes;
+    }
+
+    tileBits = microTileBits / slicesPerTile; // micro tile bits
+
+    // in micro tiles because not MicroTileWidth timed.
+    UINT_32 macroWidth  = pTileInfo->bankWidth * pipes * pTileInfo->macroAspectRatio;
+    // in micro tiles as well
+    UINT_32 macroHeight = pTileInfo->bankHeight * banks / pTileInfo->macroAspectRatio;
+
+    UINT_32 pitchInMacroTiles = pitch / MicroTileWidth / macroWidth;
+
+    macroTileBits = (macroWidth * macroHeight) * tileBits / (banks * pipes);
+
+    macroTileIndex = totalOffset / macroTileBits;
+
+    // pitchMacros * height / heightMacros;  macroTilesPerSlice == _State->SliceMacros
+    UINT_32 macroTilesPerSlice = (pitch / (macroWidth * MicroTileWidth)) * height /
+        (macroHeight * MicroTileWidth);
+
+    slices = static_cast<UINT_32>(macroTileIndex / macroTilesPerSlice);
+
+    *pSlice = static_cast<UINT_32>(slices / slicesPerTile * microTileThickness);
+
+    //
+    // calculate element offset and x[2:0], y[2:0], z[1:0] for thick
+    //
+    tileSlices = slices % slicesPerTile;
+
+    elementOffset  = tileSlices * tileBits;
+    elementOffset += totalOffset % tileBits;
+
+    UINT_32 coordZ = 0;
+
+    HwlComputePixelCoordFromOffset(static_cast<UINT_32>(elementOffset),
+                                   bpp,
+                                   numSamples,
+                                   tileMode,
+                                   tileBase,
+                                   compBits,
+                                   pX,
+                                   pY,
+                                   &coordZ,
+                                   pSample,
+                                   microTileType,
+                                   isDepthSampleOrder);
+
+    macroTileIndex = macroTileIndex % macroTilesPerSlice;
+    *pY += static_cast<UINT_32>(macroTileIndex / pitchInMacroTiles * macroHeight * MicroTileHeight);
+    *pX += static_cast<UINT_32>(macroTileIndex % pitchInMacroTiles * macroWidth * MicroTileWidth);
+
+    *pSlice += coordZ;
+
+    tileIndex = static_cast<UINT_32>((totalOffset % macroTileBits) / tileBits);
+
+    my = (tileIndex / pTileInfo->bankWidth) % pTileInfo->bankHeight * MicroTileHeight;
+    mx = (tileIndex % pTileInfo->bankWidth) * pipes * MicroTileWidth;
+
+    *pY += my;
+    *pX += mx;
+
+    bank = ComputeBankFromAddr(addr, banks, pipes);
+    pipe = ComputePipeFromAddr(addr, pipes);
+
+    HwlComputeSurfaceCoord2DFromBankPipe(tileMode,
+                                         pX,
+                                         pY,
+                                         *pSlice,
+                                         bank,
+                                         pipe,
+                                         bankSwizzle,
+                                         pipeSwizzle,
+                                         tileSlices,
+                                         ignoreSE,
+                                         pTileInfo);
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceCoord2DFromBankPipe
+*
+*   @brief
+*       Compute surface x,y coordinates from bank/pipe info
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::ComputeSurfaceCoord2DFromBankPipe(
+    AddrTileMode        tileMode,   ///< [in] tile mode
+    UINT_32             x,          ///< [in] x coordinate
+    UINT_32             y,          ///< [in] y coordinate
+    UINT_32             slice,      ///< [in] slice index
+    UINT_32             bank,       ///< [in] bank number
+    UINT_32             pipe,       ///< [in] pipe number
+    UINT_32             bankSwizzle,///< [in] bank swizzle
+    UINT_32             pipeSwizzle,///< [in] pipe swizzle
+    UINT_32             tileSlices, ///< [in] slices in a micro tile
+    ADDR_TILEINFO*      pTileInfo,  ///< [in] bank structure. **All fields to be valid on entry**
+    CoordFromBankPipe*  pOutput     ///< [out] pointer to extracted x/y bits
+    ) const
+{
+    UINT_32 yBit3 = 0;
+    UINT_32 yBit4 = 0;
+    UINT_32 yBit5 = 0;
+    UINT_32 yBit6 = 0;
+
+    UINT_32 xBit3 = 0;
+    UINT_32 xBit4 = 0;
+    UINT_32 xBit5 = 0;
+
+    UINT_32 tileSplitRotation;
+
+    UINT_32 numPipes = HwlGetPipes(pTileInfo);
+
+    UINT_32 bankRotation = ComputeBankRotation(tileMode,
+                                               pTileInfo->banks, numPipes);
+
+    UINT_32 pipeRotation = ComputePipeRotation(tileMode, numPipes);
+
+    UINT_32 xBit = x / (MicroTileWidth * pTileInfo->bankWidth * numPipes);
+    UINT_32 yBit = y / (MicroTileHeight * pTileInfo->bankHeight);
+
+    //calculate the bank and pipe before rotation and swizzle
+
+    switch (tileMode)
+    {
+        case ADDR_TM_2D_TILED_THIN1:  //fall through
+        case ADDR_TM_2D_TILED_THICK:  //fall through
+        case ADDR_TM_2D_TILED_XTHICK: //fall through
+        case ADDR_TM_3D_TILED_THIN1:  //fall through
+        case ADDR_TM_3D_TILED_THICK:  //fall through
+        case ADDR_TM_3D_TILED_XTHICK:
+            tileSplitRotation = ((pTileInfo->banks / 2) + 1);
+            break;
+        default:
+            tileSplitRotation =  0;
+            break;
+    }
+
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    bank ^= tileSplitRotation * tileSlices;
+    if (pipeRotation == 0)
+    {
+        bank ^= bankRotation * (slice / microTileThickness) + bankSwizzle;
+        bank %= pTileInfo->banks;
+        pipe ^= pipeSwizzle;
+    }
+    else
+    {
+        bank ^= bankRotation * (slice / microTileThickness) / numPipes + bankSwizzle;
+        bank %= pTileInfo->banks;
+        pipe ^= pipeRotation * (slice / microTileThickness) + pipeSwizzle;
+    }
+
+    if (pTileInfo->macroAspectRatio == 1)
+    {
+        switch (pTileInfo->banks)
+        {
+            case 2:
+                yBit3 = _BIT(bank, 0) ^ _BIT(xBit,0);
+                break;
+            case 4:
+                yBit4 = _BIT(bank, 0) ^ _BIT(xBit,0);
+                yBit3 = _BIT(bank, 1) ^ _BIT(xBit,1);
+                break;
+            case 8:
+                yBit3 = _BIT(bank, 2) ^ _BIT(xBit,2);
+                yBit5 = _BIT(bank, 0) ^ _BIT(xBit,0);
+                yBit4 = _BIT(bank, 1) ^ _BIT(xBit,1) ^ yBit5;
+                break;
+            case 16:
+                yBit3 = _BIT(bank, 3) ^ _BIT(xBit, 3);
+                yBit4 = _BIT(bank, 2) ^ _BIT(xBit, 2);
+                yBit6 = _BIT(bank, 0) ^ _BIT(xBit, 0);
+                yBit5 = _BIT(bank, 1) ^ _BIT(xBit, 1) ^ yBit6;
+                break;
+            default:
+                break;
+        }
+
+    }
+    else if (pTileInfo->macroAspectRatio == 2)
+    {
+        switch (pTileInfo->banks)
+        {
+            case 2: //xBit3 = yBit3^b0
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit,0);
+                break;
+            case 4: //xBit3=yBit4^b0; yBit3=xBit4^b1
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit,1);
+                yBit3 = _BIT(bank, 1) ^ _BIT(xBit,1);
+                break;
+            case 8: //xBit4, xBit5, yBit5 are known
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit,2);
+                yBit3 = _BIT(bank, 2) ^ _BIT(xBit,2);
+                yBit4 = _BIT(bank, 1) ^ _BIT(xBit,1) ^ _BIT(yBit, 2);
+                break;
+            case 16://x4,x5,x6,y6 are known
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit, 3); //x3 = y6 ^ b0
+                yBit3 = _BIT(bank, 3) ^ _BIT(xBit, 3); //y3 = x6 ^ b3
+                yBit4 = _BIT(bank, 2) ^ _BIT(xBit, 2); //y4 = x5 ^ b2
+                yBit5 = _BIT(bank, 1) ^ _BIT(xBit, 1) ^ _BIT(yBit, 3); //y5=x4^y6^b1
+                break;
+            default:
+                break;
+        }
+    }
+    else if (pTileInfo->macroAspectRatio == 4)
+    {
+        switch (pTileInfo->banks)
+        {
+            case 4: //yBit3, yBit4
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit,1);
+                xBit4 = _BIT(bank, 1) ^ _BIT(yBit,0);
+                break;
+            case 8: //xBit5, yBit4, yBit5
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit,2);
+                yBit3 = _BIT(bank, 2) ^ _BIT(xBit,2);
+                xBit4 = _BIT(bank, 1) ^ _BIT(yBit,1) ^  _BIT(yBit,2);
+                break;
+            case 16: //xBit5, xBit6, yBit5, yBit6
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit, 3);//x3 = b0 ^ y6
+                xBit4 = _BIT(bank, 1) ^ _BIT(yBit, 2) ^ _BIT(yBit, 3);//x4 = b1 ^ y5 ^ y6;
+                yBit3 = _BIT(bank, 3) ^ _BIT(xBit, 3); //y3 = b3 ^ x6;
+                yBit4 = _BIT(bank, 2) ^ _BIT(xBit, 2); //y4 = b2 ^ x5;
+                break;
+            default:
+                break;
+        }
+    }
+    else if (pTileInfo->macroAspectRatio == 8)
+    {
+        switch (pTileInfo->banks)
+        {
+            case 8: //yBit3, yBit4, yBit5
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit,2); //x3 = b0 ^ y5;
+                xBit4 = _BIT(bank, 1) ^ _BIT(yBit,1) ^ _BIT(yBit, 2);//x4 = b1 ^ y4 ^ y5;
+                xBit5 = _BIT(bank, 2) ^ _BIT(yBit,0);
+                break;
+            case 16: //xBit6, yBit4, yBit5, yBit6
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit, 3);//x3 = y6 ^ b0
+                xBit4 = _BIT(bank, 1) ^ _BIT(yBit, 2) ^ _BIT(yBit, 3);//x4 = y5 ^ y6 ^ b1
+                xBit5 = _BIT(bank, 2) ^ _BIT(yBit, 1);//x5 = y4 ^ b2
+                yBit3 = _BIT(bank, 3) ^ _BIT(xBit, 3); //y3 = x6 ^ b3
+                break;
+            default:
+                break;
+        }
+    }
+
+    pOutput->xBits = xBit;
+    pOutput->yBits = yBit;
+
+    pOutput->xBit3 = xBit3;
+    pOutput->xBit4 = xBit4;
+    pOutput->xBit5 = xBit5;
+    pOutput->yBit3 = yBit3;
+    pOutput->yBit4 = yBit4;
+    pOutput->yBit5 = yBit5;
+    pOutput->yBit6 = yBit6;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlExtractBankPipeSwizzle
+*   @brief
+*       Entry of EgBasedAddrLib ExtractBankPipeSwizzle
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlExtractBankPipeSwizzle(
+    const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT*  pIn,   ///< [in] input structure
+    ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT*       pOut   ///< [out] output structure
+    ) const
+{
+    ExtractBankPipeSwizzle(pIn->base256b,
+                           pIn->pTileInfo,
+                           &pOut->bankSwizzle,
+                           &pOut->pipeSwizzle);
+
+    return ADDR_OK;
+}
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlCombineBankPipeSwizzle
+*   @brief
+*       Combine bank/pipe swizzle
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlCombineBankPipeSwizzle(
+    UINT_32         bankSwizzle,    ///< [in] bank swizzle
+    UINT_32         pipeSwizzle,    ///< [in] pipe swizzle
+    ADDR_TILEINFO*  pTileInfo,      ///< [in] tile info
+    UINT_64         baseAddr,       ///< [in] base address
+    UINT_32*        pTileSwizzle    ///< [out] combined swizzle
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    if (pTileSwizzle)
+    {
+        *pTileSwizzle = GetBankPipeSwizzle(bankSwizzle, pipeSwizzle, baseAddr, pTileInfo);
+    }
+    else
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeBaseSwizzle
+*   @brief
+*       Compute base swizzle
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeBaseSwizzle(
+    const ADDR_COMPUTE_BASE_SWIZZLE_INPUT* pIn,
+    ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT* pOut
+    ) const
+{
+    UINT_32 bankSwizzle = 0;
+    UINT_32 pipeSwizzle = 0;
+    ADDR_TILEINFO* pTileInfo = pIn->pTileInfo;
+
+    ADDR_ASSERT(IsMacroTiled(pIn->tileMode));
+    ADDR_ASSERT(pIn->pTileInfo);
+
+    /// This is a legacy misreading of h/w doc, use it as it doesn't hurt.
+    static const UINT_8 bankRotationArray[4][16] = {
+        { 0, 0,  0, 0,  0, 0,  0, 0, 0,  0, 0,  0, 0,  0, 0, 0 }, // ADDR_SURF_2_BANK
+        { 0, 1,  2, 3,  0, 0,  0, 0, 0,  0, 0,  0, 0,  0, 0, 0 }, // ADDR_SURF_4_BANK
+        { 0, 3,  6, 1,  4, 7,  2, 5, 0,  0, 0,  0, 0,  0, 0, 0 }, // ADDR_SURF_8_BANK
+        { 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9 }, // ADDR_SURF_16_BANK
+    };
+
+    UINT_32 banks = pTileInfo ? pTileInfo->banks : 2;
+    UINT_32 hwNumBanks;
+
+    // Uses less bank swizzle bits
+    if (pIn->option.reduceBankBit && banks > 2)
+    {
+        banks >>= 1;
+    }
+
+    switch (banks)
+    {
+        case 2:
+            hwNumBanks = 0;
+            break;
+        case 4:
+            hwNumBanks = 1;
+            break;
+        case 8:
+            hwNumBanks = 2;
+            break;
+        case 16:
+            hwNumBanks = 3;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+            hwNumBanks = 0;
+            break;
+    }
+
+    if (pIn->option.genOption == ADDR_SWIZZLE_GEN_LINEAR)
+    {
+        bankSwizzle = pIn->surfIndex & (banks - 1);
+    }
+    else // (pIn->option.genOption == ADDR_SWIZZLE_GEN_DEFAULT)
+    {
+        bankSwizzle = bankRotationArray[hwNumBanks][pIn->surfIndex & (banks - 1)];
+    }
+
+    if (IsMacro3dTiled(pIn->tileMode))
+    {
+        pipeSwizzle = pIn->surfIndex & (HwlGetPipes(pTileInfo) - 1);
+    }
+
+    return HwlCombineBankPipeSwizzle(bankSwizzle, pipeSwizzle, pTileInfo, 0, &pOut->tileSwizzle);
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ExtractBankPipeSwizzle
+*   @brief
+*       Extract bank/pipe swizzle from base256b
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::ExtractBankPipeSwizzle(
+    UINT_32         base256b,       ///< [in] input base256b register value
+    ADDR_TILEINFO*  pTileInfo,      ///< [in] 2D tile parameters. Client must provide all data
+    UINT_32*        pBankSwizzle,   ///< [out] bank swizzle
+    UINT_32*        pPipeSwizzle    ///< [out] pipe swizzle
+    ) const
+{
+    UINT_32 bankSwizzle = 0;
+    UINT_32 pipeSwizzle = 0;
+
+    if (base256b != 0)
+    {
+        UINT_32 numPipes        = HwlGetPipes(pTileInfo);
+        UINT_32 bankBits        = QLog2(pTileInfo->banks);
+        UINT_32 pipeBits        = QLog2(numPipes);
+        UINT_32 groupBytes      = m_pipeInterleaveBytes;
+        UINT_32 bankInterleave  = m_bankInterleave;
+
+        pipeSwizzle =
+            (base256b / (groupBytes >> 8)) & ((1<<pipeBits)-1);
+
+        bankSwizzle =
+            (base256b / (groupBytes >> 8) / numPipes / bankInterleave) & ((1 << bankBits) - 1);
+    }
+
+    *pPipeSwizzle = pipeSwizzle;
+    *pBankSwizzle = bankSwizzle;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::GetBankPipeSwizzle
+*   @brief
+*       Combine bank/pipe swizzle
+*   @return
+*       Base256b bits (only filled bank/pipe bits)
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::GetBankPipeSwizzle(
+    UINT_32         bankSwizzle,    ///< [in] bank swizzle
+    UINT_32         pipeSwizzle,    ///< [in] pipe swizzle
+    UINT_64         baseAddr,       ///< [in] base address
+    ADDR_TILEINFO*  pTileInfo       ///< [in] tile info
+    ) const
+{
+    UINT_32 pipeBits = QLog2(HwlGetPipes(pTileInfo));
+    UINT_32 bankInterleaveBits = QLog2(m_bankInterleave);
+    UINT_32 tileSwizzle = pipeSwizzle + ((bankSwizzle << bankInterleaveBits) << pipeBits);
+
+    baseAddr ^= tileSwizzle * m_pipeInterleaveBytes;
+    baseAddr >>= 8;
+
+    return static_cast<UINT_32>(baseAddr);
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSliceTileSwizzle
+*   @brief
+*       Compute cubemap/3d texture faces/slices tile swizzle
+*   @return
+*       Tile swizzle
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputeSliceTileSwizzle(
+    AddrTileMode        tileMode,       ///< [in] Tile mode
+    UINT_32             baseSwizzle,    ///< [in] Base swizzle
+    UINT_32             slice,          ///< [in] Slice index, Cubemap face index, 0 means +X
+    UINT_64             baseAddr,       ///< [in] Base address
+    ADDR_TILEINFO* pTileInfo       ///< [in] Bank structure
+    ) const
+{
+    UINT_32 tileSwizzle = 0;
+
+    if (IsMacroTiled(tileMode)) // Swizzle only for macro tile mode
+    {
+        UINT_32 firstSlice = slice / ComputeSurfaceThickness(tileMode);
+
+        UINT_32 numPipes = HwlGetPipes(pTileInfo);
+        UINT_32 numBanks = pTileInfo->banks;
+
+        UINT_32 pipeRotation;
+        UINT_32 bankRotation;
+
+        UINT_32 bankSwizzle = 0;
+        UINT_32 pipeSwizzle = 0;
+
+        pipeRotation = ComputePipeRotation(tileMode, numPipes);
+        bankRotation = ComputeBankRotation(tileMode, numBanks, numPipes);
+
+        if (baseSwizzle != 0)
+        {
+            ExtractBankPipeSwizzle(baseSwizzle,
+                                   pTileInfo,
+                                   &bankSwizzle,
+                                   &pipeSwizzle);
+        }
+
+        if (pipeRotation == 0) //2D mode
+        {
+            bankSwizzle += firstSlice * bankRotation;
+            bankSwizzle %= numBanks;
+        }
+        else //3D mode
+        {
+            pipeSwizzle += firstSlice * pipeRotation;
+            pipeSwizzle %= numPipes;
+            bankSwizzle += firstSlice * bankRotation / numPipes;
+            bankSwizzle %= numBanks;
+        }
+
+        tileSwizzle = GetBankPipeSwizzle(bankSwizzle,
+                                         pipeSwizzle,
+                                         baseAddr,
+                                         pTileInfo);
+    }
+
+    return tileSwizzle;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeQbStereoRightSwizzle
+*
+*   @brief
+*       Compute right eye swizzle
+*   @return
+*       swizzle
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::HwlComputeQbStereoRightSwizzle(
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pInfo  ///< [in] Surface info, must be valid
+    ) const
+{
+    UINT_32 bankBits    = 0;
+    UINT_32 swizzle     = 0;
+
+    // The assumption is default swizzle for left eye is 0
+    if (IsMacroTiled(pInfo->tileMode) && pInfo->pStereoInfo && pInfo->pTileInfo)
+    {
+        bankBits = ComputeBankFromCoord(0, pInfo->height, 0,
+                                        pInfo->tileMode, 0, 0, pInfo->pTileInfo);
+
+        if (bankBits)
+        {
+            HwlCombineBankPipeSwizzle(bankBits, 0, pInfo->pTileInfo, 0, &swizzle);
+        }
+    }
+
+    return swizzle;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeBankFromCoord
+*
+*   @brief
+*       Compute bank number from coordinates
+*   @return
+*       Bank number
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputeBankFromCoord(
+    UINT_32         x,              ///< [in] x coordinate
+    UINT_32         y,              ///< [in] y coordinate
+    UINT_32         slice,          ///< [in] slice index
+    AddrTileMode    tileMode,       ///< [in] tile mode
+    UINT_32         bankSwizzle,    ///< [in] bank swizzle
+    UINT_32         tileSplitSlice, ///< [in] If the size of the pixel offset is larger than the
+                                    ///  tile split size, then the pixel will be moved to a separate
+                                    ///  slice. This value equals pixelOffset / tileSplitBytes
+                                    ///  in this case. Otherwise this is 0.
+    ADDR_TILEINFO*  pTileInfo       ///< [in] tile info
+    ) const
+{
+    UINT_32 pipes = HwlGetPipes(pTileInfo);
+    UINT_32 bankBit0 = 0;
+    UINT_32 bankBit1 = 0;
+    UINT_32 bankBit2 = 0;
+    UINT_32 bankBit3 = 0;
+    UINT_32 sliceRotation;
+    UINT_32 tileSplitRotation;
+    UINT_32 bank;
+    UINT_32 numBanks    = pTileInfo->banks;
+    UINT_32 bankWidth   = pTileInfo->bankWidth;
+    UINT_32 bankHeight  = pTileInfo->bankHeight;
+
+    UINT_32 tx = x / MicroTileWidth / (bankWidth * pipes);
+    UINT_32 ty = y / MicroTileHeight / bankHeight;
+
+    UINT_32 x3 = _BIT(tx,0);
+    UINT_32 x4 = _BIT(tx,1);
+    UINT_32 x5 = _BIT(tx,2);
+    UINT_32 x6 = _BIT(tx,3);
+    UINT_32 y3 = _BIT(ty,0);
+    UINT_32 y4 = _BIT(ty,1);
+    UINT_32 y5 = _BIT(ty,2);
+    UINT_32 y6 = _BIT(ty,3);
+
+    switch (numBanks)
+    {
+        case 16:
+            bankBit0 = x3 ^ y6;
+            bankBit1 = x4 ^ y5 ^ y6;
+            bankBit2 = x5 ^ y4;
+            bankBit3 = x6 ^ y3;
+            break;
+        case 8:
+            bankBit0 = x3 ^ y5;
+            bankBit1 = x4 ^ y4 ^ y5;
+            bankBit2 = x5 ^ y3;
+            break;
+        case 4:
+            bankBit0 = x3 ^ y4;
+            bankBit1 = x4 ^ y3;
+            break;
+        case 2:
+            bankBit0 = x3 ^ y3;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+            break;
+    }
+
+    bank = bankBit0 | (bankBit1 << 1) | (bankBit2 << 2) | (bankBit3 << 3);
+
+    //Bits2Number(4, bankBit3, bankBit2, bankBit1, bankBit0);
+
+    bank = HwlPreAdjustBank((x / MicroTileWidth), bank, pTileInfo);
+    //
+    // Compute bank rotation for the slice.
+    //
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    switch (tileMode)
+    {
+        case ADDR_TM_2D_TILED_THIN1:  // fall through
+        case ADDR_TM_2D_TILED_THICK:  // fall through
+        case ADDR_TM_2D_TILED_XTHICK:
+            sliceRotation = ((numBanks / 2) - 1) * (slice / microTileThickness);
+            break;
+        case ADDR_TM_3D_TILED_THIN1:  // fall through
+        case ADDR_TM_3D_TILED_THICK:  // fall through
+        case ADDR_TM_3D_TILED_XTHICK:
+            sliceRotation =
+                Max(1u, (pipes / 2) - 1) * (slice / microTileThickness) / pipes;
+            break;
+        default:
+            sliceRotation =  0;
+            break;
+    }
+
+
+    //
+    // Compute bank rotation for the tile split slice.
+    //
+    // The sample slice will be non-zero if samples must be split across multiple slices.
+    // This situation arises when the micro tile size multiplied yBit the number of samples exceeds
+    // the split size (set in GB_ADDR_CONFIG).
+    //
+    switch (tileMode)
+    {
+        case ADDR_TM_2D_TILED_THIN1: //fall through
+        case ADDR_TM_3D_TILED_THIN1: //fall through
+        case ADDR_TM_PRT_2D_TILED_THIN1: //fall through
+        case ADDR_TM_PRT_3D_TILED_THIN1: //fall through
+            tileSplitRotation = ((numBanks / 2) + 1) * tileSplitSlice;
+            break;
+        default:
+            tileSplitRotation =  0;
+            break;
+    }
+
+    //
+    // Apply bank rotation for the slice and tile split slice.
+    //
+    bank ^= bankSwizzle + sliceRotation;
+    bank ^= tileSplitRotation;
+
+    bank &= (numBanks - 1);
+
+    return bank;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeBankFromAddr
+*
+*   @brief
+*       Compute the bank number from an address
+*   @return
+*       Bank number
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputeBankFromAddr(
+    UINT_64 addr,       ///< [in] address
+    UINT_32 numBanks,   ///< [in] number of banks
+    UINT_32 numPipes    ///< [in] number of pipes
+    ) const
+{
+    UINT_32 bank;
+
+    //
+    // The LSBs of the address are arranged as follows:
+    //   bank | bankInterleave | pipe | pipeInterleave
+    //
+    // To get the bank number, shift off the pipe interleave, pipe, and bank interlave bits and
+    // mask the bank bits.
+    //
+    bank = static_cast<UINT_32>(
+        (addr >> Log2(m_pipeInterleaveBytes * numPipes * m_bankInterleave)) &
+        (numBanks - 1)
+        );
+
+    return bank;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputePipeRotation
+*
+*   @brief
+*       Compute pipe rotation value
+*   @return
+*       Pipe rotation
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputePipeRotation(
+    AddrTileMode tileMode,  ///< [in] tile mode
+    UINT_32      numPipes   ///< [in] number of pipes
+    ) const
+{
+   UINT_32 rotation;
+
+    switch (tileMode)
+    {
+        case ADDR_TM_3D_TILED_THIN1:        //fall through
+        case ADDR_TM_3D_TILED_THICK:        //fall through
+        case ADDR_TM_3D_TILED_XTHICK:       //fall through
+        case ADDR_TM_PRT_3D_TILED_THIN1:    //fall through
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            rotation = (numPipes < 4) ? 1 : (numPipes / 2 - 1);
+            break;
+        default:
+            rotation = 0;
+    }
+
+    return rotation;
+}
+
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeBankRotation
+*
+*   @brief
+*       Compute bank rotation value
+*   @return
+*       Bank rotation
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputeBankRotation(
+    AddrTileMode tileMode,  ///< [in] tile mode
+    UINT_32      numBanks,  ///< [in] number of banks
+    UINT_32      numPipes   ///< [in] number of pipes
+    ) const
+{
+    UINT_32 rotation;
+
+    switch (tileMode)
+    {
+        case ADDR_TM_2D_TILED_THIN1: // fall through
+        case ADDR_TM_2D_TILED_THICK: // fall through
+        case ADDR_TM_2D_TILED_XTHICK:
+        case ADDR_TM_PRT_2D_TILED_THIN1:
+        case ADDR_TM_PRT_2D_TILED_THICK:
+            // Rotate banks per Z-slice yBit 1 for 4-bank or 3 for 8-bank
+            rotation =  numBanks / 2 - 1;
+            break;
+        case ADDR_TM_3D_TILED_THIN1: // fall through
+        case ADDR_TM_3D_TILED_THICK: // fall through
+        case ADDR_TM_3D_TILED_XTHICK:
+        case ADDR_TM_PRT_3D_TILED_THIN1:
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            rotation = (numPipes < 4) ? 1 : (numPipes / 2 - 1);    // rotate pipes & banks
+            break;
+        default:
+            rotation = 0;
+    }
+
+    return rotation;
+}
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeHtileBytes
+*
+*   @brief
+*       Compute htile size in bytes
+*
+*   @return
+*       Htile size in bytes
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::ComputeHtileBytes(
+    UINT_32 pitch,        ///< [in] pitch
+    UINT_32 height,       ///< [in] height
+    UINT_32 bpp,          ///< [in] bits per pixel
+    BOOL_32 isLinear,     ///< [in] if it is linear mode
+    UINT_32 numSlices,    ///< [in] number of slices
+    UINT_64* sliceBytes,  ///< [out] bytes per slice
+    UINT_32 baseAlign     ///< [in] base alignments
+    ) const
+{
+    UINT_64 surfBytes;
+
+    const UINT_64 HtileCacheLineSize = BITS_TO_BYTES(HtileCacheBits);
+
+    *sliceBytes = BITS_TO_BYTES(static_cast<UINT_64>(pitch) * height * bpp / 64);
+
+    if (m_configFlags.useHtileSliceAlign)
+    {
+        // Align the sliceSize to htilecachelinesize * pipes at first
+        *sliceBytes = PowTwoAlign(*sliceBytes, HtileCacheLineSize * m_pipes);
+        surfBytes  = *sliceBytes * numSlices;
+    }
+    else
+    {
+        // Align the surfSize to htilecachelinesize * pipes at last
+        surfBytes  = *sliceBytes * numSlices;
+        surfBytes  = PowTwoAlign(surfBytes, HtileCacheLineSize * m_pipes);
+    }
+
+    return surfBytes;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::DispatchComputeFmaskInfo
+*
+*   @brief
+*       Compute fmask sizes include padded pitch, height, slices, total size in bytes,
+*       meanwhile output suitable tile mode and alignments as well. Results are returned
+*       through output parameters.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::DispatchComputeFmaskInfo(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pIn,   ///< [in] input structure
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT*         pOut)  ///< [out] output structure
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    ADDR_COMPUTE_SURFACE_INFO_INPUT  surfIn     = {0};
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT surfOut    = {0};
+
+    // Setup input structure
+    surfIn.tileMode          = pIn->tileMode;
+    surfIn.width             = pIn->pitch;
+    surfIn.height            = pIn->height;
+    surfIn.numSlices         = pIn->numSlices;
+    surfIn.pTileInfo         = pIn->pTileInfo;
+    surfIn.tileType          = ADDR_NON_DISPLAYABLE;
+    surfIn.flags.fmask       = 1;
+
+    // Setup output structure
+    surfOut.pTileInfo       = pOut->pTileInfo;
+
+    // Setup hwl specific fields
+    HwlFmaskPreThunkSurfInfo(pIn, pOut, &surfIn, &surfOut);
+
+    surfIn.bpp = HwlComputeFmaskBits(pIn, &surfIn.numSamples);
+
+    // ComputeSurfaceInfo needs numSamples in surfOut as surface routines need adjusted numSamples
+    surfOut.numSamples = surfIn.numSamples;
+
+    retCode = HwlComputeSurfaceInfo(&surfIn, &surfOut);
+
+    // Save bpp field for surface dump support
+    surfOut.bpp = surfIn.bpp;
+
+    if (retCode == ADDR_OK)
+    {
+        pOut->bpp               = surfOut.bpp;
+        pOut->pitch             = surfOut.pitch;
+        pOut->height            = surfOut.height;
+        pOut->numSlices         = surfOut.depth;
+        pOut->fmaskBytes        = surfOut.surfSize;
+        pOut->baseAlign         = surfOut.baseAlign;
+        pOut->pitchAlign        = surfOut.pitchAlign;
+        pOut->heightAlign       = surfOut.heightAlign;
+
+        if (surfOut.depth > 1)
+        {
+            // For fmask, expNumSlices is stored in depth.
+            pOut->sliceSize = surfOut.surfSize / surfOut.depth;
+        }
+        else
+        {
+            pOut->sliceSize = surfOut.surfSize;
+        }
+
+        // Save numSamples field for surface dump support
+        pOut->numSamples        = surfOut.numSamples;
+
+        HwlFmaskPostThunkSurfInfo(&surfOut, pOut);
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlFmaskSurfaceInfo
+*   @brief
+*       Entry of EgBasedAddrLib ComputeFmaskInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeFmaskInfo(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pIn,   ///< [in] input structure
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT*         pOut   ///< [out] output structure
+    )
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    ADDR_TILEINFO tileInfo = {0};
+
+    // Use internal tile info if pOut does not have a valid pTileInfo
+    if (pOut->pTileInfo == NULL)
+    {
+        pOut->pTileInfo = &tileInfo;
+    }
+
+    retCode = DispatchComputeFmaskInfo(pIn, pOut);
+
+    if (retCode == ADDR_OK)
+    {
+        pOut->tileIndex =
+            HwlPostCheckTileIndex(pOut->pTileInfo, pIn->tileMode, ADDR_NON_DISPLAYABLE,
+                                  pOut->tileIndex);
+    }
+
+    // Resets pTileInfo to NULL if the internal tile info is used
+    if (pOut->pTileInfo == &tileInfo)
+    {
+        pOut->pTileInfo = NULL;
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeFmaskAddrFromCoord
+*   @brief
+*       Entry of EgBasedAddrLib ComputeFmaskAddrFromCoord
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeFmaskAddrFromCoord(
+    const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+#if ADDR_AM_BUILD
+    if ((pIn->x > pIn->pitch)               ||
+        (pIn->y > pIn->height)              ||
+        (pIn->numSamples > m_maxSamples)    ||
+        (pIn->sample >= m_maxSamples))
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+    else
+    {
+        pOut->addr = DispatchComputeFmaskAddrFromCoord(pIn, pOut);
+    }
+#endif
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeFmaskCoordFromAddr
+*   @brief
+*       Entry of EgBasedAddrLib ComputeFmaskCoordFromAddr
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeFmaskCoordFromAddr(
+    const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+#if ADDR_AM_BUILD
+    if ((pIn->bitPosition >= 8) ||
+        (pIn->numSamples > m_maxSamples))
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+    else
+    {
+        DispatchComputeFmaskCoordFromAddr(pIn, pOut);
+    }
+#endif
+
+    return retCode;
+}
+
+#if ADDR_AM_BUILD
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::DispatchComputeFmaskAddrFromCoord
+*
+*   @brief
+*       Computes the FMASK address and bit position from a coordinate.
+*   @return
+*       The byte address
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::DispatchComputeFmaskAddrFromCoord(
+    const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    UINT_32             x                 = pIn->x;
+    UINT_32             y                 = pIn->y;
+    UINT_32             slice             = pIn->slice;
+    UINT_32             sample            = pIn->sample;
+    UINT_32             plane             = pIn->plane;
+    UINT_32             pitch             = pIn->pitch;
+    UINT_32             height            = pIn->height;
+    UINT_32             numSamples        = pIn->numSamples;
+    AddrTileMode        tileMode          = pIn->tileMode;
+    BOOL_32             ignoreSE          = pIn->ignoreSE;
+    ADDR_TILEINFO*      pTileInfo         = pIn->pTileInfo;
+    BOOL_32             resolved          = pIn->resolved;
+
+    UINT_32* pBitPosition = &pOut->bitPosition;
+    UINT_64 addr          = 0;
+
+    ADDR_ASSERT(numSamples > 1);
+    ADDR_ASSERT(ComputeSurfaceThickness(tileMode) == 1);
+
+    switch (tileMode)
+    {
+        case ADDR_TM_1D_TILED_THIN1:
+            addr = ComputeFmaskAddrFromCoordMicroTiled(x,
+                                                       y,
+                                                       slice,
+                                                       sample,
+                                                       plane,
+                                                       pitch,
+                                                       height,
+                                                       numSamples,
+                                                       tileMode,
+                                                       resolved,
+                                                       pBitPosition);
+            break;
+        case ADDR_TM_2D_TILED_THIN1: //fall through
+        case ADDR_TM_3D_TILED_THIN1:
+            UINT_32 pipeSwizzle;
+            UINT_32 bankSwizzle;
+
+            if (m_configFlags.useCombinedSwizzle)
+            {
+                ExtractBankPipeSwizzle(pIn->tileSwizzle, pIn->pTileInfo,
+                                       &bankSwizzle, &pipeSwizzle);
+            }
+            else
+            {
+                pipeSwizzle = pIn->pipeSwizzle;
+                bankSwizzle = pIn->bankSwizzle;
+            }
+
+            addr = ComputeFmaskAddrFromCoordMacroTiled(x,
+                                                       y,
+                                                       slice,
+                                                       sample,
+                                                       plane,
+                                                       pitch,
+                                                       height,
+                                                       numSamples,
+                                                       tileMode,
+                                                       pipeSwizzle,
+                                                       bankSwizzle,
+                                                       ignoreSE,
+                                                       pTileInfo,
+                                                       resolved,
+                                                       pBitPosition);
+            break;
+        default:
+            *pBitPosition = 0;
+            break;
+    }
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeFmaskAddrFromCoordMicroTiled
+*
+*   @brief
+*       Computes the FMASK address and bit position from a coordinate for 1D tilied (micro
+*       tiled)
+*   @return
+*       The byte address
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::ComputeFmaskAddrFromCoordMicroTiled(
+    UINT_32             x,              ///< [in] x coordinate
+    UINT_32             y,              ///< [in] y coordinate
+    UINT_32             slice,          ///< [in] slice index
+    UINT_32             sample,         ///< [in] sample number
+    UINT_32             plane,          ///< [in] plane number
+    UINT_32             pitch,          ///< [in] surface pitch in pixels
+    UINT_32             height,         ///< [in] surface height in pixels
+    UINT_32             numSamples,     ///< [in] number of samples
+    AddrTileMode        tileMode,       ///< [in] tile mode
+    BOOL_32             resolved,       ///< [in] TRUE if this is for resolved fmask
+    UINT_32*            pBitPosition    ///< [out] pointer to returned bit position
+    ) const
+{
+    UINT_64 addr = 0;
+    UINT_32 effectiveBpp;
+    UINT_32 effectiveSamples;
+
+    //
+    // 2xAA use the same layout as 4xAA
+    //
+    if (numSamples == 2)
+    {
+        numSamples = 4;
+    }
+
+    //
+    // Compute the number of planes.
+    //
+    if (!resolved)
+    {
+        effectiveSamples = ComputeFmaskNumPlanesFromNumSamples(numSamples);;
+        effectiveBpp = numSamples;
+
+        //
+        // Compute the address just like a color surface with numSamples bits per element and
+        // numPlanes samples.
+        //
+        addr = ComputeSurfaceAddrFromCoordMicroTiled(x,
+                                                     y,
+                                                     slice,
+                                                     plane, // sample
+                                                     effectiveBpp,
+                                                     pitch,
+                                                     height,
+                                                     effectiveSamples,
+                                                     tileMode,
+                                                     ADDR_NON_DISPLAYABLE,
+                                                     FALSE,
+                                                     pBitPosition);
+
+        //
+        // Compute the real bit position. Each (sample, plane) is stored with one bit per sample.
+        //
+
+        //
+        // Compute the pixel index with in the micro tile
+        //
+        UINT_32 pixelIndex = ComputePixelIndexWithinMicroTile(x % 8,
+                                                              y % 8,
+                                                              slice,
+                                                              1,
+                                                              tileMode,
+                                                              ADDR_NON_DISPLAYABLE);
+
+        *pBitPosition = ((pixelIndex * numSamples) + sample) & (BITS_PER_BYTE-1);
+
+        UINT_64 bitAddr = BYTES_TO_BITS(addr) + *pBitPosition;
+
+        addr = bitAddr / 8;
+    }
+    else
+    {
+        effectiveBpp = ComputeFmaskResolvedBppFromNumSamples(numSamples);
+        effectiveSamples = 1;
+
+        //
+        // Compute the address just like a color surface with numSamples bits per element and
+        // numPlanes samples.
+        //
+        addr = ComputeSurfaceAddrFromCoordMicroTiled(x,
+                                                     y,
+                                                     slice,
+                                                     sample,
+                                                     effectiveBpp,
+                                                     pitch,
+                                                     height,
+                                                     effectiveSamples,
+                                                     tileMode,
+                                                     ADDR_NON_DISPLAYABLE,
+                                                     TRUE,
+                                                     pBitPosition);
+    }
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeFmaskAddrFromCoordMacroTiled
+*
+*   @brief
+*       Computes the FMASK address and bit position from a coordinate for 2D tilied (macro
+*       tiled)
+*   @return
+*       The byte address
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::ComputeFmaskAddrFromCoordMacroTiled(
+    UINT_32             x,              ///< [in] x coordinate
+    UINT_32             y,              ///< [in] y coordinate
+    UINT_32             slice,          ///< [in] slice index
+    UINT_32             sample,         ///< [in] sample number
+    UINT_32             plane,          ///< [in] plane number
+    UINT_32             pitch,          ///< [in] surface pitch in pixels
+    UINT_32             height,         ///< [in] surface height in pixels
+    UINT_32             numSamples,     ///< [in] number of samples
+    AddrTileMode        tileMode,       ///< [in] tile mode
+    UINT_32             pipeSwizzle,    ///< [in] pipe swizzle
+    UINT_32             bankSwizzle,    ///< [in] bank swizzle
+    BOOL_32             ignoreSE,       ///< [in] TRUE if ignore shader engine
+    ADDR_TILEINFO*      pTileInfo,      ///< [in] bank structure.**All fields to be valid on entry**
+    BOOL_32             resolved,       ///< [in] TRUE if this is for resolved fmask
+    UINT_32*            pBitPosition    ///< [out] pointer to returned bit position
+    ) const
+{
+    UINT_64 addr = 0;
+    UINT_32 effectiveBpp;
+    UINT_32 effectiveSamples;
+
+    //
+    // 2xAA use the same layout as 4xAA
+    //
+    if (numSamples == 2)
+    {
+        numSamples = 4;
+    }
+
+    //
+    // Compute the number of planes.
+    //
+    if (!resolved)
+    {
+        effectiveSamples = ComputeFmaskNumPlanesFromNumSamples(numSamples);
+        effectiveBpp = numSamples;
+
+        //
+        // Compute the address just like a color surface with numSamples bits per element and
+        // numPlanes samples.
+        //
+        addr = ComputeSurfaceAddrFromCoordMacroTiled(x,
+                                                     y,
+                                                     slice,
+                                                     plane, // sample
+                                                     effectiveBpp,
+                                                     pitch,
+                                                     height,
+                                                     effectiveSamples,
+                                                     tileMode,
+                                                     ADDR_NON_DISPLAYABLE,// isdisp
+                                                     ignoreSE,// ignore_shader
+                                                     FALSE,// depth_sample_order
+                                                     pipeSwizzle,
+                                                     bankSwizzle,
+                                                     pTileInfo,
+                                                     pBitPosition);
+
+        //
+        // Compute the real bit position. Each (sample, plane) is stored with one bit per sample.
+        //
+
+
+        //
+        // Compute the pixel index with in the micro tile
+        //
+        UINT_32 pixelIndex = ComputePixelIndexWithinMicroTile(x ,
+                                                              y ,
+                                                              slice,
+                                                              effectiveBpp,
+                                                              tileMode,
+                                                              ADDR_NON_DISPLAYABLE);
+
+        *pBitPosition = ((pixelIndex * numSamples) + sample) & (BITS_PER_BYTE-1);
+
+        UINT_64 bitAddr = BYTES_TO_BITS(addr) + *pBitPosition;
+
+        addr = bitAddr / 8;
+
+    }
+    else
+    {
+        effectiveBpp = ComputeFmaskResolvedBppFromNumSamples(numSamples);
+        effectiveSamples = 1;
+
+        //
+        // Compute the address just like a color surface with numSamples bits per element and
+        // numPlanes samples.
+        //
+        addr = ComputeSurfaceAddrFromCoordMacroTiled(x,
+                                                     y,
+                                                     slice,
+                                                     sample,
+                                                     effectiveBpp,
+                                                     pitch,
+                                                     height,
+                                                     effectiveSamples,
+                                                     tileMode,
+                                                     ADDR_NON_DISPLAYABLE,
+                                                     ignoreSE,
+                                                     TRUE,
+                                                     pipeSwizzle,
+                                                     bankSwizzle,
+                                                     pTileInfo,
+                                                     pBitPosition);
+    }
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeFmaskCoordFromAddrMicroTiled
+*
+*   @brief
+*       Compute (x,y,slice,sample,plane) coordinates from fmask address
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::ComputeFmaskCoordFromAddrMicroTiled(
+    UINT_64             addr,       ///< [in] byte address
+    UINT_32             bitPosition,///< [in] bit position
+    UINT_32             pitch,      ///< [in] pitch in pixels
+    UINT_32             height,     ///< [in] height in pixels
+    UINT_32             numSamples, ///< [in] number of samples (of color buffer)
+    AddrTileMode        tileMode,   ///< [in] tile mode
+    BOOL_32             resolved,   ///< [in] TRUE if it is resolved fmask
+    UINT_32*            pX,         ///< [out] X coord
+    UINT_32*            pY,         ///< [out] Y coord
+    UINT_32*            pSlice,     ///< [out] slice index
+    UINT_32*            pSample,    ///< [out] sample index
+    UINT_32*            pPlane      ///< [out] plane index
+    ) const
+{
+    UINT_32 effectiveBpp;
+    UINT_32 effectiveSamples;
+
+    // 2xAA use the same layout as 4xAA
+    if (numSamples == 2)
+    {
+        numSamples = 4;
+    }
+
+    if (!resolved)
+    {
+        effectiveSamples = ComputeFmaskNumPlanesFromNumSamples(numSamples);
+        effectiveBpp  = numSamples;
+
+        ComputeSurfaceCoordFromAddrMicroTiled(addr,
+                                              bitPosition,
+                                              effectiveBpp,
+                                              pitch,
+                                              height,
+                                              effectiveSamples,
+                                              tileMode,
+                                              0, // tileBase
+                                              0, // compBits
+                                              pX,
+                                              pY,
+                                              pSlice,
+                                              pPlane,
+                                              ADDR_NON_DISPLAYABLE, // microTileType
+                                              FALSE  // isDepthSampleOrder
+                                              );
+
+
+        if ( pSample )
+        {
+            *pSample = bitPosition % numSamples;
+        }
+    }
+    else
+    {
+        effectiveBpp = ComputeFmaskResolvedBppFromNumSamples(numSamples);
+        effectiveSamples = 1;
+
+        ComputeSurfaceCoordFromAddrMicroTiled(addr,
+                                              bitPosition,
+                                              effectiveBpp,
+                                              pitch,
+                                              height,
+                                              effectiveSamples,
+                                              tileMode,
+                                              0,     // tileBase
+                                              0,     // compBits
+                                              pX,
+                                              pY,
+                                              pSlice,
+                                              pSample,
+                                              ADDR_NON_DISPLAYABLE, // microTileType
+                                              TRUE   // isDepthSampleOrder
+                                              );
+    }
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeFmaskCoordFromAddrMacroTiled
+*
+*   @brief
+*       Compute (x,y,slice,sample,plane) coordinates from
+*       fmask address
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::ComputeFmaskCoordFromAddrMacroTiled(
+    UINT_64             addr,       ///< [in] byte address
+    UINT_32             bitPosition,///< [in] bit position
+    UINT_32             pitch,      ///< [in] pitch in pixels
+    UINT_32             height,     ///< [in] height in pixels
+    UINT_32             numSamples, ///< [in] number of samples (of color buffer)
+    AddrTileMode        tileMode,   ///< [in] tile mode
+    UINT_32             pipeSwizzle,///< [in] pipe swizzle
+    UINT_32             bankSwizzle,///< [in] bank swizzle
+    BOOL_32             ignoreSE,   ///< [in] TRUE if ignore shader engine
+    ADDR_TILEINFO*      pTileInfo,  ///< [in] bank structure. **All fields to be valid on entry**
+    BOOL_32             resolved,   ///< [in] TRUE if it is resolved fmask
+    UINT_32*            pX,         ///< [out] X coord
+    UINT_32*            pY,         ///< [out] Y coord
+    UINT_32*            pSlice,     ///< [out] slice index
+    UINT_32*            pSample,    ///< [out] sample index
+    UINT_32*            pPlane      ///< [out] plane index
+    ) const
+{
+    UINT_32 effectiveBpp;
+    UINT_32 effectiveSamples;
+
+    // 2xAA use the same layout as 4xAA
+    if (numSamples == 2)
+    {
+        numSamples = 4;
+    }
+
+    //
+    // Compute the number of planes.
+    //
+    if (!resolved)
+    {
+        effectiveSamples = ComputeFmaskNumPlanesFromNumSamples(numSamples);
+        effectiveBpp  = numSamples;
+
+        ComputeSurfaceCoordFromAddrMacroTiled(addr,
+                                              bitPosition,
+                                              effectiveBpp,
+                                              pitch,
+                                              height,
+                                              effectiveSamples,
+                                              tileMode,
+                                              0, // No tileBase
+                                              0, // No compBits
+                                              ADDR_NON_DISPLAYABLE,
+                                              ignoreSE,
+                                              FALSE,
+                                              pipeSwizzle,
+                                              bankSwizzle,
+                                              pTileInfo,
+                                              pX,
+                                              pY,
+                                              pSlice,
+                                              pPlane);
+
+        if (pSample)
+        {
+            *pSample = bitPosition % numSamples;
+        }
+    }
+    else
+    {
+        effectiveBpp = ComputeFmaskResolvedBppFromNumSamples(numSamples);
+        effectiveSamples = 1;
+
+        ComputeSurfaceCoordFromAddrMacroTiled(addr,
+                                              bitPosition,
+                                              effectiveBpp,
+                                              pitch,
+                                              height,
+                                              effectiveSamples,
+                                              tileMode,
+                                              0, // No tileBase
+                                              0, // No compBits
+                                              ADDR_NON_DISPLAYABLE,
+                                              ignoreSE,
+                                              TRUE,
+                                              pipeSwizzle,
+                                              bankSwizzle,
+                                              pTileInfo,
+                                              pX,
+                                              pY,
+                                              pSlice,
+                                              pSample);
+    }
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::DispatchComputeFmaskCoordFromAddr
+*
+*   @brief
+*       Compute (x,y,slice,sample,plane) coordinates from
+*       fmask address
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::DispatchComputeFmaskCoordFromAddr(
+    const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    UINT_64             addr              = pIn->addr;
+    UINT_32             bitPosition       = pIn->bitPosition;
+    UINT_32             pitch             = pIn->pitch;
+    UINT_32             height            = pIn->height;
+    UINT_32             numSamples        = pIn->numSamples;
+    AddrTileMode        tileMode          = pIn->tileMode;
+    BOOL_32             ignoreSE          = pIn->ignoreSE;
+    ADDR_TILEINFO*      pTileInfo         = pIn->pTileInfo;
+    BOOL_32             resolved          = pIn->resolved;
+
+    UINT_32*            pX      = &pOut->x;
+    UINT_32*            pY      = &pOut->y;
+    UINT_32*            pSlice  = &pOut->slice;
+    UINT_32*            pSample = &pOut->sample;
+    UINT_32*            pPlane  = &pOut->plane;
+
+    switch (tileMode)
+    {
+        case ADDR_TM_1D_TILED_THIN1:
+            ComputeFmaskCoordFromAddrMicroTiled(addr,
+                                                bitPosition,
+                                                pitch,
+                                                height,
+                                                numSamples,
+                                                tileMode,
+                                                resolved,
+                                                pX,
+                                                pY,
+                                                pSlice,
+                                                pSample,
+                                                pPlane);
+            break;
+        case ADDR_TM_2D_TILED_THIN1://fall through
+        case ADDR_TM_3D_TILED_THIN1:
+            UINT_32 pipeSwizzle;
+            UINT_32 bankSwizzle;
+
+            if (m_configFlags.useCombinedSwizzle)
+            {
+                ExtractBankPipeSwizzle(pIn->tileSwizzle, pIn->pTileInfo,
+                                       &bankSwizzle, &pipeSwizzle);
+            }
+            else
+            {
+                pipeSwizzle = pIn->pipeSwizzle;
+                bankSwizzle = pIn->bankSwizzle;
+            }
+
+            ComputeFmaskCoordFromAddrMacroTiled(addr,
+                                                bitPosition,
+                                                pitch,
+                                                height,
+                                                numSamples,
+                                                tileMode,
+                                                pipeSwizzle,
+                                                bankSwizzle,
+                                                ignoreSE,
+                                                pTileInfo,
+                                                resolved,
+                                                pX,
+                                                pY,
+                                                pSlice,
+                                                pSample,
+                                                pPlane);
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+            break;
+
+    }
+}
+#endif
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeFmaskNumPlanesFromNumSamples
+*
+*   @brief
+*       Compute fmask number of planes from number of samples
+*
+*   @return
+*       Number of planes
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputeFmaskNumPlanesFromNumSamples(
+    UINT_32 numSamples)     ///< [in] number of samples
+{
+    UINT_32 numPlanes;
+
+    //
+    // FMASK is stored such that each micro tile is composed of elements containing N bits, where
+    // N is the number of samples.  There is a micro tile for each bit in the FMASK address, and
+    // micro tiles for each address bit, sometimes referred to as a plane, are stored sequentially.
+    // The FMASK for a 2-sample surface looks like a general surface with 2 bits per element.
+    // The FMASK for a 4-sample surface looks like a general surface with 4 bits per element and
+    // 2 samples.  The FMASK for an 8-sample surface looks like a general surface with 8 bits per
+    // element and 4 samples.  R6xx and R7xx only stored 3 planes for 8-sample FMASK surfaces.
+    // This was changed for R8xx to simplify the logic in the CB.
+    //
+    switch (numSamples)
+    {
+        case 2:
+            numPlanes = 1;
+            break;
+        case 4:
+            numPlanes = 2;
+            break;
+        case 8:
+            numPlanes = 4;
+            break;
+        default:
+            ADDR_UNHANDLED_CASE();
+            numPlanes = 0;
+            break;
+    }
+    return numPlanes;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeFmaskResolvedBppFromNumSamples
+*
+*   @brief
+*       Compute resolved fmask effective bpp based on number of samples
+*
+*   @return
+*       bpp
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputeFmaskResolvedBppFromNumSamples(
+    UINT_32 numSamples)     ///< number of samples
+{
+    UINT_32 bpp;
+
+    //
+    // Resolved FMASK surfaces are generated yBit the CB and read yBit the texture unit
+    // so that the texture unit can read compressed multi-sample color data.
+    // These surfaces store each index value packed per element.
+    // Each element contains at least num_samples * log2(num_samples) bits.
+    // Resolved FMASK surfaces are addressed as follows:
+    // 2-sample Addressed similarly to a color surface with 8 bits per element and 1 sample.
+    // 4-sample Addressed similarly to a color surface with 8 bits per element and 1 sample.
+    // 8-sample Addressed similarly to a color surface with 32 bits per element and 1 sample.
+
+    switch (numSamples)
+    {
+        case 2:
+            bpp = 8;
+            break;
+        case 4:
+            bpp = 8;
+            break;
+        case 8:
+            bpp = 32;
+            break;
+        default:
+            ADDR_UNHANDLED_CASE();
+            bpp = 0;
+            break;
+    }
+    return bpp;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::IsTileInfoAllZero
+*
+*   @brief
+*       Return TRUE if all field are zero
+*   @note
+*       Since NULL input is consider to be all zero
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::IsTileInfoAllZero(
+    ADDR_TILEINFO* pTileInfo)
+{
+    BOOL_32 allZero = TRUE;
+
+    if (pTileInfo)
+    {
+        if ((pTileInfo->banks            != 0)  ||
+            (pTileInfo->bankWidth        != 0)  ||
+            (pTileInfo->bankHeight       != 0)  ||
+            (pTileInfo->macroAspectRatio != 0)  ||
+            (pTileInfo->tileSplitBytes   != 0)  ||
+            (pTileInfo->pipeConfig       != 0)
+            )
+        {
+            allZero = FALSE;
+        }
+    }
+
+    return allZero;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlTileInfoEqual
+*
+*   @brief
+*       Return TRUE if all field are equal
+*   @note
+*       Only takes care of current HWL's data
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::HwlTileInfoEqual(
+    const ADDR_TILEINFO* pLeft, ///<[in] Left compare operand
+    const ADDR_TILEINFO* pRight ///<[in] Right compare operand
+    ) const
+{
+    BOOL_32 equal = FALSE;
+
+    if (pLeft->banks == pRight->banks           &&
+        pLeft->bankWidth == pRight->bankWidth   &&
+        pLeft->bankHeight == pRight->bankHeight &&
+        pLeft->macroAspectRatio == pRight->macroAspectRatio &&
+        pLeft->tileSplitBytes == pRight->tileSplitBytes)
+    {
+        equal = TRUE;
+    }
+
+    return equal;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlConvertTileInfoToHW
+*   @brief
+*       Entry of EgBasedAddrLib ConvertTileInfoToHW
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlConvertTileInfoToHW(
+    const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn, ///< [in] input structure
+    ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut      ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode   = ADDR_OK;
+
+    ADDR_TILEINFO *pTileInfoIn  = pIn->pTileInfo;
+    ADDR_TILEINFO *pTileInfoOut = pOut->pTileInfo;
+
+    if ((pTileInfoIn != NULL) && (pTileInfoOut != NULL))
+    {
+        if (pIn->reverse == FALSE)
+        {
+            switch (pTileInfoIn->banks)
+            {
+                case 2:
+                    pTileInfoOut->banks = 0;
+                    break;
+                case 4:
+                    pTileInfoOut->banks = 1;
+                    break;
+                case 8:
+                    pTileInfoOut->banks = 2;
+                    break;
+                case 16:
+                    pTileInfoOut->banks = 3;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->banks = 0;
+                    break;
+            }
+
+            switch (pTileInfoIn->bankWidth)
+            {
+                case 1:
+                    pTileInfoOut->bankWidth = 0;
+                    break;
+                case 2:
+                    pTileInfoOut->bankWidth = 1;
+                    break;
+                case 4:
+                    pTileInfoOut->bankWidth = 2;
+                    break;
+                case 8:
+                    pTileInfoOut->bankWidth = 3;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->bankWidth = 0;
+                    break;
+            }
+
+            switch (pTileInfoIn->bankHeight)
+            {
+                case 1:
+                    pTileInfoOut->bankHeight = 0;
+                    break;
+                case 2:
+                    pTileInfoOut->bankHeight = 1;
+                    break;
+                case 4:
+                    pTileInfoOut->bankHeight = 2;
+                    break;
+                case 8:
+                    pTileInfoOut->bankHeight = 3;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->bankHeight = 0;
+                    break;
+            }
+
+            switch (pTileInfoIn->macroAspectRatio)
+            {
+                case 1:
+                    pTileInfoOut->macroAspectRatio = 0;
+                    break;
+                case 2:
+                    pTileInfoOut->macroAspectRatio = 1;
+                    break;
+                case 4:
+                    pTileInfoOut->macroAspectRatio = 2;
+                    break;
+                case 8:
+                    pTileInfoOut->macroAspectRatio = 3;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->macroAspectRatio = 0;
+                    break;
+            }
+
+            switch (pTileInfoIn->tileSplitBytes)
+            {
+                case 64:
+                    pTileInfoOut->tileSplitBytes = 0;
+                    break;
+                case 128:
+                    pTileInfoOut->tileSplitBytes = 1;
+                    break;
+                case 256:
+                    pTileInfoOut->tileSplitBytes = 2;
+                    break;
+                case 512:
+                    pTileInfoOut->tileSplitBytes = 3;
+                    break;
+                case 1024:
+                    pTileInfoOut->tileSplitBytes = 4;
+                    break;
+                case 2048:
+                    pTileInfoOut->tileSplitBytes = 5;
+                    break;
+                case 4096:
+                    pTileInfoOut->tileSplitBytes = 6;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->tileSplitBytes = 0;
+                    break;
+            }
+        }
+        else
+        {
+            switch (pTileInfoIn->banks)
+            {
+                case 0:
+                    pTileInfoOut->banks = 2;
+                    break;
+                case 1:
+                    pTileInfoOut->banks = 4;
+                    break;
+                case 2:
+                    pTileInfoOut->banks = 8;
+                    break;
+                case 3:
+                    pTileInfoOut->banks = 16;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->banks = 2;
+                    break;
+            }
+
+            switch (pTileInfoIn->bankWidth)
+            {
+                case 0:
+                    pTileInfoOut->bankWidth = 1;
+                    break;
+                case 1:
+                    pTileInfoOut->bankWidth = 2;
+                    break;
+                case 2:
+                    pTileInfoOut->bankWidth = 4;
+                    break;
+                case 3:
+                    pTileInfoOut->bankWidth = 8;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->bankWidth = 1;
+                    break;
+            }
+
+            switch (pTileInfoIn->bankHeight)
+            {
+                case 0:
+                    pTileInfoOut->bankHeight = 1;
+                    break;
+                case 1:
+                    pTileInfoOut->bankHeight = 2;
+                    break;
+                case 2:
+                    pTileInfoOut->bankHeight = 4;
+                    break;
+                case 3:
+                    pTileInfoOut->bankHeight = 8;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->bankHeight = 1;
+                    break;
+            }
+
+            switch (pTileInfoIn->macroAspectRatio)
+            {
+                case 0:
+                    pTileInfoOut->macroAspectRatio = 1;
+                    break;
+                case 1:
+                    pTileInfoOut->macroAspectRatio = 2;
+                    break;
+                case 2:
+                    pTileInfoOut->macroAspectRatio = 4;
+                    break;
+                case 3:
+                    pTileInfoOut->macroAspectRatio = 8;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->macroAspectRatio = 1;
+                    break;
+            }
+
+            switch (pTileInfoIn->tileSplitBytes)
+            {
+                case 0:
+                    pTileInfoOut->tileSplitBytes = 64;
+                    break;
+                case 1:
+                    pTileInfoOut->tileSplitBytes = 128;
+                    break;
+                case 2:
+                    pTileInfoOut->tileSplitBytes = 256;
+                    break;
+                case 3:
+                    pTileInfoOut->tileSplitBytes = 512;
+                    break;
+                case 4:
+                    pTileInfoOut->tileSplitBytes = 1024;
+                    break;
+                case 5:
+                    pTileInfoOut->tileSplitBytes = 2048;
+                    break;
+                case 6:
+                    pTileInfoOut->tileSplitBytes = 4096;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->tileSplitBytes = 64;
+                    break;
+            }
+        }
+
+        if (pTileInfoIn != pTileInfoOut)
+        {
+            pTileInfoOut->pipeConfig = pTileInfoIn->pipeConfig;
+        }
+    }
+    else
+    {
+        ADDR_ASSERT_ALWAYS();
+        retCode = ADDR_INVALIDPARAMS;
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeSurfaceInfo
+*   @brief
+*       Entry of EgBasedAddrLib ComputeSurfaceInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeSurfaceInfo(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    if (pIn->numSamples < pIn->numFrags)
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+
+    ADDR_TILEINFO tileInfo = {0};
+
+    if (retCode == ADDR_OK)
+    {
+        // Uses internal tile info if pOut does not have a valid pTileInfo
+        if (pOut->pTileInfo == NULL)
+        {
+            pOut->pTileInfo = &tileInfo;
+        }
+
+        if (!DispatchComputeSurfaceInfo(pIn, pOut))
+        {
+            retCode = ADDR_INVALIDPARAMS;
+        }
+
+        // Returns an index
+        pOut->tileIndex = HwlPostCheckTileIndex(pOut->pTileInfo,
+                                                pOut->tileMode,
+                                                pOut->tileType,
+                                                pOut->tileIndex);
+
+        if (IsMacroTiled(pOut->tileMode) && (pOut->macroModeIndex == TileIndexInvalid))
+        {
+            pOut->macroModeIndex = HwlComputeMacroModeIndex(pOut->tileIndex,
+                                                            pIn->flags,
+                                                            pIn->bpp,
+                                                            pIn->numSamples,
+                                                            pOut->pTileInfo);
+        }
+
+        // Resets pTileInfo to NULL if the internal tile info is used
+        if (pOut->pTileInfo == &tileInfo)
+        {
+#if DEBUG
+            // Client does not pass in a valid pTileInfo
+            if (IsMacroTiled(pOut->tileMode))
+            {
+                // If a valid index is returned, then no pTileInfo is okay
+                ADDR_ASSERT(!m_configFlags.useTileIndex || pOut->tileIndex != TileIndexInvalid);
+
+                if (!IsTileInfoAllZero(pIn->pTileInfo))
+                {
+                    // The initial value of pIn->pTileInfo is copied to tileInfo
+                    // We do not expect any of these value to be changed nor any 0 of inputs
+                    ADDR_ASSERT(tileInfo.banks == pIn->pTileInfo->banks);
+                    ADDR_ASSERT(tileInfo.bankWidth == pIn->pTileInfo->bankWidth);
+                    ADDR_ASSERT(tileInfo.bankHeight == pIn->pTileInfo->bankHeight);
+                    ADDR_ASSERT(tileInfo.macroAspectRatio == pIn->pTileInfo->macroAspectRatio);
+                    ADDR_ASSERT(tileInfo.tileSplitBytes == pIn->pTileInfo->tileSplitBytes);
+                }
+            }
+#endif
+            pOut->pTileInfo = NULL;
+        }
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeSurfaceAddrFromCoord
+*   @brief
+*       Entry of EgBasedAddrLib ComputeSurfaceAddrFromCoord
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeSurfaceAddrFromCoord(
+    const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    if (
+#if !ALT_TEST // Overflow test needs this out-of-boundary coord
+        (pIn->x > pIn->pitch)   ||
+        (pIn->y > pIn->height)  ||
+#endif
+        (pIn->numSamples > m_maxSamples))
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+    else
+    {
+        pOut->addr = DispatchComputeSurfaceAddrFromCoord(pIn, pOut);
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeSurfaceCoordFromAddr
+*   @brief
+*       Entry of EgBasedAddrLib ComputeSurfaceCoordFromAddr
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeSurfaceCoordFromAddr(
+    const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    if ((pIn->bitPosition >= 8) ||
+        (pIn->numSamples > m_maxSamples))
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+    else
+    {
+        DispatchComputeSurfaceCoordFromAddr(pIn, pOut);
+    }
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeSliceTileSwizzle
+*   @brief
+*       Entry of EgBasedAddrLib ComputeSurfaceCoordFromAddr
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeSliceTileSwizzle(
+    const ADDR_COMPUTE_SLICESWIZZLE_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SLICESWIZZLE_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    if (pIn->pTileInfo && (pIn->pTileInfo->banks > 0))
+    {
+
+        pOut->tileSwizzle = ComputeSliceTileSwizzle(pIn->tileMode,
+                                                    pIn->baseSwizzle,
+                                                    pIn->slice,
+                                                    pIn->baseAddr,
+                                                    pIn->pTileInfo);
+    }
+    else
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeHtileBpp
+*
+*   @brief
+*       Compute htile bpp
+*
+*   @return
+*       Htile bpp
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::HwlComputeHtileBpp(
+    BOOL_32 isWidth8,   ///< [in] TRUE if block width is 8
+    BOOL_32 isHeight8   ///< [in] TRUE if block height is 8
+    ) const
+{
+    // only support 8x8 mode
+    ADDR_ASSERT(isWidth8 && isHeight8);
+    return 32;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeHtileBaseAlign
+*
+*   @brief
+*       Compute htile base alignment
+*
+*   @return
+*       Htile base alignment
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::HwlComputeHtileBaseAlign(
+    BOOL_32         isTcCompatible, ///< [in] if TC compatible
+    BOOL_32         isLinear,       ///< [in] if it is linear mode
+    ADDR_TILEINFO*  pTileInfo       ///< [in] Tile info
+    ) const
+{
+    UINT_32 baseAlign = m_pipeInterleaveBytes * HwlGetPipes(pTileInfo);
+
+    if (isTcCompatible)
+    {
+        ADDR_ASSERT(pTileInfo != NULL);
+        if (pTileInfo)
+        {
+            baseAlign *= pTileInfo->banks;
+        }
+    }
+
+    return baseAlign;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlGetPitchAlignmentMicroTiled
+*
+*   @brief
+*       Compute 1D tiled surface pitch alignment, calculation results are returned through
+*       output parameters.
+*
+*   @return
+*       pitch alignment
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::HwlGetPitchAlignmentMicroTiled(
+    AddrTileMode        tileMode,          ///< [in] tile mode
+    UINT_32             bpp,               ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,             ///< [in] surface flags
+    UINT_32             numSamples         ///< [in] number of samples
+    ) const
+{
+    UINT_32 pitchAlign;
+
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    UINT_32 pixelsPerMicroTile;
+    UINT_32 pixelsPerPipeInterleave;
+    UINT_32 microTilesPerPipeInterleave;
+
+    //
+    // Special workaround for depth/stencil buffer, use 8 bpp to meet larger requirement for
+    // stencil buffer since pitch alignment is related to bpp.
+    // For a depth only buffer do not set this.
+    //
+    // Note: this actually does not work for mipmap but mipmap depth texture is not really
+    // sampled with mipmap.
+    //
+    if (flags.depth && !flags.noStencil)
+    {
+        bpp = 8;
+    }
+
+    pixelsPerMicroTile = MicroTilePixels * microTileThickness;
+    pixelsPerPipeInterleave = BYTES_TO_BITS(m_pipeInterleaveBytes) / (bpp * numSamples);
+    microTilesPerPipeInterleave = pixelsPerPipeInterleave / pixelsPerMicroTile;
+
+    pitchAlign = Max(MicroTileWidth, microTilesPerPipeInterleave * MicroTileWidth);
+
+    return pitchAlign;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlGetSizeAdjustmentMicroTiled
+*
+*   @brief
+*       Adjust 1D tiled surface pitch and slice size
+*
+*   @return
+*       Logical slice size in bytes
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::HwlGetSizeAdjustmentMicroTiled(
+    UINT_32             thickness,      ///< [in] thickness
+    UINT_32             bpp,            ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,          ///< [in] surface flags
+    UINT_32             numSamples,     ///< [in] number of samples
+    UINT_32             baseAlign,      ///< [in] base alignment
+    UINT_32             pitchAlign,     ///< [in] pitch alignment
+    UINT_32*            pPitch,         ///< [in/out] pointer to pitch
+    UINT_32*            pHeight         ///< [in/out] pointer to height
+    ) const
+{
+    UINT_64 logicalSliceSize;
+    UINT_64 physicalSliceSize;
+
+    UINT_32 pitch   = *pPitch;
+    UINT_32 height  = *pHeight;
+
+    // Logical slice: pitch * height * bpp * numSamples (no 1D MSAA so actually numSamples == 1)
+    logicalSliceSize = BITS_TO_BYTES(static_cast<UINT_64>(pitch) * height * bpp * numSamples);
+
+    // Physical slice: multiplied by thickness
+    physicalSliceSize =  logicalSliceSize * thickness;
+
+    //
+    // R800 will always pad physical slice size to baseAlign which is pipe_interleave_bytes
+    //
+    ADDR_ASSERT((physicalSliceSize % baseAlign) == 0)
+
+    return logicalSliceSize;
+}
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.h
new file mode 100644
index 00000000000..84adb66eedc
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.h
@@ -0,0 +1,411 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  egbaddrlib.h
+* @brief Contains the EgBasedAddrLib class definition.
+***************************************************************************************************
+*/
+
+#ifndef __EG_BASED_ADDR_LIB_H__
+#define __EG_BASED_ADDR_LIB_H__
+
+#include "addrlib.h"
+
+
+/// Structures for functions
+struct CoordFromBankPipe
+{
+    UINT_32 xBits : 3;
+    UINT_32 yBits : 4;
+
+    UINT_32 xBit3 : 1;
+    UINT_32 xBit4 : 1;
+    UINT_32 xBit5 : 1;
+    UINT_32 yBit3 : 1;
+    UINT_32 yBit4 : 1;
+    UINT_32 yBit5 : 1;
+    UINT_32 yBit6 : 1;
+};
+
+/**
+***************************************************************************************************
+* @brief This class is the Evergreen based address library
+* @note  Abstract class
+***************************************************************************************************
+*/
+class EgBasedAddrLib : public AddrLib
+{
+protected:
+    EgBasedAddrLib(const AddrClient* pClient);
+    virtual ~EgBasedAddrLib();
+
+public:
+
+    /// Surface info functions
+
+    // NOTE: DispatchComputeSurfaceInfo using TileInfo takes both an input and an output.
+    //       On input:
+    //       One or more fields may be 0 to be calculated/defaulted - pre-SI h/w.
+    //       H/W using tile mode index only accepts none or all 0's - SI and newer h/w.
+    //       It then returns the actual tiling configuration used.
+    //       Other methods' TileInfo must be valid on entry
+    BOOL_32 DispatchComputeSurfaceInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE DispatchComputeFmaskInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pOut);
+
+protected:
+    // Hwl interface
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceAddrFromCoord(
+        const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceCoordFromAddr(
+        const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeSliceTileSwizzle(
+        const ADDR_COMPUTE_SLICESWIZZLE_INPUT* pIn,
+        ADDR_COMPUTE_SLICESWIZZLE_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlExtractBankPipeSwizzle(
+        const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT* pIn,
+        ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlCombineBankPipeSwizzle(
+        UINT_32 bankSwizzle, UINT_32 pipeSwizzle, ADDR_TILEINFO*  pTileInfo,
+        UINT_64 baseAddr, UINT_32* pTileSwizzle) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeBaseSwizzle(
+        const ADDR_COMPUTE_BASE_SWIZZLE_INPUT* pIn,
+        ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlConvertTileInfoToHW(
+        const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn,
+        ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut) const;
+
+    virtual UINT_32 HwlComputeHtileBpp(
+        BOOL_32 isWidth8, BOOL_32 isHeight8) const;
+
+    virtual UINT_32 HwlComputeHtileBaseAlign(
+        BOOL_32 isTcCompatible, BOOL_32 isLinear, ADDR_TILEINFO* pTileInfo) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pOut);
+
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskAddrFromCoord(
+        const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskCoordFromAddr(
+        const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT* pOut) const;
+
+    virtual BOOL_32 HwlDegradeBaseLevel(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn) const;
+
+    virtual UINT_32 HwlComputeQbStereoRightSwizzle(
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pInfo) const;
+
+    virtual VOID HwlComputePixelCoordFromOffset(
+        UINT_32 offset, UINT_32 bpp, UINT_32 numSamples,
+        AddrTileMode tileMode, UINT_32 tileBase, UINT_32 compBits,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample,
+        AddrTileType microTileType, BOOL_32 isDepthSampleOrder) const;
+
+    /// Return Cmask block max
+    virtual BOOL_32 HwlGetMaxCmaskBlockMax() const
+    {
+        return 16383; // 14 bits
+    }
+
+    // Sub-hwl interface
+    /// Pure virtual function to setup tile info (indices) if client requests to do so
+    virtual VOID HwlSetupTileInfo(
+        AddrTileMode tileMode, ADDR_SURFACE_FLAGS flags,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        ADDR_TILEINFO* inputTileInfo, ADDR_TILEINFO* outputTileInfo,
+        AddrTileType inTileType, ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const = 0;
+
+    /// Pure virtual function to get pitch alignment for linear modes
+    virtual UINT_32 HwlGetPitchAlignmentLinear(UINT_32 bpp, ADDR_SURFACE_FLAGS flags) const = 0;
+
+    /// Pure virtual function to get size adjustment for linear modes
+    virtual UINT_64 HwlGetSizeAdjustmentLinear(
+        AddrTileMode tileMode,
+        UINT_32 bpp, UINT_32 numSamples, UINT_32 baseAlign, UINT_32 pitchAlign,
+        UINT_32 *pPitch, UINT_32 *pHeight, UINT_32 *pHeightAlign) const = 0;
+
+    virtual UINT_32 HwlGetPitchAlignmentMicroTiled(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples) const;
+
+    virtual UINT_64 HwlGetSizeAdjustmentMicroTiled(
+        UINT_32 thickness, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples,
+        UINT_32 baseAlign, UINT_32 pitchAlign,
+        UINT_32 *pPitch, UINT_32 *pHeight) const;
+
+        /// Pure virtual function to do extra sanity check
+    virtual BOOL_32 HwlSanityCheckMacroTiled(
+        ADDR_TILEINFO* pTileInfo) const = 0;
+
+    /// Pure virtual function to check current level to be the last macro tiled one
+    virtual VOID HwlCheckLastMacroTiledLvl(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const = 0;
+
+    /// Adjusts bank before bank is modified by rotation
+    virtual UINT_32 HwlPreAdjustBank(
+        UINT_32 tileX, UINT_32 bank, ADDR_TILEINFO*  pTileInfo) const = 0;
+
+    virtual VOID HwlComputeSurfaceCoord2DFromBankPipe(
+        AddrTileMode tileMode, UINT_32* pX, UINT_32* pY, UINT_32 slice,
+        UINT_32 bank, UINT_32 pipe,
+        UINT_32 bankSwizzle, UINT_32 pipeSwizzle, UINT_32 tileSlices,
+        BOOL_32 ignoreSE,
+        ADDR_TILEINFO* pTileInfo) const = 0;
+
+    virtual BOOL_32 HwlTileInfoEqual(
+        const ADDR_TILEINFO* pLeft, const ADDR_TILEINFO* pRight) const;
+
+    virtual AddrTileMode HwlDegradeThickTileMode(
+        AddrTileMode baseTileMode, UINT_32 numSlices, UINT_32* pBytesPerTile) const;
+
+    virtual INT_32 HwlPostCheckTileIndex(
+        const ADDR_TILEINFO* pInfo, AddrTileMode mode, AddrTileType type,
+        INT curIndex = TileIndexInvalid) const
+    {
+        return TileIndexInvalid;
+    }
+
+    virtual VOID HwlFmaskPreThunkSurfInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pFmaskIn,
+        const ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut,
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pSurfIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut) const
+    {
+    }
+
+    virtual VOID HwlFmaskPostThunkSurfInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut) const
+    {
+    }
+
+    /// Virtual function to check if the height needs extra padding
+    /// for stereo right eye offset, to avoid bank pipe swizzle
+    virtual BOOL_32 HwlStereoCheckRightOffsetPadding() const
+    {
+        return FALSE;
+    }
+
+    virtual BOOL_32 HwlReduceBankWidthHeight(
+        UINT_32 tileSize, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples,
+        UINT_32 bankHeightAlign, UINT_32 pipes,
+        ADDR_TILEINFO* pTileInfo) const;
+
+    // Protected non-virtual functions
+
+    /// Mip level functions
+    AddrTileMode ComputeSurfaceMipLevelTileMode(
+        AddrTileMode baseTileMode, UINT_32 bpp,
+        UINT_32 pitch, UINT_32 height, UINT_32 numSlices, UINT_32 numSamples,
+        UINT_32 pitchAlign, UINT_32 heightAlign,
+        ADDR_TILEINFO* pTileInfo) const;
+
+    /// Swizzle functions
+    VOID ExtractBankPipeSwizzle(
+        UINT_32 base256b, ADDR_TILEINFO* pTileInfo,
+        UINT_32* pBankSwizzle, UINT_32* pPipeSwizzle) const;
+
+    UINT_32 GetBankPipeSwizzle(
+        UINT_32 bankSwizzle, UINT_32 pipeSwizzle,
+        UINT_64 baseAddr, ADDR_TILEINFO*  pTileInfo) const;
+
+    UINT_32 ComputeSliceTileSwizzle(
+        AddrTileMode tileMode, UINT_32 baseSwizzle, UINT_32 slice, UINT_64 baseAddr,
+        ADDR_TILEINFO* pTileInfo) const;
+
+    /// Addressing functions
+    UINT_32 ComputeBankFromCoord(
+        UINT_32 x, UINT_32 y, UINT_32 slice,
+        AddrTileMode tileMode, UINT_32 bankSwizzle, UINT_32 tileSpitSlice,
+        ADDR_TILEINFO* pTileInfo) const;
+
+    UINT_32 ComputeBankFromAddr(
+        UINT_64 addr, UINT_32 numBanks, UINT_32 numPipes) const;
+
+    UINT_32 ComputePipeRotation(
+        AddrTileMode tileMode, UINT_32 numPipes) const;
+
+    UINT_32 ComputeBankRotation(
+        AddrTileMode tileMode, UINT_32 numBanks,
+        UINT_32 numPipes) const;
+
+    VOID ComputeSurfaceCoord2DFromBankPipe(
+        AddrTileMode tileMode, UINT_32 x, UINT_32 y, UINT_32 slice,
+        UINT_32 bank, UINT_32 pipe,
+        UINT_32 bankSwizzle, UINT_32 pipeSwizzle, UINT_32 tileSlices,
+        ADDR_TILEINFO* pTileInfo,
+        CoordFromBankPipe *pOutput) const;
+
+    /// Htile/Cmask functions
+    UINT_64 ComputeHtileBytes(
+        UINT_32 pitch, UINT_32 height, UINT_32 bpp,
+        BOOL_32 isLinear, UINT_32 numSlices, UINT_64* sliceBytes, UINT_32 baseAlign) const;
+
+    // Static functions
+    static BOOL_32 IsTileInfoAllZero(ADDR_TILEINFO* pTileInfo);
+    static UINT_32 ComputeFmaskNumPlanesFromNumSamples(UINT_32 numSamples);
+    static UINT_32 ComputeFmaskResolvedBppFromNumSamples(UINT_32 numSamples);
+
+private:
+
+    BOOL_32 ComputeSurfaceInfoLinear(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut,
+        UINT_32 padDims) const;
+
+    BOOL_32 ComputeSurfaceInfoMicroTiled(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut,
+        UINT_32 padDims,
+        AddrTileMode expTileMode) const;
+
+    BOOL_32 ComputeSurfaceInfoMacroTiled(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut,
+        UINT_32 padDims,
+        AddrTileMode expTileMode) const;
+
+    BOOL_32 ComputeSurfaceAlignmentsLinear(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags,
+        UINT_32* pBaseAlign, UINT_32* pPitchAlign, UINT_32* pHeightAlign) const;
+
+    BOOL_32 ComputeSurfaceAlignmentsMicroTiled(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples,
+        UINT_32* pBaseAlign, UINT_32* pPitchAlign, UINT_32* pHeightAlign) const;
+
+    BOOL_32 ComputeSurfaceAlignmentsMacroTiled(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags,
+        UINT_32 mipLevel, UINT_32 numSamples,
+        ADDR_TILEINFO* pTileInfo,
+        UINT_32* pBaseAlign, UINT_32* pPitchAlign, UINT_32* pHeightAlign) const;
+
+    /// Surface addressing functions
+    UINT_64 DispatchComputeSurfaceAddrFromCoord(
+        const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    VOID    DispatchComputeSurfaceCoordFromAddr(
+        const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT* pOut) const;
+
+    UINT_64 ComputeSurfaceAddrFromCoordMicroTiled(
+        UINT_32 x, UINT_32 y, UINT_32 slice, UINT_32 sample,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        AddrTileMode tileMode,
+        AddrTileType microTileType, BOOL_32 isDepthSampleOrder,
+        UINT_32* pBitPosition) const;
+
+    UINT_64 ComputeSurfaceAddrFromCoordMacroTiled(
+        UINT_32 x, UINT_32 y, UINT_32 slice, UINT_32 sample,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        AddrTileMode tileMode,
+        AddrTileType microTileType, BOOL_32 ignoreSE, BOOL_32 isDepthSampleOrder,
+        UINT_32 pipeSwizzle, UINT_32 bankSwizzle,
+        ADDR_TILEINFO* pTileInfo,
+        UINT_32* pBitPosition) const;
+
+    VOID    ComputeSurfaceCoordFromAddrMacroTiled(
+        UINT_64 addr, UINT_32 bitPosition,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        AddrTileMode tileMode, UINT_32 tileBase, UINT_32 compBits,
+        AddrTileType microTileType, BOOL_32 ignoreSE, BOOL_32 isDepthSampleOrder,
+        UINT_32 pipeSwizzle, UINT_32 bankSwizzle,
+        ADDR_TILEINFO* pTileInfo,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample) const;
+
+    /// Fmask functions
+    UINT_64 DispatchComputeFmaskAddrFromCoord(
+        const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    VOID    DispatchComputeFmaskCoordFromAddr(
+        const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT* pOut) const;
+
+    // FMASK related methods - private
+    UINT_64 ComputeFmaskAddrFromCoordMicroTiled(
+        UINT_32 x, UINT_32 y, UINT_32 slice, UINT_32 sample, UINT_32 plane,
+        UINT_32 pitch, UINT_32 height, UINT_32 numSamples, AddrTileMode tileMode,
+        BOOL_32 resolved, UINT_32* pBitPosition) const;
+
+    VOID    ComputeFmaskCoordFromAddrMicroTiled(
+        UINT_64 addr, UINT_32 bitPosition,
+        UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        AddrTileMode tileMode, BOOL_32 resolved,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample, UINT_32* pPlane) const;
+
+    VOID    ComputeFmaskCoordFromAddrMacroTiled(
+        UINT_64 addr, UINT_32 bitPosition,
+        UINT_32 pitch, UINT_32 height, UINT_32 numSamples, AddrTileMode tileMode,
+        UINT_32 pipeSwizzle, UINT_32 bankSwizzle,
+        BOOL_32 ignoreSE,
+        ADDR_TILEINFO* pTileInfo,
+        BOOL_32 resolved,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample, UINT_32* pPlane) const;
+
+    UINT_64 ComputeFmaskAddrFromCoordMacroTiled(
+        UINT_32 x, UINT_32 y, UINT_32 slice, UINT_32 sample, UINT_32 plane,
+        UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        AddrTileMode tileMode, UINT_32 pipeSwizzle, UINT_32 bankSwizzle,
+        BOOL_32 ignoreSE,
+        ADDR_TILEINFO* pTileInfo,
+        BOOL_32 resolved,
+        UINT_32* pBitPosition) const;
+
+    /// Sanity check functions
+    BOOL_32 SanityCheckMacroTiled(
+        ADDR_TILEINFO* pTileInfo) const;
+
+protected:
+    UINT_32 m_ranks;                ///< Number of ranks - MC_ARB_RAMCFG.NOOFRANK
+    UINT_32 m_logicalBanks;         ///< Logical banks = m_banks * m_ranks if m_banks != 16
+    UINT_32 m_bankInterleave;       ///< Bank interleave, as a multiple of pipe interleave size
+};
+
+#endif
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.cpp
new file mode 100644
index 00000000000..a858b55b7cf
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.cpp
@@ -0,0 +1,2818 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  siaddrlib.cpp
+* @brief Contains the implementation for the SIAddrLib class.
+***************************************************************************************************
+*/
+
+#include "siaddrlib.h"
+
+#include "si_gb_reg.h"
+
+#include "si_ci_vi_merged_enum.h"
+
+#if BRAHMA_BUILD
+#include "amdgpu_id.h"
+#else
+#include "si_id.h"
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrSIHwlInit
+*
+*   @brief
+*       Creates an SIAddrLib object.
+*
+*   @return
+*       Returns an SIAddrLib object pointer.
+***************************************************************************************************
+*/
+AddrLib* AddrSIHwlInit(const AddrClient* pClient)
+{
+    return SIAddrLib::CreateObj(pClient);
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::SIAddrLib
+*
+*   @brief
+*       Constructor
+*
+***************************************************************************************************
+*/
+SIAddrLib::SIAddrLib(const AddrClient* pClient) :
+    EgBasedAddrLib(pClient),
+    m_noOfEntries(0)
+{
+    m_class = SI_ADDRLIB;
+    memset(&m_settings, 0, sizeof(m_settings));
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::~SIAddrLib
+*
+*   @brief
+*       Destructor
+***************************************************************************************************
+*/
+SIAddrLib::~SIAddrLib()
+{
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlGetPipes
+*
+*   @brief
+*       Get number pipes
+*   @return
+*       num pipes
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlGetPipes(
+    const ADDR_TILEINFO* pTileInfo    ///< [in] Tile info
+    ) const
+{
+    UINT_32 numPipes;
+
+    if (pTileInfo)
+    {
+        numPipes = GetPipePerSurf(pTileInfo->pipeConfig);
+    }
+    else
+    {
+        ADDR_ASSERT_ALWAYS();
+        numPipes = m_pipes; // Suppose we should still have a global pipes
+    }
+
+    return numPipes;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::GetPipePerSurf
+*   @brief
+*       get pipe num base on inputing tileinfo->pipeconfig
+*   @return
+*       pipe number
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::GetPipePerSurf(
+    AddrPipeCfg pipeConfig   ///< [in] pipe config
+    ) const
+{
+    UINT_32 numPipes = 0;
+
+    switch (pipeConfig)
+    {
+        case ADDR_PIPECFG_P2:
+            numPipes = 2;
+            break;
+        case ADDR_PIPECFG_P4_8x16:
+        case ADDR_PIPECFG_P4_16x16:
+        case ADDR_PIPECFG_P4_16x32:
+        case ADDR_PIPECFG_P4_32x32:
+            numPipes = 4;
+            break;
+        case ADDR_PIPECFG_P8_16x16_8x16:
+        case ADDR_PIPECFG_P8_16x32_8x16:
+        case ADDR_PIPECFG_P8_32x32_8x16:
+        case ADDR_PIPECFG_P8_16x32_16x16:
+        case ADDR_PIPECFG_P8_32x32_16x16:
+        case ADDR_PIPECFG_P8_32x32_16x32:
+        case ADDR_PIPECFG_P8_32x64_32x32:
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P16_32x32_8x16:
+        case ADDR_PIPECFG_P16_32x32_16x16:
+            numPipes = 16;
+            break;
+        default:
+            ADDR_ASSERT(!"Invalid pipe config");
+            numPipes = m_pipes;
+    }
+    return numPipes;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::ComputePipeFromCoord
+*
+*   @brief
+*       Compute pipe number from coordinates
+*   @return
+*       Pipe number
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::ComputePipeFromCoord(
+    UINT_32         x,              ///< [in] x coordinate
+    UINT_32         y,              ///< [in] y coordinate
+    UINT_32         slice,          ///< [in] slice index
+    AddrTileMode    tileMode,       ///< [in] tile mode
+    UINT_32         pipeSwizzle,    ///< [in] pipe swizzle
+    BOOL_32         ignoreSE,       ///< [in] TRUE if shader engines are ignored
+    ADDR_TILEINFO*  pTileInfo       ///< [in] Tile info
+    ) const
+{
+    UINT_32 pipe;
+    UINT_32 pipeBit0 = 0;
+    UINT_32 pipeBit1 = 0;
+    UINT_32 pipeBit2 = 0;
+    UINT_32 pipeBit3 = 0;
+    UINT_32 sliceRotation;
+    UINT_32 numPipes = 0;
+
+    UINT_32 tx = x / MicroTileWidth;
+    UINT_32 ty = y / MicroTileHeight;
+    UINT_32 x3 = _BIT(tx,0);
+    UINT_32 x4 = _BIT(tx,1);
+    UINT_32 x5 = _BIT(tx,2);
+    UINT_32 x6 = _BIT(tx,3);
+    UINT_32 y3 = _BIT(ty,0);
+    UINT_32 y4 = _BIT(ty,1);
+    UINT_32 y5 = _BIT(ty,2);
+    UINT_32 y6 = _BIT(ty,3);
+
+    switch (pTileInfo->pipeConfig)
+    {
+        case ADDR_PIPECFG_P2:
+            pipeBit0 = x3 ^ y3;
+            numPipes = 2;
+            break;
+        case ADDR_PIPECFG_P4_8x16:
+            pipeBit0 = x4 ^ y3;
+            pipeBit1 = x3 ^ y4;
+            numPipes = 4;
+            break;
+        case ADDR_PIPECFG_P4_16x16:
+            pipeBit0 = x3 ^ y3 ^ x4;
+            pipeBit1 = x4 ^ y4;
+            numPipes = 4;
+            break;
+        case ADDR_PIPECFG_P4_16x32:
+            pipeBit0 = x3 ^ y3 ^ x4;
+            pipeBit1 = x4 ^ y5;
+            numPipes = 4;
+            break;
+        case ADDR_PIPECFG_P4_32x32:
+            pipeBit0 = x3 ^ y3 ^ x5;
+            pipeBit1 = x5 ^ y5;
+            numPipes = 4;
+            break;
+        case ADDR_PIPECFG_P8_16x16_8x16:
+            pipeBit0 = x4 ^ y3 ^ x5;
+            pipeBit1 = x3 ^ y5;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P8_16x32_8x16:
+            pipeBit0 = x4 ^ y3 ^ x5;
+            pipeBit1 = x3 ^ y4;
+            pipeBit2 = x4 ^ y5;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P8_16x32_16x16:
+            pipeBit0 = x3 ^ y3 ^ x4;
+            pipeBit1 = x5 ^ y4;
+            pipeBit2 = x4 ^ y5;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P8_32x32_8x16:
+            pipeBit0 = x4 ^ y3 ^ x5;
+            pipeBit1 = x3 ^ y4;
+            pipeBit2 = x5 ^ y5;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x16:
+            pipeBit0 = x3 ^ y3 ^ x4;
+            pipeBit1 = x4 ^ y4;
+            pipeBit2 = x5 ^ y5;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x32:
+            pipeBit0 = x3 ^ y3 ^ x4;
+            pipeBit1 = x4 ^ y6;
+            pipeBit2 = x5 ^ y5;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P8_32x64_32x32:
+            pipeBit0 = x3 ^ y3 ^ x5;
+            pipeBit1 = x6 ^ y5;
+            pipeBit2 = x5 ^ y6;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P16_32x32_8x16:
+            pipeBit0 = x4 ^ y3;
+            pipeBit1 = x3 ^ y4;
+            pipeBit2 = x5 ^ y6;
+            pipeBit3 = x6 ^ y5;
+            numPipes = 16;
+            break;
+        case ADDR_PIPECFG_P16_32x32_16x16:
+            pipeBit0 = x3 ^ y3 ^ x4;
+            pipeBit1 = x4 ^ y4;
+            pipeBit2 = x5 ^ y6;
+            pipeBit3 = x6 ^ y5;
+            numPipes = 16;
+            break;
+        default:
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+    pipe = pipeBit0 | (pipeBit1 << 1) | (pipeBit2 << 2) | (pipeBit3 << 3);
+
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    //
+    // Apply pipe rotation for the slice.
+    //
+    switch (tileMode)
+    {
+        case ADDR_TM_3D_TILED_THIN1:    //fall through thin
+        case ADDR_TM_3D_TILED_THICK:    //fall through thick
+        case ADDR_TM_3D_TILED_XTHICK:
+            sliceRotation =
+                Max(1, static_cast<INT_32>(numPipes / 2) - 1) * (slice / microTileThickness);
+            break;
+        default:
+            sliceRotation = 0;
+            break;
+    }
+    pipeSwizzle += sliceRotation;
+    pipeSwizzle &= (numPipes - 1);
+
+    pipe = pipe ^ pipeSwizzle;
+
+    return pipe;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::ComputeTileCoordFromPipeAndElemIdx
+*
+*   @brief
+*       Compute (x,y) of a tile within a macro tile from address
+*   @return
+*       Pipe number
+***************************************************************************************************
+*/
+VOID SIAddrLib::ComputeTileCoordFromPipeAndElemIdx(
+    UINT_32         elemIdx,          ///< [in] per pipe element index within a macro tile
+    UINT_32         pipe,             ///< [in] pipe index
+    AddrPipeCfg     pipeCfg,          ///< [in] pipe config
+    UINT_32         pitchInMacroTile, ///< [in] surface pitch in macro tile
+    UINT_32         x,                ///< [in] x coordinate of the (0,0) tile in a macro tile
+    UINT_32         y,                ///< [in] y coordinate of the (0,0) tile in a macro tile
+    UINT_32*        pX,               ///< [out] x coordinate
+    UINT_32*        pY                ///< [out] y coordinate
+    ) const
+{
+    UINT_32 pipebit0 = _BIT(pipe,0);
+    UINT_32 pipebit1 = _BIT(pipe,1);
+    UINT_32 pipebit2 = _BIT(pipe,2);
+    UINT_32 pipebit3 = _BIT(pipe,3);
+    UINT_32 elemIdx0 = _BIT(elemIdx,0);
+    UINT_32 elemIdx1 = _BIT(elemIdx,1);
+    UINT_32 elemIdx2 = _BIT(elemIdx,2);
+    UINT_32 x3 = 0;
+    UINT_32 x4 = 0;
+    UINT_32 x5 = 0;
+    UINT_32 x6 = 0;
+    UINT_32 y3 = 0;
+    UINT_32 y4 = 0;
+    UINT_32 y5 = 0;
+    UINT_32 y6 = 0;
+
+    switch(pipeCfg)
+    {
+        case ADDR_PIPECFG_P2:
+            x4 = elemIdx2;
+            y4 = elemIdx1 ^ x4;
+            y3 = elemIdx0 ^ x4;
+            x3 = pipebit0 ^ y3;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P4_8x16:
+            x4 = elemIdx1;
+            y4 = elemIdx0 ^ x4;
+            x3 = pipebit1 ^ y4;
+            y3 = pipebit0 ^ x4;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P4_16x16:
+            x4 = elemIdx1;
+            y3 = elemIdx0 ^ x4;
+            y4 = pipebit1 ^ x4;
+            x3 = pipebit0 ^ y3 ^ x4;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P4_16x32:
+            x3 = elemIdx0 ^ pipebit0;
+            y5 = _BIT(y,5);
+            x4 = pipebit1 ^ y5;
+            y3 = pipebit0 ^ x3 ^ x4;
+            y4 = elemIdx1 ^ x4;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P4_32x32:
+            x4 = elemIdx2;
+            y3 = elemIdx0 ^ x4;
+            y4 = elemIdx1 ^ x4;
+            if((pitchInMacroTile % 2) == 0)
+            {   //even
+                y5 = _BIT(y,5);
+                x5 = pipebit1 ^ y5;
+                x3 = pipebit0 ^ y3 ^ x5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            else
+            {   //odd
+                x5 = _BIT(x,5);
+                x3 = pipebit0 ^ y3 ^ x5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(2, x4, x3);
+            }
+            break;
+        case ADDR_PIPECFG_P8_16x16_8x16:
+            x4 = elemIdx0;
+            y5 = _BIT(y,5);
+            x5 = _BIT(x,5);
+            x3 = pipebit1 ^ y5;
+            y4 = pipebit2 ^ x4;
+            y3 = pipebit0 ^ x5 ^ x4;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P8_16x32_8x16:
+            x3 = elemIdx0;
+            y4 = pipebit1 ^ x3;
+            y5 = _BIT(y,5);
+            x5 = _BIT(x,5);
+            x4 = pipebit2 ^ y5;
+            y3 = pipebit0 ^ x4 ^ x5;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P8_32x32_8x16:
+            x4 = elemIdx1;
+            y4 = elemIdx0 ^ x4;
+            x3 = pipebit1 ^ y4;
+            if((pitchInMacroTile % 2) == 0)
+            {  //even
+                y5 = _BIT(y,5);
+                x5 = _BIT(x,5);
+                x5 = pipebit2 ^ y5;
+                y3 = pipebit0 ^ x4 ^ x5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            else
+            {  //odd
+                x5 = _BIT(x,5);
+                y3 = pipebit0 ^ x4 ^ x5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(2, x4, x3);
+            }
+            break;
+        case ADDR_PIPECFG_P8_16x32_16x16:
+            x3 = elemIdx0;
+            x5 = _BIT(x,5);
+            y5 = _BIT(y,5);
+            x4 = pipebit2 ^ y5;
+            y4 = pipebit1 ^ x5;
+            y3 = pipebit0 ^ x3 ^ x4;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x16:
+            x4 = elemIdx1;
+            y3 = elemIdx0 ^ x4;
+            x3 = y3^x4^pipebit0;
+            y4 = pipebit1 ^ x4;
+            if((pitchInMacroTile % 2) == 0)
+            {   //even
+                y5 = _BIT(y,5);
+                x5 = pipebit2 ^ y5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            else
+            {   //odd
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(2, x4, x3);
+            }
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x32:
+            if((pitchInMacroTile % 2) == 0)
+            {   //even
+                y5 = _BIT(y,5);
+                y6 = _BIT(y,6);
+                x4 = pipebit1 ^ y6;
+                y3 = elemIdx0 ^ x4;
+                y4 = elemIdx1 ^ x4;
+                x3 = pipebit0 ^ y3 ^ x4;
+                x5 = pipebit2 ^ y5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            else
+            {   //odd
+                y6 = _BIT(y,6);
+                x4 = pipebit1 ^ y6;
+                y3 = elemIdx0 ^ x4;
+                y4 = elemIdx1 ^ x4;
+                x3 = pipebit0 ^ y3 ^ x4;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(2, x4, x3);
+            }
+            break;
+        case ADDR_PIPECFG_P8_32x64_32x32:
+            x4 = elemIdx2;
+            y3 = elemIdx0 ^ x4;
+            y4 = elemIdx1 ^ x4;
+            if((pitchInMacroTile % 4) == 0)
+            {   //multiple of 4
+                y5 = _BIT(y,5);
+                y6 = _BIT(y,6);
+                x5 = pipebit2 ^ y6;
+                x6 = pipebit1 ^ y5;
+                x3 = pipebit0 ^ y3 ^ x5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(4, x6, x5, x4, x3);
+            }
+            else
+            {
+                y6 = _BIT(y,6);
+                x5 = pipebit2 ^ y6;
+                x3 = pipebit0 ^ y3 ^ x5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            break;
+        case ADDR_PIPECFG_P16_32x32_8x16:
+            x4 = elemIdx1;
+            y4 = elemIdx0 ^ x4;
+            y3 = pipebit0 ^ x4;
+            x3 = pipebit1 ^ y4;
+            if((pitchInMacroTile % 4) == 0)
+            {   //multiple of 4
+                y5 = _BIT(y,5);
+                y6 = _BIT(y,6);
+                x5 = pipebit2 ^ y6;
+                x6 = pipebit3 ^ y5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(4, x6, x5,x4, x3);
+            }
+            else
+            {
+                y6 = _BIT(y,6);
+                x5 = pipebit2 ^ y6;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            break;
+        case ADDR_PIPECFG_P16_32x32_16x16:
+            x4 = elemIdx1;
+            y3 = elemIdx0 ^ x4;
+            y4 = pipebit1 ^ x4;
+            x3 = pipebit0 ^ y3 ^ x4;
+            if((pitchInMacroTile % 4) == 0)
+            {   //multiple of 4
+                y5 = _BIT(y,5);
+                y6 = _BIT(y,6);
+                x5 = pipebit2 ^ y6;
+                x6 = pipebit3 ^ y5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(4, x6, x5, x4, x3);
+            }
+            else
+            {
+                y6 = _BIT(y,6);
+                x5 = pipebit2 ^ y6;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            break;
+        default:
+            ADDR_UNHANDLED_CASE();
+    }
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::TileCoordToMaskElementIndex
+*
+*   @brief
+*       Compute element index from coordinates in tiles
+*   @return
+*       Element index
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::TileCoordToMaskElementIndex(
+    UINT_32         tx,                 ///< [in] x coord, in Tiles
+    UINT_32         ty,                 ///< [in] y coord, in Tiles
+    AddrPipeCfg     pipeConfig,         ///< [in] pipe config
+    UINT_32*        macroShift,         ///< [out] macro shift
+    UINT_32*        elemIdxBits         ///< [out] tile offset bits
+    ) const
+{
+    UINT_32 elemIdx = 0;
+    UINT_32 elemIdx0, elemIdx1, elemIdx2;
+    UINT_32 tx0, tx1;
+    UINT_32 ty0, ty1;
+
+    tx0 = _BIT(tx,0);
+    tx1 = _BIT(tx,1);
+    ty0 = _BIT(ty,0);
+    ty1 = _BIT(ty,1);
+
+    switch(pipeConfig)
+    {
+        case ADDR_PIPECFG_P2:
+            *macroShift = 3;
+            *elemIdxBits =3;
+            elemIdx2 = tx1;
+            elemIdx1 = tx1 ^ ty1;
+            elemIdx0 = tx1 ^ ty0;
+            elemIdx = Bits2Number(3,elemIdx2,elemIdx1,elemIdx0);
+            break;
+        case ADDR_PIPECFG_P4_8x16:
+            *macroShift = 2;
+            *elemIdxBits =2;
+            elemIdx1 = tx1;
+            elemIdx0 = tx1 ^ ty1;
+            elemIdx = Bits2Number(2,elemIdx1,elemIdx0);
+            break;
+        case ADDR_PIPECFG_P4_16x16:
+            *macroShift = 2;
+            *elemIdxBits =2;
+            elemIdx0 = tx1^ty0;
+            elemIdx1 = tx1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P4_16x32:
+            *macroShift = 2;
+            *elemIdxBits =2;
+            elemIdx0 = tx1^ty0;
+            elemIdx1 = tx1^ty1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P4_32x32:
+            *macroShift = 2;
+            *elemIdxBits =3;
+            elemIdx0 = tx1^ty0;
+            elemIdx1 = tx1^ty1;
+            elemIdx2 = tx1;
+            elemIdx = Bits2Number(3, elemIdx2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P8_16x16_8x16:
+            *macroShift = 1;
+            *elemIdxBits =1;
+            elemIdx0 = tx1;
+            elemIdx = elemIdx0;
+            break;
+        case ADDR_PIPECFG_P8_16x32_8x16:
+            *macroShift = 1;
+            *elemIdxBits =1;
+            elemIdx0 = tx0;
+            elemIdx = elemIdx0;
+            break;
+        case ADDR_PIPECFG_P8_32x32_8x16:
+            *macroShift = 1;
+            *elemIdxBits =2;
+            elemIdx1 = tx1;
+            elemIdx0 = tx1^ty1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P8_16x32_16x16:
+            *macroShift = 1;
+            *elemIdxBits =1;
+            elemIdx0 = tx0;
+            elemIdx = elemIdx0;
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x16:
+            *macroShift = 1;
+            *elemIdxBits =2;
+            elemIdx0 = tx1^ty0;
+            elemIdx1 = tx1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x32:
+            *macroShift = 1;
+            *elemIdxBits =2;
+            elemIdx0 =  tx1^ty0;
+            elemIdx1 = tx1^ty1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P8_32x64_32x32:
+            *macroShift = 1;
+            *elemIdxBits =3;
+            elemIdx0 = tx1^ty0;
+            elemIdx1 = tx1^ty1;
+            elemIdx2 = tx1;
+            elemIdx = Bits2Number(3, elemIdx2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P16_32x32_8x16:
+            *macroShift = 0;
+            *elemIdxBits =2;
+            elemIdx0 = tx1^ty1;
+            elemIdx1 = tx1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P16_32x32_16x16:
+            *macroShift = 0;
+            *elemIdxBits =2;
+            elemIdx0 = tx1^ty0;
+            elemIdx1 = tx1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        default:
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+
+    return elemIdx;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeTileDataWidthAndHeightLinear
+*
+*   @brief
+*       Compute the squared cache shape for per-tile data (CMASK and HTILE) for linear layout
+*
+*   @return
+*       N/A
+*
+*   @note
+*       MacroWidth and macroHeight are measured in pixels
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlComputeTileDataWidthAndHeightLinear(
+    UINT_32*        pMacroWidth,     ///< [out] macro tile width
+    UINT_32*        pMacroHeight,    ///< [out] macro tile height
+    UINT_32         bpp,             ///< [in] bits per pixel
+    ADDR_TILEINFO*  pTileInfo        ///< [in] tile info
+    ) const
+{
+    ADDR_ASSERT(pTileInfo != NULL);
+    UINT_32 macroWidth;
+    UINT_32 macroHeight;
+
+    /// In linear mode, the htile or cmask buffer must be padded out to 4 tiles
+    /// but for P8_32x64_32x32, it must be padded out to 8 tiles
+    /// Actually there are more pipe configs which need 8-tile padding but SI family
+    /// has a bug which is fixed in CI family
+    if ((pTileInfo->pipeConfig == ADDR_PIPECFG_P8_32x64_32x32) ||
+        (pTileInfo->pipeConfig == ADDR_PIPECFG_P16_32x32_8x16) ||
+        (pTileInfo->pipeConfig == ADDR_PIPECFG_P8_32x32_16x16))
+    {
+        macroWidth  = 8*MicroTileWidth;
+        macroHeight = 8*MicroTileHeight;
+    }
+    else
+    {
+        macroWidth  = 4*MicroTileWidth;
+        macroHeight = 4*MicroTileHeight;
+    }
+
+    *pMacroWidth    = macroWidth;
+    *pMacroHeight   = macroHeight;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeHtileBytes
+*
+*   @brief
+*       Compute htile size in bytes
+*
+*   @return
+*       Htile size in bytes
+***************************************************************************************************
+*/
+UINT_64 SIAddrLib::HwlComputeHtileBytes(
+    UINT_32     pitch,          ///< [in] pitch
+    UINT_32     height,         ///< [in] height
+    UINT_32     bpp,            ///< [in] bits per pixel
+    BOOL_32     isLinear,       ///< [in] if it is linear mode
+    UINT_32     numSlices,      ///< [in] number of slices
+    UINT_64*    pSliceBytes,    ///< [out] bytes per slice
+    UINT_32     baseAlign       ///< [in] base alignments
+    ) const
+{
+    return ComputeHtileBytes(pitch, height, bpp, isLinear, numSlices, pSliceBytes, baseAlign);
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeXmaskAddrFromCoord
+*
+*   @brief
+*       Compute address from coordinates for htile/cmask
+*   @return
+*       Byte address
+***************************************************************************************************
+*/
+UINT_64 SIAddrLib::HwlComputeXmaskAddrFromCoord(
+    UINT_32        pitch,          ///< [in] pitch
+    UINT_32        height,         ///< [in] height
+    UINT_32        x,              ///< [in] x coord
+    UINT_32        y,              ///< [in] y coord
+    UINT_32        slice,          ///< [in] slice/depth index
+    UINT_32        numSlices,      ///< [in] number of slices
+    UINT_32        factor,         ///< [in] factor that indicates cmask(2) or htile(1)
+    BOOL_32        isLinear,       ///< [in] linear or tiled HTILE layout
+    BOOL_32        isWidth8,       ///< [in] TRUE if width is 8, FALSE means 4. It's register value
+    BOOL_32        isHeight8,      ///< [in] TRUE if width is 8, FALSE means 4. It's register value
+    ADDR_TILEINFO* pTileInfo,      ///< [in] Tile info
+    UINT_32*       pBitPosition    ///< [out] bit position inside a byte
+    ) const
+{
+    UINT_32 tx = x / MicroTileWidth;
+    UINT_32 ty = y / MicroTileHeight;
+    UINT_32 newPitch;
+    UINT_32 newHeight;
+    UINT_64 totalBytes;
+    UINT_32 macroWidth;
+    UINT_32 macroHeight;
+    UINT_64 pSliceBytes;
+    UINT_32 pBaseAlign;
+    UINT_32 tileNumPerPipe;
+    UINT_32 elemBits;
+
+    if (factor == 2) //CMASK
+    {
+        ADDR_CMASK_FLAGS flags = {{0}};
+
+        tileNumPerPipe = 256;
+
+        ComputeCmaskInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         pTileInfo,
+                         &newPitch,
+                         &newHeight,
+                         &totalBytes,
+                         &macroWidth,
+                         &macroHeight);
+        elemBits = CmaskElemBits;
+    }
+    else //HTile
+    {
+        ADDR_HTILE_FLAGS flags = {{0}};
+
+        tileNumPerPipe = 512;
+
+        ComputeHtileInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         TRUE,
+                         TRUE,
+                         pTileInfo,
+                         &newPitch,
+                         &newHeight,
+                         &totalBytes,
+                         &macroWidth,
+                         &macroHeight,
+                         &pSliceBytes,
+                         &pBaseAlign);
+        elemBits = 32;
+    }
+
+    const UINT_32 pitchInTile = newPitch / MicroTileWidth;
+    const UINT_32 heightInTile = newHeight / MicroTileWidth;
+    UINT_64 macroOffset; // Per pipe starting offset of the macro tile in which this tile lies.
+    UINT_64 microNumber; // Per pipe starting offset of the macro tile in which this tile lies.
+    UINT_32 microX;
+    UINT_32 microY;
+    UINT_64 microOffset;
+    UINT_32 microShift;
+    UINT_64 totalOffset;
+    UINT_32 elemIdxBits;
+    UINT_32 elemIdx =
+        TileCoordToMaskElementIndex(tx, ty, pTileInfo->pipeConfig, &microShift, &elemIdxBits);
+
+    UINT_32 numPipes = HwlGetPipes(pTileInfo);
+
+    if (isLinear)
+    {   //linear addressing
+        // Linear addressing is extremelly wasting memory if slice > 1, since each pipe has the full
+        // slice memory foot print instead of divided by numPipes.
+        microX = tx / 4; // Macro Tile is 4x4
+        microY = ty / 4 ;
+        microNumber = static_cast<UINT_64>(microX + microY * (pitchInTile / 4)) << microShift;
+
+        UINT_32 sliceBits = pitchInTile * heightInTile;
+
+        // do htile single slice alignment if the flag is true
+        if (m_configFlags.useHtileSliceAlign && (factor == 1))  //Htile
+        {
+            sliceBits = PowTwoAlign(sliceBits, BITS_TO_BYTES(HtileCacheBits) * numPipes / elemBits);
+        }
+        macroOffset = slice * (sliceBits / numPipes) * elemBits ;
+    }
+    else
+    {   //tiled addressing
+        const UINT_32 macroWidthInTile = macroWidth / MicroTileWidth; // Now in unit of Tiles
+        const UINT_32 macroHeightInTile = macroHeight / MicroTileHeight;
+        const UINT_32 pitchInCL = pitchInTile / macroWidthInTile;
+        const UINT_32 heightInCL = heightInTile / macroHeightInTile;
+
+        const UINT_32 macroX = x / macroWidth;
+        const UINT_32 macroY = y / macroHeight;
+        const UINT_32 macroNumber = macroX + macroY * pitchInCL + slice * pitchInCL * heightInCL;
+
+        // Per pipe starting offset of the cache line in which this tile lies.
+        microX = (x % macroWidth) / MicroTileWidth / 4; // Macro Tile is 4x4
+        microY = (y % macroHeight) / MicroTileHeight / 4 ;
+        microNumber = static_cast<UINT_64>(microX + microY * (macroWidth / MicroTileWidth / 4)) << microShift;
+
+        macroOffset = macroNumber * tileNumPerPipe * elemBits;
+    }
+
+    if(elemIdxBits == microShift)
+    {
+        microNumber += elemIdx;
+    }
+    else
+    {
+        microNumber >>= elemIdxBits;
+        microNumber <<= elemIdxBits;
+        microNumber += elemIdx;
+    }
+
+    microOffset = elemBits * microNumber;
+    totalOffset = microOffset + macroOffset;
+
+    UINT_32 pipe = ComputePipeFromCoord(x, y, 0, ADDR_TM_2D_TILED_THIN1, 0, FALSE, pTileInfo);
+    UINT_64 addrInBits = totalOffset % (m_pipeInterleaveBytes * 8) +
+                   pipe * (m_pipeInterleaveBytes * 8) +
+                   totalOffset / (m_pipeInterleaveBytes * 8) * (m_pipeInterleaveBytes * 8) * numPipes;
+    *pBitPosition = static_cast<UINT_32>(addrInBits) % 8;
+    UINT_64 addr = addrInBits / 8;
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeXmaskCoordFromAddr
+*
+*   @brief
+*       Compute the coord from an address of a cmask/htile
+*
+*   @return
+*       N/A
+*
+*   @note
+*       This method is reused by htile, so rename to Xmask
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlComputeXmaskCoordFromAddr(
+    UINT_64         addr,           ///< [in] address
+    UINT_32         bitPosition,    ///< [in] bitPosition in a byte
+    UINT_32         pitch,          ///< [in] pitch
+    UINT_32         height,         ///< [in] height
+    UINT_32         numSlices,      ///< [in] number of slices
+    UINT_32         factor,         ///< [in] factor that indicates cmask or htile
+    BOOL_32         isLinear,       ///< [in] linear or tiled HTILE layout
+    BOOL_32         isWidth8,       ///< [in] Not used by SI
+    BOOL_32         isHeight8,      ///< [in] Not used by SI
+    ADDR_TILEINFO*  pTileInfo,      ///< [in] Tile info
+    UINT_32*        pX,             ///< [out] x coord
+    UINT_32*        pY,             ///< [out] y coord
+    UINT_32*        pSlice          ///< [out] slice index
+    ) const
+{
+    UINT_32 newPitch;
+    UINT_32 newHeight;
+    UINT_64 totalBytes;
+    UINT_32 clWidth;
+    UINT_32 clHeight;
+    UINT_32 tileNumPerPipe;
+    UINT_64 sliceBytes;
+
+    *pX = 0;
+    *pY = 0;
+    *pSlice = 0;
+
+    if (factor == 2) //CMASK
+    {
+        ADDR_CMASK_FLAGS flags = {{0}};
+
+        tileNumPerPipe = 256;
+
+        ComputeCmaskInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         pTileInfo,
+                         &newPitch,
+                         &newHeight,
+                         &totalBytes,
+                         &clWidth,
+                         &clHeight);
+    }
+    else //HTile
+    {
+        ADDR_HTILE_FLAGS flags = {{0}};
+
+        tileNumPerPipe = 512;
+
+        ComputeHtileInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         TRUE,
+                         TRUE,
+                         pTileInfo,
+                         &newPitch,
+                         &newHeight,
+                         &totalBytes,
+                         &clWidth,
+                         &clHeight,
+                         &sliceBytes);
+    }
+
+    const UINT_32 pitchInTile = newPitch / MicroTileWidth;
+    const UINT_32 heightInTile = newHeight / MicroTileWidth;
+    const UINT_32 pitchInMacroTile = pitchInTile / 4;
+    UINT_32 macroShift;
+    UINT_32 elemIdxBits;
+    // get macroShift and elemIdxBits
+    TileCoordToMaskElementIndex(0, 0, pTileInfo->pipeConfig, &macroShift, &elemIdxBits);
+
+    const UINT_32 numPipes = HwlGetPipes(pTileInfo);
+    const UINT_32 pipe = (UINT_32)((addr / m_pipeInterleaveBytes) % numPipes);
+    // per pipe
+    UINT_64 localOffset = (addr % m_pipeInterleaveBytes) +
+        (addr / m_pipeInterleaveBytes / numPipes)* m_pipeInterleaveBytes;
+
+    UINT_32 tileIndex;
+    if (factor == 2) //CMASK
+    {
+        tileIndex = (UINT_32)(localOffset * 2 + (bitPosition != 0));
+    }
+    else
+    {
+        tileIndex = (UINT_32)(localOffset / 4);
+    }
+
+    UINT_32 macroOffset;
+    if (isLinear)
+    {
+        UINT_32 sliceSizeInTile = pitchInTile * heightInTile;
+
+        // do htile single slice alignment if the flag is true
+        if (m_configFlags.useHtileSliceAlign && (factor == 1))  //Htile
+        {
+            sliceSizeInTile = PowTwoAlign(sliceSizeInTile, static_cast<UINT_32>(sliceBytes) / 64);
+        }
+        *pSlice = tileIndex / (sliceSizeInTile / numPipes);
+        macroOffset = tileIndex % (sliceSizeInTile / numPipes);
+    }
+    else
+    {
+        const UINT_32 clWidthInTile = clWidth / MicroTileWidth; // Now in unit of Tiles
+        const UINT_32 clHeightInTile = clHeight / MicroTileHeight;
+        const UINT_32 pitchInCL = pitchInTile / clWidthInTile;
+        const UINT_32 heightInCL = heightInTile / clHeightInTile;
+        const UINT_32 clIndex = tileIndex / tileNumPerPipe;
+
+        UINT_32 clX = clIndex % pitchInCL;
+        UINT_32 clY = (clIndex % (heightInCL * pitchInCL)) / pitchInCL;
+
+        *pX = clX * clWidthInTile * MicroTileWidth;
+        *pY = clY * clHeightInTile * MicroTileHeight;
+        *pSlice = clIndex / (heightInCL * pitchInCL);
+
+        macroOffset = tileIndex % tileNumPerPipe;
+    }
+
+    UINT_32 elemIdx = macroOffset & 7;
+    macroOffset >>= elemIdxBits;
+
+    if (elemIdxBits != macroShift)
+    {
+        macroOffset <<= (elemIdxBits - macroShift);
+
+        UINT_32 pipebit1 = _BIT(pipe,1);
+        UINT_32 pipebit2 = _BIT(pipe,2);
+        UINT_32 pipebit3 = _BIT(pipe,3);
+        if (pitchInMacroTile % 2)
+        {   //odd
+            switch (pTileInfo->pipeConfig)
+            {
+                case ADDR_PIPECFG_P4_32x32:
+                    macroOffset |= pipebit1;
+                    break;
+                case ADDR_PIPECFG_P8_32x32_8x16:
+                case ADDR_PIPECFG_P8_32x32_16x16:
+                case ADDR_PIPECFG_P8_32x32_16x32:
+                    macroOffset |= pipebit2;
+                    break;
+                default:
+                    break;
+            }
+
+        }
+
+        if (pitchInMacroTile % 4)
+        {
+            if (pTileInfo->pipeConfig == ADDR_PIPECFG_P8_32x64_32x32)
+            {
+                macroOffset |= (pipebit1<<1);
+            }
+            if((pTileInfo->pipeConfig == ADDR_PIPECFG_P16_32x32_8x16) ||
+               (pTileInfo->pipeConfig == ADDR_PIPECFG_P16_32x32_16x16))
+            {
+                macroOffset |= (pipebit3<<1);
+            }
+        }
+    }
+
+    UINT_32 macroX;
+    UINT_32 macroY;
+
+    if (isLinear)
+    {
+        macroX = macroOffset % pitchInMacroTile;
+        macroY = macroOffset / pitchInMacroTile;
+    }
+    else
+    {
+        const UINT_32 clWidthInMacroTile = clWidth / (MicroTileWidth * 4);
+        macroX = macroOffset % clWidthInMacroTile;
+        macroY = macroOffset / clWidthInMacroTile;
+    }
+
+    *pX += macroX * 4 * MicroTileWidth;
+    *pY += macroY * 4 * MicroTileHeight;
+
+    UINT_32 microX;
+    UINT_32 microY;
+    ComputeTileCoordFromPipeAndElemIdx(elemIdx, pipe, pTileInfo->pipeConfig, pitchInMacroTile,
+                                       *pX, *pY, &microX, &microY);
+
+    *pX += microX * MicroTileWidth;
+    *pY += microY * MicroTileWidth;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlGetPitchAlignmentLinear
+*   @brief
+*       Get pitch alignment
+*   @return
+*       pitch alignment
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlGetPitchAlignmentLinear(
+    UINT_32             bpp,    ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags   ///< [in] surface flags
+    ) const
+{
+    UINT_32 pitchAlign;
+
+    // Interleaved access requires a 256B aligned pitch, so fall back to pre-SI alignment
+    if (flags.interleaved)
+    {
+        pitchAlign = Max(64u, m_pipeInterleaveBytes / BITS_TO_BYTES(bpp));
+
+    }
+    else
+    {
+        pitchAlign = Max(8u, 64 / BITS_TO_BYTES(bpp));
+    }
+
+    return pitchAlign;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlGetSizeAdjustmentLinear
+*
+*   @brief
+*       Adjust linear surface pitch and slice size
+*
+*   @return
+*       Logical slice size in bytes
+***************************************************************************************************
+*/
+UINT_64 SIAddrLib::HwlGetSizeAdjustmentLinear(
+    AddrTileMode        tileMode,       ///< [in] tile mode
+    UINT_32             bpp,            ///< [in] bits per pixel
+    UINT_32             numSamples,     ///< [in] number of samples
+    UINT_32             baseAlign,      ///< [in] base alignment
+    UINT_32             pitchAlign,     ///< [in] pitch alignment
+    UINT_32*            pPitch,         ///< [in/out] pointer to pitch
+    UINT_32*            pHeight,        ///< [in/out] pointer to height
+    UINT_32*            pHeightAlign    ///< [in/out] pointer to height align
+    ) const
+{
+    UINT_64 sliceSize;
+    if (tileMode == ADDR_TM_LINEAR_GENERAL)
+    {
+        sliceSize = BITS_TO_BYTES(static_cast<UINT_64>(*pPitch) * (*pHeight) * bpp * numSamples);
+    }
+    else
+    {
+        UINT_32 pitch   = *pPitch;
+        UINT_32 height  = *pHeight;
+
+        UINT_32 pixelsPerPipeInterleave = m_pipeInterleaveBytes / BITS_TO_BYTES(bpp);
+        UINT_32 sliceAlignInPixel = pixelsPerPipeInterleave < 64 ? 64 : pixelsPerPipeInterleave;
+
+        // numSamples should be 1 in real cases (no MSAA for linear but TGL may pass non 1 value)
+        UINT_64 pixelPerSlice = static_cast<UINT_64>(pitch) * height * numSamples;
+
+        while (pixelPerSlice % sliceAlignInPixel)
+        {
+            pitch += pitchAlign;
+            pixelPerSlice = static_cast<UINT_64>(pitch) * height * numSamples;
+        }
+
+        *pPitch = pitch;
+
+        UINT_32 heightAlign = 1;
+
+        while ((pitch * heightAlign) % sliceAlignInPixel)
+        {
+            heightAlign++;
+        }
+
+        *pHeightAlign = heightAlign;
+
+        sliceSize = BITS_TO_BYTES(pixelPerSlice * bpp);
+    }
+
+    return sliceSize;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlPreHandleBaseLvl3xPitch
+*
+*   @brief
+*       Pre-handler of 3x pitch (96 bit) adjustment
+*
+*   @return
+*       Expected pitch
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlPreHandleBaseLvl3xPitch(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] input
+    UINT_32                                 expPitch    ///< [in] pitch
+    ) const
+{
+    ADDR_ASSERT(pIn->width == expPitch);
+
+    // From SI, if pow2Pad is 1 the pitch is expanded 3x first, then padded to pow2, so nothing to
+    // do here
+    if (!pIn->flags.pow2Pad)
+    {
+        AddrLib::HwlPreHandleBaseLvl3xPitch(pIn, expPitch);
+    }
+    else
+    {
+        ADDR_ASSERT(IsPow2(expPitch));
+    }
+
+    return expPitch;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlPostHandleBaseLvl3xPitch
+*
+*   @brief
+*       Post-handler of 3x pitch adjustment
+*
+*   @return
+*       Expected pitch
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlPostHandleBaseLvl3xPitch(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] input
+    UINT_32                                 expPitch    ///< [in] pitch
+    ) const
+{
+    /**
+     * @note The pitch will be divided by 3 in the end so the value will look odd but h/w should
+     *  be able to compute a correct pitch from it as h/w address library is doing the job.
+     */
+    // From SI, the pitch is expanded 3x first, then padded to pow2, so no special handler here
+    if (!pIn->flags.pow2Pad)
+    {
+        AddrLib::HwlPostHandleBaseLvl3xPitch(pIn, expPitch);
+    }
+
+    return expPitch;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlGetPitchAlignmentMicroTiled
+*
+*   @brief
+*       Compute 1D tiled surface pitch alignment
+*
+*   @return
+*       pitch alignment
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlGetPitchAlignmentMicroTiled(
+    AddrTileMode        tileMode,          ///< [in] tile mode
+    UINT_32             bpp,               ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,             ///< [in] surface flags
+    UINT_32             numSamples         ///< [in] number of samples
+    ) const
+{
+    UINT_32 pitchAlign;
+
+    if (flags.qbStereo)
+    {
+        pitchAlign = EgBasedAddrLib::HwlGetPitchAlignmentMicroTiled(tileMode,bpp,flags,numSamples);
+    }
+    else
+    {
+        pitchAlign = 8;
+    }
+
+    return pitchAlign;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlGetSizeAdjustmentMicroTiled
+*
+*   @brief
+*       Adjust 1D tiled surface pitch and slice size
+*
+*   @return
+*       Logical slice size in bytes
+***************************************************************************************************
+*/
+UINT_64 SIAddrLib::HwlGetSizeAdjustmentMicroTiled(
+    UINT_32             thickness,      ///< [in] thickness
+    UINT_32             bpp,            ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,          ///< [in] surface flags
+    UINT_32             numSamples,     ///< [in] number of samples
+    UINT_32             baseAlign,      ///< [in] base alignment
+    UINT_32             pitchAlign,     ///< [in] pitch alignment
+    UINT_32*            pPitch,         ///< [in/out] pointer to pitch
+    UINT_32*            pHeight         ///< [in/out] pointer to height
+    ) const
+{
+    UINT_64 logicalSliceSize;
+    UINT_64 physicalSliceSize;
+
+    UINT_32 pitch   = *pPitch;
+    UINT_32 height  = *pHeight;
+
+    // Logical slice: pitch * height * bpp * numSamples (no 1D MSAA so actually numSamples == 1)
+    logicalSliceSize = BITS_TO_BYTES(static_cast<UINT_64>(pitch) * height * bpp * numSamples);
+
+    // Physical slice: multiplied by thickness
+    physicalSliceSize =  logicalSliceSize * thickness;
+
+    // Pitch alignment is always 8, so if slice size is not padded to base alignment
+    // (pipe_interleave_size), we need to increase pitch
+    while ((physicalSliceSize % baseAlign) != 0)
+    {
+        pitch += pitchAlign;
+
+        logicalSliceSize = BITS_TO_BYTES(static_cast<UINT_64>(pitch) * height * bpp * numSamples);
+
+        physicalSliceSize =  logicalSliceSize * thickness;
+    }
+
+#if !ALT_TEST
+    //
+    // Special workaround for depth/stencil buffer, use 8 bpp to align depth buffer again since
+    // the stencil plane may have larger pitch if the slice size is smaller than base alignment.
+    //
+    // Note: this actually does not work for mipmap but mipmap depth texture is not really
+    // sampled with mipmap.
+    //
+    if (flags.depth && !flags.noStencil)
+    {
+        ADDR_ASSERT(numSamples == 1);
+
+        UINT_64 logicalSiceSizeStencil = static_cast<UINT_64>(pitch) * height; // 1 byte stencil
+
+        while ((logicalSiceSizeStencil % baseAlign) != 0)
+        {
+            pitch += pitchAlign; // Stencil plane's pitch alignment is the same as depth plane's
+
+            logicalSiceSizeStencil = static_cast<UINT_64>(pitch) * height;
+        }
+
+        if (pitch != *pPitch)
+        {
+            // If this is a mipmap, this padded one cannot be sampled as a whole mipmap!
+            logicalSliceSize = logicalSiceSizeStencil * BITS_TO_BYTES(bpp);
+        }
+    }
+#endif
+    *pPitch = pitch;
+
+    // No adjust for pHeight
+
+    return logicalSliceSize;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlConvertChipFamily
+*
+*   @brief
+*       Convert familyID defined in atiid.h to AddrChipFamily and set m_chipFamily/m_chipRevision
+*   @return
+*       AddrChipFamily
+***************************************************************************************************
+*/
+AddrChipFamily SIAddrLib::HwlConvertChipFamily(
+    UINT_32 uChipFamily,        ///< [in] chip family defined in atiih.h
+    UINT_32 uChipRevision)      ///< [in] chip revision defined in "asic_family"_id.h
+{
+    AddrChipFamily family = ADDR_CHIP_FAMILY_SI;
+
+    switch (uChipFamily)
+    {
+        case FAMILY_SI:
+            m_settings.isSouthernIsland = 1;
+            m_settings.isTahiti     = ASICREV_IS_TAHITI_P(uChipRevision);
+            m_settings.isPitCairn   = ASICREV_IS_PITCAIRN_PM(uChipRevision);
+            m_settings.isCapeVerde  = ASICREV_IS_CAPEVERDE_M(uChipRevision);
+            m_settings.isOland      = ASICREV_IS_OLAND_M(uChipRevision);
+            m_settings.isHainan     = ASICREV_IS_HAINAN_V(uChipRevision);
+            break;
+        default:
+            ADDR_ASSERT(!"This should be a Fusion");
+            break;
+    }
+
+    return family;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlSetupTileInfo
+*
+*   @brief
+*       Setup default value of tile info for SI
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlSetupTileInfo(
+    AddrTileMode                        tileMode,       ///< [in] Tile mode
+    ADDR_SURFACE_FLAGS                  flags,          ///< [in] Surface type flags
+    UINT_32                             bpp,            ///< [in] Bits per pixel
+    UINT_32                             pitch,          ///< [in] Pitch in pixels
+    UINT_32                             height,         ///< [in] Height in pixels
+    UINT_32                             numSamples,     ///< [in] Number of samples
+    ADDR_TILEINFO*                      pTileInfoIn,    ///< [in] Tile info input: NULL for default
+    ADDR_TILEINFO*                      pTileInfoOut,   ///< [out] Tile info output
+    AddrTileType                        inTileType,     ///< [in] Tile type
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*   pOut            ///< [out] Output
+    ) const
+{
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+    ADDR_TILEINFO* pTileInfo = pTileInfoOut;
+    INT index = TileIndexInvalid;
+
+    // Fail-safe code
+    if (!IsLinear(tileMode))
+    {
+        // 128 bpp/thick tiling must be non-displayable.
+        // Fmask reuse color buffer's entry but bank-height field can be from another entry
+        // To simplify the logic, fmask entry should be picked from non-displayable ones
+        if (bpp == 128 || thickness > 1 || flags.fmask || flags.prt)
+        {
+            inTileType = ADDR_NON_DISPLAYABLE;
+        }
+
+        if (flags.depth || flags.stencil)
+        {
+            inTileType = ADDR_DEPTH_SAMPLE_ORDER;
+        }
+    }
+
+    // Partial valid fields are not allowed for SI.
+    if (IsTileInfoAllZero(pTileInfo))
+    {
+        if (IsMacroTiled(tileMode))
+        {
+            if (flags.prt)
+            {
+                if (numSamples == 1)
+                {
+                    if (flags.depth)
+                    {
+                        switch (bpp)
+                        {
+                            case 16:
+                                index = 3;
+                                break;
+                            case 32:
+                                index = 6;
+                                break;
+                            default:
+                                ADDR_ASSERT_ALWAYS();
+                                break;
+                        }
+                    }
+                    else
+                    {
+                        switch (bpp)
+                        {
+                            case 8:
+                                index = 21;
+                                break;
+                            case 16:
+                                index = 22;
+                                break;
+                            case 32:
+                                index = 23;
+                                break;
+                            case 64:
+                                index = 24;
+                                break;
+                            case 128:
+                                index = 25;
+                                break;
+                            default:
+                                break;
+                        }
+
+                        if (thickness > 1)
+                        {
+                            ADDR_ASSERT(bpp != 128);
+                            index += 5;
+                        }
+                    }
+                }
+                else
+                {
+                    ADDR_ASSERT(numSamples == 4);
+
+                    if (flags.depth)
+                    {
+                        switch (bpp)
+                        {
+                            case 16:
+                                index = 5;
+                                break;
+                            case 32:
+                                index = 7;
+                                break;
+                            default:
+                                ADDR_ASSERT_ALWAYS();
+                                break;
+                        }
+                    }
+                    else
+                    {
+                        switch (bpp)
+                        {
+                            case 8:
+                                index = 23;
+                                break;
+                            case 16:
+                                index = 24;
+                                break;
+                            case 32:
+                                index = 25;
+                                break;
+                            case 64:
+                                index = 30;
+                                break;
+                            default:
+                                ADDR_ASSERT_ALWAYS();
+                                break;
+                        }
+                    }
+                }
+            }//end of PRT part
+            // See table entries 0-7
+            else if (flags.depth || flags.stencil)
+            {
+                if (flags.compressZ)
+                {
+                    if (flags.stencil)
+                    {
+                        index = 0;
+                    }
+                    else
+                    {
+                        // optimal tile index for compressed depth/stencil.
+                        switch (numSamples)
+                        {
+                            case 1:
+                                index = 0;
+                                break;
+                            case 2:
+                            case 4:
+                                index = 1;
+                                break;
+                            case 8:
+                                index = 2;
+                                break;
+                            default:
+                                break;
+                        }
+                    }
+                }
+                else // unCompressZ
+                {
+                    index = 3;
+                }
+            }
+            else //non PRT & non Depth & non Stencil
+            {
+                // See table entries 9-12
+                if (inTileType == ADDR_DISPLAYABLE)
+                {
+                    switch (bpp)
+                    {
+                        case 8:
+                            index = 10;
+                            break;
+                        case 16:
+                            index = 11;
+                            break;
+                        case 32:
+                            index = 12;
+                            break;
+                        case 64:
+                            index = 12;
+                            break;
+                        default:
+                            break;
+                    }
+                }
+                else
+                {
+                    // See table entries 13-17
+                    if (thickness == 1)
+                    {
+                        if (flags.fmask)
+                        {
+                            UINT_32 fmaskPixelSize = bpp * numSamples;
+
+                            switch (fmaskPixelSize)
+                            {
+                                case 8:
+                                    index = 14;
+                                    break;
+                                case 16:
+                                    index = 15;
+                                    break;
+                                case 32:
+                                    index = 16;
+                                    break;
+                                case 64:
+                                    index = 17;
+                                    break;
+                                default:
+                                    ADDR_ASSERT_ALWAYS();
+                            }
+                        }
+                        else
+                        {
+                            switch (bpp)
+                            {
+                                case 8:
+                                    index = 14;
+                                    break;
+                                case 16:
+                                    index = 15;
+                                    break;
+                                case 32:
+                                    index = 16;
+                                    break;
+                                case 64:
+                                    index = 17;
+                                    break;
+                                case 128:
+                                    index = 17;
+                                    break;
+                                default:
+                                    break;
+                            }
+                        }
+                    }
+                    else // thick tiling - entries 18-20
+                    {
+                        switch (thickness)
+                        {
+                            case 4:
+                                index = 20;
+                                break;
+                            case 8:
+                                index = 19;
+                                break;
+                            default:
+                                break;
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            if (tileMode == ADDR_TM_LINEAR_ALIGNED)
+            {
+                index = 8;
+            }
+            else if (tileMode == ADDR_TM_LINEAR_GENERAL)
+            {
+                index = TileIndexLinearGeneral;
+            }
+            else
+            {
+                if (flags.depth || flags.stencil)
+                {
+                    index = 4;
+                }
+                else if (inTileType == ADDR_DISPLAYABLE)
+                {
+                    index = 9;
+                }
+                else if (thickness == 1)
+                {
+                    index = 13;
+                }
+                else
+                {
+                    index = 18;
+                }
+            }
+        }
+
+        if (index >= 0 && index <= 31)
+        {
+            *pTileInfo      = m_tileTable[index].info;
+            pOut->tileType  = m_tileTable[index].type;
+        }
+
+        if (index == TileIndexLinearGeneral)
+        {
+            *pTileInfo      = m_tileTable[8].info;
+            pOut->tileType  = m_tileTable[8].type;
+        }
+    }
+    else
+    {
+        if (pTileInfoIn)
+        {
+            if (flags.stencil && pTileInfoIn->tileSplitBytes == 0)
+            {
+                // Stencil always uses index 0
+                *pTileInfo = m_tileTable[0].info;
+            }
+        }
+        // Pass through tile type
+        pOut->tileType = inTileType;
+    }
+
+    pOut->tileIndex = index;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::DecodeGbRegs
+*
+*   @brief
+*       Decodes GB_ADDR_CONFIG and noOfBanks/noOfRanks
+*
+*   @return
+*       TRUE if all settings are valid
+*
+***************************************************************************************************
+*/
+BOOL_32 SIAddrLib::DecodeGbRegs(
+    const ADDR_REGISTER_VALUE* pRegValue) ///< [in] create input
+{
+    GB_ADDR_CONFIG  reg;
+    BOOL_32         valid = TRUE;
+
+    reg.val = pRegValue->gbAddrConfig;
+
+    switch (reg.f.pipe_interleave_size)
+    {
+        case ADDR_CONFIG_PIPE_INTERLEAVE_256B:
+            m_pipeInterleaveBytes = ADDR_PIPEINTERLEAVE_256B;
+            break;
+        case ADDR_CONFIG_PIPE_INTERLEAVE_512B:
+            m_pipeInterleaveBytes = ADDR_PIPEINTERLEAVE_512B;
+            break;
+        default:
+            valid = FALSE;
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+
+    switch (reg.f.row_size)
+    {
+        case ADDR_CONFIG_1KB_ROW:
+            m_rowSize = ADDR_ROWSIZE_1KB;
+            break;
+        case ADDR_CONFIG_2KB_ROW:
+            m_rowSize = ADDR_ROWSIZE_2KB;
+            break;
+        case ADDR_CONFIG_4KB_ROW:
+            m_rowSize = ADDR_ROWSIZE_4KB;
+            break;
+        default:
+            valid = FALSE;
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+
+    switch (pRegValue->noOfBanks)
+    {
+        case 0:
+            m_banks = 4;
+            break;
+        case 1:
+            m_banks = 8;
+            break;
+        case 2:
+            m_banks = 16;
+            break;
+        default:
+            valid = FALSE;
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+
+    switch (pRegValue->noOfRanks)
+    {
+        case 0:
+            m_ranks = 1;
+            break;
+        case 1:
+            m_ranks = 2;
+            break;
+        default:
+            valid = FALSE;
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+
+    m_logicalBanks = m_banks * m_ranks;
+
+    ADDR_ASSERT(m_logicalBanks <= 16);
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlInitGlobalParams
+*
+*   @brief
+*       Initializes global parameters
+*
+*   @return
+*       TRUE if all settings are valid
+*
+***************************************************************************************************
+*/
+BOOL_32 SIAddrLib::HwlInitGlobalParams(
+    const ADDR_CREATE_INPUT* pCreateIn) ///< [in] create input
+{
+    BOOL_32 valid = TRUE;
+    const ADDR_REGISTER_VALUE* pRegValue = &pCreateIn->regValue;
+
+    valid = DecodeGbRegs(pRegValue);
+
+    if (valid)
+    {
+        if (m_settings.isTahiti || m_settings.isPitCairn)
+        {
+            m_pipes = 8;
+        }
+        else if (m_settings.isCapeVerde || m_settings.isOland)
+        {
+            m_pipes = 4;
+        }
+        else
+        {
+            // Hainan is 2-pipe (m_settings.isHainan == 1)
+            m_pipes = 2;
+        }
+
+        valid = InitTileSettingTable(pRegValue->pTileConfig, pRegValue->noOfEntries);
+
+        m_maxSamples = 16;
+    }
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlConvertTileInfoToHW
+*   @brief
+*       Entry of si's ConvertTileInfoToHW
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE SIAddrLib::HwlConvertTileInfoToHW(
+    const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn, ///< [in] input structure
+    ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut      ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode   = ADDR_OK;
+
+    retCode = EgBasedAddrLib::HwlConvertTileInfoToHW(pIn, pOut);
+
+    if (retCode == ADDR_OK)
+    {
+        if (pIn->reverse == FALSE)
+        {
+            if (pIn->pTileInfo->pipeConfig == ADDR_PIPECFG_INVALID)
+            {
+                retCode = ADDR_INVALIDPARAMS;
+            }
+            else
+            {
+                pOut->pTileInfo->pipeConfig =
+                    static_cast<AddrPipeCfg>(pIn->pTileInfo->pipeConfig - 1);
+            }
+        }
+        else
+        {
+            pOut->pTileInfo->pipeConfig =
+                static_cast<AddrPipeCfg>(pIn->pTileInfo->pipeConfig + 1);
+        }
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeXmaskCoordYFrom8Pipe
+*
+*   @brief
+*       Compute the Y coord which will be added to Xmask Y
+*       coord.
+*   @return
+*       Y coord
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlComputeXmaskCoordYFrom8Pipe(
+    UINT_32         pipe,       ///< [in] pipe id
+    UINT_32         x           ///< [in] tile coord x, which is original x coord / 8
+    ) const
+{
+    // This function should never be called since it is 6xx/8xx specfic.
+    // Keep this empty implementation to avoid any mis-use.
+    ADDR_ASSERT_ALWAYS();
+
+    return 0;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeSurfaceCoord2DFromBankPipe
+*
+*   @brief
+*       Compute surface x,y coordinates from bank/pipe info
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlComputeSurfaceCoord2DFromBankPipe(
+    AddrTileMode        tileMode,   ///< [in] tile mode
+    UINT_32*            pX,         ///< [in/out] x coordinate
+    UINT_32*            pY,         ///< [in/out] y coordinate
+    UINT_32             slice,      ///< [in] slice index
+    UINT_32             bank,       ///< [in] bank number
+    UINT_32             pipe,       ///< [in] pipe number
+    UINT_32             bankSwizzle,///< [in] bank swizzle
+    UINT_32             pipeSwizzle,///< [in] pipe swizzle
+    UINT_32             tileSlices, ///< [in] slices in a micro tile
+    BOOL_32             ignoreSE,   ///< [in] TRUE if shader engines are ignored
+    ADDR_TILEINFO*      pTileInfo   ///< [in] bank structure. **All fields to be valid on entry**
+    ) const
+{
+    UINT_32 xBit;
+    UINT_32 yBit;
+    UINT_32 yBit3 = 0;
+    UINT_32 yBit4 = 0;
+    UINT_32 yBit5 = 0;
+    UINT_32 yBit6 = 0;
+
+    UINT_32 xBit3 = 0;
+    UINT_32 xBit4 = 0;
+    UINT_32 xBit5 = 0;
+
+    UINT_32 numPipes = GetPipePerSurf(pTileInfo->pipeConfig);
+
+    CoordFromBankPipe xyBits = {0};
+    ComputeSurfaceCoord2DFromBankPipe(tileMode, *pX, *pY, slice, bank, pipe,
+                                      bankSwizzle, pipeSwizzle, tileSlices, pTileInfo,
+                                      &xyBits);
+    yBit3 = xyBits.yBit3;
+    yBit4 = xyBits.yBit4;
+    yBit5 = xyBits.yBit5;
+    yBit6 = xyBits.yBit6;
+
+    xBit3 = xyBits.xBit3;
+    xBit4 = xyBits.xBit4;
+    xBit5 = xyBits.xBit5;
+
+    yBit = xyBits.yBits;
+
+    UINT_32 yBitTemp = 0;
+
+    if ((pTileInfo->pipeConfig == ADDR_PIPECFG_P4_32x32) ||
+        (pTileInfo->pipeConfig == ADDR_PIPECFG_P8_32x64_32x32))
+    {
+        ADDR_ASSERT(pTileInfo->bankWidth == 1 && pTileInfo->macroAspectRatio > 1);
+        UINT_32 yBitToCheck = QLog2(pTileInfo->banks) - 1;
+
+        ADDR_ASSERT(yBitToCheck <= 3);
+
+        yBitTemp = _BIT(yBit, yBitToCheck);
+
+        xBit3 = 0;
+    }
+
+    yBit = Bits2Number(4, yBit6, yBit5, yBit4, yBit3);
+    xBit = Bits2Number(3, xBit5, xBit4, xBit3);
+
+    *pY += yBit * pTileInfo->bankHeight * MicroTileHeight;
+    *pX += xBit * numPipes * pTileInfo->bankWidth * MicroTileWidth;
+
+    //calculate the bank and pipe bits in x, y
+    UINT_32 xTile; //x in micro tile
+    UINT_32 x3 = 0;
+    UINT_32 x4 = 0;
+    UINT_32 x5 = 0;
+    UINT_32 x6 = 0;
+    UINT_32 y = *pY;
+
+    UINT_32 pipeBit0 = _BIT(pipe,0);
+    UINT_32 pipeBit1 = _BIT(pipe,1);
+    UINT_32 pipeBit2 = _BIT(pipe,2);
+
+    UINT_32 y3 = _BIT(y, 3);
+    UINT_32 y4 = _BIT(y, 4);
+    UINT_32 y5 = _BIT(y, 5);
+    UINT_32 y6 = _BIT(y, 6);
+
+    // bankbit0 after ^x4^x5
+    UINT_32 bankBit00 = _BIT(bank,0);
+    UINT_32 bankBit0 = 0;
+
+    switch (pTileInfo->pipeConfig)
+    {
+        case ADDR_PIPECFG_P2:
+            x3 = pipeBit0 ^ y3;
+            break;
+        case ADDR_PIPECFG_P4_8x16:
+            x4 = pipeBit0 ^ y3;
+            x3 = pipeBit0 ^ y4;
+            break;
+        case ADDR_PIPECFG_P4_16x16:
+            x4 = pipeBit1 ^ y4;
+            x3 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P4_16x32:
+            x4 = pipeBit1 ^ y4;
+            x3 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P4_32x32:
+            x5 = pipeBit1 ^ y5;
+            x3 = pipeBit0 ^ y3 ^ x5;
+            bankBit0 = yBitTemp ^ x5;
+            x4 = bankBit00 ^ x5 ^ bankBit0;
+            *pX += x5 * 4 * 1 * 8; // x5 * num_pipes * bank_width * 8;
+            break;
+        case ADDR_PIPECFG_P8_16x16_8x16:
+            x3 = pipeBit1 ^ y5;
+            x4 = pipeBit2 ^ y4;
+            x5 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P8_16x32_8x16:
+            x3 = pipeBit1 ^ y4;
+            x4 = pipeBit2 ^ y5;
+            x5 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P8_32x32_8x16:
+            x3 = pipeBit1 ^ y4;
+            x5 = pipeBit2 ^ y5;
+            x4 = pipeBit0 ^ y3 ^ x5;
+            break;
+        case ADDR_PIPECFG_P8_16x32_16x16:
+            x4 = pipeBit2 ^ y5;
+            x5 = pipeBit1 ^ y4;
+            x3 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x16:
+            x5 = pipeBit2 ^ y5;
+            x4 = pipeBit1 ^ y4;
+            x3 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x32:
+            x5 = pipeBit2 ^ y5;
+            x4 = pipeBit1 ^ y6;
+            x3 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P8_32x64_32x32:
+            x6 = pipeBit1 ^ y5;
+            x5 = pipeBit2 ^ y6;
+            x3 = pipeBit0 ^ y3 ^ x5;
+            bankBit0 = yBitTemp ^ x6;
+            x4 = bankBit00 ^ x5 ^ bankBit0;
+            *pX += x6 * 8 * 1 * 8; // x6 * num_pipes * bank_width * 8;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+    }
+
+    xTile = Bits2Number(3, x5, x4, x3);
+
+    *pX += xTile << 3;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlPreAdjustBank
+*
+*   @brief
+*       Adjust bank before calculating address acoording to bank/pipe
+*   @return
+*       Adjusted bank
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlPreAdjustBank(
+    UINT_32         tileX,      ///< [in] x coordinate in unit of tile
+    UINT_32         bank,       ///< [in] bank
+    ADDR_TILEINFO*  pTileInfo   ///< [in] tile info
+    ) const
+{
+    if (((pTileInfo->pipeConfig == ADDR_PIPECFG_P4_32x32) ||
+        (pTileInfo->pipeConfig == ADDR_PIPECFG_P8_32x64_32x32)) && (pTileInfo->bankWidth == 1))
+    {
+        UINT_32 bankBit0 = _BIT(bank, 0);
+        UINT_32 x4 = _BIT(tileX, 1);
+        UINT_32 x5 = _BIT(tileX, 2);
+
+        bankBit0 = bankBit0 ^ x4 ^ x5;
+        bank |= bankBit0;
+
+        ADDR_ASSERT(pTileInfo->macroAspectRatio > 1)
+    }
+
+    return bank;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeSurfaceInfo
+*
+*   @brief
+*       Entry of si's ComputeSurfaceInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE SIAddrLib::HwlComputeSurfaceInfo(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    pOut->tileIndex = pIn->tileIndex;
+
+    return EgBasedAddrLib::HwlComputeSurfaceInfo(pIn,pOut);
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeMipLevel
+*   @brief
+*       Compute MipLevel info (including level 0)
+*   @return
+*       TRUE if HWL's handled
+***************************************************************************************************
+*/
+BOOL_32 SIAddrLib::HwlComputeMipLevel(
+    ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn ///< [in/out] Input structure
+    ) const
+{
+    // basePitch is calculated from level 0 so we only check this for mipLevel > 0
+    if (pIn->mipLevel > 0)
+    {
+        // Note: Don't check expand 3x formats(96 bit) as the basePitch is not pow2 even if
+        // we explicity set pow2Pad flag. The 3x base pitch is padded to pow2 but after being
+        // divided by expandX factor (3) - to program texture pitch, the basePitch is never pow2.
+        if (!AddrElemLib::IsExpand3x(pIn->format))
+        {
+            // Sublevel pitches are generated from base level pitch instead of width on SI
+            // If pow2Pad is 0, we don't assert - as this is not really used for a mip chain
+            ADDR_ASSERT(!pIn->flags.pow2Pad || ((pIn->basePitch != 0) && IsPow2(pIn->basePitch)));
+        }
+
+        if (pIn->basePitch != 0)
+        {
+            pIn->width = Max(1u, pIn->basePitch >> pIn->mipLevel);
+        }
+    }
+
+    // pow2Pad is done in PostComputeMipLevel
+
+    return TRUE;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlCheckLastMacroTiledLvl
+*
+*   @brief
+*       Sets pOut->last2DLevel to TRUE if it is
+*   @note
+*
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlCheckLastMacroTiledLvl(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, ///< [in] Input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut      ///< [in/out] Output structure (used as input, too)
+    ) const
+{
+    // pow2Pad covers all mipmap cases
+    if (pIn->flags.pow2Pad)
+    {
+        ADDR_ASSERT(IsMacroTiled(pIn->tileMode));
+
+        UINT_32 nextPitch;
+        UINT_32 nextHeight;
+        UINT_32 nextSlices;
+
+        AddrTileMode nextTileMode;
+
+        if (pIn->mipLevel == 0 || pIn->basePitch == 0)
+        {
+            // Base level or fail-safe case (basePitch == 0)
+            nextPitch = pOut->pitch >> 1;
+        }
+        else
+        {
+            // Sub levels
+            nextPitch = pIn->basePitch >> (pIn->mipLevel + 1);
+        }
+
+        // nextHeight must be shifted from this level's original height rather than a pow2 padded
+        // one but this requires original height stored somewhere (pOut->height)
+        ADDR_ASSERT(pOut->height != 0);
+
+        // next level's height is just current level's >> 1 in pixels
+        nextHeight = pOut->height >> 1;
+        // Special format such as FMT_1 and FMT_32_32_32 can be linear only so we consider block
+        // compressed foramts
+        if (AddrElemLib::IsBlockCompressed(pIn->format))
+        {
+            nextHeight = (nextHeight + 3) / 4;
+        }
+        nextHeight = NextPow2(nextHeight);
+
+        // nextSlices may be 0 if this level's is 1
+        if (pIn->flags.volume)
+        {
+            nextSlices = Max(1u, pIn->numSlices >> 1);
+        }
+        else
+        {
+            nextSlices = pIn->numSlices;
+        }
+
+        nextTileMode = ComputeSurfaceMipLevelTileMode(pIn->tileMode,
+                                                      pIn->bpp,
+                                                      nextPitch,
+                                                      nextHeight,
+                                                      nextSlices,
+                                                      pIn->numSamples,
+                                                      pOut->pitchAlign,
+                                                      pOut->heightAlign,
+                                                      pOut->pTileInfo);
+
+        pOut->last2DLevel = IsMicroTiled(nextTileMode);
+    }
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlDegradeThickTileMode
+*
+*   @brief
+*       Degrades valid tile mode for thick modes if needed
+*
+*   @return
+*       Suitable tile mode
+***************************************************************************************************
+*/
+AddrTileMode SIAddrLib::HwlDegradeThickTileMode(
+    AddrTileMode        baseTileMode,   ///< [in] base tile mode
+    UINT_32             numSlices,      ///< [in] current number of slices
+    UINT_32*            pBytesPerTile   ///< [in/out] pointer to bytes per slice
+    ) const
+{
+    return EgBasedAddrLib::HwlDegradeThickTileMode(baseTileMode, numSlices, pBytesPerTile);
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlTileInfoEqual
+*
+*   @brief
+*       Return TRUE if all field are equal
+*   @note
+*       Only takes care of current HWL's data
+***************************************************************************************************
+*/
+BOOL_32 SIAddrLib::HwlTileInfoEqual(
+    const ADDR_TILEINFO* pLeft, ///<[in] Left compare operand
+    const ADDR_TILEINFO* pRight ///<[in] Right compare operand
+    ) const
+{
+    BOOL_32 equal = FALSE;
+
+    if (pLeft->pipeConfig == pRight->pipeConfig)
+    {
+        equal =  EgBasedAddrLib::HwlTileInfoEqual(pLeft, pRight);
+    }
+
+    return equal;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::GetTileSettings
+*
+*   @brief
+*       Get tile setting infos by index.
+*   @return
+*       Tile setting info.
+***************************************************************************************************
+*/
+const ADDR_TILECONFIG* SIAddrLib::GetTileSetting(
+    UINT_32 index          ///< [in] Tile index
+    ) const
+{
+    ADDR_ASSERT(index < m_noOfEntries);
+    return &m_tileTable[index];
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlPostCheckTileIndex
+*
+*   @brief
+*       Map a tile setting to index if curIndex is invalid, otherwise check if curIndex matches
+*       tile mode/type/info and change the index if needed
+*   @return
+*       Tile index.
+***************************************************************************************************
+*/
+INT_32 SIAddrLib::HwlPostCheckTileIndex(
+    const ADDR_TILEINFO* pInfo,     ///< [in] Tile Info
+    AddrTileMode         mode,      ///< [in] Tile mode
+    AddrTileType         type,      ///< [in] Tile type
+    INT                  curIndex   ///< [in] Current index assigned in HwlSetupTileInfo
+    ) const
+{
+    INT_32 index = curIndex;
+
+    if (mode == ADDR_TM_LINEAR_GENERAL)
+    {
+        index = TileIndexLinearGeneral;
+    }
+    else
+    {
+        BOOL_32 macroTiled = IsMacroTiled(mode);
+
+        // We need to find a new index if either of them is true
+        // 1. curIndex is invalid
+        // 2. tile mode is changed
+        // 3. tile info does not match for macro tiled
+        if ((index == TileIndexInvalid         ||
+            (mode != m_tileTable[index].mode)  ||
+            (macroTiled && !HwlTileInfoEqual(pInfo, &m_tileTable[index].info))))
+        {
+            for (index = 0; index < static_cast<INT_32>(m_noOfEntries); index++)
+            {
+                if (macroTiled)
+                {
+                    // macro tile modes need all to match
+                    if (HwlTileInfoEqual(pInfo, &m_tileTable[index].info) &&
+                        (mode == m_tileTable[index].mode)                 &&
+                        (type == m_tileTable[index].type))
+                    {
+                        break;
+                    }
+                }
+                else if (mode == ADDR_TM_LINEAR_ALIGNED)
+                {
+                    // linear mode only needs tile mode to match
+                    if (mode == m_tileTable[index].mode)
+                    {
+                        break;
+                    }
+                }
+                else
+                {
+                    // micro tile modes only need tile mode and tile type to match
+                    if (mode == m_tileTable[index].mode &&
+                        type == m_tileTable[index].type)
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    ADDR_ASSERT(index < static_cast<INT_32>(m_noOfEntries));
+
+    if (index >= static_cast<INT_32>(m_noOfEntries))
+    {
+        index = TileIndexInvalid;
+    }
+
+    return index;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlSetupTileCfg
+*
+*   @brief
+*       Map tile index to tile setting.
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE SIAddrLib::HwlSetupTileCfg(
+    INT_32          index,          ///< [in] Tile index
+    INT_32          macroModeIndex, ///< [in] Index in macro tile mode table(CI)
+    ADDR_TILEINFO*  pInfo,          ///< [out] Tile Info
+    AddrTileMode*   pMode,          ///< [out] Tile mode
+    AddrTileType*   pType          ///< [out] Tile type
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    // Global flag to control usage of tileIndex
+    if (UseTileIndex(index))
+    {
+        if (index == TileIndexLinearGeneral)
+        {
+            if (pMode)
+            {
+                *pMode = ADDR_TM_LINEAR_GENERAL;
+            }
+
+            if (pType)
+            {
+                *pType = ADDR_DISPLAYABLE;
+            }
+
+            if (pInfo)
+            {
+                pInfo->banks = 2;
+                pInfo->bankWidth = 1;
+                pInfo->bankHeight = 1;
+                pInfo->macroAspectRatio = 1;
+                pInfo->tileSplitBytes = 64;
+                pInfo->pipeConfig = ADDR_PIPECFG_P2;
+            }
+        }
+        else if (static_cast<UINT_32>(index) >= m_noOfEntries)
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+        else
+        {
+            const ADDR_TILECONFIG* pCfgTable = GetTileSetting(index);
+
+            if (pInfo)
+            {
+                *pInfo = pCfgTable->info;
+            }
+            else
+            {
+                if (IsMacroTiled(pCfgTable->mode))
+                {
+                    returnCode = ADDR_INVALIDPARAMS;
+                }
+            }
+
+            if (pMode)
+            {
+                *pMode = pCfgTable->mode;
+            }
+
+            if (pType)
+            {
+                *pType = pCfgTable->type;
+            }
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::ReadGbTileMode
+*
+*   @brief
+*       Convert GB_TILE_MODE HW value to ADDR_TILE_CONFIG.
+*   @return
+*       NA.
+***************************************************************************************************
+*/
+VOID SIAddrLib::ReadGbTileMode(
+    UINT_32             regValue,   ///< [in] GB_TILE_MODE register
+    ADDR_TILECONFIG*    pCfg        ///< [out] output structure
+    ) const
+{
+    GB_TILE_MODE gbTileMode;
+    gbTileMode.val = regValue;
+
+    pCfg->type = static_cast<AddrTileType>(gbTileMode.f.micro_tile_mode);
+    pCfg->info.bankHeight = 1 << gbTileMode.f.bank_height;
+    pCfg->info.bankWidth = 1 << gbTileMode.f.bank_width;
+    pCfg->info.banks = 1 << (gbTileMode.f.num_banks + 1);
+    pCfg->info.macroAspectRatio = 1 << gbTileMode.f.macro_tile_aspect;
+    pCfg->info.tileSplitBytes = 64 << gbTileMode.f.tile_split;
+    pCfg->info.pipeConfig = static_cast<AddrPipeCfg>(gbTileMode.f.pipe_config + 1);
+
+    UINT_32 regArrayMode = gbTileMode.f.array_mode;
+
+    pCfg->mode = static_cast<AddrTileMode>(regArrayMode);
+
+    if (regArrayMode == 8) //ARRAY_2D_TILED_XTHICK
+    {
+        pCfg->mode = ADDR_TM_2D_TILED_XTHICK;
+    }
+    else if (regArrayMode >= 14) //ARRAY_3D_TILED_XTHICK
+    {
+        pCfg->mode = static_cast<AddrTileMode>(pCfg->mode + 3);
+    }
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::InitTileSettingTable
+*
+*   @brief
+*       Initialize the ADDR_TILE_CONFIG table.
+*   @return
+*       TRUE if tile table is correctly initialized
+***************************************************************************************************
+*/
+BOOL_32 SIAddrLib::InitTileSettingTable(
+    const UINT_32*  pCfg,           ///< [in] Pointer to table of tile configs
+    UINT_32         noOfEntries     ///< [in] Numbe of entries in the table above
+    )
+{
+    BOOL_32 initOk = TRUE;
+
+    ADDR_ASSERT(noOfEntries <= TileTableSize);
+
+    memset(m_tileTable, 0, sizeof(m_tileTable));
+
+    if (noOfEntries != 0)
+    {
+        m_noOfEntries = noOfEntries;
+    }
+    else
+    {
+        m_noOfEntries = TileTableSize;
+    }
+
+    if (pCfg) // From Client
+    {
+        for (UINT_32 i = 0; i < m_noOfEntries; i++)
+        {
+            ReadGbTileMode(*(pCfg + i), &m_tileTable[i]);
+        }
+    }
+    else
+    {
+        ADDR_ASSERT_ALWAYS();
+        initOk = FALSE;
+    }
+
+    if (initOk)
+    {
+        ADDR_ASSERT(m_tileTable[TILEINDEX_LINEAR_ALIGNED].mode == ADDR_TM_LINEAR_ALIGNED);
+    }
+
+    return initOk;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlGetTileIndex
+*
+*   @brief
+*       Return the virtual/real index for given mode/type/info
+*   @return
+*       ADDR_OK if successful.
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE SIAddrLib::HwlGetTileIndex(
+    const ADDR_GET_TILEINDEX_INPUT* pIn,
+    ADDR_GET_TILEINDEX_OUTPUT*      pOut) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    pOut->index = HwlPostCheckTileIndex(pIn->pTileInfo, pIn->tileMode, pIn->tileType);
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlFmaskPreThunkSurfInfo
+*
+*   @brief
+*       Some preparation before thunking a ComputeSurfaceInfo call for Fmask
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlFmaskPreThunkSurfInfo(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pFmaskIn,   ///< [in] Input of fmask info
+    const ADDR_COMPUTE_FMASK_INFO_OUTPUT*   pFmaskOut,  ///< [in] Output of fmask info
+    ADDR_COMPUTE_SURFACE_INFO_INPUT*        pSurfIn,    ///< [out] Input of thunked surface info
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pSurfOut    ///< [out] Output of thunked surface info
+    ) const
+{
+    pSurfIn->tileIndex = pFmaskIn->tileIndex;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlFmaskPostThunkSurfInfo
+*
+*   @brief
+*       Copy hwl extra field after calling thunked ComputeSurfaceInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlFmaskPostThunkSurfInfo(
+    const ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut,   ///< [in] Output of surface info
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut           ///< [out] Output of fmask info
+    ) const
+{
+    pFmaskOut->macroModeIndex = TileIndexInvalid;
+    pFmaskOut->tileIndex = pSurfOut->tileIndex;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeFmaskBits
+*   @brief
+*       Computes fmask bits
+*   @return
+*       Fmask bits
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlComputeFmaskBits(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+    UINT_32* pNumSamples
+    ) const
+{
+    UINT_32 numSamples = pIn->numSamples;
+    UINT_32 numFrags = GetNumFragments(numSamples, pIn->numFrags);
+    UINT_32 bpp;
+
+    if (numFrags != numSamples) // EQAA
+    {
+        ADDR_ASSERT(numFrags <= 8);
+
+        if (!pIn->resolved)
+        {
+            if (numFrags == 1)
+            {
+                bpp          = 1;
+                numSamples   = numSamples == 16 ? 16 : 8;
+            }
+            else if (numFrags == 2)
+            {
+                ADDR_ASSERT(numSamples >= 4);
+
+                bpp          = 2;
+                numSamples   = numSamples;
+            }
+            else if (numFrags == 4)
+            {
+                ADDR_ASSERT(numSamples >= 4);
+
+                bpp          = 4;
+                numSamples   = numSamples;
+            }
+            else // numFrags == 8
+            {
+                ADDR_ASSERT(numSamples == 16);
+
+                bpp          = 4;
+                numSamples   = numSamples;
+            }
+        }
+        else
+        {
+            if (numFrags == 1)
+            {
+                bpp          = (numSamples == 16) ? 16 : 8;
+                numSamples   = 1;
+            }
+            else if (numFrags == 2)
+            {
+                ADDR_ASSERT(numSamples >= 4);
+
+                bpp          = numSamples*2;
+                numSamples   = 1;
+            }
+            else if (numFrags == 4)
+            {
+                ADDR_ASSERT(numSamples >= 4);
+
+                bpp          = numSamples*4;
+                numSamples   = 1;
+            }
+            else // numFrags == 8
+            {
+                ADDR_ASSERT(numSamples >= 16);
+
+                bpp          = 16*4;
+                numSamples   = 1;
+            }
+        }
+    }
+    else // Normal AA
+    {
+        if (!pIn->resolved)
+        {
+            bpp          = ComputeFmaskNumPlanesFromNumSamples(numSamples);
+            numSamples   = numSamples == 2 ? 8 : numSamples;
+        }
+        else
+        {
+            // The same as 8XX
+            bpp          = ComputeFmaskResolvedBppFromNumSamples(numSamples);
+            numSamples   = 1; // 1x sample
+        }
+    }
+
+    SafeAssign(pNumSamples, numSamples);
+
+    return bpp;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlOverrideTileMode
+*
+*   @brief
+*       Override tile modes (for PRT only, avoid client passes in an invalid PRT mode for SI.
+*
+*   @return
+*       Suitable tile mode
+*
+***************************************************************************************************
+*/
+BOOL_32 SIAddrLib::HwlOverrideTileMode(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,       ///< [in] input structure
+    AddrTileMode*                           pTileMode, ///< [in/out] pointer to the tile mode
+    AddrTileType*                           pTileType  ///< [in/out] pointer to the tile type
+    ) const
+{
+    BOOL_32 bOverrided = FALSE;
+    AddrTileMode tileMode = *pTileMode;
+
+    switch (tileMode)
+    {
+        case ADDR_TM_PRT_TILED_THIN1:
+            tileMode    = ADDR_TM_2D_TILED_THIN1;
+            break;
+
+        case ADDR_TM_PRT_TILED_THICK:
+            tileMode    = ADDR_TM_2D_TILED_THICK;
+            break;
+
+        case ADDR_TM_PRT_2D_TILED_THICK:
+            tileMode    = ADDR_TM_2D_TILED_THICK;
+            break;
+
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            tileMode    = ADDR_TM_3D_TILED_THICK;
+            break;
+
+        default:
+            break;
+    }
+
+    if (tileMode != *pTileMode)
+    {
+        *pTileMode = tileMode;
+        bOverrided = TRUE;
+        ADDR_ASSERT(pIn->flags.prt == TRUE);
+    }
+
+    return bOverrided;
+}
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.h
new file mode 100644
index 00000000000..897beb1bb92
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  siaddrlib.h
+* @brief Contains the R800AddrLib class definition.
+***************************************************************************************************
+*/
+
+#ifndef __SI_ADDR_LIB_H__
+#define __SI_ADDR_LIB_H__
+
+#include "addrlib.h"
+#include "egbaddrlib.h"
+
+/**
+***************************************************************************************************
+* @brief Describes the information in tile mode table
+***************************************************************************************************
+*/
+struct ADDR_TILECONFIG
+{
+    AddrTileMode  mode;
+    AddrTileType  type;
+    ADDR_TILEINFO info;
+};
+
+/**
+***************************************************************************************************
+* @brief SI specific settings structure.
+***************************************************************************************************
+*/
+struct SIChipSettings
+{
+    struct
+    {
+        UINT_32 isSouthernIsland    : 1;
+        UINT_32 isTahiti            : 1;
+        UINT_32 isPitCairn          : 1;
+        UINT_32 isCapeVerde         : 1;
+        /// Oland/Hainan are of GFXIP 6.0, similar with SI
+        UINT_32 isOland             : 1;
+        UINT_32 isHainan            : 1;
+    };
+};
+
+/**
+***************************************************************************************************
+* @brief This class is the SI specific address library
+*        function set.
+***************************************************************************************************
+*/
+class SIAddrLib : public EgBasedAddrLib
+{
+public:
+    /// Creates SIAddrLib object
+    static AddrLib* CreateObj(const AddrClient* pClient)
+    {
+        return new(pClient) SIAddrLib(pClient);
+    }
+
+protected:
+    SIAddrLib(const AddrClient* pClient);
+    virtual ~SIAddrLib();
+
+    // Hwl interface - defined in AddrLib
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlConvertTileInfoToHW(
+        const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn,
+        ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut) const;
+
+    virtual UINT_64 HwlComputeXmaskAddrFromCoord(
+        UINT_32 pitch, UINT_32 height, UINT_32 x, UINT_32 y, UINT_32 slice, UINT_32 numSlices,
+        UINT_32 factor, BOOL_32 isLinear, BOOL_32 isWidth8, BOOL_32 isHeight8,
+        ADDR_TILEINFO* pTileInfo, UINT_32* pBitPosition) const;
+
+    virtual VOID HwlComputeXmaskCoordFromAddr(
+        UINT_64 addr, UINT_32 bitPosition, UINT_32 pitch, UINT_32 height, UINT_32 numSlices,
+        UINT_32 factor, BOOL_32 isLinear, BOOL_32 isWidth8, BOOL_32 isHeight8,
+        ADDR_TILEINFO* pTileInfo, UINT_32* pX, UINT_32* pY, UINT_32* pSlice) const;
+
+    virtual ADDR_E_RETURNCODE HwlGetTileIndex(
+        const ADDR_GET_TILEINDEX_INPUT* pIn,
+        ADDR_GET_TILEINDEX_OUTPUT*      pOut) const;
+
+    virtual BOOL_32 HwlComputeMipLevel(
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn) const;
+
+    virtual AddrChipFamily HwlConvertChipFamily(
+        UINT_32 uChipFamily, UINT_32 uChipRevision);
+
+    virtual BOOL_32 HwlInitGlobalParams(
+        const ADDR_CREATE_INPUT* pCreateIn);
+
+    virtual ADDR_E_RETURNCODE HwlSetupTileCfg(
+        INT_32 index, INT_32 macroModeIndex,
+        ADDR_TILEINFO* pInfo, AddrTileMode* pMode = 0, AddrTileType* pType = 0) const;
+
+    virtual VOID HwlComputeTileDataWidthAndHeightLinear(
+        UINT_32* pMacroWidth, UINT_32* pMacroHeight,
+        UINT_32 bpp, ADDR_TILEINFO* pTileInfo) const;
+
+    virtual UINT_64 HwlComputeHtileBytes(
+        UINT_32 pitch, UINT_32 height, UINT_32 bpp,
+        BOOL_32 isLinear, UINT_32 numSlices, UINT_64* pSliceBytes, UINT_32 baseAlign) const;
+
+    virtual UINT_32 ComputePipeFromCoord(
+        UINT_32 x, UINT_32 y, UINT_32 slice,
+        AddrTileMode tileMode, UINT_32 pipeSwizzle, BOOL_32 ignoreSE,
+        ADDR_TILEINFO* pTileInfo) const;
+
+    virtual UINT_32 HwlGetPipes(const ADDR_TILEINFO* pTileInfo) const;
+
+    /// Pre-handler of 3x pitch (96 bit) adjustment
+    virtual UINT_32 HwlPreHandleBaseLvl3xPitch(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, UINT_32 expPitch) const;
+    /// Post-handler of 3x pitch adjustment
+    virtual UINT_32 HwlPostHandleBaseLvl3xPitch(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, UINT_32 expPitch) const;
+
+    /// Dummy function to finalize the inheritance
+    virtual UINT_32 HwlComputeXmaskCoordYFrom8Pipe(
+        UINT_32 pipe, UINT_32 x) const;
+
+    // Sub-hwl interface - defined in EgBasedAddrLib
+    virtual VOID HwlSetupTileInfo(
+        AddrTileMode tileMode, ADDR_SURFACE_FLAGS flags,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        ADDR_TILEINFO* inputTileInfo, ADDR_TILEINFO* outputTileInfo,
+        AddrTileType inTileType, ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    virtual UINT_32 HwlGetPitchAlignmentMicroTiled(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples) const;
+
+    virtual UINT_64 HwlGetSizeAdjustmentMicroTiled(
+        UINT_32 thickness, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples,
+        UINT_32 baseAlign, UINT_32 pitchAlign,
+        UINT_32 *pPitch, UINT_32 *pHeight) const;
+
+    virtual VOID HwlCheckLastMacroTiledLvl(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    virtual BOOL_32 HwlTileInfoEqual(
+        const ADDR_TILEINFO* pLeft, const ADDR_TILEINFO* pRight) const;
+
+    virtual AddrTileMode HwlDegradeThickTileMode(
+        AddrTileMode baseTileMode, UINT_32 numSlices, UINT_32* pBytesPerTile) const;
+
+    virtual BOOL_32 HwlOverrideTileMode(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        AddrTileMode* pTileMode,
+        AddrTileType* pTileType) const;
+
+    virtual BOOL_32 HwlSanityCheckMacroTiled(
+        ADDR_TILEINFO* pTileInfo) const
+    {
+        return TRUE;
+    }
+
+    virtual UINT_32 HwlGetPitchAlignmentLinear(UINT_32 bpp, ADDR_SURFACE_FLAGS flags) const;
+
+    virtual UINT_64 HwlGetSizeAdjustmentLinear(
+        AddrTileMode tileMode,
+        UINT_32 bpp, UINT_32 numSamples, UINT_32 baseAlign, UINT_32 pitchAlign,
+        UINT_32 *pPitch, UINT_32 *pHeight, UINT_32 *pHeightAlign) const;
+
+    virtual VOID HwlComputeSurfaceCoord2DFromBankPipe(
+        AddrTileMode tileMode, UINT_32* pX, UINT_32* pY, UINT_32 slice,
+        UINT_32 bank, UINT_32 pipe,
+        UINT_32 bankSwizzle, UINT_32 pipeSwizzle, UINT_32 tileSlices,
+        BOOL_32 ignoreSE,
+        ADDR_TILEINFO* pTileInfo) const;
+
+    virtual UINT_32 HwlPreAdjustBank(
+        UINT_32 tileX, UINT_32 bank, ADDR_TILEINFO* pTileInfo) const;
+
+    virtual INT_32 HwlPostCheckTileIndex(
+        const ADDR_TILEINFO* pInfo, AddrTileMode mode, AddrTileType type,
+        INT curIndex = TileIndexInvalid) const;
+
+    virtual VOID   HwlFmaskPreThunkSurfInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pFmaskIn,
+        const ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut,
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pSurfIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut) const;
+
+    virtual VOID   HwlFmaskPostThunkSurfInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut) const;
+
+    virtual UINT_32 HwlComputeFmaskBits(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+        UINT_32* pNumSamples) const;
+
+    virtual BOOL_32 HwlReduceBankWidthHeight(
+        UINT_32 tileSize, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples,
+        UINT_32 bankHeightAlign, UINT_32 pipes,
+        ADDR_TILEINFO* pTileInfo) const
+    {
+        return TRUE;
+    }
+
+    // Protected non-virtual functions
+    VOID ComputeTileCoordFromPipeAndElemIdx(
+        UINT_32 elemIdx, UINT_32 pipe, AddrPipeCfg pipeCfg, UINT_32 pitchInMacroTile,
+        UINT_32 x, UINT_32 y, UINT_32* pX, UINT_32* pY) const;
+
+    UINT_32 TileCoordToMaskElementIndex(
+        UINT_32 tx, UINT_32 ty, AddrPipeCfg  pipeConfig,
+        UINT_32 *macroShift, UINT_32 *elemIdxBits) const;
+
+    BOOL_32 DecodeGbRegs(
+        const ADDR_REGISTER_VALUE* pRegValue);
+
+    const ADDR_TILECONFIG* GetTileSetting(
+        UINT_32 index) const;
+
+    static const UINT_32    TileTableSize = 32;
+    ADDR_TILECONFIG         m_tileTable[TileTableSize];
+    UINT_32                 m_noOfEntries;
+
+private:
+
+    UINT_32 GetPipePerSurf(AddrPipeCfg pipeConfig) const;
+
+    VOID ReadGbTileMode(
+        UINT_32 regValue, ADDR_TILECONFIG* pCfg) const;
+    BOOL_32 InitTileSettingTable(
+        const UINT_32 *pSetting, UINT_32 noOfEntries);
+
+    SIChipSettings          m_settings;
+};
+
+#endif
+
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 1b9158302ce..50c42e3599a 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -486,10 +486,11 @@ static void amdgpu_bo_set_tiling(struct pb_buffer *_buf,
                                  struct radeon_winsys_cs *rcs,
                                  enum radeon_bo_layout microtiled,
                                  enum radeon_bo_layout macrotiled,
+                                 unsigned pipe_config,
                                  unsigned bankw, unsigned bankh,
                                  unsigned tile_split,
                                  unsigned stencil_tile_split,
-                                 unsigned mtilea,
+                                 unsigned mtilea, unsigned num_banks,
                                  uint32_t pitch,
                                  bool scanout)
 {
@@ -504,11 +505,13 @@ static void amdgpu_bo_set_tiling(struct pb_buffer *_buf,
    else
       tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
 
+   tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, pipe_config);
    tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(bankw));
    tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(bankh));
    if (tile_split)
       tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(tile_split));
    tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(mtilea));
+   tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(num_banks)-1);
 
    if (scanout)
       tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
new file mode 100644
index 00000000000..08a1591496e
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+ * This file is included by addrlib. It adds GPU family definitions and
+ * macros compatible with addrlib.
+ */
+
+#ifndef AMDGPU_ID_H
+#define AMDGPU_ID_H
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_LITTLE_ENDIAN)
+#define LITTLEENDIAN_CPU
+#elif defined(PIPE_ARCH_BIG_ENDIAN)
+#define BIGENDIAN_CPU
+#endif
+
+enum {
+	FAMILY_UNKNOWN,
+	FAMILY_SI,
+	FAMILY_CI,
+	FAMILY_KV,
+	FAMILY_VI,
+	FAMILY_CZ,
+	FAMILY_PI,
+	FAMILY_LAST,
+};
+
+/* SI specific rev IDs */
+enum {
+	SI_TAHITI_P_A11      = 1,
+	SI_TAHITI_P_A0       = SI_TAHITI_P_A11,      /*A0 is alias of A11*/
+	SI_TAHITI_P_A21      = 5,
+	SI_TAHITI_P_B0       = SI_TAHITI_P_A21,      /*B0 is alias of A21*/
+	SI_TAHITI_P_A22      = 6,
+	SI_TAHITI_P_B1       = SI_TAHITI_P_A22,      /*B1 is alias of A22*/
+
+	SI_PITCAIRN_PM_A11   = 20,
+	SI_PITCAIRN_PM_A0    = SI_PITCAIRN_PM_A11,   /*A0 is alias of A11*/
+	SI_PITCAIRN_PM_A12   = 21,
+	SI_PITCAIRN_PM_A1    = SI_PITCAIRN_PM_A12,   /*A1 is alias of A12*/
+
+	SI_CAPEVERDE_M_A11   = 40,
+	SI_CAPEVERDE_M_A0    = SI_CAPEVERDE_M_A11,   /*A0 is alias of A11*/
+	SI_CAPEVERDE_M_A12   = 41,
+	SI_CAPEVERDE_M_A1    = SI_CAPEVERDE_M_A12,   /*A1 is alias of A12*/
+
+	SI_OLAND_M_A0        = 60,
+
+	SI_HAINAN_V_A0       = 70,
+
+	SI_UNKNOWN           = 0xFF
+};
+
+
+#define ASICREV_IS_TAHITI_P(eChipRev)	\
+	(eChipRev < SI_PITCAIRN_PM_A11)
+#define ASICREV_IS_PITCAIRN_PM(eChipRev)	\
+	((eChipRev >= SI_PITCAIRN_PM_A11) && (eChipRev < SI_CAPEVERDE_M_A11))
+#define ASICREV_IS_CAPEVERDE_M(eChipRev)	\
+	((eChipRev >= SI_CAPEVERDE_M_A11) && (eChipRev < SI_OLAND_M_A0))
+#define ASICREV_IS_OLAND_M(eChipRev)	\
+	((eChipRev >= SI_OLAND_M_A0) && (eChipRev < SI_HAINAN_V_A0))
+#define ASICREV_IS_HAINAN_V(eChipRev)	\
+(eChipRev >= SI_HAINAN_V_A0)
+
+/* CI specific revIDs */
+enum {
+	CI_BONAIRE_M_A0 = 20,
+	CI_BONAIRE_M_A1 = 21,
+
+	CI_HAWAII_P_A0  = 40,
+
+	CI_UNKNOWN      = 0xFF
+};
+
+#define ASICREV_IS_BONAIRE_M(eChipRev)	\
+	((eChipRev >= CI_BONAIRE_M_A0) && (eChipRev < CI_HAWAII_P_A0))
+#define ASICREV_IS_HAWAII_P(eChipRev)	\
+	(eChipRev >= CI_HAWAII_P_A0)
+
+/* KV specific rev IDs */
+enum {
+	KV_SPECTRE_A0      = 0x01,       /* KV1 with Spectre GFX core, 8-8-1-2 (CU-Pix-Primitive-RB) */
+	KV_SPOOKY_A0       = 0x41,       /* KV2 with Spooky GFX core, including downgraded from Spectre core, 3-4-1-1 (CU-Pix-Primitive-RB) */
+	KB_KALINDI_A0      = 0x81,       /* KB with Kalindi GFX core, 2-4-1-1 (CU-Pix-Primitive-RB) */
+	KB_KALINDI_A1      = 0x82,       /* KB with Kalindi GFX core, 2-4-1-1 (CU-Pix-Primitive-RB) */
+	BV_KALINDI_A2      = 0x85,       /* BV with Kalindi GFX core, 2-4-1-1 (CU-Pix-Primitive-RB) */
+	ML_GODAVARI_A0     = 0xa1,      /* ML with Godavari GFX core, 2-4-1-1 (CU-Pix-Primitive-RB) */
+	ML_GODAVARI_A1     = 0xa2,      /* ML with Godavari GFX core, 2-4-1-1 (CU-Pix-Primitive-RB) */
+	KV_UNKNOWN = 0xFF
+};
+
+#define ASICREV_IS_SPECTRE(eChipRev)	\
+	((eChipRev >= KV_SPECTRE_A0) && (eChipRev < KV_SPOOKY_A0))         /* identify all versions of SPRECTRE and supported features set */
+#define ASICREV_IS_SPOOKY(eChipRev)	\
+	((eChipRev >= KV_SPOOKY_A0) && (eChipRev < KB_KALINDI_A0))          /* identify all versions of SPOOKY and supported features set */
+#define ASICREV_IS_KALINDI(eChipRev)	\
+	((eChipRev >= KB_KALINDI_A0) && (eChipRev < KV_UNKNOWN))           /* identify all versions of KALINDI and supported features set */
+
+/* Following macros are subset of ASICREV_IS_KALINDI macro */
+#define ASICREV_IS_KALINDI_BHAVANI(eChipRev)	\
+	((eChipRev >= BV_KALINDI_A2) && (eChipRev < ML_GODAVARI_A0))   /* identify all versions of BHAVANI and supported features set */
+#define ASICREV_IS_KALINDI_GODAVARI(eChipRev)	\
+	((eChipRev >= ML_GODAVARI_A0) && (eChipRev < KV_UNKNOWN)) /* identify all versions of GODAVARI and supported features set */
+
+/* VI specific rev IDs */
+enum {
+	VI_ICELAND_M_A0   = 1,
+
+	VI_TONGA_P_A0     = 20,
+	VI_TONGA_P_A1     = 21,
+
+	VI_UNKNOWN        = 0xFF
+};
+
+
+#define ASICREV_IS_ICELAND_M(eChipRev)	\
+	(eChipRev < VI_TONGA_P_A0)
+#define ASICREV_IS_TONGA_P(eChipRev)	\
+	(eChipRev >= VI_TONGA_P_A0)
+
+/* CZ specific rev IDs */
+enum {
+	CZ_CARRIZO_A0      = 0x01,
+	CZ_UNKNOWN      = 0xFF
+};
+
+#define ASICREV_IS_CARRIZO(eChipRev) \
+	(eChipRev >= CARRIZO_A0)
+
+#endif /* AMDGPU_ID_H */
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
new file mode 100644
index 00000000000..358df381011
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright © 2011 Red Hat All Rights Reserved.
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/* Contact:
+ *     Marek Olšák <maraeo@gmail.com>
+ */
+
+#include "amdgpu_winsys.h"
+
+#ifndef NO_ENTRIES
+#define NO_ENTRIES 32
+#endif
+
+#ifndef NO_MACRO_ENTRIES
+#define NO_MACRO_ENTRIES 16
+#endif
+
+#ifndef CIASICIDGFXENGINE_SOUTHERNISLAND
+#define CIASICIDGFXENGINE_SOUTHERNISLAND 0x0000000A
+#endif
+
+
+static int amdgpu_surface_sanity(const struct radeon_surf *surf)
+{
+   unsigned type = RADEON_SURF_GET(surf->flags, TYPE);
+
+   if (!(surf->flags & RADEON_SURF_HAS_TILE_MODE_INDEX))
+      return -EINVAL;
+
+   /* all dimension must be at least 1 ! */
+   if (!surf->npix_x || !surf->npix_y || !surf->npix_z ||
+       !surf->array_size)
+      return -EINVAL;
+
+   if (!surf->blk_w || !surf->blk_h || !surf->blk_d)
+      return -EINVAL;
+
+   switch (surf->nsamples) {
+   case 1:
+   case 2:
+   case 4:
+   case 8:
+      break;
+   default:
+      return -EINVAL;
+   }
+
+   switch (type) {
+   case RADEON_SURF_TYPE_1D:
+      if (surf->npix_y > 1)
+         return -EINVAL;
+      /* fall through */
+   case RADEON_SURF_TYPE_2D:
+   case RADEON_SURF_TYPE_CUBEMAP:
+      if (surf->npix_z > 1 || surf->array_size > 1)
+         return -EINVAL;
+      break;
+   case RADEON_SURF_TYPE_3D:
+      if (surf->array_size > 1)
+         return -EINVAL;
+      break;
+   case RADEON_SURF_TYPE_1D_ARRAY:
+      if (surf->npix_y > 1)
+         return -EINVAL;
+      /* fall through */
+   case RADEON_SURF_TYPE_2D_ARRAY:
+      if (surf->npix_z > 1)
+         return -EINVAL;
+      break;
+   default:
+      return -EINVAL;
+   }
+   return 0;
+}
+
+static void *ADDR_API allocSysMem(const ADDR_ALLOCSYSMEM_INPUT * pInput)
+{
+   return malloc(pInput->sizeInBytes);
+}
+
+static ADDR_E_RETURNCODE ADDR_API freeSysMem(const ADDR_FREESYSMEM_INPUT * pInput)
+{
+   free(pInput->pVirtAddr);
+   return ADDR_OK;
+}
+
+/**
+ * This returns the number of banks for the surface.
+ * Possible values: 2, 4, 8, 16.
+ */
+static uint32_t cik_num_banks(struct amdgpu_winsys *ws,
+                              struct radeon_surf *surf)
+{
+   unsigned index, tileb;
+
+   tileb = 8 * 8 * surf->bpe;
+   tileb = MIN2(surf->tile_split, tileb);
+
+   for (index = 0; tileb > 64; index++) {
+      tileb >>= 1;
+   }
+   assert(index < 16);
+
+   return 2 << ((ws->amdinfo.gb_macro_tile_mode[index] >> 6) & 0x3);
+}
+
+ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws)
+{
+   ADDR_CREATE_INPUT addrCreateInput = {0};
+   ADDR_CREATE_OUTPUT addrCreateOutput = {0};
+   ADDR_REGISTER_VALUE regValue = {0};
+   ADDR_CREATE_FLAGS createFlags = {{0}};
+   ADDR_E_RETURNCODE addrRet;
+
+   addrCreateInput.size = sizeof(ADDR_CREATE_INPUT);
+   addrCreateOutput.size = sizeof(ADDR_CREATE_OUTPUT);
+
+   regValue.noOfBanks = ws->amdinfo.mc_arb_ramcfg & 0x3;
+   regValue.gbAddrConfig = ws->amdinfo.gb_addr_cfg;
+   regValue.noOfRanks = (ws->amdinfo.mc_arb_ramcfg & 0x4) >> 2;
+
+   regValue.backendDisables = ws->amdinfo.backend_disable[0];
+   regValue.pTileConfig = ws->amdinfo.gb_tile_mode;
+   regValue.noOfEntries = sizeof(ws->amdinfo.gb_tile_mode) /
+                          sizeof(ws->amdinfo.gb_tile_mode[0]);
+   regValue.pMacroTileConfig = ws->amdinfo.gb_macro_tile_mode;
+   regValue.noOfMacroEntries = sizeof(ws->amdinfo.gb_macro_tile_mode) /
+                               sizeof(ws->amdinfo.gb_macro_tile_mode[0]);
+
+   createFlags.value = 0;
+   createFlags.useTileIndex = 1;
+   createFlags.degradeBaseLevel = 1;
+
+   addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND;
+   addrCreateInput.chipFamily = ws->family;
+   addrCreateInput.chipRevision = ws->rev_id;
+   addrCreateInput.createFlags = createFlags;
+   addrCreateInput.callbacks.allocSysMem = allocSysMem;
+   addrCreateInput.callbacks.freeSysMem = freeSysMem;
+   addrCreateInput.callbacks.debugPrint = 0;
+   addrCreateInput.regValue = regValue;
+
+   addrRet = AddrCreate(&addrCreateInput, &addrCreateOutput);
+   if (addrRet != ADDR_OK)
+      return NULL;
+
+   return addrCreateOutput.hLib;
+}
+
+static int compute_level(struct amdgpu_winsys *ws,
+                         struct radeon_surf *surf, bool is_stencil,
+                         unsigned level, unsigned type, bool compressed,
+                         ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
+                         ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut)
+{
+   struct radeon_surf_level *surf_level;
+   ADDR_E_RETURNCODE ret;
+
+   AddrSurfInfoIn->mipLevel = level;
+   AddrSurfInfoIn->width = u_minify(surf->npix_x, level);
+   AddrSurfInfoIn->height = u_minify(surf->npix_y, level);
+
+   if (type == RADEON_SURF_TYPE_3D)
+      AddrSurfInfoIn->numSlices = u_minify(surf->npix_z, level);
+   else if (type == RADEON_SURF_TYPE_CUBEMAP)
+      AddrSurfInfoIn->numSlices = 6;
+   else
+      AddrSurfInfoIn->numSlices = surf->array_size;
+
+   if (level > 0) {
+      /* Set the base level pitch. This is needed for calculation
+       * of non-zero levels. */
+      if (is_stencil)
+         AddrSurfInfoIn->basePitch = surf->stencil_level[0].nblk_x;
+      else
+         AddrSurfInfoIn->basePitch = surf->level[0].nblk_x;
+
+      /* Convert blocks to pixels for compressed formats. */
+      if (compressed)
+         AddrSurfInfoIn->basePitch *= surf->blk_w;
+   }
+
+   ret = AddrComputeSurfaceInfo(ws->addrlib,
+                                AddrSurfInfoIn,
+                                AddrSurfInfoOut);
+   if (ret != ADDR_OK) {
+      return ret;
+   }
+
+   surf_level = is_stencil ? &surf->stencil_level[level] : &surf->level[level];
+   surf_level->offset = align(surf->bo_size, AddrSurfInfoOut->baseAlign);
+   surf_level->slice_size = AddrSurfInfoOut->sliceSize;
+   surf_level->pitch_bytes = AddrSurfInfoOut->pitch * (is_stencil ? 1 : surf->bpe);
+   surf_level->npix_x = u_minify(surf->npix_x, level);
+   surf_level->npix_y = u_minify(surf->npix_y, level);
+   surf_level->npix_z = u_minify(surf->npix_z, level);
+   surf_level->nblk_x = AddrSurfInfoOut->pitch;
+   surf_level->nblk_y = AddrSurfInfoOut->height;
+   if (type == RADEON_SURF_TYPE_3D)
+      surf_level->nblk_z = AddrSurfInfoOut->depth;
+   else
+      surf_level->nblk_z = 1;
+
+   switch (AddrSurfInfoOut->tileMode) {
+   case ADDR_TM_LINEAR_GENERAL:
+      surf_level->mode = RADEON_SURF_MODE_LINEAR;
+      break;
+   case ADDR_TM_LINEAR_ALIGNED:
+      surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+      break;
+   case ADDR_TM_1D_TILED_THIN1:
+      surf_level->mode = RADEON_SURF_MODE_1D;
+      break;
+   case ADDR_TM_2D_TILED_THIN1:
+      surf_level->mode = RADEON_SURF_MODE_2D;
+      break;
+   default:
+      assert(0);
+   }
+
+   if (is_stencil)
+      surf->stencil_tiling_index[level] = AddrSurfInfoOut->tileIndex;
+   else
+      surf->tiling_index[level] = AddrSurfInfoOut->tileIndex;
+
+   surf->bo_size = surf_level->offset + AddrSurfInfoOut->surfSize;
+   return 0;
+}
+
+static int amdgpu_surface_init(struct radeon_winsys *rws,
+                               struct radeon_surf *surf)
+{
+   struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
+   unsigned level, mode, type;
+   bool compressed;
+   ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
+   ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
+   ADDR_TILEINFO AddrTileInfoIn = {0};
+   ADDR_TILEINFO AddrTileInfoOut = {0};
+   int r;
+
+   r = amdgpu_surface_sanity(surf);
+   if (r)
+      return r;
+
+   AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT);
+   AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
+   AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;
+
+   type = RADEON_SURF_GET(surf->flags, TYPE);
+   mode = RADEON_SURF_GET(surf->flags, MODE);
+   compressed = surf->blk_w == 4 && surf->blk_h == 4;
+
+   /* MSAA and FMASK require 2D tiling. */
+   if (surf->nsamples > 1 ||
+       (surf->flags & RADEON_SURF_FMASK))
+      mode = RADEON_SURF_MODE_2D;
+
+   /* DB doesn't support linear layouts. */
+   if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER) &&
+       mode < RADEON_SURF_MODE_1D)
+      mode = RADEON_SURF_MODE_1D;
+
+   /* Set the requested tiling mode. */
+   switch (mode) {
+   case RADEON_SURF_MODE_LINEAR:
+      AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_GENERAL;
+      break;
+   case RADEON_SURF_MODE_LINEAR_ALIGNED:
+      AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED;
+      break;
+   case RADEON_SURF_MODE_1D:
+      AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1;
+      break;
+   case RADEON_SURF_MODE_2D:
+      AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1;
+      break;
+   default:
+      assert(0);
+   }
+
+   /* The format must be set correctly for the allocation of compressed
+    * textures to work. In other cases, setting the bpp is sufficient. */
+   if (compressed) {
+      switch (surf->bpe) {
+      case 8:
+         AddrSurfInfoIn.format = ADDR_FMT_BC1;
+         break;
+      case 16:
+         AddrSurfInfoIn.format = ADDR_FMT_BC3;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   else {
+      AddrSurfInfoIn.bpp = surf->bpe * 8;
+   }
+
+   AddrSurfInfoIn.numSamples = surf->nsamples;
+   AddrSurfInfoIn.tileIndex = -1;
+
+   /* Set the micro tile type. */
+   if (surf->flags & RADEON_SURF_SCANOUT)
+      AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE;
+   else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER)
+      AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER;
+   else
+      AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE;
+
+   AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
+   AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
+   AddrSurfInfoIn.flags.stencil = (surf->flags & RADEON_SURF_SBUFFER) != 0;
+   AddrSurfInfoIn.flags.cube = type == RADEON_SURF_TYPE_CUBEMAP;
+   AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0;
+   AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0;
+   AddrSurfInfoIn.flags.degrade4Space = 1;
+
+   /* This disables incorrect calculations (hacks) in addrlib. */
+   AddrSurfInfoIn.flags.noStencil = 1;
+
+   /* Set preferred macrotile parameters. This is usually required
+    * for shared resources. This is for 2D tiling only. */
+   if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 &&
+       surf->bankw && surf->bankh && surf->mtilea && surf->tile_split) {
+      /* If any of these parameters are incorrect, the calculation
+       * will fail. */
+      AddrTileInfoIn.banks = cik_num_banks(ws, surf);
+      AddrTileInfoIn.bankWidth = surf->bankw;
+      AddrTileInfoIn.bankHeight = surf->bankh;
+      AddrTileInfoIn.macroAspectRatio = surf->mtilea;
+      AddrTileInfoIn.tileSplitBytes = surf->tile_split;
+      AddrSurfInfoIn.flags.degrade4Space = 0;
+      AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn;
+
+      /* If AddrSurfInfoIn.pTileInfo is set, Addrlib doesn't set
+       * the tile index, because we are expected to know it if
+       * we know the other parameters.
+       *
+       * This is something that can easily be fixed in Addrlib.
+       * For now, just figure it out here.
+       * Note that only 2D_TILE_THIN1 is handled here.
+       */
+      assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+      assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1);
+
+      if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE)
+         AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */
+      else
+         AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */
+   }
+
+   surf->bo_size = 0;
+
+   /* Calculate texture layout information. */
+   for (level = 0; level <= surf->last_level; level++) {
+      r = compute_level(ws, surf, false, level, type, compressed,
+                        &AddrSurfInfoIn, &AddrSurfInfoOut);
+      if (r)
+         return r;
+
+      if (level == 0) {
+         surf->bo_alignment = AddrSurfInfoOut.baseAlign;
+         surf->pipe_config = AddrSurfInfoOut.pTileInfo->pipeConfig - 1;
+
+         /* For 2D modes only. */
+         if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
+            surf->bankw = AddrSurfInfoOut.pTileInfo->bankWidth;
+            surf->bankh = AddrSurfInfoOut.pTileInfo->bankHeight;
+            surf->mtilea = AddrSurfInfoOut.pTileInfo->macroAspectRatio;
+            surf->tile_split = AddrSurfInfoOut.pTileInfo->tileSplitBytes;
+            surf->num_banks = AddrSurfInfoOut.pTileInfo->banks;
+         }
+      }
+   }
+
+   /* Calculate texture layout information for stencil. */
+   if (surf->flags & RADEON_SURF_SBUFFER) {
+      AddrSurfInfoIn.bpp = 8;
+      /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */
+      AddrTileInfoIn.tileSplitBytes = surf->stencil_tile_split;
+
+      for (level = 0; level <= surf->last_level; level++) {
+         r = compute_level(ws, surf, true, level, type, compressed,
+                           &AddrSurfInfoIn, &AddrSurfInfoOut);
+         if (r)
+            return r;
+
+         if (level == 0) {
+            surf->stencil_offset = surf->stencil_level[0].offset;
+
+            /* For 2D modes only. */
+            if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
+               surf->stencil_tile_split =
+                     AddrSurfInfoOut.pTileInfo->tileSplitBytes;
+            }
+         }
+      }
+   }
+
+   return 0;
+}
+
+static int amdgpu_surface_best(struct radeon_winsys *rws,
+                               struct radeon_surf *surf)
+{
+   return 0;
+}
+
+void amdgpu_surface_init_functions(struct amdgpu_winsys *ws)
+{
+   ws->base.surface_init = amdgpu_surface_init;
+   ws->base.surface_best = amdgpu_surface_best;
+}
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index d517762f7a2..9020b1cefc7 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -39,6 +39,7 @@
 #include <xf86drm.h>
 #include <stdio.h>
 #include <sys/stat.h>
+#include "amdgpu_id.h"
 
 #define CIK_TILE_MODE_COLOR_2D			14
 
@@ -192,6 +193,51 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws)
       goto fail;
    }
 
+   /* family and rev_id are for addrlib */
+   switch (ws->info.family) {
+   case CHIP_BONAIRE:
+      ws->family = FAMILY_CI;
+      ws->rev_id = CI_BONAIRE_M_A0;
+      break;
+   case CHIP_KAVERI:
+      ws->family = FAMILY_KV;
+      ws->rev_id = KV_SPECTRE_A0;
+      break;
+   case CHIP_KABINI:
+      ws->family = FAMILY_KV;
+      ws->rev_id = KB_KALINDI_A0;
+      break;
+   case CHIP_HAWAII:
+      ws->family = FAMILY_CI;
+      ws->rev_id = CI_HAWAII_P_A0;
+      break;
+   case CHIP_MULLINS:
+      ws->family = FAMILY_KV;
+      ws->rev_id = ML_GODAVARI_A0;
+      break;
+   case CHIP_TONGA:
+      ws->family = FAMILY_VI;
+      ws->rev_id = VI_TONGA_P_A0;
+      break;
+   case CHIP_ICELAND:
+      ws->family = FAMILY_VI;
+      ws->rev_id = VI_ICELAND_M_A0;
+      break;
+   case CHIP_CARRIZO:
+      ws->family = FAMILY_CZ;
+      ws->rev_id = CZ_CARRIZO_A0;
+      break;
+   default:
+      fprintf(stderr, "amdgpu: Unknown family.\n");
+      goto fail;
+   }
+
+   ws->addrlib = amdgpu_addr_create(ws);
+   if (!ws->addrlib) {
+      fprintf(stderr, "amdgpu: Cannot create addrlib.\n");
+      goto fail;
+   }
+
    /* Set hardware information. */
    ws->info.gart_size = gtt.heap_size;
    ws->info.vram_size = vram.heap_size;
@@ -226,6 +272,8 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws)
    return TRUE;
 
 fail:
+   if (ws->addrlib)
+      AddrDestroy(ws->addrlib);
    amdgpu_device_deinitialize(ws->dev);
    ws->dev = NULL;
    return FALSE;
@@ -239,6 +287,7 @@ static void amdgpu_winsys_destroy(struct radeon_winsys *rws)
 
    ws->cman->destroy(ws->cman);
    ws->kman->destroy(ws->kman);
+   AddrDestroy(ws->addrlib);
 
    amdgpu_device_deinitialize(ws->dev);
    FREE(rws);
@@ -413,6 +462,7 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create)
 
    amdgpu_bomgr_init_functions(ws);
    amdgpu_cs_init_functions(ws);
+   amdgpu_surface_init_functions(ws);
 
    pipe_mutex_init(ws->bo_fence_lock);
 
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
index 68c896814f2..4d07644c9ef 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@@ -33,6 +33,7 @@
 #define AMDGPU_WINSYS_H
 
 #include "gallium/drivers/radeon/radeon_winsys.h"
+#include "addrlib/addrinterface.h"
 #include "os/os_thread.h"
 #include <amdgpu.h>
 
@@ -60,6 +61,9 @@ struct amdgpu_winsys {
    struct pb_manager *cman;
 
    struct amdgpu_gpu_info amdinfo;
+   ADDR_HANDLE addrlib;
+   uint32_t rev_id;
+   unsigned family;
 };
 
 static inline struct amdgpu_winsys *
@@ -68,4 +72,7 @@ amdgpu_winsys(struct radeon_winsys *base)
    return (struct amdgpu_winsys*)base;
 }
 
+void amdgpu_surface_init_functions(struct amdgpu_winsys *ws);
+ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws);
+
 #endif
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index ecaeb3bc29c..3a9ac445b24 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -774,10 +774,11 @@ static void radeon_bo_set_tiling(struct pb_buffer *_buf,
                                  struct radeon_winsys_cs *rcs,
                                  enum radeon_bo_layout microtiled,
                                  enum radeon_bo_layout macrotiled,
+                                 unsigned pipe_config,
                                  unsigned bankw, unsigned bankh,
                                  unsigned tile_split,
                                  unsigned stencil_tile_split,
-                                 unsigned mtilea,
+                                 unsigned mtilea, unsigned num_banks,
                                  uint32_t pitch,
                                  bool scanout)
 {

From 8ba70e0a7405005c079eb72f94999245c992aa91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Apr 2015 20:35:27 +0200
Subject: [PATCH 1168/1208] radeonsi: fix DRM version checks for amdgpu DRM
 3.0.0

---
 src/gallium/drivers/radeon/r600_buffer_common.c | 6 ++++--
 src/gallium/drivers/radeon/r600_pipe_common.c   | 4 +++-
 src/gallium/drivers/radeon/r600_texture.c       | 8 +++++---
 src/gallium/drivers/radeonsi/si_pipe.c          | 4 +++-
 src/gallium/drivers/radeonsi/si_state.c         | 8 ++++----
 5 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index 0f788b7e23c..cb9809f2449 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -121,7 +121,8 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 		/* Older kernels didn't always flush the HDP cache before
 		 * CS execution
 		 */
-		if (rscreen->info.drm_minor < 40) {
+		if (rscreen->info.drm_major == 2 &&
+		    rscreen->info.drm_minor < 40) {
 			res->domains = RADEON_DOMAIN_GTT;
 			flags |= RADEON_FLAG_GTT_WC;
 			break;
@@ -147,7 +148,8 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 		 * Write-combined CPU mappings are fine, the kernel ensures all CPU
 		 * writes finish before the GPU executes a command stream.
 		 */
-		if (rscreen->info.drm_minor < 40)
+		if (rscreen->info.drm_major == 2 &&
+		    rscreen->info.drm_minor < 40)
 			res->domains = RADEON_DOMAIN_GTT;
 		else if (res->domains & RADEON_DOMAIN_VRAM)
 			flags |= RADEON_FLAG_CPU_ACCESS;
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 9e68e5f80ac..51c72cc882f 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -928,7 +928,9 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	pipe_mutex_init(rscreen->aux_context_lock);
 	pipe_mutex_init(rscreen->gpu_load_mutex);
 
-	if (rscreen->info.drm_minor >= 28 && (rscreen->debug_flags & DBG_TRACE_CS)) {
+	if (((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 28) ||
+	     rscreen->info.drm_major == 3) &&
+	    (rscreen->debug_flags & DBG_TRACE_CS)) {
 		rscreen->trace_bo = (struct r600_resource*)pipe_buffer_create(&rscreen->b,
 										PIPE_BIND_CUSTOM,
 										PIPE_USAGE_STAGING,
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index a4c7034cb37..54696910e43 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -490,7 +490,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	unsigned num_pipes = rscreen->tiling_info.num_channels;
 
 	if (rscreen->chip_class <= EVERGREEN &&
-	    rscreen->info.drm_minor < 26)
+	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 26)
 		return 0;
 
 	/* HW bug on R6xx. */
@@ -502,7 +502,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	/* HTILE is broken with 1D tiling on old kernels and CIK. */
 	if (rscreen->chip_class >= CIK &&
 	    rtex->surface.level[0].mode == RADEON_SURF_MODE_1D &&
-	    rscreen->info.drm_minor < 38)
+	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38)
 		return 0;
 
 	switch (num_pipes) {
@@ -1261,7 +1261,9 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 
 		/* fast color clear with 1D tiling doesn't work on old kernels and CIK */
 		if (tex->surface.level[0].mode == RADEON_SURF_MODE_1D &&
-		    rctx->chip_class >= CIK && rctx->screen->info.drm_minor < 38) {
+		    rctx->chip_class >= CIK &&
+		    rctx->screen->info.drm_major == 2 &&
+		    rctx->screen->info.drm_minor < 38) {
 			continue;
 		}
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 22efc6837eb..9b5cdd8dc12 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -271,7 +271,9 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_MULTISAMPLE:
 		/* 2D tiling on CIK is supported since DRM 2.35.0 */
 		return sscreen->b.chip_class < CIK ||
-		       sscreen->b.info.drm_minor >= 35;
+		       (sscreen->b.info.drm_major == 2 &&
+			sscreen->b.info.drm_minor >= 35) ||
+		       sscreen->b.info.drm_major == 3;
 
         case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
                 return R600_MAP_BUFFER_ALIGNMENT;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index b9f512d17b6..227d975e380 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -1178,7 +1178,9 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 				       int first_non_void)
 {
 	struct si_screen *sscreen = (struct si_screen*)screen;
-	bool enable_compressed_formats = sscreen->b.info.drm_minor >= 31;
+	bool enable_compressed_formats = (sscreen->b.info.drm_major == 2 &&
+					  sscreen->b.info.drm_minor >= 31) ||
+					 sscreen->b.info.drm_major == 3;
 	boolean uniform = TRUE;
 	int i;
 
@@ -1626,7 +1628,6 @@ boolean si_is_format_supported(struct pipe_screen *screen,
                                unsigned sample_count,
                                unsigned usage)
 {
-	struct si_screen *sscreen = (struct si_screen *)screen;
 	unsigned retval = 0;
 
 	if (target >= PIPE_MAX_TEXTURE_TYPES) {
@@ -1638,8 +1639,7 @@ boolean si_is_format_supported(struct pipe_screen *screen,
 		return FALSE;
 
 	if (sample_count > 1) {
-		/* 2D tiling on CIK is supported since DRM 2.35.0 */
-		if (sscreen->b.chip_class >= CIK && sscreen->b.info.drm_minor < 35)
+		if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE))
 			return FALSE;
 
 		switch (sample_count) {

From 8f49f6ed19ba4ee6a26c77786dcbc151c6615d48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Apr 2015 20:12:24 +0200
Subject: [PATCH 1169/1208] radeonsi: add VI register definitions

---
 src/gallium/drivers/radeon/r600d_common.h |    2 +-
 src/gallium/drivers/radeonsi/si_state.c   |   12 +-
 src/gallium/drivers/radeonsi/sid.h        | 1090 +++++++++++++++++++--
 3 files changed, 1042 insertions(+), 62 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600d_common.h b/src/gallium/drivers/radeon/r600d_common.h
index 5a56a5454be..115042d153e 100644
--- a/src/gallium/drivers/radeon/r600d_common.h
+++ b/src/gallium/drivers/radeon/r600d_common.h
@@ -180,7 +180,7 @@
 #define   S_028804_INTERPOLATE_SRC_Z(x)			(((x) & 0x1) << 19)
 #define   S_028804_STATIC_ANCHOR_ASSOCIATIONS(x)	(((x) & 0x1) << 20)
 #define   S_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)	(((x) & 0x1) << 21)
-#define   S_028804_OVERRASTERIZATION_AMOUNT(x)		(((x) & 0x7) << 24)
+#define   S_028804_OVERRASTERIZATION_AMOUNT(x)		(((x) & 0x07) << 24)
 #define   S_028804_ENABLE_POSTZ_OVERRASTERIZATION(x)	(((x) & 0x1) << 27)
 #define CM_R_028BDC_PA_SC_LINE_CNTL                  0x28bdc
 #define   S_028BDC_EXPAND_LINE_WIDTH(x)                (((x) & 0x1) << 9)
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 227d975e380..51ade5248a4 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -554,12 +554,12 @@ static void si_set_viewport_states(struct pipe_context *ctx,
 		pm4 = &viewport->pm4;
 
 		viewport->viewport = state[idx];
-		si_pm4_set_reg(pm4, R_02843C_PA_CL_VPORT_XSCALE_0 + offset, fui(state[idx].scale[0]));
-		si_pm4_set_reg(pm4, R_028440_PA_CL_VPORT_XOFFSET_0 + offset, fui(state[idx].translate[0]));
-		si_pm4_set_reg(pm4, R_028444_PA_CL_VPORT_YSCALE_0 + offset, fui(state[idx].scale[1]));
-		si_pm4_set_reg(pm4, R_028448_PA_CL_VPORT_YOFFSET_0 + offset, fui(state[idx].translate[1]));
-		si_pm4_set_reg(pm4, R_02844C_PA_CL_VPORT_ZSCALE_0 + offset, fui(state[idx].scale[2]));
-		si_pm4_set_reg(pm4, R_028450_PA_CL_VPORT_ZOFFSET_0 + offset, fui(state[idx].translate[2]));
+		si_pm4_set_reg(pm4, R_02843C_PA_CL_VPORT_XSCALE + offset, fui(state[idx].scale[0]));
+		si_pm4_set_reg(pm4, R_028440_PA_CL_VPORT_XOFFSET + offset, fui(state[idx].translate[0]));
+		si_pm4_set_reg(pm4, R_028444_PA_CL_VPORT_YSCALE + offset, fui(state[idx].scale[1]));
+		si_pm4_set_reg(pm4, R_028448_PA_CL_VPORT_YOFFSET + offset, fui(state[idx].translate[1]));
+		si_pm4_set_reg(pm4, R_02844C_PA_CL_VPORT_ZSCALE + offset, fui(state[idx].scale[2]));
+		si_pm4_set_reg(pm4, R_028450_PA_CL_VPORT_ZOFFSET + offset, fui(state[idx].translate[2]));
 
 		si_pm4_set_state(sctx, viewport[i], viewport);
 	}
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 35d5ee232a0..921d5335554 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -206,6 +206,80 @@
  * 6. COMMAND [29:22] | BYTE_COUNT [20:0]
  */
 
+
+#define R_008010_GRBM_STATUS                                            0x008010
+#define   S_008010_ME0PIPE0_CMDFIFO_AVAIL(x)                          (((x) & 0x0F) << 0)
+#define   G_008010_ME0PIPE0_CMDFIFO_AVAIL(x)                          (((x) >> 0) & 0x0F)
+#define   C_008010_ME0PIPE0_CMDFIFO_AVAIL                             0xFFFFFFF0
+#define   S_008010_SRBM_RQ_PENDING(x)                                 (((x) & 0x1) << 5)
+#define   G_008010_SRBM_RQ_PENDING(x)                                 (((x) >> 5) & 0x1)
+#define   C_008010_SRBM_RQ_PENDING                                    0xFFFFFFDF
+#define   S_008010_ME0PIPE0_CF_RQ_PENDING(x)                          (((x) & 0x1) << 7)
+#define   G_008010_ME0PIPE0_CF_RQ_PENDING(x)                          (((x) >> 7) & 0x1)
+#define   C_008010_ME0PIPE0_CF_RQ_PENDING                             0xFFFFFF7F
+#define   S_008010_ME0PIPE0_PF_RQ_PENDING(x)                          (((x) & 0x1) << 8)
+#define   G_008010_ME0PIPE0_PF_RQ_PENDING(x)                          (((x) >> 8) & 0x1)
+#define   C_008010_ME0PIPE0_PF_RQ_PENDING                             0xFFFFFEFF
+#define   S_008010_GDS_DMA_RQ_PENDING(x)                              (((x) & 0x1) << 9)
+#define   G_008010_GDS_DMA_RQ_PENDING(x)                              (((x) >> 9) & 0x1)
+#define   C_008010_GDS_DMA_RQ_PENDING                                 0xFFFFFDFF
+#define   S_008010_DB_CLEAN(x)                                        (((x) & 0x1) << 12)
+#define   G_008010_DB_CLEAN(x)                                        (((x) >> 12) & 0x1)
+#define   C_008010_DB_CLEAN                                           0xFFFFEFFF
+#define   S_008010_CB_CLEAN(x)                                        (((x) & 0x1) << 13)
+#define   G_008010_CB_CLEAN(x)                                        (((x) >> 13) & 0x1)
+#define   C_008010_CB_CLEAN                                           0xFFFFDFFF
+#define   S_008010_TA_BUSY(x)                                         (((x) & 0x1) << 14)
+#define   G_008010_TA_BUSY(x)                                         (((x) >> 14) & 0x1)
+#define   C_008010_TA_BUSY                                            0xFFFFBFFF
+#define   S_008010_GDS_BUSY(x)                                        (((x) & 0x1) << 15)
+#define   G_008010_GDS_BUSY(x)                                        (((x) >> 15) & 0x1)
+#define   C_008010_GDS_BUSY                                           0xFFFF7FFF
+#define   S_008010_WD_BUSY_NO_DMA(x)                                  (((x) & 0x1) << 16)
+#define   G_008010_WD_BUSY_NO_DMA(x)                                  (((x) >> 16) & 0x1)
+#define   C_008010_WD_BUSY_NO_DMA                                     0xFFFEFFFF
+#define   S_008010_VGT_BUSY(x)                                        (((x) & 0x1) << 17)
+#define   G_008010_VGT_BUSY(x)                                        (((x) >> 17) & 0x1)
+#define   C_008010_VGT_BUSY                                           0xFFFDFFFF
+#define   S_008010_IA_BUSY_NO_DMA(x)                                  (((x) & 0x1) << 18)
+#define   G_008010_IA_BUSY_NO_DMA(x)                                  (((x) >> 18) & 0x1)
+#define   C_008010_IA_BUSY_NO_DMA                                     0xFFFBFFFF
+#define   S_008010_IA_BUSY(x)                                         (((x) & 0x1) << 19)
+#define   G_008010_IA_BUSY(x)                                         (((x) >> 19) & 0x1)
+#define   C_008010_IA_BUSY                                            0xFFF7FFFF
+#define   S_008010_SX_BUSY(x)                                         (((x) & 0x1) << 20)
+#define   G_008010_SX_BUSY(x)                                         (((x) >> 20) & 0x1)
+#define   C_008010_SX_BUSY                                            0xFFEFFFFF
+#define   S_008010_WD_BUSY(x)                                         (((x) & 0x1) << 21)
+#define   G_008010_WD_BUSY(x)                                         (((x) >> 21) & 0x1)
+#define   C_008010_WD_BUSY                                            0xFFDFFFFF
+#define   S_008010_SPI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008010_SPI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008010_SPI_BUSY                                           0xFFBFFFFF
+#define   S_008010_BCI_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_008010_BCI_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_008010_BCI_BUSY                                           0xFF7FFFFF
+#define   S_008010_SC_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_008010_SC_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_008010_SC_BUSY                                            0xFEFFFFFF
+#define   S_008010_PA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008010_PA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008010_PA_BUSY                                            0xFDFFFFFF
+#define   S_008010_DB_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008010_DB_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008010_DB_BUSY                                            0xFBFFFFFF
+#define   S_008010_CP_COHERENCY_BUSY(x)                               (((x) & 0x1) << 28)
+#define   G_008010_CP_COHERENCY_BUSY(x)                               (((x) >> 28) & 0x1)
+#define   C_008010_CP_COHERENCY_BUSY                                  0xEFFFFFFF
+#define   S_008010_CP_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_008010_CP_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_008010_CP_BUSY                                            0xDFFFFFFF
+#define   S_008010_CB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_008010_CB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_008010_CB_BUSY                                            0xBFFFFFFF
+#define   S_008010_GUI_ACTIVE(x)                                      (((x) & 0x1) << 31)
+#define   G_008010_GUI_ACTIVE(x)                                      (((x) >> 31) & 0x1)
+#define   C_008010_GUI_ACTIVE                                         0x7FFFFFFF
 #define GRBM_GFX_INDEX                                                  0x802C
 #define         INSTANCE_INDEX(x)                                     ((x) << 0)
 #define         SH_INDEX(x)                                           ((x) << 8)
@@ -278,10 +352,18 @@
 #define R_0085F8_CP_COHER_BASE                                          0x0085F8
 
 /* CIK */
+#define R_0300FC_CP_STRMOUT_CNTL                                        0x0300FC
+#define   S_0300FC_OFFSET_UPDATE_DONE(x)                              (((x) & 0x1) << 0)
+#define   G_0300FC_OFFSET_UPDATE_DONE(x)                              (((x) >> 0) & 0x1)
+#define   C_0300FC_OFFSET_UPDATE_DONE                                 0xFFFFFFFE
 #define R_0301E4_CP_COHER_BASE_HI                                       0x0301E4
 #define   S_0301E4_COHER_BASE_HI_256B(x)                              (((x) & 0xFF) << 0)
 #define   G_0301E4_COHER_BASE_HI_256B(x)                              (((x) >> 0) & 0xFF)
 #define   C_0301E4_COHER_BASE_HI_256B                                 0xFFFFFF00
+#define R_0301EC_CP_COHER_START_DELAY                                   0x0301EC
+#define   S_0301EC_START_DELAY_COUNT(x)                               (((x) & 0x3F) << 0)
+#define   G_0301EC_START_DELAY_COUNT(x)                               (((x) >> 0) & 0x3F)
+#define   C_0301EC_START_DELAY_COUNT                                  0xFFFFFFC0
 #define R_0301F0_CP_COHER_CNTL                                          0x0301F0
 #define   S_0301F0_DEST_BASE_0_ENA(x)                                 (((x) & 0x1) << 0)
 #define   G_0301F0_DEST_BASE_0_ENA(x)                                 (((x) >> 0) & 0x1)
@@ -289,6 +371,14 @@
 #define   S_0301F0_DEST_BASE_1_ENA(x)                                 (((x) & 0x1) << 1)
 #define   G_0301F0_DEST_BASE_1_ENA(x)                                 (((x) >> 1) & 0x1)
 #define   C_0301F0_DEST_BASE_1_ENA                                    0xFFFFFFFD
+/* VI */
+#define   S_0301F0_TC_SD_ACTION_ENA(x)                                (((x) & 0x1) << 2)
+#define   G_0301F0_TC_SD_ACTION_ENA(x)                                (((x) >> 2) & 0x1)
+#define   C_0301F0_TC_SD_ACTION_ENA                                   0xFFFFFFFB
+#define   S_0301F0_TC_NC_ACTION_ENA(x)                                (((x) & 0x1) << 3)
+#define   G_0301F0_TC_NC_ACTION_ENA(x)                                (((x) >> 3) & 0x1)
+#define   C_0301F0_TC_NC_ACTION_ENA                                   0xFFFFFFF7
+/*    */
 #define   S_0301F0_CB0_DEST_BASE_ENA(x)                               (((x) & 0x1) << 6)
 #define   G_0301F0_CB0_DEST_BASE_ENA(x)                               (((x) >> 6) & 0x1)
 #define   C_0301F0_CB0_DEST_BASE_ENA                                  0xFFFFFFBF
@@ -319,7 +409,7 @@
 #define   S_0301F0_TCL1_VOL_ACTION_ENA(x)                             (((x) & 0x1) << 15)
 #define   G_0301F0_TCL1_VOL_ACTION_ENA(x)                             (((x) >> 15) & 0x1)
 #define   C_0301F0_TCL1_VOL_ACTION_ENA                                0xFFFF7FFF
-#define   S_0301F0_TC_VOL_ACTION_ENA(x)                               (((x) & 0x1) << 16)
+#define   S_0301F0_TC_VOL_ACTION_ENA(x)                               (((x) & 0x1) << 16) /* not on VI */
 #define   G_0301F0_TC_VOL_ACTION_ENA(x)                               (((x) >> 16) & 0x1)
 #define   C_0301F0_TC_VOL_ACTION_ENA                                  0xFFFEFFFF
 #define   S_0301F0_TC_WB_ACTION_ENA(x)                                (((x) & 0x1) << 18)
@@ -352,8 +442,29 @@
 #define   S_0301F0_SH_ICACHE_ACTION_ENA(x)                            (((x) & 0x1) << 29)
 #define   G_0301F0_SH_ICACHE_ACTION_ENA(x)                            (((x) >> 29) & 0x1)
 #define   C_0301F0_SH_ICACHE_ACTION_ENA                               0xDFFFFFFF
+/* VI */
+#define   S_0301F0_SH_KCACHE_WB_ACTION_ENA(x)                         (((x) & 0x1) << 30)
+#define   G_0301F0_SH_KCACHE_WB_ACTION_ENA(x)                         (((x) >> 30) & 0x1)
+#define   C_0301F0_SH_KCACHE_WB_ACTION_ENA                            0xBFFFFFFF
+#define   S_0301F0_SH_SD_ACTION_ENA(x)                                (((x) & 0x1) << 31)
+#define   G_0301F0_SH_SD_ACTION_ENA(x)                                (((x) >> 31) & 0x1)
+#define   C_0301F0_SH_SD_ACTION_ENA                                   0x7FFFFFFF
+/*    */
 #define R_0301F4_CP_COHER_SIZE                                          0x0301F4
 #define R_0301F8_CP_COHER_BASE                                          0x0301F8
+#define R_0301FC_CP_COHER_STATUS                                        0x0301FC
+#define   S_0301FC_MATCHING_GFX_CNTX(x)                               (((x) & 0xFF) << 0)
+#define   G_0301FC_MATCHING_GFX_CNTX(x)                               (((x) >> 0) & 0xFF)
+#define   C_0301FC_MATCHING_GFX_CNTX                                  0xFFFFFF00
+#define   S_0301FC_MEID(x)                                            (((x) & 0x03) << 24)
+#define   G_0301FC_MEID(x)                                            (((x) >> 24) & 0x03)
+#define   C_0301FC_MEID                                               0xFCFFFFFF
+#define   S_0301FC_PHASE1_STATUS(x)                                   (((x) & 0x1) << 30)
+#define   G_0301FC_PHASE1_STATUS(x)                                   (((x) >> 30) & 0x1)
+#define   C_0301FC_PHASE1_STATUS                                      0xBFFFFFFF
+#define   S_0301FC_STATUS(x)                                          (((x) & 0x1) << 31)
+#define   G_0301FC_STATUS(x)                                          (((x) >> 31) & 0x1)
+#define   C_0301FC_STATUS                                             0x7FFFFFFF
 #define R_030230_CP_COHER_SIZE_HI                                       0x030230
 #define   S_030230_COHER_SIZE_HI_256B(x)                              (((x) & 0xFF) << 0)
 #define   G_030230_COHER_SIZE_HI_256B(x)                              (((x) >> 0) & 0xFF)
@@ -375,10 +486,6 @@
 #define   C_0088C4_ES_LIMIT                                           0xFFE0FFFF
 #define R_0088C8_VGT_ESGS_RING_SIZE                                     0x0088C8
 #define R_0088CC_VGT_GSVS_RING_SIZE                                     0x0088CC
-/* CIK */
-#define R_030900_VGT_ESGS_RING_SIZE                                     0x030900
-#define R_030904_VGT_GSVS_RING_SIZE                                     0x030904
-/*     */
 #define R_0088D4_VGT_GS_VERTEX_REUSE                                    0x0088D4
 #define   S_0088D4_VERT_REUSE(x)                                      (((x) & 0x1F) << 0)
 #define   G_0088D4_VERT_REUSE(x)                                      (((x) >> 0) & 0x1F)
@@ -462,6 +569,27 @@
 #define   G_008B10_CURRENT_COUNT(x)                                   (((x) >> 8) & 0xFF)
 #define   C_008B10_CURRENT_COUNT                                      0xFFFF00FF
 /* CIK */
+#define R_030800_GRBM_GFX_INDEX                                         0x030800
+#define   S_030800_INSTANCE_INDEX(x)                                  (((x) & 0xFF) << 0)
+#define   G_030800_INSTANCE_INDEX(x)                                  (((x) >> 0) & 0xFF)
+#define   C_030800_INSTANCE_INDEX                                     0xFFFFFF00
+#define   S_030800_SH_INDEX(x)                                        (((x) & 0xFF) << 8)
+#define   G_030800_SH_INDEX(x)                                        (((x) >> 8) & 0xFF)
+#define   C_030800_SH_INDEX                                           0xFFFF00FF
+#define   S_030800_SE_INDEX(x)                                        (((x) & 0xFF) << 16)
+#define   G_030800_SE_INDEX(x)                                        (((x) >> 16) & 0xFF)
+#define   C_030800_SE_INDEX                                           0xFF00FFFF
+#define   S_030800_SH_BROADCAST_WRITES(x)                             (((x) & 0x1) << 29)
+#define   G_030800_SH_BROADCAST_WRITES(x)                             (((x) >> 29) & 0x1)
+#define   C_030800_SH_BROADCAST_WRITES                                0xDFFFFFFF
+#define   S_030800_INSTANCE_BROADCAST_WRITES(x)                       (((x) & 0x1) << 30)
+#define   G_030800_INSTANCE_BROADCAST_WRITES(x)                       (((x) >> 30) & 0x1)
+#define   C_030800_INSTANCE_BROADCAST_WRITES                          0xBFFFFFFF
+#define   S_030800_SE_BROADCAST_WRITES(x)                             (((x) & 0x1) << 31)
+#define   G_030800_SE_BROADCAST_WRITES(x)                             (((x) >> 31) & 0x1)
+#define   C_030800_SE_BROADCAST_WRITES                                0x7FFFFFFF
+#define R_030900_VGT_ESGS_RING_SIZE                                     0x030900
+#define R_030904_VGT_GSVS_RING_SIZE                                     0x030904
 #define R_030908_VGT_PRIMITIVE_TYPE                                     0x030908
 #define   S_030908_PRIM_TYPE(x)                                       (((x) & 0x3F) << 0)
 #define   G_030908_PRIM_TYPE(x)                                       (((x) >> 0) & 0x3F)
@@ -530,6 +658,34 @@
 #define   S_030A04_CURRENT_COUNT(x)                                   (((x) & 0xFF) << 8)
 #define   G_030A04_CURRENT_COUNT(x)                                   (((x) >> 8) & 0xFF)
 #define   C_030A04_CURRENT_COUNT                                      0xFFFF00FF
+#define R_030A10_PA_SC_SCREEN_EXTENT_MIN_0                              0x030A10
+#define   S_030A10_X(x)                                               (((x) & 0xFFFF) << 0)
+#define   G_030A10_X(x)                                               (((x) >> 0) & 0xFFFF)
+#define   C_030A10_X                                                  0xFFFF0000
+#define   S_030A10_Y(x)                                               (((x) & 0xFFFF) << 16)
+#define   G_030A10_Y(x)                                               (((x) >> 16) & 0xFFFF)
+#define   C_030A10_Y                                                  0x0000FFFF
+#define R_030A14_PA_SC_SCREEN_EXTENT_MAX_0                              0x030A14
+#define   S_030A14_X(x)                                               (((x) & 0xFFFF) << 0)
+#define   G_030A14_X(x)                                               (((x) >> 0) & 0xFFFF)
+#define   C_030A14_X                                                  0xFFFF0000
+#define   S_030A14_Y(x)                                               (((x) & 0xFFFF) << 16)
+#define   G_030A14_Y(x)                                               (((x) >> 16) & 0xFFFF)
+#define   C_030A14_Y                                                  0x0000FFFF
+#define R_030A18_PA_SC_SCREEN_EXTENT_MIN_1                              0x030A18
+#define   S_030A18_X(x)                                               (((x) & 0xFFFF) << 0)
+#define   G_030A18_X(x)                                               (((x) >> 0) & 0xFFFF)
+#define   C_030A18_X                                                  0xFFFF0000
+#define   S_030A18_Y(x)                                               (((x) & 0xFFFF) << 16)
+#define   G_030A18_Y(x)                                               (((x) >> 16) & 0xFFFF)
+#define   C_030A18_Y                                                  0x0000FFFF
+#define R_030A2C_PA_SC_SCREEN_EXTENT_MAX_1                              0x030A2C
+#define   S_030A2C_X(x)                                               (((x) & 0xFFFF) << 0)
+#define   G_030A2C_X(x)                                               (((x) >> 0) & 0xFFFF)
+#define   C_030A2C_X                                                  0xFFFF0000
+#define   S_030A2C_Y(x)                                               (((x) & 0xFFFF) << 16)
+#define   G_030A2C_Y(x)                                               (((x) >> 16) & 0xFFFF)
+#define   C_030A2C_Y                                                  0x0000FFFF
 /*     */
 #define R_008BF0_PA_SC_ENHANCE                                          0x008BF0
 #define   S_008BF0_ENABLE_PA_SC_OUT_OF_ORDER(x)                       (((x) & 0x1) << 0)
@@ -608,6 +764,32 @@
 #define     V_008DFC_SQ_VGPR                                        0x00
 /*     */
 #define R_008DFC_SQ_INST                                                0x008DFC
+#define R_030D20_SQC_CACHES                                             0x030D20
+#define   S_030D20_TARGET_INST(x)                                     (((x) & 0x1) << 0)
+#define   G_030D20_TARGET_INST(x)                                     (((x) >> 0) & 0x1)
+#define   C_030D20_TARGET_INST                                        0xFFFFFFFE
+#define   S_030D20_TARGET_DATA(x)                                     (((x) & 0x1) << 1)
+#define   G_030D20_TARGET_DATA(x)                                     (((x) >> 1) & 0x1)
+#define   C_030D20_TARGET_DATA                                        0xFFFFFFFD
+#define   S_030D20_INVALIDATE(x)                                      (((x) & 0x1) << 2)
+#define   G_030D20_INVALIDATE(x)                                      (((x) >> 2) & 0x1)
+#define   C_030D20_INVALIDATE                                         0xFFFFFFFB
+#define   S_030D20_WRITEBACK(x)                                       (((x) & 0x1) << 3)
+#define   G_030D20_WRITEBACK(x)                                       (((x) >> 3) & 0x1)
+#define   C_030D20_WRITEBACK                                          0xFFFFFFF7
+#define   S_030D20_VOL(x)                                             (((x) & 0x1) << 4)
+#define   G_030D20_VOL(x)                                             (((x) >> 4) & 0x1)
+#define   C_030D20_VOL                                                0xFFFFFFEF
+#define   S_030D20_COMPLETE(x)                                        (((x) & 0x1) << 16)
+#define   G_030D20_COMPLETE(x)                                        (((x) >> 16) & 0x1)
+#define   C_030D20_COMPLETE                                           0xFFFEFFFF
+#define R_030D24_SQC_WRITEBACK                                          0x030D24
+#define   S_030D24_DWB(x)                                             (((x) & 0x1) << 0)
+#define   G_030D24_DWB(x)                                             (((x) >> 0) & 0x1)
+#define   C_030D24_DWB                                                0xFFFFFFFE
+#define   S_030D24_DIRTY(x)                                           (((x) & 0x1) << 1)
+#define   G_030D24_DIRTY(x)                                           (((x) >> 1) & 0x1)
+#define   C_030D24_DIRTY                                              0xFFFFFFFD
 #define R_008DFC_SQ_VOP1                                                0x008DFC
 #define   S_008DFC_SRC0(x)                                            (((x) & 0x1FF) << 0)
 #define   G_008DFC_SRC0(x)                                            (((x) >> 0) & 0x1FF)
@@ -3740,7 +3922,17 @@
 #define   C_008DFC_ENCODING                                           0x03FFFFFF
 #define     V_008DFC_SQ_ENC_MUBUF_FIELD                             0x38
 #endif
+#define R_030E00_TA_CS_BC_BASE_ADDR                                     0x030E00
+#define R_030E04_TA_CS_BC_BASE_ADDR_HI                                  0x030E04
+#define   S_030E04_ADDRESS(x)                                         (((x) & 0xFF) << 0)
+#define   G_030E04_ADDRESS(x)                                         (((x) >> 0) & 0xFF)
+#define   C_030E04_ADDRESS                                            0xFFFFFF00
+#define R_030F00_DB_OCCLUSION_COUNT0_LOW                                0x030F00
 #define R_008F00_SQ_BUF_RSRC_WORD0                                      0x008F00
+#define R_030F04_DB_OCCLUSION_COUNT0_HI                                 0x030F04
+#define   S_030F04_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030F04_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030F04_COUNT_HI                                           0x80000000
 #define R_008F04_SQ_BUF_RSRC_WORD1                                      0x008F04
 #define   S_008F04_BASE_ADDRESS_HI(x)                                 (((x) & 0xFFFF) << 0)
 #define   G_008F04_BASE_ADDRESS_HI(x)                                 (((x) >> 0) & 0xFFFF)
@@ -3754,7 +3946,12 @@
 #define   S_008F04_SWIZZLE_ENABLE(x)                                  (((x) & 0x1) << 31)
 #define   G_008F04_SWIZZLE_ENABLE(x)                                  (((x) >> 31) & 0x1)
 #define   C_008F04_SWIZZLE_ENABLE                                     0x7FFFFFFF
+#define R_030F08_DB_OCCLUSION_COUNT1_LOW                                0x030F08
 #define R_008F08_SQ_BUF_RSRC_WORD2                                      0x008F08
+#define R_030F0C_DB_OCCLUSION_COUNT1_HI                                 0x030F0C
+#define   S_030F0C_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030F0C_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030F0C_COUNT_HI                                           0x80000000
 #define R_008F0C_SQ_BUF_RSRC_WORD3                                      0x008F0C
 #define   S_008F0C_DST_SEL_X(x)                                       (((x) & 0x07) << 0)
 #define   G_008F0C_DST_SEL_X(x)                                       (((x) >> 0) & 0x07)
@@ -3862,7 +4059,12 @@
 #define     V_008F0C_SQ_RSRC_BUF_RSVD_1                             0x01
 #define     V_008F0C_SQ_RSRC_BUF_RSVD_2                             0x02
 #define     V_008F0C_SQ_RSRC_BUF_RSVD_3                             0x03
+#define R_030F10_DB_OCCLUSION_COUNT2_LOW                                0x030F10
 #define R_008F10_SQ_IMG_RSRC_WORD0                                      0x008F10
+#define R_030F14_DB_OCCLUSION_COUNT2_HI                                 0x030F14
+#define   S_030F14_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030F14_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030F14_COUNT_HI                                           0x80000000
 #define R_008F14_SQ_IMG_RSRC_WORD1                                      0x008F14
 #define   S_008F14_BASE_ADDRESS_HI(x)                                 (((x) & 0xFF) << 0)
 #define   G_008F14_BASE_ADDRESS_HI(x)                                 (((x) >> 0) & 0xFF)
@@ -3961,6 +4163,7 @@
 #define   G_008F14_MTYPE(x)                                           (((x) >> 30) & 0x03)
 #define   C_008F14_MTYPE                                              0x3FFFFFFF
 /*     */
+#define R_030F18_DB_OCCLUSION_COUNT3_LOW                                0x030F18
 #define R_008F18_SQ_IMG_RSRC_WORD2                                      0x008F18
 #define   S_008F18_WIDTH(x)                                           (((x) & 0x3FFF) << 0)
 #define   G_008F18_WIDTH(x)                                           (((x) >> 0) & 0x3FFF)
@@ -3974,6 +4177,10 @@
 #define   S_008F18_INTERLACED(x)                                      (((x) & 0x1) << 31)
 #define   G_008F18_INTERLACED(x)                                      (((x) >> 31) & 0x1)
 #define   C_008F18_INTERLACED                                         0x7FFFFFFF
+#define R_030F1C_DB_OCCLUSION_COUNT3_HI                                 0x030F1C
+#define   S_030F1C_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030F1C_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030F1C_COUNT_HI                                           0x80000000
 #define R_008F1C_SQ_IMG_RSRC_WORD3                                      0x008F1C
 #define   S_008F1C_DST_SEL_X(x)                                       (((x) & 0x07) << 0)
 #define   G_008F1C_DST_SEL_X(x)                                       (((x) >> 0) & 0x07)
@@ -4084,6 +4291,23 @@
 #define   G_008F28_LOD_HDW_CNT_EN(x)                                  (((x) >> 20) & 0x1)
 #define   C_008F28_LOD_HDW_CNT_EN                                     0xFFEFFFFF
 /*     */
+/* VI */
+#define   S_008F28_COMPRESSION_EN(x)                                  (((x) & 0x1) << 21)
+#define   G_008F28_COMPRESSION_EN(x)                                  (((x) >> 21) & 0x1)
+#define   C_008F28_COMPRESSION_EN                                     0xFFDFFFFF
+#define   S_008F28_ALPHA_IS_ON_MSB(x)                                 (((x) & 0x1) << 22)
+#define   G_008F28_ALPHA_IS_ON_MSB(x)                                 (((x) >> 22) & 0x1)
+#define   C_008F28_ALPHA_IS_ON_MSB                                    0xFFBFFFFF
+#define   S_008F28_COLOR_TRANSFORM(x)                                 (((x) & 0x1) << 23)
+#define   G_008F28_COLOR_TRANSFORM(x)                                 (((x) >> 23) & 0x1)
+#define   C_008F28_COLOR_TRANSFORM                                    0xFF7FFFFF
+#define   S_008F28_LOST_ALPHA_BITS(x)                                 (((x) & 0x0F) << 24)
+#define   G_008F28_LOST_ALPHA_BITS(x)                                 (((x) >> 24) & 0x0F)
+#define   C_008F28_LOST_ALPHA_BITS                                    0xF0FFFFFF
+#define   S_008F28_LOST_COLOR_BITS(x)                                 (((x) & 0x0F) << 28)
+#define   G_008F28_LOST_COLOR_BITS(x)                                 (((x) >> 28) & 0x0F)
+#define   C_008F28_LOST_COLOR_BITS                                    0x0FFFFFFF
+/*    */
 #define R_008F2C_SQ_IMG_RSRC_WORD7                                      0x008F2C
 #define R_008F30_SQ_IMG_SAMP_WORD0                                      0x008F30
 #define   S_008F30_CLAMP_X(x)                                         (((x) & 0x07) << 0)
@@ -4148,6 +4372,11 @@
 #define   S_008F30_FILTER_MODE(x)                                     (((x) & 0x03) << 29)
 #define   G_008F30_FILTER_MODE(x)                                     (((x) >> 29) & 0x03)
 #define   C_008F30_FILTER_MODE                                        0x9FFFFFFF
+/* VI */
+#define   S_008F30_COMPAT_MODE(x)                                     (((x) & 0x1) << 31)
+#define   G_008F30_COMPAT_MODE(x)                                     (((x) >> 31) & 0x1)
+#define   C_008F30_COMPAT_MODE                                        0x7FFFFFFF
+/*    */
 #define R_008F34_SQ_IMG_SAMP_WORD1                                      0x008F34
 #define   S_008F34_MIN_LOD(x)                                         (((x) & 0xFFF) << 0)
 #define   G_008F34_MIN_LOD(x)                                         (((x) >> 0) & 0xFFF)
@@ -4313,6 +4542,11 @@
 #define   G_008F44_OFFSET(x)                                          (((x) >> 0) & 0xFFFFFF)
 #define   C_008F44_OFFSET                                             0xFF000000
 /*     */
+#define R_030FF8_DB_ZPASS_COUNT_LOW                                     0x030FF8
+#define R_030FFC_DB_ZPASS_COUNT_HI                                      0x030FFC
+#define   S_030FFC_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030FFC_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030FFC_COUNT_HI                                           0x80000000
 #define R_009100_SPI_CONFIG_CNTL                                        0x009100
 #define   S_009100_GPR_WRITE_PRIORITY(x)                              (((x) & 0x1FFFFF) << 0)
 #define   G_009100_GPR_WRITE_PRIORITY(x)                              (((x) >> 0) & 0x1FFFFF)
@@ -4437,6 +4671,34 @@
 #define   S_009858_MSAA16_Y(x)                                        (((x) & 0x03) << 18)
 #define   G_009858_MSAA16_Y(x)                                        (((x) >> 18) & 0x03)
 #define   C_009858_MSAA16_Y                                           0xFFF3FFFF
+#define R_0098F8_GB_ADDR_CONFIG                                         0x0098F8
+#define   S_0098F8_NUM_PIPES(x)                                       (((x) & 0x07) << 0)
+#define   G_0098F8_NUM_PIPES(x)                                       (((x) >> 0) & 0x07)
+#define   C_0098F8_NUM_PIPES                                          0xFFFFFFF8
+#define   S_0098F8_PIPE_INTERLEAVE_SIZE(x)                            (((x) & 0x07) << 4)
+#define   G_0098F8_PIPE_INTERLEAVE_SIZE(x)                            (((x) >> 4) & 0x07)
+#define   C_0098F8_PIPE_INTERLEAVE_SIZE                               0xFFFFFF8F
+#define   S_0098F8_BANK_INTERLEAVE_SIZE(x)                            (((x) & 0x07) << 8)
+#define   G_0098F8_BANK_INTERLEAVE_SIZE(x)                            (((x) >> 8) & 0x07)
+#define   C_0098F8_BANK_INTERLEAVE_SIZE                               0xFFFFF8FF
+#define   S_0098F8_NUM_SHADER_ENGINES(x)                              (((x) & 0x03) << 12)
+#define   G_0098F8_NUM_SHADER_ENGINES(x)                              (((x) >> 12) & 0x03)
+#define   C_0098F8_NUM_SHADER_ENGINES                                 0xFFFFCFFF
+#define   S_0098F8_SHADER_ENGINE_TILE_SIZE(x)                         (((x) & 0x07) << 16)
+#define   G_0098F8_SHADER_ENGINE_TILE_SIZE(x)                         (((x) >> 16) & 0x07)
+#define   C_0098F8_SHADER_ENGINE_TILE_SIZE                            0xFFF8FFFF
+#define   S_0098F8_NUM_GPUS(x)                                        (((x) & 0x07) << 20)
+#define   G_0098F8_NUM_GPUS(x)                                        (((x) >> 20) & 0x07)
+#define   C_0098F8_NUM_GPUS                                           0xFF8FFFFF
+#define   S_0098F8_MULTI_GPU_TILE_SIZE(x)                             (((x) & 0x03) << 24)
+#define   G_0098F8_MULTI_GPU_TILE_SIZE(x)                             (((x) >> 24) & 0x03)
+#define   C_0098F8_MULTI_GPU_TILE_SIZE                                0xFCFFFFFF
+#define   S_0098F8_ROW_SIZE(x)                                        (((x) & 0x03) << 28)
+#define   G_0098F8_ROW_SIZE(x)                                        (((x) >> 28) & 0x03)
+#define   C_0098F8_ROW_SIZE                                           0xCFFFFFFF
+#define   S_0098F8_NUM_LOWER_PIPES(x)                                 (((x) & 0x1) << 30)
+#define   G_0098F8_NUM_LOWER_PIPES(x)                                 (((x) >> 30) & 0x1)
+#define   C_0098F8_NUM_LOWER_PIPES                                    0xBFFFFFFF
 #define R_009910_GB_TILE_MODE0                                          0x009910
 #define   S_009910_MICRO_TILE_MODE(x)                                 (((x) & 0x03) << 0)
 #define   G_009910_MICRO_TILE_MODE(x)                                 (((x) >> 0) & 0x03)
@@ -4515,14 +4777,88 @@
 #define     V_009910_ADDR_SURF_4_BANK                               0x01
 #define     V_009910_ADDR_SURF_8_BANK                               0x02
 #define     V_009910_ADDR_SURF_16_BANK                              0x03
-/* CIK */
 #define   S_009910_MICRO_TILE_MODE_NEW(x)                             (((x) & 0x07) << 22)
 #define   G_009910_MICRO_TILE_MODE_NEW(x)                             (((x) >> 22) & 0x07)
-#define   C_009910_MICRO_TILE_MODE_NEW(x)                             0xFE3FFFFF
+#define   C_009910_MICRO_TILE_MODE_NEW                                0xFE3FFFFF
 #define     V_009910_ADDR_SURF_DISPLAY_MICRO_TILING                 0x00
 #define     V_009910_ADDR_SURF_THIN_MICRO_TILING                    0x01
 #define     V_009910_ADDR_SURF_DEPTH_MICRO_TILING                   0x02
 #define     V_009910_ADDR_SURF_ROTATED_MICRO_TILING                 0x03
+#define   S_009910_SAMPLE_SPLIT(x)                                    (((x) & 0x03) << 25)
+#define   G_009910_SAMPLE_SPLIT(x)                                    (((x) >> 25) & 0x03)
+#define   C_009910_SAMPLE_SPLIT                                       0xF9FFFFFF
+#define R_009914_GB_TILE_MODE1                                          0x009914
+#define R_009918_GB_TILE_MODE2                                          0x009918
+#define R_00991C_GB_TILE_MODE3                                          0x00991C
+#define R_009920_GB_TILE_MODE4                                          0x009920
+#define R_009924_GB_TILE_MODE5                                          0x009924
+#define R_009928_GB_TILE_MODE6                                          0x009928
+#define R_00992C_GB_TILE_MODE7                                          0x00992C
+#define R_009930_GB_TILE_MODE8                                          0x009930
+#define R_009934_GB_TILE_MODE9                                          0x009934
+#define R_009938_GB_TILE_MODE10                                         0x009938
+#define R_00993C_GB_TILE_MODE11                                         0x00993C
+#define R_009940_GB_TILE_MODE12                                         0x009940
+#define R_009944_GB_TILE_MODE13                                         0x009944
+#define R_009948_GB_TILE_MODE14                                         0x009948
+#define R_00994C_GB_TILE_MODE15                                         0x00994C
+#define R_009950_GB_TILE_MODE16                                         0x009950
+#define R_009954_GB_TILE_MODE17                                         0x009954
+#define R_009958_GB_TILE_MODE18                                         0x009958
+#define R_00995C_GB_TILE_MODE19                                         0x00995C
+#define R_009960_GB_TILE_MODE20                                         0x009960
+#define R_009964_GB_TILE_MODE21                                         0x009964
+#define R_009968_GB_TILE_MODE22                                         0x009968
+#define R_00996C_GB_TILE_MODE23                                         0x00996C
+#define R_009970_GB_TILE_MODE24                                         0x009970
+#define R_009974_GB_TILE_MODE25                                         0x009974
+#define R_009978_GB_TILE_MODE26                                         0x009978
+#define R_00997C_GB_TILE_MODE27                                         0x00997C
+#define R_009980_GB_TILE_MODE28                                         0x009980
+#define R_009984_GB_TILE_MODE29                                         0x009984
+#define R_009988_GB_TILE_MODE30                                         0x009988
+#define R_00998C_GB_TILE_MODE31                                         0x00998C
+/* CIK */
+#define R_009990_GB_MACROTILE_MODE0                                     0x009990
+#define   S_009990_BANK_WIDTH(x)                                      (((x) & 0x03) << 0)
+#define   G_009990_BANK_WIDTH(x)                                      (((x) >> 0) & 0x03)
+#define   C_009990_BANK_WIDTH                                         0xFFFFFFFC
+#define   S_009990_BANK_HEIGHT(x)                                     (((x) & 0x03) << 2)
+#define   G_009990_BANK_HEIGHT(x)                                     (((x) >> 2) & 0x03)
+#define   C_009990_BANK_HEIGHT                                        0xFFFFFFF3
+#define   S_009990_MACRO_TILE_ASPECT(x)                               (((x) & 0x03) << 4)
+#define   G_009990_MACRO_TILE_ASPECT(x)                               (((x) >> 4) & 0x03)
+#define   C_009990_MACRO_TILE_ASPECT                                  0xFFFFFFCF
+#define   S_009990_NUM_BANKS(x)                                       (((x) & 0x03) << 6)
+#define   G_009990_NUM_BANKS(x)                                       (((x) >> 6) & 0x03)
+#define   C_009990_NUM_BANKS                                          0xFFFFFF3F
+#define R_009994_GB_MACROTILE_MODE1                                     0x009994
+#define R_009998_GB_MACROTILE_MODE2                                     0x009998
+#define R_00999C_GB_MACROTILE_MODE3                                     0x00999C
+#define R_0099A0_GB_MACROTILE_MODE4                                     0x0099A0
+#define R_0099A4_GB_MACROTILE_MODE5                                     0x0099A4
+#define R_0099A8_GB_MACROTILE_MODE6                                     0x0099A8
+#define R_0099AC_GB_MACROTILE_MODE7                                     0x0099AC
+#define R_0099B0_GB_MACROTILE_MODE8                                     0x0099B0
+#define R_0099B4_GB_MACROTILE_MODE9                                     0x0099B4
+#define R_0099B8_GB_MACROTILE_MODE10                                    0x0099B8
+#define R_0099BC_GB_MACROTILE_MODE11                                    0x0099BC
+#define R_0099C0_GB_MACROTILE_MODE12                                    0x0099C0
+#define R_0099C4_GB_MACROTILE_MODE13                                    0x0099C4
+#define R_0099C8_GB_MACROTILE_MODE14                                    0x0099C8
+#define R_0099CC_GB_MACROTILE_MODE15                                    0x0099CC
+/*     */
+#define R_00B000_SPI_SHADER_TBA_LO_PS                                   0x00B000
+#define R_00B004_SPI_SHADER_TBA_HI_PS                                   0x00B004
+#define   S_00B004_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B004_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B004_MEM_BASE                                           0xFFFFFF00
+#define R_00B008_SPI_SHADER_TMA_LO_PS                                   0x00B008
+#define R_00B00C_SPI_SHADER_TMA_HI_PS                                   0x00B00C
+#define   S_00B00C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B00C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B00C_MEM_BASE                                           0xFFFFFF00
+/* CIK */
 #define R_00B01C_SPI_SHADER_PGM_RSRC3_PS                                0x00B01C
 #define   S_00B01C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
 #define   G_00B01C_CU_EN(x)                                           (((x) >> 0) & 0xFFFF)
@@ -4582,6 +4918,9 @@
 #define   S_00B02C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B02C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B02C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B02C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B02C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B02C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B02C_WAVE_CNT_EN(x)                                     (((x) & 0x1) << 7)
 #define   G_00B02C_WAVE_CNT_EN(x)                                     (((x) >> 7) & 0x1)
 #define   C_00B02C_WAVE_CNT_EN                                        0xFFFFFF7F
@@ -4591,6 +4930,9 @@
 #define   S_00B02C_EXCP_EN(x)                                         (((x) & 0x7F) << 16) /* mask is 0x1FF on CIK */
 #define   G_00B02C_EXCP_EN(x)                                         (((x) >> 16) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B02C_EXCP_EN                                            0xFF80FFFF /* mask is 0x1FF on CIK */
+#define   S_00B02C_EXCP_EN_CIK(x)                                     (((x) & 0x1FF) << 16)
+#define   G_00B02C_EXCP_EN_CIK(x)                                     (((x) >> 16) & 0x1FF)
+#define   C_00B02C_EXCP_EN_CIK                                        0xFE00FFFF
 #define R_00B030_SPI_SHADER_USER_DATA_PS_0                              0x00B030
 #define R_00B034_SPI_SHADER_USER_DATA_PS_1                              0x00B034
 #define R_00B038_SPI_SHADER_USER_DATA_PS_2                              0x00B038
@@ -4607,6 +4949,16 @@
 #define R_00B064_SPI_SHADER_USER_DATA_PS_13                             0x00B064
 #define R_00B068_SPI_SHADER_USER_DATA_PS_14                             0x00B068
 #define R_00B06C_SPI_SHADER_USER_DATA_PS_15                             0x00B06C
+#define R_00B100_SPI_SHADER_TBA_LO_VS                                   0x00B100
+#define R_00B104_SPI_SHADER_TBA_HI_VS                                   0x00B104
+#define   S_00B104_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B104_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B104_MEM_BASE                                           0xFFFFFF00
+#define R_00B108_SPI_SHADER_TMA_LO_VS                                   0x00B108
+#define R_00B10C_SPI_SHADER_TMA_HI_VS                                   0x00B10C
+#define   S_00B10C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B10C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B10C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B118_SPI_SHADER_PGM_RSRC3_VS                                0x00B118
 #define   S_00B118_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
@@ -4674,6 +5026,9 @@
 #define   S_00B12C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B12C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B12C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B12C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B12C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B12C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B12C_OC_LDS_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B12C_OC_LDS_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B12C_OC_LDS_EN                                          0xFFFFFF7F
@@ -4695,6 +5050,14 @@
 #define   S_00B12C_EXCP_EN(x)                                         (((x) & 0x7F) << 13) /* mask is 0x1FF on CIK */
 #define   G_00B12C_EXCP_EN(x)                                         (((x) >> 13) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B12C_EXCP_EN                                            0xFFF01FFF /* mask is 0x1FF on CIK */
+#define   S_00B12C_EXCP_EN_CIK(x)                                     (((x) & 0x1FF) << 13)
+#define   G_00B12C_EXCP_EN_CIK(x)                                     (((x) >> 13) & 0x1FF)
+#define   C_00B12C_EXCP_EN_CIK                                        0xFFC01FFF
+/* VI */
+#define   S_00B12C_DISPATCH_DRAW_EN(x)                                (((x) & 0x1) << 24)
+#define   G_00B12C_DISPATCH_DRAW_EN(x)                                (((x) >> 24) & 0x1)
+#define   C_00B12C_DISPATCH_DRAW_EN                                   0xFEFFFFFF
+/*    */
 #define R_00B130_SPI_SHADER_USER_DATA_VS_0                              0x00B130
 #define R_00B134_SPI_SHADER_USER_DATA_VS_1                              0x00B134
 #define R_00B138_SPI_SHADER_USER_DATA_VS_2                              0x00B138
@@ -4711,6 +5074,16 @@
 #define R_00B164_SPI_SHADER_USER_DATA_VS_13                             0x00B164
 #define R_00B168_SPI_SHADER_USER_DATA_VS_14                             0x00B168
 #define R_00B16C_SPI_SHADER_USER_DATA_VS_15                             0x00B16C
+#define R_00B200_SPI_SHADER_TBA_LO_GS                                   0x00B200
+#define R_00B204_SPI_SHADER_TBA_HI_GS                                   0x00B204
+#define   S_00B204_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B204_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B204_MEM_BASE                                           0xFFFFFF00
+#define R_00B208_SPI_SHADER_TMA_LO_GS                                   0x00B208
+#define R_00B20C_SPI_SHADER_TMA_HI_GS                                   0x00B20C
+#define   S_00B20C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B20C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B20C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B21C_SPI_SHADER_PGM_RSRC3_GS                                0x00B21C
 #define   S_00B21C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
@@ -4723,6 +5096,11 @@
 #define   G_00B21C_LOCK_LOW_THRESHOLD(x)                              (((x) >> 22) & 0x0F)
 #define   C_00B21C_LOCK_LOW_THRESHOLD                                 0xFC3FFFFF
 /*     */
+/* VI */
+#define   S_00B21C_GROUP_FIFO_DEPTH(x)                                (((x) & 0x3F) << 26)
+#define   G_00B21C_GROUP_FIFO_DEPTH(x)                                (((x) >> 26) & 0x3F)
+#define   C_00B21C_GROUP_FIFO_DEPTH                                   0x03FFFFFF
+/*    */
 #define R_00B220_SPI_SHADER_PGM_LO_GS                                   0x00B220
 #define R_00B224_SPI_SHADER_PGM_HI_GS                                   0x00B224
 #define   S_00B224_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
@@ -4771,10 +5149,41 @@
 #define   S_00B22C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B22C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B22C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B22C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B22C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B22C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B22C_EXCP_EN(x)                                         (((x) & 0x7F) << 7) /* mask is 0x1FF on CIK */
 #define   G_00B22C_EXCP_EN(x)                                         (((x) >> 7) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B22C_EXCP_EN                                            0xFFFFC07F /* mask is 0x1FF on CIK */
+#define   S_00B22C_EXCP_EN_CIK(x)                                     (((x) & 0x1FF) << 7)
+#define   G_00B22C_EXCP_EN_CIK(x)                                     (((x) >> 7) & 0x1FF)
+#define   C_00B22C_EXCP_EN_CIK                                        0xFFFF007F
 #define R_00B230_SPI_SHADER_USER_DATA_GS_0                              0x00B230
+#define R_00B234_SPI_SHADER_USER_DATA_GS_1                              0x00B234
+#define R_00B238_SPI_SHADER_USER_DATA_GS_2                              0x00B238
+#define R_00B23C_SPI_SHADER_USER_DATA_GS_3                              0x00B23C
+#define R_00B240_SPI_SHADER_USER_DATA_GS_4                              0x00B240
+#define R_00B244_SPI_SHADER_USER_DATA_GS_5                              0x00B244
+#define R_00B248_SPI_SHADER_USER_DATA_GS_6                              0x00B248
+#define R_00B24C_SPI_SHADER_USER_DATA_GS_7                              0x00B24C
+#define R_00B250_SPI_SHADER_USER_DATA_GS_8                              0x00B250
+#define R_00B254_SPI_SHADER_USER_DATA_GS_9                              0x00B254
+#define R_00B258_SPI_SHADER_USER_DATA_GS_10                             0x00B258
+#define R_00B25C_SPI_SHADER_USER_DATA_GS_11                             0x00B25C
+#define R_00B260_SPI_SHADER_USER_DATA_GS_12                             0x00B260
+#define R_00B264_SPI_SHADER_USER_DATA_GS_13                             0x00B264
+#define R_00B268_SPI_SHADER_USER_DATA_GS_14                             0x00B268
+#define R_00B26C_SPI_SHADER_USER_DATA_GS_15                             0x00B26C
+#define R_00B300_SPI_SHADER_TBA_LO_ES                                   0x00B300
+#define R_00B304_SPI_SHADER_TBA_HI_ES                                   0x00B304
+#define   S_00B304_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B304_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B304_MEM_BASE                                           0xFFFFFF00
+#define R_00B308_SPI_SHADER_TMA_LO_ES                                   0x00B308
+#define R_00B30C_SPI_SHADER_TMA_HI_ES                                   0x00B30C
+#define   S_00B30C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B30C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B30C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B31C_SPI_SHADER_PGM_RSRC3_ES                                0x00B31C
 #define   S_00B31C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
@@ -4787,6 +5196,11 @@
 #define   G_00B31C_LOCK_LOW_THRESHOLD(x)                              (((x) >> 22) & 0x0F)
 #define   C_00B31C_LOCK_LOW_THRESHOLD                                 0xFC3FFFFF
 /*     */
+/* VI */
+#define   S_00B31C_GROUP_FIFO_DEPTH(x)                                (((x) & 0x3F) << 26)
+#define   G_00B31C_GROUP_FIFO_DEPTH(x)                                (((x) >> 26) & 0x3F)
+#define   C_00B31C_GROUP_FIFO_DEPTH                                   0x03FFFFFF
+/*    */
 #define R_00B320_SPI_SHADER_PGM_LO_ES                                   0x00B320
 #define R_00B324_SPI_SHADER_PGM_HI_ES                                   0x00B324
 #define   S_00B324_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
@@ -4838,6 +5252,9 @@
 #define   S_00B32C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B32C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B32C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B32C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B32C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B32C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B32C_OC_LDS_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B32C_OC_LDS_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B32C_OC_LDS_EN                                          0xFFFFFF7F
@@ -4848,6 +5265,31 @@
 #define   G_00B32C_LDS_SIZE(x)                                        (((x) >> 20) & 0x1FF) /* CIK, for on-chip GS */
 #define   C_00B32C_LDS_SIZE                                           0xE00FFFFF /* CIK, for on-chip GS */
 #define R_00B330_SPI_SHADER_USER_DATA_ES_0                              0x00B330
+#define R_00B334_SPI_SHADER_USER_DATA_ES_1                              0x00B334
+#define R_00B338_SPI_SHADER_USER_DATA_ES_2                              0x00B338
+#define R_00B33C_SPI_SHADER_USER_DATA_ES_3                              0x00B33C
+#define R_00B340_SPI_SHADER_USER_DATA_ES_4                              0x00B340
+#define R_00B344_SPI_SHADER_USER_DATA_ES_5                              0x00B344
+#define R_00B348_SPI_SHADER_USER_DATA_ES_6                              0x00B348
+#define R_00B34C_SPI_SHADER_USER_DATA_ES_7                              0x00B34C
+#define R_00B350_SPI_SHADER_USER_DATA_ES_8                              0x00B350
+#define R_00B354_SPI_SHADER_USER_DATA_ES_9                              0x00B354
+#define R_00B358_SPI_SHADER_USER_DATA_ES_10                             0x00B358
+#define R_00B35C_SPI_SHADER_USER_DATA_ES_11                             0x00B35C
+#define R_00B360_SPI_SHADER_USER_DATA_ES_12                             0x00B360
+#define R_00B364_SPI_SHADER_USER_DATA_ES_13                             0x00B364
+#define R_00B368_SPI_SHADER_USER_DATA_ES_14                             0x00B368
+#define R_00B36C_SPI_SHADER_USER_DATA_ES_15                             0x00B36C
+#define R_00B400_SPI_SHADER_TBA_LO_HS                                   0x00B400
+#define R_00B404_SPI_SHADER_TBA_HI_HS                                   0x00B404
+#define   S_00B404_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B404_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B404_MEM_BASE                                           0xFFFFFF00
+#define R_00B408_SPI_SHADER_TMA_LO_HS                                   0x00B408
+#define R_00B40C_SPI_SHADER_TMA_HI_HS                                   0x00B40C
+#define   S_00B40C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B40C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B40C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B41C_SPI_SHADER_PGM_RSRC3_HS                                0x00B41C
 #define   S_00B41C_WAVE_LIMIT(x)                                      (((x) & 0x3F) << 0)
@@ -4857,6 +5299,11 @@
 #define   G_00B41C_LOCK_LOW_THRESHOLD(x)                              (((x) >> 6) & 0x0F)
 #define   C_00B41C_LOCK_LOW_THRESHOLD                                 0xFFFFFC3F
 /*     */
+/* VI */
+#define   S_00B41C_GROUP_FIFO_DEPTH(x)                                (((x) & 0x3F) << 10)
+#define   G_00B41C_GROUP_FIFO_DEPTH(x)                                (((x) >> 10) & 0x3F)
+#define   C_00B41C_GROUP_FIFO_DEPTH                                   0xFFFF03FF
+/*    */
 #define R_00B420_SPI_SHADER_PGM_LO_HS                                   0x00B420
 #define R_00B424_SPI_SHADER_PGM_HI_HS                                   0x00B424
 #define   S_00B424_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
@@ -4902,6 +5349,9 @@
 #define   S_00B42C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B42C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B42C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B42C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B42C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B42C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B42C_OC_LDS_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B42C_OC_LDS_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B42C_OC_LDS_EN                                          0xFFFFFF7F
@@ -4912,6 +5362,31 @@
 #define   G_00B42C_EXCP_EN(x)                                         (((x) >> 9) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B42C_EXCP_EN                                            0xFFFF01FF /* mask is 0x1FF on CIK */
 #define R_00B430_SPI_SHADER_USER_DATA_HS_0                              0x00B430
+#define R_00B434_SPI_SHADER_USER_DATA_HS_1                              0x00B434
+#define R_00B438_SPI_SHADER_USER_DATA_HS_2                              0x00B438
+#define R_00B43C_SPI_SHADER_USER_DATA_HS_3                              0x00B43C
+#define R_00B440_SPI_SHADER_USER_DATA_HS_4                              0x00B440
+#define R_00B444_SPI_SHADER_USER_DATA_HS_5                              0x00B444
+#define R_00B448_SPI_SHADER_USER_DATA_HS_6                              0x00B448
+#define R_00B44C_SPI_SHADER_USER_DATA_HS_7                              0x00B44C
+#define R_00B450_SPI_SHADER_USER_DATA_HS_8                              0x00B450
+#define R_00B454_SPI_SHADER_USER_DATA_HS_9                              0x00B454
+#define R_00B458_SPI_SHADER_USER_DATA_HS_10                             0x00B458
+#define R_00B45C_SPI_SHADER_USER_DATA_HS_11                             0x00B45C
+#define R_00B460_SPI_SHADER_USER_DATA_HS_12                             0x00B460
+#define R_00B464_SPI_SHADER_USER_DATA_HS_13                             0x00B464
+#define R_00B468_SPI_SHADER_USER_DATA_HS_14                             0x00B468
+#define R_00B46C_SPI_SHADER_USER_DATA_HS_15                             0x00B46C
+#define R_00B500_SPI_SHADER_TBA_LO_LS                                   0x00B500
+#define R_00B504_SPI_SHADER_TBA_HI_LS                                   0x00B504
+#define   S_00B504_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B504_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B504_MEM_BASE                                           0xFFFFFF00
+#define R_00B508_SPI_SHADER_TMA_LO_LS                                   0x00B508
+#define R_00B50C_SPI_SHADER_TMA_HI_LS                                   0x00B50C
+#define   S_00B50C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B50C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B50C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B51C_SPI_SHADER_PGM_RSRC3_LS                                0x00B51C
 #define   S_00B51C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
@@ -4924,6 +5399,11 @@
 #define   G_00B51C_LOCK_LOW_THRESHOLD(x)                              (((x) >> 22) & 0x0F)
 #define   C_00B51C_LOCK_LOW_THRESHOLD                                 0xFC3FFFFF
 /*     */
+/* VI */
+#define   S_00B51C_GROUP_FIFO_DEPTH(x)                                (((x) & 0x3F) << 26)
+#define   G_00B51C_GROUP_FIFO_DEPTH(x)                                (((x) >> 26) & 0x3F)
+#define   C_00B51C_GROUP_FIFO_DEPTH                                   0x03FFFFFF
+/*    */
 #define R_00B520_SPI_SHADER_PGM_LO_LS                                   0x00B520
 #define R_00B524_SPI_SHADER_PGM_HI_LS                                   0x00B524
 #define   S_00B524_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
@@ -4972,6 +5452,9 @@
 #define   S_00B52C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B52C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B52C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B52C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B52C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B52C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B52C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 7)
 #define   G_00B52C_LDS_SIZE(x)                                        (((x) >> 7) & 0x1FF)
 #define   C_00B52C_LDS_SIZE                                           0xFFFF007F
@@ -4979,6 +5462,21 @@
 #define   G_00B52C_EXCP_EN(x)                                         (((x) >> 16) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B52C_EXCP_EN                                            0xFF80FFFF /* mask is 0x1FF on CIK */
 #define R_00B530_SPI_SHADER_USER_DATA_LS_0                              0x00B530
+#define R_00B534_SPI_SHADER_USER_DATA_LS_1                              0x00B534
+#define R_00B538_SPI_SHADER_USER_DATA_LS_2                              0x00B538
+#define R_00B53C_SPI_SHADER_USER_DATA_LS_3                              0x00B53C
+#define R_00B540_SPI_SHADER_USER_DATA_LS_4                              0x00B540
+#define R_00B544_SPI_SHADER_USER_DATA_LS_5                              0x00B544
+#define R_00B548_SPI_SHADER_USER_DATA_LS_6                              0x00B548
+#define R_00B54C_SPI_SHADER_USER_DATA_LS_7                              0x00B54C
+#define R_00B550_SPI_SHADER_USER_DATA_LS_8                              0x00B550
+#define R_00B554_SPI_SHADER_USER_DATA_LS_9                              0x00B554
+#define R_00B558_SPI_SHADER_USER_DATA_LS_10                             0x00B558
+#define R_00B55C_SPI_SHADER_USER_DATA_LS_11                             0x00B55C
+#define R_00B560_SPI_SHADER_USER_DATA_LS_12                             0x00B560
+#define R_00B564_SPI_SHADER_USER_DATA_LS_13                             0x00B564
+#define R_00B568_SPI_SHADER_USER_DATA_LS_14                             0x00B568
+#define R_00B56C_SPI_SHADER_USER_DATA_LS_15                             0x00B56C
 #define R_00B800_COMPUTE_DISPATCH_INITIATOR                             0x00B800
 #define   S_00B800_COMPUTE_SHADER_EN(x)                               (((x) & 0x1) << 0)
 #define   G_00B800_COMPUTE_SHADER_EN(x)                               (((x) >> 0) & 0x1)
@@ -5049,6 +5547,16 @@
 #define   S_00B82C_MAX_WAVE_ID(x)                                     (((x) & 0xFFF) << 0)
 #define   G_00B82C_MAX_WAVE_ID(x)                                     (((x) >> 0) & 0xFFF)
 #define   C_00B82C_MAX_WAVE_ID                                        0xFFFFF000
+/* CIK */
+#define R_00B828_COMPUTE_PIPELINESTAT_ENABLE                            0x00B828
+#define   S_00B828_PIPELINESTAT_ENABLE(x)                             (((x) & 0x1) << 0)
+#define   G_00B828_PIPELINESTAT_ENABLE(x)                             (((x) >> 0) & 0x1)
+#define   C_00B828_PIPELINESTAT_ENABLE                                0xFFFFFFFE
+#define R_00B82C_COMPUTE_PERFCOUNT_ENABLE                               0x00B82C
+#define   S_00B82C_PERFCOUNT_ENABLE(x)                                (((x) & 0x1) << 0)
+#define   G_00B82C_PERFCOUNT_ENABLE(x)                                (((x) >> 0) & 0x1)
+#define   C_00B82C_PERFCOUNT_ENABLE                                   0xFFFFFFFE
+/*     */
 #define R_00B830_COMPUTE_PGM_LO                                         0x00B830
 #define R_00B834_COMPUTE_PGM_HI                                         0x00B834
 #define   S_00B834_DATA(x)                                            (((x) & 0xFF) << 0)
@@ -5059,6 +5567,16 @@
 #define   G_00B834_INST_ATC(x)                                        (((x) >> 8) & 0x1)
 #define   C_00B834_INST_ATC                                           0xFFFFFEFF
 /*     */
+#define R_00B838_COMPUTE_TBA_LO                                         0x00B838
+#define R_00B83C_COMPUTE_TBA_HI                                         0x00B83C
+#define   S_00B83C_DATA(x)                                            (((x) & 0xFF) << 0)
+#define   G_00B83C_DATA(x)                                            (((x) >> 0) & 0xFF)
+#define   C_00B83C_DATA                                               0xFFFFFF00
+#define R_00B840_COMPUTE_TMA_LO                                         0x00B840
+#define R_00B844_COMPUTE_TMA_HI                                         0x00B844
+#define   S_00B844_DATA(x)                                            (((x) & 0xFF) << 0)
+#define   G_00B844_DATA(x)                                            (((x) >> 0) & 0xFF)
+#define   C_00B844_DATA                                               0xFFFFFF00
 #define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
 #define   S_00B848_VGPRS(x)                                           (((x) & 0x3F) << 0)
 #define   G_00B848_VGPRS(x)                                           (((x) >> 0) & 0x3F)
@@ -5099,6 +5617,9 @@
 #define   S_00B84C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B84C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B84C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B84C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B84C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B84C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B84C_TGID_X_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B84C_TGID_X_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B84C_TGID_X_EN                                          0xFFFFFF7F
@@ -5125,6 +5646,10 @@
 #define   S_00B84C_EXCP_EN(x)                                         (((x) & 0x7F) << 24)
 #define   G_00B84C_EXCP_EN(x)                                         (((x) >> 24) & 0x7F)
 #define   C_00B84C_EXCP_EN                                            0x80FFFFFF
+#define R_00B850_COMPUTE_VMID                                           0x00B850
+#define   S_00B850_DATA(x)                                            (((x) & 0x0F) << 0)
+#define   G_00B850_DATA(x)                                            (((x) >> 0) & 0x0F)
+#define   C_00B850_DATA                                               0xFFFFFFF0
 #define R_00B854_COMPUTE_RESOURCE_LIMITS                                0x00B854
 #define   S_00B854_WAVES_PER_SH(x)                                    (((x) & 0x3F) << 0) /* mask is 0x3FF on CIK */
 #define   G_00B854_WAVES_PER_SH(x)                                    (((x) >> 0) & 0x3F) /* mask is 0x3FF on CIK */
@@ -5167,7 +5692,84 @@
 #define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
 #define   G_00B860_WAVESIZE(x)                                        (((x) >> 12) & 0x1FFF)
 #define   C_00B860_WAVESIZE                                           0xFE000FFF
+/* CIK */
+#define R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2                         0x00B864
+#define   S_00B864_SH0_CU_EN(x)                                       (((x) & 0xFFFF) << 0)
+#define   G_00B864_SH0_CU_EN(x)                                       (((x) >> 0) & 0xFFFF)
+#define   C_00B864_SH0_CU_EN                                          0xFFFF0000
+#define   S_00B864_SH1_CU_EN(x)                                       (((x) & 0xFFFF) << 16)
+#define   G_00B864_SH1_CU_EN(x)                                       (((x) >> 16) & 0xFFFF)
+#define   C_00B864_SH1_CU_EN                                          0x0000FFFF
+#define R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3                         0x00B868
+#define   S_00B868_SH0_CU_EN(x)                                       (((x) & 0xFFFF) << 0)
+#define   G_00B868_SH0_CU_EN(x)                                       (((x) >> 0) & 0xFFFF)
+#define   C_00B868_SH0_CU_EN                                          0xFFFF0000
+#define   S_00B868_SH1_CU_EN(x)                                       (((x) & 0xFFFF) << 16)
+#define   G_00B868_SH1_CU_EN(x)                                       (((x) >> 16) & 0xFFFF)
+#define   C_00B868_SH1_CU_EN                                          0x0000FFFF
+#define R_00B86C_COMPUTE_RESTART_X                                      0x00B86C
+#define R_00B870_COMPUTE_RESTART_Y                                      0x00B870
+#define R_00B874_COMPUTE_RESTART_Z                                      0x00B874
+#define R_00B87C_COMPUTE_MISC_RESERVED                                  0x00B87C
+#define   S_00B87C_SEND_SEID(x)                                       (((x) & 0x03) << 0)
+#define   G_00B87C_SEND_SEID(x)                                       (((x) >> 0) & 0x03)
+#define   C_00B87C_SEND_SEID                                          0xFFFFFFFC
+#define   S_00B87C_RESERVED2(x)                                       (((x) & 0x1) << 2)
+#define   G_00B87C_RESERVED2(x)                                       (((x) >> 2) & 0x1)
+#define   C_00B87C_RESERVED2                                          0xFFFFFFFB
+#define   S_00B87C_RESERVED3(x)                                       (((x) & 0x1) << 3)
+#define   G_00B87C_RESERVED3(x)                                       (((x) >> 3) & 0x1)
+#define   C_00B87C_RESERVED3                                          0xFFFFFFF7
+#define   S_00B87C_RESERVED4(x)                                       (((x) & 0x1) << 4)
+#define   G_00B87C_RESERVED4(x)                                       (((x) >> 4) & 0x1)
+#define   C_00B87C_RESERVED4                                          0xFFFFFFEF
+/* VI */
+#define   S_00B87C_WAVE_ID_BASE(x)                                    (((x) & 0xFFF) << 5)
+#define   G_00B87C_WAVE_ID_BASE(x)                                    (((x) >> 5) & 0xFFF)
+#define   C_00B87C_WAVE_ID_BASE                                       0xFFFE001F
+#define R_00B880_COMPUTE_DISPATCH_ID                                    0x00B880
+#define R_00B884_COMPUTE_THREADGROUP_ID                                 0x00B884
+#define R_00B888_COMPUTE_RELAUNCH                                       0x00B888
+#define   S_00B888_PAYLOAD(x)                                         (((x) & 0x3FFFFFFF) << 0)
+#define   G_00B888_PAYLOAD(x)                                         (((x) >> 0) & 0x3FFFFFFF)
+#define   C_00B888_PAYLOAD                                            0xC0000000
+#define   S_00B888_IS_EVENT(x)                                        (((x) & 0x1) << 30)
+#define   G_00B888_IS_EVENT(x)                                        (((x) >> 30) & 0x1)
+#define   C_00B888_IS_EVENT                                           0xBFFFFFFF
+#define   S_00B888_IS_STATE(x)                                        (((x) & 0x1) << 31)
+#define   G_00B888_IS_STATE(x)                                        (((x) >> 31) & 0x1)
+#define   C_00B888_IS_STATE                                           0x7FFFFFFF
+#define R_00B88C_COMPUTE_WAVE_RESTORE_ADDR_LO                           0x00B88C
+#define R_00B890_COMPUTE_WAVE_RESTORE_ADDR_HI                           0x00B890
+#define   S_00B890_ADDR(x)                                            (((x) & 0xFFFF) << 0)
+#define   G_00B890_ADDR(x)                                            (((x) >> 0) & 0xFFFF)
+#define   C_00B890_ADDR                                               0xFFFF0000
+#define R_00B894_COMPUTE_WAVE_RESTORE_CONTROL                           0x00B894
+#define   S_00B894_ATC(x)                                             (((x) & 0x1) << 0)
+#define   G_00B894_ATC(x)                                             (((x) >> 0) & 0x1)
+#define   C_00B894_ATC                                                0xFFFFFFFE
+#define   S_00B894_MTYPE(x)                                           (((x) & 0x03) << 1)
+#define   G_00B894_MTYPE(x)                                           (((x) >> 1) & 0x03)
+#define   C_00B894_MTYPE                                              0xFFFFFFF9
+/*    */
+/*     */
 #define R_00B900_COMPUTE_USER_DATA_0                                    0x00B900
+#define R_00B904_COMPUTE_USER_DATA_1                                    0x00B904
+#define R_00B908_COMPUTE_USER_DATA_2                                    0x00B908
+#define R_00B90C_COMPUTE_USER_DATA_3                                    0x00B90C
+#define R_00B910_COMPUTE_USER_DATA_4                                    0x00B910
+#define R_00B914_COMPUTE_USER_DATA_5                                    0x00B914
+#define R_00B918_COMPUTE_USER_DATA_6                                    0x00B918
+#define R_00B91C_COMPUTE_USER_DATA_7                                    0x00B91C
+#define R_00B920_COMPUTE_USER_DATA_8                                    0x00B920
+#define R_00B924_COMPUTE_USER_DATA_9                                    0x00B924
+#define R_00B928_COMPUTE_USER_DATA_10                                   0x00B928
+#define R_00B92C_COMPUTE_USER_DATA_11                                   0x00B92C
+#define R_00B930_COMPUTE_USER_DATA_12                                   0x00B930
+#define R_00B934_COMPUTE_USER_DATA_13                                   0x00B934
+#define R_00B938_COMPUTE_USER_DATA_14                                   0x00B938
+#define R_00B93C_COMPUTE_USER_DATA_15                                   0x00B93C
+#define R_00B9FC_COMPUTE_NOWHERE                                        0x00B9FC
 #define R_028000_DB_RENDER_CONTROL                                      0x028000
 #define   S_028000_DEPTH_CLEAR_ENABLE(x)                              (((x) & 0x1) << 0)
 #define   G_028000_DEPTH_CLEAR_ENABLE(x)                              (((x) >> 0) & 0x1)
@@ -5196,6 +5798,11 @@
 #define   S_028000_COPY_SAMPLE(x)                                     (((x) & 0x0F) << 8)
 #define   G_028000_COPY_SAMPLE(x)                                     (((x) >> 8) & 0x0F)
 #define   C_028000_COPY_SAMPLE                                        0xFFFFF0FF
+/* VI */
+#define   S_028000_DECOMPRESS_ENABLE(x)                               (((x) & 0x1) << 12)
+#define   G_028000_DECOMPRESS_ENABLE(x)                               (((x) >> 12) & 0x1)
+#define   C_028000_DECOMPRESS_ENABLE                                  0xFFFFEFFF
+/*    */
 #define R_028004_DB_COUNT_CONTROL                                       0x028004
 #define   S_028004_ZPASS_INCREMENT_DISABLE(x)                         (((x) & 0x1) << 0)
 #define   G_028004_ZPASS_INCREMENT_DISABLE(x)                         (((x) >> 0) & 0x1)
@@ -5474,9 +6081,6 @@
 #define   S_028040_NUM_SAMPLES(x)                                     (((x) & 0x03) << 2)
 #define   G_028040_NUM_SAMPLES(x)                                     (((x) >> 2) & 0x03)
 #define   C_028040_NUM_SAMPLES                                        0xFFFFFFF3
-#define   S_028040_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20) /* not on CIK */
-#define   G_028040_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07) /* not on CIK */
-#define   C_028040_TILE_MODE_INDEX                                    0xFF8FFFFF /* not on CIK */
 /* CIK */
 #define   S_028040_TILE_SPLIT(x)                                      (((x) & 0x07) << 13)
 #define   G_028040_TILE_SPLIT(x)                                      (((x) >> 13) & 0x07)
@@ -5489,6 +6093,14 @@
 #define     V_028040_ADDR_SURF_TILE_SPLIT_2KB                       0x05
 #define     V_028040_ADDR_SURF_TILE_SPLIT_4KB                       0x06
 /*     */
+#define   S_028040_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20) /* not on CIK */
+#define   G_028040_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07) /* not on CIK */
+#define   C_028040_TILE_MODE_INDEX                                    0xFF8FFFFF /* not on CIK */
+/* VI */
+#define   S_028040_DECOMPRESS_ON_N_ZPLANES(x)                         (((x) & 0x0F) << 23)
+#define   G_028040_DECOMPRESS_ON_N_ZPLANES(x)                         (((x) >> 23) & 0x0F)
+#define   C_028040_DECOMPRESS_ON_N_ZPLANES                            0xF87FFFFF
+/*    */
 #define   S_028040_ALLOW_EXPCLEAR(x)                                  (((x) & 0x1) << 27)
 #define   G_028040_ALLOW_EXPCLEAR(x)                                  (((x) >> 27) & 0x1)
 #define   C_028040_ALLOW_EXPCLEAR                                     0xF7FFFFFF
@@ -5498,6 +6110,11 @@
 #define   S_028040_TILE_SURFACE_ENABLE(x)                             (((x) & 0x1) << 29)
 #define   G_028040_TILE_SURFACE_ENABLE(x)                             (((x) >> 29) & 0x1)
 #define   C_028040_TILE_SURFACE_ENABLE                                0xDFFFFFFF
+/* VI */
+#define   S_028040_CLEAR_DISALLOWED(x)                                (((x) & 0x1) << 30)
+#define   G_028040_CLEAR_DISALLOWED(x)                                (((x) >> 30) & 0x1)
+#define   C_028040_CLEAR_DISALLOWED                                   0xBFFFFFFF
+/*    */
 #define   S_028040_ZRANGE_PRECISION(x)                                (((x) & 0x1) << 31)
 #define   G_028040_ZRANGE_PRECISION(x)                                (((x) >> 31) & 0x1)
 #define   C_028040_ZRANGE_PRECISION                                   0x7FFFFFFF
@@ -5507,9 +6124,6 @@
 #define   C_028044_FORMAT                                             0xFFFFFFFE
 #define     V_028044_STENCIL_INVALID                                0x00
 #define     V_028044_STENCIL_8                                      0x01
-#define   S_028044_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20) /* not on CIK */
-#define   G_028044_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07) /* not on CIK */
-#define   C_028044_TILE_MODE_INDEX                                    0xFF8FFFFF /* not on CIK */
 /* CIK */
 #define   S_028044_TILE_SPLIT(x)                                      (((x) & 0x07) << 13)
 #define   G_028044_TILE_SPLIT(x)                                      (((x) >> 13) & 0x07)
@@ -5522,12 +6136,20 @@
 #define     V_028044_ADDR_SURF_TILE_SPLIT_2KB                       0x05
 #define     V_028044_ADDR_SURF_TILE_SPLIT_4KB                       0x06
 /*     */
+#define   S_028044_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20) /* not on CIK */
+#define   G_028044_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07) /* not on CIK */
+#define   C_028044_TILE_MODE_INDEX                                    0xFF8FFFFF /* not on CIK */
 #define   S_028044_ALLOW_EXPCLEAR(x)                                  (((x) & 0x1) << 27)
 #define   G_028044_ALLOW_EXPCLEAR(x)                                  (((x) >> 27) & 0x1)
 #define   C_028044_ALLOW_EXPCLEAR                                     0xF7FFFFFF
 #define   S_028044_TILE_STENCIL_DISABLE(x)                            (((x) & 0x1) << 29)
 #define   G_028044_TILE_STENCIL_DISABLE(x)                            (((x) >> 29) & 0x1)
 #define   C_028044_TILE_STENCIL_DISABLE                               0xDFFFFFFF
+/* VI */
+#define   S_028044_CLEAR_DISALLOWED(x)                                (((x) & 0x1) << 30)
+#define   G_028044_CLEAR_DISALLOWED(x)                                (((x) >> 30) & 0x1)
+#define   C_028044_CLEAR_DISALLOWED                                   0xBFFFFFFF
+/*    */
 #define R_028048_DB_Z_READ_BASE                                         0x028048
 #define R_02804C_DB_STENCIL_READ_BASE                                   0x02804C
 #define R_028050_DB_Z_WRITE_BASE                                        0x028050
@@ -5549,7 +6171,13 @@
 #define   S_028084_ADDRESS(x)                                         (((x) & 0xFF) << 0)
 #define   G_028084_ADDRESS(x)                                         (((x) >> 0) & 0xFF)
 #define   C_028084_ADDRESS                                            0xFFFFFF00
-/* */
+#define R_0281E8_COHER_DEST_BASE_HI_0                                   0x0281E8
+#define R_0281EC_COHER_DEST_BASE_HI_1                                   0x0281EC
+#define R_0281F0_COHER_DEST_BASE_HI_2                                   0x0281F0
+#define R_0281F4_COHER_DEST_BASE_HI_3                                   0x0281F4
+/*     */
+#define R_0281F8_COHER_DEST_BASE_2                                      0x0281F8
+#define R_0281FC_COHER_DEST_BASE_3                                      0x0281FC
 #define R_028200_PA_SC_WINDOW_OFFSET                                    0x028200
 #define   S_028200_WINDOW_X_OFFSET(x)                                 (((x) & 0xFFFF) << 0)
 #define   G_028200_WINDOW_X_OFFSET(x)                                 (((x) >> 0) & 0xFFFF)
@@ -5694,6 +6322,8 @@
 #define   S_028244_BR_Y(x)                                            (((x) & 0x7FFF) << 16)
 #define   G_028244_BR_Y(x)                                            (((x) >> 16) & 0x7FFF)
 #define   C_028244_BR_Y                                               0x8000FFFF
+#define R_028248_COHER_DEST_BASE_0                                      0x028248
+#define R_02824C_COHER_DEST_BASE_1                                      0x02824C
 #define R_028250_PA_SC_VPORT_SCISSOR_0_TL                               0x028250
 #define   S_028250_TL_X(x)                                            (((x) & 0x7FFF) << 0)
 #define   G_028250_TL_X(x)                                            (((x) >> 0) & 0x7FFF)
@@ -5711,8 +6341,68 @@
 #define   S_028254_BR_Y(x)                                            (((x) & 0x7FFF) << 16)
 #define   G_028254_BR_Y(x)                                            (((x) >> 16) & 0x7FFF)
 #define   C_028254_BR_Y                                               0x8000FFFF
+#define R_028258_PA_SC_VPORT_SCISSOR_1_TL                               0x028258
+#define R_02825C_PA_SC_VPORT_SCISSOR_1_BR                               0x02825C
+#define R_028260_PA_SC_VPORT_SCISSOR_2_TL                               0x028260
+#define R_028264_PA_SC_VPORT_SCISSOR_2_BR                               0x028264
+#define R_028268_PA_SC_VPORT_SCISSOR_3_TL                               0x028268
+#define R_02826C_PA_SC_VPORT_SCISSOR_3_BR                               0x02826C
+#define R_028270_PA_SC_VPORT_SCISSOR_4_TL                               0x028270
+#define R_028274_PA_SC_VPORT_SCISSOR_4_BR                               0x028274
+#define R_028278_PA_SC_VPORT_SCISSOR_5_TL                               0x028278
+#define R_02827C_PA_SC_VPORT_SCISSOR_5_BR                               0x02827C
+#define R_028280_PA_SC_VPORT_SCISSOR_6_TL                               0x028280
+#define R_028284_PA_SC_VPORT_SCISSOR_6_BR                               0x028284
+#define R_028288_PA_SC_VPORT_SCISSOR_7_TL                               0x028288
+#define R_02828C_PA_SC_VPORT_SCISSOR_7_BR                               0x02828C
+#define R_028290_PA_SC_VPORT_SCISSOR_8_TL                               0x028290
+#define R_028294_PA_SC_VPORT_SCISSOR_8_BR                               0x028294
+#define R_028298_PA_SC_VPORT_SCISSOR_9_TL                               0x028298
+#define R_02829C_PA_SC_VPORT_SCISSOR_9_BR                               0x02829C
+#define R_0282A0_PA_SC_VPORT_SCISSOR_10_TL                              0x0282A0
+#define R_0282A4_PA_SC_VPORT_SCISSOR_10_BR                              0x0282A4
+#define R_0282A8_PA_SC_VPORT_SCISSOR_11_TL                              0x0282A8
+#define R_0282AC_PA_SC_VPORT_SCISSOR_11_BR                              0x0282AC
+#define R_0282B0_PA_SC_VPORT_SCISSOR_12_TL                              0x0282B0
+#define R_0282B4_PA_SC_VPORT_SCISSOR_12_BR                              0x0282B4
+#define R_0282B8_PA_SC_VPORT_SCISSOR_13_TL                              0x0282B8
+#define R_0282BC_PA_SC_VPORT_SCISSOR_13_BR                              0x0282BC
+#define R_0282C0_PA_SC_VPORT_SCISSOR_14_TL                              0x0282C0
+#define R_0282C4_PA_SC_VPORT_SCISSOR_14_BR                              0x0282C4
+#define R_0282C8_PA_SC_VPORT_SCISSOR_15_TL                              0x0282C8
+#define R_0282CC_PA_SC_VPORT_SCISSOR_15_BR                              0x0282CC
 #define R_0282D0_PA_SC_VPORT_ZMIN_0                                     0x0282D0
 #define R_0282D4_PA_SC_VPORT_ZMAX_0                                     0x0282D4
+#define R_0282D8_PA_SC_VPORT_ZMIN_1                                     0x0282D8
+#define R_0282DC_PA_SC_VPORT_ZMAX_1                                     0x0282DC
+#define R_0282E0_PA_SC_VPORT_ZMIN_2                                     0x0282E0
+#define R_0282E4_PA_SC_VPORT_ZMAX_2                                     0x0282E4
+#define R_0282E8_PA_SC_VPORT_ZMIN_3                                     0x0282E8
+#define R_0282EC_PA_SC_VPORT_ZMAX_3                                     0x0282EC
+#define R_0282F0_PA_SC_VPORT_ZMIN_4                                     0x0282F0
+#define R_0282F4_PA_SC_VPORT_ZMAX_4                                     0x0282F4
+#define R_0282F8_PA_SC_VPORT_ZMIN_5                                     0x0282F8
+#define R_0282FC_PA_SC_VPORT_ZMAX_5                                     0x0282FC
+#define R_028300_PA_SC_VPORT_ZMIN_6                                     0x028300
+#define R_028304_PA_SC_VPORT_ZMAX_6                                     0x028304
+#define R_028308_PA_SC_VPORT_ZMIN_7                                     0x028308
+#define R_02830C_PA_SC_VPORT_ZMAX_7                                     0x02830C
+#define R_028310_PA_SC_VPORT_ZMIN_8                                     0x028310
+#define R_028314_PA_SC_VPORT_ZMAX_8                                     0x028314
+#define R_028318_PA_SC_VPORT_ZMIN_9                                     0x028318
+#define R_02831C_PA_SC_VPORT_ZMAX_9                                     0x02831C
+#define R_028320_PA_SC_VPORT_ZMIN_10                                    0x028320
+#define R_028324_PA_SC_VPORT_ZMAX_10                                    0x028324
+#define R_028328_PA_SC_VPORT_ZMIN_11                                    0x028328
+#define R_02832C_PA_SC_VPORT_ZMAX_11                                    0x02832C
+#define R_028330_PA_SC_VPORT_ZMIN_12                                    0x028330
+#define R_028334_PA_SC_VPORT_ZMAX_12                                    0x028334
+#define R_028338_PA_SC_VPORT_ZMIN_13                                    0x028338
+#define R_02833C_PA_SC_VPORT_ZMAX_13                                    0x02833C
+#define R_028340_PA_SC_VPORT_ZMIN_14                                    0x028340
+#define R_028344_PA_SC_VPORT_ZMAX_14                                    0x028344
+#define R_028348_PA_SC_VPORT_ZMIN_15                                    0x028348
+#define R_02834C_PA_SC_VPORT_ZMAX_15                                    0x02834C
 #define R_028350_PA_SC_RASTER_CONFIG                                    0x028350
 #define   S_028350_RB_MAP_PKR0(x)                                     (((x) & 0x03) << 0)
 #define   G_028350_RB_MAP_PKR0(x)                                     (((x) >> 0) & 0x03)
@@ -5834,6 +6524,13 @@
 #define     V_028354_RASTER_CONFIG_SE_PAIR_YSEL_16_WIDE_TILE        0x01
 #define     V_028354_RASTER_CONFIG_SE_PAIR_YSEL_32_WIDE_TILE        0x02
 #define     V_028354_RASTER_CONFIG_SE_PAIR_YSEL_64_WIDE_TILE        0x03
+#define R_028358_PA_SC_SCREEN_EXTENT_CONTROL                            0x028358
+#define   S_028358_SLICE_EVEN_ENABLE(x)                               (((x) & 0x03) << 0)
+#define   G_028358_SLICE_EVEN_ENABLE(x)                               (((x) >> 0) & 0x03)
+#define   C_028358_SLICE_EVEN_ENABLE                                  0xFFFFFFFC
+#define   S_028358_SLICE_ODD_ENABLE(x)                                (((x) & 0x03) << 2)
+#define   G_028358_SLICE_ODD_ENABLE(x)                                (((x) >> 2) & 0x03)
+#define   C_028358_SLICE_ODD_ENABLE                                   0xFFFFFFF3
 /*     */
 #define R_028400_VGT_MAX_VTX_INDX                                       0x028400
 #define R_028404_VGT_MIN_VTX_INDX                                       0x028404
@@ -5843,6 +6540,18 @@
 #define R_028418_CB_BLEND_GREEN                                         0x028418
 #define R_02841C_CB_BLEND_BLUE                                          0x02841C
 #define R_028420_CB_BLEND_ALPHA                                         0x028420
+/* VI */
+#define R_028424_CB_DCC_CONTROL                                         0x028424
+#define   S_028424_OVERWRITE_COMBINER_DISABLE(x)                      (((x) & 0x1) << 0)
+#define   G_028424_OVERWRITE_COMBINER_DISABLE(x)                      (((x) >> 0) & 0x1)
+#define   C_028424_OVERWRITE_COMBINER_DISABLE                         0xFFFFFFFE
+#define   S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(x)          (((x) & 0x1) << 1)
+#define   G_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(x)          (((x) >> 1) & 0x1)
+#define   C_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE             0xFFFFFFFD
+#define   S_028424_OVERWRITE_COMBINER_WATERMARK(x)                    (((x) & 0x1F) << 2)
+#define   G_028424_OVERWRITE_COMBINER_WATERMARK(x)                    (((x) >> 2) & 0x1F)
+#define   C_028424_OVERWRITE_COMBINER_WATERMARK                       0xFFFFFF83
+/*    */
 #define R_02842C_DB_STENCIL_CONTROL                                     0x02842C
 #define   S_02842C_STENCILFAIL(x)                                     (((x) & 0x0F) << 0)
 #define   G_02842C_STENCILFAIL(x)                                     (((x) >> 0) & 0x0F)
@@ -5984,12 +6693,102 @@
 #define   S_028434_STENCILOPVAL_BF(x)                                 (((x) & 0xFF) << 24)
 #define   G_028434_STENCILOPVAL_BF(x)                                 (((x) >> 24) & 0xFF)
 #define   C_028434_STENCILOPVAL_BF                                    0x00FFFFFF
-#define R_02843C_PA_CL_VPORT_XSCALE_0                                   0x02843C
-#define R_028440_PA_CL_VPORT_XOFFSET_0                                  0x028440
-#define R_028444_PA_CL_VPORT_YSCALE_0                                   0x028444
-#define R_028448_PA_CL_VPORT_YOFFSET_0                                  0x028448
-#define R_02844C_PA_CL_VPORT_ZSCALE_0                                   0x02844C
-#define R_028450_PA_CL_VPORT_ZOFFSET_0                                  0x028450
+#define R_02843C_PA_CL_VPORT_XSCALE                                     0x02843C
+#define R_028440_PA_CL_VPORT_XOFFSET                                    0x028440
+#define R_028444_PA_CL_VPORT_YSCALE                                     0x028444
+#define R_028448_PA_CL_VPORT_YOFFSET                                    0x028448
+#define R_02844C_PA_CL_VPORT_ZSCALE                                     0x02844C
+#define R_028450_PA_CL_VPORT_ZOFFSET                                    0x028450
+#define R_028454_PA_CL_VPORT_XSCALE_1                                   0x028454
+#define R_028458_PA_CL_VPORT_XOFFSET_1                                  0x028458
+#define R_02845C_PA_CL_VPORT_YSCALE_1                                   0x02845C
+#define R_028460_PA_CL_VPORT_YOFFSET_1                                  0x028460
+#define R_028464_PA_CL_VPORT_ZSCALE_1                                   0x028464
+#define R_028468_PA_CL_VPORT_ZOFFSET_1                                  0x028468
+#define R_02846C_PA_CL_VPORT_XSCALE_2                                   0x02846C
+#define R_028470_PA_CL_VPORT_XOFFSET_2                                  0x028470
+#define R_028474_PA_CL_VPORT_YSCALE_2                                   0x028474
+#define R_028478_PA_CL_VPORT_YOFFSET_2                                  0x028478
+#define R_02847C_PA_CL_VPORT_ZSCALE_2                                   0x02847C
+#define R_028480_PA_CL_VPORT_ZOFFSET_2                                  0x028480
+#define R_028484_PA_CL_VPORT_XSCALE_3                                   0x028484
+#define R_028488_PA_CL_VPORT_XOFFSET_3                                  0x028488
+#define R_02848C_PA_CL_VPORT_YSCALE_3                                   0x02848C
+#define R_028490_PA_CL_VPORT_YOFFSET_3                                  0x028490
+#define R_028494_PA_CL_VPORT_ZSCALE_3                                   0x028494
+#define R_028498_PA_CL_VPORT_ZOFFSET_3                                  0x028498
+#define R_02849C_PA_CL_VPORT_XSCALE_4                                   0x02849C
+#define R_0284A0_PA_CL_VPORT_XOFFSET_4                                  0x0284A0
+#define R_0284A4_PA_CL_VPORT_YSCALE_4                                   0x0284A4
+#define R_0284A8_PA_CL_VPORT_YOFFSET_4                                  0x0284A8
+#define R_0284AC_PA_CL_VPORT_ZSCALE_4                                   0x0284AC
+#define R_0284B0_PA_CL_VPORT_ZOFFSET_4                                  0x0284B0
+#define R_0284B4_PA_CL_VPORT_XSCALE_5                                   0x0284B4
+#define R_0284B8_PA_CL_VPORT_XOFFSET_5                                  0x0284B8
+#define R_0284BC_PA_CL_VPORT_YSCALE_5                                   0x0284BC
+#define R_0284C0_PA_CL_VPORT_YOFFSET_5                                  0x0284C0
+#define R_0284C4_PA_CL_VPORT_ZSCALE_5                                   0x0284C4
+#define R_0284C8_PA_CL_VPORT_ZOFFSET_5                                  0x0284C8
+#define R_0284CC_PA_CL_VPORT_XSCALE_6                                   0x0284CC
+#define R_0284D0_PA_CL_VPORT_XOFFSET_6                                  0x0284D0
+#define R_0284D4_PA_CL_VPORT_YSCALE_6                                   0x0284D4
+#define R_0284D8_PA_CL_VPORT_YOFFSET_6                                  0x0284D8
+#define R_0284DC_PA_CL_VPORT_ZSCALE_6                                   0x0284DC
+#define R_0284E0_PA_CL_VPORT_ZOFFSET_6                                  0x0284E0
+#define R_0284E4_PA_CL_VPORT_XSCALE_7                                   0x0284E4
+#define R_0284E8_PA_CL_VPORT_XOFFSET_7                                  0x0284E8
+#define R_0284EC_PA_CL_VPORT_YSCALE_7                                   0x0284EC
+#define R_0284F0_PA_CL_VPORT_YOFFSET_7                                  0x0284F0
+#define R_0284F4_PA_CL_VPORT_ZSCALE_7                                   0x0284F4
+#define R_0284F8_PA_CL_VPORT_ZOFFSET_7                                  0x0284F8
+#define R_0284FC_PA_CL_VPORT_XSCALE_8                                   0x0284FC
+#define R_028500_PA_CL_VPORT_XOFFSET_8                                  0x028500
+#define R_028504_PA_CL_VPORT_YSCALE_8                                   0x028504
+#define R_028508_PA_CL_VPORT_YOFFSET_8                                  0x028508
+#define R_02850C_PA_CL_VPORT_ZSCALE_8                                   0x02850C
+#define R_028510_PA_CL_VPORT_ZOFFSET_8                                  0x028510
+#define R_028514_PA_CL_VPORT_XSCALE_9                                   0x028514
+#define R_028518_PA_CL_VPORT_XOFFSET_9                                  0x028518
+#define R_02851C_PA_CL_VPORT_YSCALE_9                                   0x02851C
+#define R_028520_PA_CL_VPORT_YOFFSET_9                                  0x028520
+#define R_028524_PA_CL_VPORT_ZSCALE_9                                   0x028524
+#define R_028528_PA_CL_VPORT_ZOFFSET_9                                  0x028528
+#define R_02852C_PA_CL_VPORT_XSCALE_10                                  0x02852C
+#define R_028530_PA_CL_VPORT_XOFFSET_10                                 0x028530
+#define R_028534_PA_CL_VPORT_YSCALE_10                                  0x028534
+#define R_028538_PA_CL_VPORT_YOFFSET_10                                 0x028538
+#define R_02853C_PA_CL_VPORT_ZSCALE_10                                  0x02853C
+#define R_028540_PA_CL_VPORT_ZOFFSET_10                                 0x028540
+#define R_028544_PA_CL_VPORT_XSCALE_11                                  0x028544
+#define R_028548_PA_CL_VPORT_XOFFSET_11                                 0x028548
+#define R_02854C_PA_CL_VPORT_YSCALE_11                                  0x02854C
+#define R_028550_PA_CL_VPORT_YOFFSET_11                                 0x028550
+#define R_028554_PA_CL_VPORT_ZSCALE_11                                  0x028554
+#define R_028558_PA_CL_VPORT_ZOFFSET_11                                 0x028558
+#define R_02855C_PA_CL_VPORT_XSCALE_12                                  0x02855C
+#define R_028560_PA_CL_VPORT_XOFFSET_12                                 0x028560
+#define R_028564_PA_CL_VPORT_YSCALE_12                                  0x028564
+#define R_028568_PA_CL_VPORT_YOFFSET_12                                 0x028568
+#define R_02856C_PA_CL_VPORT_ZSCALE_12                                  0x02856C
+#define R_028570_PA_CL_VPORT_ZOFFSET_12                                 0x028570
+#define R_028574_PA_CL_VPORT_XSCALE_13                                  0x028574
+#define R_028578_PA_CL_VPORT_XOFFSET_13                                 0x028578
+#define R_02857C_PA_CL_VPORT_YSCALE_13                                  0x02857C
+#define R_028580_PA_CL_VPORT_YOFFSET_13                                 0x028580
+#define R_028584_PA_CL_VPORT_ZSCALE_13                                  0x028584
+#define R_028588_PA_CL_VPORT_ZOFFSET_13                                 0x028588
+#define R_02858C_PA_CL_VPORT_XSCALE_14                                  0x02858C
+#define R_028590_PA_CL_VPORT_XOFFSET_14                                 0x028590
+#define R_028594_PA_CL_VPORT_YSCALE_14                                  0x028594
+#define R_028598_PA_CL_VPORT_YOFFSET_14                                 0x028598
+#define R_02859C_PA_CL_VPORT_ZSCALE_14                                  0x02859C
+#define R_0285A0_PA_CL_VPORT_ZOFFSET_14                                 0x0285A0
+#define R_0285A4_PA_CL_VPORT_XSCALE_15                                  0x0285A4
+#define R_0285A8_PA_CL_VPORT_XOFFSET_15                                 0x0285A8
+#define R_0285AC_PA_CL_VPORT_YSCALE_15                                  0x0285AC
+#define R_0285B0_PA_CL_VPORT_YOFFSET_15                                 0x0285B0
+#define R_0285B4_PA_CL_VPORT_ZSCALE_15                                  0x0285B4
+#define R_0285B8_PA_CL_VPORT_ZOFFSET_15                                 0x0285B8
 #define R_0285BC_PA_CL_UCP_0_X                                          0x0285BC
 #define R_0285C0_PA_CL_UCP_0_Y                                          0x0285C0
 #define R_0285C4_PA_CL_UCP_0_Z                                          0x0285C4
@@ -6036,6 +6835,26 @@
 #define   G_028644_DUP(x)                                             (((x) >> 18) & 0x1)
 #define   C_028644_DUP                                                0xFFFBFFFF
 /*     */
+/* VI */
+#define   S_028644_FP16_INTERP_MODE(x)                                (((x) & 0x1) << 19)
+#define   G_028644_FP16_INTERP_MODE(x)                                (((x) >> 19) & 0x1)
+#define   C_028644_FP16_INTERP_MODE                                   0xFFF7FFFF
+#define   S_028644_USE_DEFAULT_ATTR1(x)                               (((x) & 0x1) << 20)
+#define   G_028644_USE_DEFAULT_ATTR1(x)                               (((x) >> 20) & 0x1)
+#define   C_028644_USE_DEFAULT_ATTR1                                  0xFFEFFFFF
+#define   S_028644_DEFAULT_VAL_ATTR1(x)                               (((x) & 0x03) << 21)
+#define   G_028644_DEFAULT_VAL_ATTR1(x)                               (((x) >> 21) & 0x03)
+#define   C_028644_DEFAULT_VAL_ATTR1                                  0xFF9FFFFF
+#define   S_028644_PT_SPRITE_TEX_ATTR1(x)                             (((x) & 0x1) << 23)
+#define   G_028644_PT_SPRITE_TEX_ATTR1(x)                             (((x) >> 23) & 0x1)
+#define   C_028644_PT_SPRITE_TEX_ATTR1                                0xFF7FFFFF
+#define   S_028644_ATTR0_VALID(x)                                     (((x) & 0x1) << 24)
+#define   G_028644_ATTR0_VALID(x)                                     (((x) >> 24) & 0x1)
+#define   C_028644_ATTR0_VALID                                        0xFEFFFFFF
+#define   S_028644_ATTR1_VALID(x)                                     (((x) & 0x1) << 25)
+#define   G_028644_ATTR1_VALID(x)                                     (((x) >> 25) & 0x1)
+#define   C_028644_ATTR1_VALID                                        0xFDFFFFFF
+/*    */
 #define R_028648_SPI_PS_INPUT_CNTL_1                                    0x028648
 #define R_02864C_SPI_PS_INPUT_CNTL_2                                    0x02864C
 #define R_028650_SPI_PS_INPUT_CNTL_3                                    0x028650
@@ -6559,6 +7378,10 @@
 #define R_028794_CB_BLEND5_CONTROL                                      0x028794
 #define R_028798_CB_BLEND6_CONTROL                                      0x028798
 #define R_02879C_CB_BLEND7_CONTROL                                      0x02879C
+#define R_0287CC_CS_COPY_STATE                                          0x0287CC
+#define   S_0287CC_SRC_STATE_ID(x)                                    (((x) & 0x07) << 0)
+#define   G_0287CC_SRC_STATE_ID(x)                                    (((x) >> 0) & 0x07)
+#define   C_0287CC_SRC_STATE_ID                                       0xFFFFFFF8
 #define R_0287D4_PA_CL_POINT_X_RAD                                      0x0287D4
 #define R_0287D8_PA_CL_POINT_Y_RAD                                      0x0287D8
 #define R_0287DC_PA_CL_POINT_SIZE                                       0x0287DC
@@ -6588,6 +7411,10 @@
 #define   G_0287F0_USE_OPAQUE(x)                                      (((x) >> 6) & 0x1)
 #define   C_0287F0_USE_OPAQUE                                         0xFFFFFFBF
 #define R_0287F4_VGT_IMMED_DATA                                         0x0287F4 /* not on CIK */
+#define R_0287F8_VGT_EVENT_ADDRESS_REG                                  0x0287F8
+#define   S_0287F8_ADDRESS_LOW(x)                                     (((x) & 0xFFFFFFF) << 0)
+#define   G_0287F8_ADDRESS_LOW(x)                                     (((x) >> 0) & 0xFFFFFFF)
+#define   C_0287F8_ADDRESS_LOW                                        0xF0000000
 #define R_028800_DB_DEPTH_CONTROL                                       0x028800
 #define   S_028800_STENCIL_ENABLE(x)                                  (((x) & 0x1) << 0)
 #define   G_028800_STENCIL_ENABLE(x)                                  (((x) >> 0) & 0x1)
@@ -6644,36 +7471,42 @@
 #define   G_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS(x)              (((x) >> 31) & 0x1)
 #define   C_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS                 0x7FFFFFFF
 #define R_028804_DB_EQAA                                                0x028804
-#define   S_028804_MAX_ANCHOR_SAMPLES(x)		(((x) & 0x7) << 0)
-#define   G_028804_MAX_ANCHOR_SAMPLES(x)		(((x) >> 0) & 0x7)
-#define   C_028804_MAX_ANCHOR_SAMPLES			(~(((~0) & 0x7) << 0))
-#define   S_028804_PS_ITER_SAMPLES(x)			(((x) & 0x7) << 4)
-#define   G_028804_PS_ITER_SAMPLES(x)			(((x) >> 4) & 0x7)
-#define   C_028804_PS_ITER_SAMPLES			(~(((~0) & 0x7) << 4))
-#define   S_028804_MASK_EXPORT_NUM_SAMPLES(x)		(((x) & 0x7) << 8)
-#define   G_028804_MASK_EXPORT_NUM_SAMPLES(x)		(((x) >> 8) & 0x7)
-#define   C_028804_MASK_EXPORT_NUM_SAMPLES		(~(((~0) & 0x7) << 8))
-#define   S_028804_ALPHA_TO_MASK_NUM_SAMPLES(x)		(((x) & 0x7) << 12)
-#define   G_028804_ALPHA_TO_MASK_NUM_SAMPLES(x)		(((x) >> 12) & 0x7)
-#define   C_028804_ALPHA_TO_MASK_NUM_SAMPLES		(~(((~0) & 0x7) << 12))
-#define   S_028804_HIGH_QUALITY_INTERSECTIONS(x)	(((x) & 0x1) << 16)
-#define   G_028804_HIGH_QUALITY_INTERSECTIONS(x)	(((x) >> 16) & 0x1)
-#define   C_028804_HIGH_QUALITY_INTERSECTIONS		(~(((~0) & 0x1) << 16))
-#define   S_028804_INCOHERENT_EQAA_READS(x)		(((x) & 0x1) << 17)
-#define   G_028804_INCOHERENT_EQAA_READS(x)		(((x) >> 17) & 0x1)
-#define   C_028804_INCOHERENT_EQAA_READS		(~(((~0) & 0x1) << 17))
-#define   S_028804_INTERPOLATE_COMP_Z(x)		(((x) & 0x1) << 18)
-#define   G_028804_INTERPOLATE_COMP_Z(x)		(((x) >> 18) & 0x1)
-#define   C_028804_INTERPOLATE_COMP_Z			(~(((~0) >> 18) & 0x1))
-#define   S_028804_INTERPOLATE_SRC_Z(x)			(((x) & 0x1) << 19)
-#define   G_028804_INTERPOLATE_SRC_Z(x)			(((x) >> 19) & 0x1)
-#define   C_028804_INTERPOLATE_SRC_Z			(~(((~0) & 0x1) << 19))
-#define   S_028804_STATIC_ANCHOR_ASSOCIATIONS(x)	(((x) & 0x1) << 20)
-#define   G_028804_STATIC_ANCHOR_ASSOCIATIONS(x)	(((x) >> 20) & 0x1)
-#define   C_028804_STATIC_ANCHOR_ASSOCIATIONS		(~(((~0) & 0x1) << 20))
-#define   S_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)	(((x) & 0x1) << 21)
-#define   G_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)	(((x) >> 21) & 0x1)
-#define   C_028804_ALPHA_TO_MASK_EQAA_DISABLE		(~(((~0) & 0x1) << 21))
+#define   S_028804_MAX_ANCHOR_SAMPLES(x)                              (((x) & 0x7) << 0)
+#define   G_028804_MAX_ANCHOR_SAMPLES(x)                              (((x) >> 0) & 0x07)
+#define   C_028804_MAX_ANCHOR_SAMPLES                                 0xFFFFFFF8
+#define   S_028804_PS_ITER_SAMPLES(x)                                 (((x) & 0x7) << 4)
+#define   G_028804_PS_ITER_SAMPLES(x)                                 (((x) >> 4) & 0x07)
+#define   C_028804_PS_ITER_SAMPLES                                    0xFFFFFF8F
+#define   S_028804_MASK_EXPORT_NUM_SAMPLES(x)                         (((x) & 0x7) << 8)
+#define   G_028804_MASK_EXPORT_NUM_SAMPLES(x)                         (((x) >> 8) & 0x07)
+#define   C_028804_MASK_EXPORT_NUM_SAMPLES                            0xFFFFF8FF
+#define   S_028804_ALPHA_TO_MASK_NUM_SAMPLES(x)                       (((x) & 0x7) << 12)
+#define   G_028804_ALPHA_TO_MASK_NUM_SAMPLES(x)                       (((x) >> 12) & 0x07)
+#define   C_028804_ALPHA_TO_MASK_NUM_SAMPLES                          0xFFFF8FFF
+#define   S_028804_HIGH_QUALITY_INTERSECTIONS(x)                      (((x) & 0x1) << 16)
+#define   G_028804_HIGH_QUALITY_INTERSECTIONS(x)                      (((x) >> 16) & 0x1)
+#define   C_028804_HIGH_QUALITY_INTERSECTIONS                         0xFFFEFFFF
+#define   S_028804_INCOHERENT_EQAA_READS(x)                           (((x) & 0x1) << 17)
+#define   G_028804_INCOHERENT_EQAA_READS(x)                           (((x) >> 17) & 0x1)
+#define   C_028804_INCOHERENT_EQAA_READS                              0xFFFDFFFF
+#define   S_028804_INTERPOLATE_COMP_Z(x)                              (((x) & 0x1) << 18)
+#define   G_028804_INTERPOLATE_COMP_Z(x)                              (((x) >> 18) & 0x1)
+#define   C_028804_INTERPOLATE_COMP_Z                                 0xFFFBFFFF
+#define   S_028804_INTERPOLATE_SRC_Z(x)                               (((x) & 0x1) << 19)
+#define   G_028804_INTERPOLATE_SRC_Z(x)                               (((x) >> 19) & 0x1)
+#define   C_028804_INTERPOLATE_SRC_Z                                  0xFFF7FFFF
+#define   S_028804_STATIC_ANCHOR_ASSOCIATIONS(x)                      (((x) & 0x1) << 20)
+#define   G_028804_STATIC_ANCHOR_ASSOCIATIONS(x)                      (((x) >> 20) & 0x1)
+#define   C_028804_STATIC_ANCHOR_ASSOCIATIONS                         0xFFEFFFFF
+#define   S_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)                      (((x) & 0x1) << 21)
+#define   G_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)                      (((x) >> 21) & 0x1)
+#define   C_028804_ALPHA_TO_MASK_EQAA_DISABLE                         0xFFDFFFFF
+#define   S_028804_OVERRASTERIZATION_AMOUNT(x)                        (((x) & 0x07) << 24)
+#define   G_028804_OVERRASTERIZATION_AMOUNT(x)                        (((x) >> 24) & 0x07)
+#define   C_028804_OVERRASTERIZATION_AMOUNT                           0xF8FFFFFF
+#define   S_028804_ENABLE_POSTZ_OVERRASTERIZATION(x)                  (((x) & 0x1) << 27)
+#define   G_028804_ENABLE_POSTZ_OVERRASTERIZATION(x)                  (((x) >> 27) & 0x1)
+#define   C_028804_ENABLE_POSTZ_OVERRASTERIZATION                     0xF7FFFFFF
 #define R_028808_CB_COLOR_CONTROL                                       0x028808
 #define   S_028808_DEGAMMA_ENABLE(x)                                  (((x) & 0x1) << 3)
 #define   G_028808_DEGAMMA_ENABLE(x)                                  (((x) >> 3) & 0x1)
@@ -6977,6 +7810,11 @@
 #define   S_02881C_USE_VTX_GS_CUT_FLAG(x)                             (((x) & 0x1) << 25)
 #define   G_02881C_USE_VTX_GS_CUT_FLAG(x)                             (((x) >> 25) & 0x1)
 #define   C_02881C_USE_VTX_GS_CUT_FLAG                                0xFDFFFFFF
+/* VI */
+#define   S_02881C_USE_VTX_LINE_WIDTH(x)                              (((x) & 0x1) << 26)
+#define   G_02881C_USE_VTX_LINE_WIDTH(x)                              (((x) >> 26) & 0x1)
+#define   C_02881C_USE_VTX_LINE_WIDTH                                 0xFBFFFFFF
+/*    */
 #define R_028820_PA_CL_NANINF_CNTL                                      0x028820
 #define   S_028820_VTE_XY_INF_DISCARD(x)                              (((x) & 0x1) << 0)
 #define   G_028820_VTE_XY_INF_DISCARD(x)                              (((x) >> 0) & 0x1)
@@ -7447,9 +8285,21 @@
 #define   S_028A4C_PS_ITER_SAMPLE(x)                                  (((x) & 0x1) << 16)
 #define   G_028A4C_PS_ITER_SAMPLE(x)                                  (((x) >> 16) & 0x1)
 #define   C_028A4C_PS_ITER_SAMPLE                                     0xFFFEFFFF
-#define   S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC(x)                   (((x) & 0x1) << 17)
-#define   G_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC(x)                   (((x) >> 17) & 0x1)
-#define   C_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC                      0xFFFDFFFF
+#define   S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(x)         (((x) & 0x1) << 17)
+#define   G_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(x)         (((x) >> 17) & 0x1)
+#define   C_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE            0xFFFDFFFF
+#define   S_028A4C_MULTI_GPU_SUPERTILE_ENABLE(x)                      (((x) & 0x1) << 18)
+#define   G_028A4C_MULTI_GPU_SUPERTILE_ENABLE(x)                      (((x) >> 18) & 0x1)
+#define   C_028A4C_MULTI_GPU_SUPERTILE_ENABLE                         0xFFFBFFFF
+#define   S_028A4C_GPU_ID_OVERRIDE_ENABLE(x)                          (((x) & 0x1) << 19)
+#define   G_028A4C_GPU_ID_OVERRIDE_ENABLE(x)                          (((x) >> 19) & 0x1)
+#define   C_028A4C_GPU_ID_OVERRIDE_ENABLE                             0xFFF7FFFF
+#define   S_028A4C_GPU_ID_OVERRIDE(x)                                 (((x) & 0x0F) << 20)
+#define   G_028A4C_GPU_ID_OVERRIDE(x)                                 (((x) >> 20) & 0x0F)
+#define   C_028A4C_GPU_ID_OVERRIDE                                    0xFF0FFFFF
+#define   S_028A4C_MULTI_GPU_PRIM_DISCARD_ENABLE(x)                   (((x) & 0x1) << 24)
+#define   G_028A4C_MULTI_GPU_PRIM_DISCARD_ENABLE(x)                   (((x) >> 24) & 0x1)
+#define   C_028A4C_MULTI_GPU_PRIM_DISCARD_ENABLE                      0xFEFFFFFF
 #define   S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)                         (((x) & 0x1) << 25)
 #define   G_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)                         (((x) >> 25) & 0x1)
 #define   C_028A4C_FORCE_EOV_CNTDWN_ENABLE                            0xFDFFFFFF
@@ -7515,6 +8365,7 @@
 #define   C_028A7C_INDEX_TYPE                                         0xFFFFFFFC
 #define     V_028A7C_VGT_INDEX_16                                   0x00
 #define     V_028A7C_VGT_INDEX_32                                   0x01
+#define     V_028A7C_VGT_INDEX_8                                    0x02 /* VI */
 #define   S_028A7C_SWAP_MODE(x)                                       (((x) & 0x03) << 2)
 #define   G_028A7C_SWAP_MODE(x)                                       (((x) >> 2) & 0x03)
 #define   C_028A7C_SWAP_MODE                                          0xFFFFFFF3
@@ -7544,6 +8395,12 @@
 #define   G_028A7C_REQ_PATH(x)                                        (((x) >> 10) & 0x1)
 #define   C_028A7C_REQ_PATH                                           0xFFFFFBFF
 /*     */
+/* VI */
+#define   S_028A7C_MTYPE(x)                                           (((x) & 0x03) << 11)
+#define   G_028A7C_MTYPE(x)                                           (((x) >> 11) & 0x03)
+#define   C_028A7C_MTYPE                                              0xFFFFE7FF
+/*    */
+#define R_028A80_WD_ENHANCE                                             0x028A80
 #define R_028A84_VGT_PRIMITIVEID_EN                                     0x028A84
 #define   S_028A84_PRIMITIVEID_EN(x)                                  (((x) & 0x1) << 0)
 #define   G_028A84_PRIMITIVEID_EN(x)                                  (((x) >> 0) & 0x1)
@@ -7642,6 +8499,10 @@
 #define   S_028AA8_WD_SWITCH_ON_EOP(x)                                (((x) & 0x1) << 20)
 #define   G_028AA8_WD_SWITCH_ON_EOP(x)                                (((x) >> 20) & 0x1)
 #define   C_028AA8_WD_SWITCH_ON_EOP                                   0xFFEFFFFF
+/* VI */
+#define   S_028AA8_MAX_PRIMGRP_IN_WAVE(x)                             (((x) & 0x0F) << 28)
+#define   G_028AA8_MAX_PRIMGRP_IN_WAVE(x)                             (((x) >> 28) & 0x0F)
+#define   C_028AA8_MAX_PRIMGRP_IN_WAVE                                0x0FFFFFFF
 /*     */
 #define R_028AAC_VGT_ESGS_RING_ITEMSIZE                                 0x028AAC
 #define   S_028AAC_ITEMSIZE(x)                                        (((x) & 0x7FFF) << 0)
@@ -7681,6 +8542,11 @@
 #define   S_028ABC_DST_OUTSIDE_ZERO_TO_ONE(x)                         (((x) & 0x1) << 16)
 #define   G_028ABC_DST_OUTSIDE_ZERO_TO_ONE(x)                         (((x) >> 16) & 0x1)
 #define   C_028ABC_DST_OUTSIDE_ZERO_TO_ONE                            0xFFFEFFFF
+/* VI */
+#define   S_028ABC_TC_COMPATIBLE(x)                                   (((x) & 0x1) << 17)
+#define   G_028ABC_TC_COMPATIBLE(x)                                   (((x) >> 17) & 0x1)
+#define   C_028ABC_TC_COMPATIBLE                                      0xFFFDFFFF
+/*    */
 #define R_028AC0_DB_SRESULTS_COMPARE_STATE0                             0x028AC0
 #define   S_028AC0_COMPAREFUNC0(x)                                    (((x) & 0x07) << 0)
 #define   G_028AC0_COMPAREFUNC0(x)                                    (((x) >> 0) & 0x07)
@@ -7770,6 +8636,21 @@
 #define   S_028B38_MAX_VERT_OUT(x)                                    (((x) & 0x7FF) << 0)
 #define   G_028B38_MAX_VERT_OUT(x)                                    (((x) >> 0) & 0x7FF)
 #define   C_028B38_MAX_VERT_OUT                                       0xFFFFF800
+/* VI */
+#define R_028B50_VGT_TESS_DISTRIBUTION                                  0x028B50
+#define   S_028B50_ACCUM_ISOLINE(x)                                   (((x) & 0xFF) << 0)
+#define   G_028B50_ACCUM_ISOLINE(x)                                   (((x) >> 0) & 0xFF)
+#define   C_028B50_ACCUM_ISOLINE                                      0xFFFFFF00
+#define   S_028B50_ACCUM_TRI(x)                                       (((x) & 0xFF) << 8)
+#define   G_028B50_ACCUM_TRI(x)                                       (((x) >> 8) & 0xFF)
+#define   C_028B50_ACCUM_TRI                                          0xFFFF00FF
+#define   S_028B50_ACCUM_QUAD(x)                                      (((x) & 0xFF) << 16)
+#define   G_028B50_ACCUM_QUAD(x)                                      (((x) >> 16) & 0xFF)
+#define   C_028B50_ACCUM_QUAD                                         0xFF00FFFF
+#define   S_028B50_DONUT_SPLIT(x)                                     (((x) & 0xFF) << 24)
+#define   G_028B50_DONUT_SPLIT(x)                                     (((x) >> 24) & 0xFF)
+#define   C_028B50_DONUT_SPLIT                                        0x00FFFFFF
+/*    */
 #define R_028B54_VGT_SHADER_STAGES_EN                                   0x028B54
 #define   S_028B54_LS_EN(x)                                           (((x) & 0x03) << 0)
 #define   G_028B54_LS_EN(x)                                           (((x) >> 0) & 0x03)
@@ -7798,6 +8679,20 @@
 #define   S_028B54_DYNAMIC_HS(x)                                      (((x) & 0x1) << 8)
 #define   G_028B54_DYNAMIC_HS(x)                                      (((x) >> 8) & 0x1)
 #define   C_028B54_DYNAMIC_HS                                         0xFFFFFEFF
+/* VI */
+#define   S_028B54_DISPATCH_DRAW_EN(x)                                (((x) & 0x1) << 9)
+#define   G_028B54_DISPATCH_DRAW_EN(x)                                (((x) >> 9) & 0x1)
+#define   C_028B54_DISPATCH_DRAW_EN                                   0xFFFFFDFF
+#define   S_028B54_DIS_DEALLOC_ACCUM_0(x)                             (((x) & 0x1) << 10)
+#define   G_028B54_DIS_DEALLOC_ACCUM_0(x)                             (((x) >> 10) & 0x1)
+#define   C_028B54_DIS_DEALLOC_ACCUM_0                                0xFFFFFBFF
+#define   S_028B54_DIS_DEALLOC_ACCUM_1(x)                             (((x) & 0x1) << 11)
+#define   G_028B54_DIS_DEALLOC_ACCUM_1(x)                             (((x) >> 11) & 0x1)
+#define   C_028B54_DIS_DEALLOC_ACCUM_1                                0xFFFFF7FF
+#define   S_028B54_VS_WAVE_ID_EN(x)                                   (((x) & 0x1) << 12)
+#define   G_028B54_VS_WAVE_ID_EN(x)                                   (((x) >> 12) & 0x1)
+#define   C_028B54_VS_WAVE_ID_EN                                      0xFFFFEFFF
+/*    */
 #define R_028B58_VGT_LS_HS_CONFIG                                       0x028B58
 #define   S_028B58_NUM_PATCHES(x)                                     (((x) & 0xFF) << 0)
 #define   G_028B58_NUM_PATCHES(x)                                     (((x) >> 0) & 0xFF)
@@ -7848,6 +8743,9 @@
 #define   S_028B6C_RESERVED_REDUC_AXIS(x)                             (((x) & 0x1) << 8) /* not on CIK */
 #define   G_028B6C_RESERVED_REDUC_AXIS(x)                             (((x) >> 8) & 0x1) /* not on CIK */
 #define   C_028B6C_RESERVED_REDUC_AXIS                                0xFFFFFEFF /* not on CIK */
+#define   S_028B6C_DEPRECATED(x)                                      (((x) & 0x1) << 9)
+#define   G_028B6C_DEPRECATED(x)                                      (((x) >> 9) & 0x1)
+#define   C_028B6C_DEPRECATED                                         0xFFFFFDFF
 #define   S_028B6C_NUM_DS_WAVES_PER_SIMD(x)                           (((x) & 0x0F) << 10)
 #define   G_028B6C_NUM_DS_WAVES_PER_SIMD(x)                           (((x) >> 10) & 0x0F)
 #define   C_028B6C_NUM_DS_WAVES_PER_SIMD                              0xFFFFC3FF
@@ -7862,6 +8760,14 @@
 #define     V_028B6C_VGT_POLICY_STREAM                              0x01
 #define     V_028B6C_VGT_POLICY_BYPASS                              0x02
 /*     */
+/* VI */
+#define   S_028B6C_DISTRIBUTION_MODE(x)                               (((x) & 0x03) << 17)
+#define   G_028B6C_DISTRIBUTION_MODE(x)                               (((x) >> 17) & 0x03)
+#define   C_028B6C_DISTRIBUTION_MODE                                  0xFFF9FFFF
+#define   S_028B6C_MTYPE(x)                                           (((x) & 0x03) << 19)
+#define   G_028B6C_MTYPE(x)                                           (((x) >> 19) & 0x03)
+#define   C_028B6C_MTYPE                                              0xFFE7FFFF
+/*    */
 #define R_028B70_DB_ALPHA_TO_MASK                                       0x028B70
 #define   S_028B70_ALPHA_TO_MASK_ENABLE(x)                            (((x) & 0x1) << 0)
 #define   G_028B70_ALPHA_TO_MASK_ENABLE(x)                            (((x) >> 0) & 0x1)
@@ -8001,6 +8907,22 @@
 #define   S_028BDC_DX10_DIAMOND_TEST_ENA(x)                           (((x) & 0x1) << 12)
 #define   G_028BDC_DX10_DIAMOND_TEST_ENA(x)                           (((x) >> 12) & 0x1)
 #define   C_028BDC_DX10_DIAMOND_TEST_ENA                              0xFFFFEFFF
+#define R_028BE0_PA_SC_AA_CONFIG                                        0x028BE0
+#define   S_028BE0_MSAA_NUM_SAMPLES(x)                                (((x) & 0x7) << 0)
+#define   G_028BE0_MSAA_NUM_SAMPLES(x)                                (((x) >> 0) & 0x07)
+#define   C_028BE0_MSAA_NUM_SAMPLES                                   0xFFFFFFF8
+#define   S_028BE0_AA_MASK_CENTROID_DTMN(x)                           (((x) & 0x1) << 4)
+#define   G_028BE0_AA_MASK_CENTROID_DTMN(x)                           (((x) >> 4) & 0x1)
+#define   C_028BE0_AA_MASK_CENTROID_DTMN                              0xFFFFFFEF
+#define   S_028BE0_MAX_SAMPLE_DIST(x)                                 (((x) & 0xf) << 13)
+#define   G_028BE0_MAX_SAMPLE_DIST(x)                                 (((x) >> 13) & 0x0F)
+#define   C_028BE0_MAX_SAMPLE_DIST                                    0xFFFE1FFF
+#define   S_028BE0_MSAA_EXPOSED_SAMPLES(x)                            (((x) & 0x7) << 20)
+#define   G_028BE0_MSAA_EXPOSED_SAMPLES(x)                            (((x) >> 20) & 0x07)
+#define   C_028BE0_MSAA_EXPOSED_SAMPLES                               0xFF8FFFFF
+#define   S_028BE0_DETAIL_TO_EXPOSED_MODE(x)                          (((x) & 0x3) << 24)
+#define   G_028BE0_DETAIL_TO_EXPOSED_MODE(x)                          (((x) >> 24) & 0x03)
+#define   C_028BE0_DETAIL_TO_EXPOSED_MODE                             0xFCFFFFFF
 #define R_028BE4_PA_SU_VTX_CNTL                                         0x028BE4
 #define   S_028BE4_PIX_CENTER(x)                                      (((x) & 0x1) << 0)
 #define   G_028BE4_PIX_CENTER(x)                                      (((x) >> 0) & 0x1)
@@ -8569,6 +9491,17 @@
 #define   G_028C70_FMASK_COMPRESSION_DISABLE(x)                       (((x) >> 26) & 0x1)
 #define   C_028C70_FMASK_COMPRESSION_DISABLE                          0xFBFFFFFF
 /*     */
+/* VI */
+#define   S_028C70_FMASK_COMPRESS_1FRAG_ONLY(x)                       (((x) & 0x1) << 27)
+#define   G_028C70_FMASK_COMPRESS_1FRAG_ONLY(x)                       (((x) >> 27) & 0x1)
+#define   C_028C70_FMASK_COMPRESS_1FRAG_ONLY                          0xF7FFFFFF
+#define   S_028C70_DCC_ENABLE(x)                                      (((x) & 0x1) << 28)
+#define   G_028C70_DCC_ENABLE(x)                                      (((x) >> 28) & 0x1)
+#define   C_028C70_DCC_ENABLE                                         0xEFFFFFFF
+#define   S_028C70_CMASK_ADDR_TYPE(x)                                 (((x) & 0x03) << 29)
+#define   G_028C70_CMASK_ADDR_TYPE(x)                                 (((x) >> 29) & 0x03)
+#define   C_028C70_CMASK_ADDR_TYPE                                    0x9FFFFFFF
+/*    */
 #define R_028C74_CB_COLOR0_ATTRIB                                       0x028C74
 #define   S_028C74_TILE_MODE_INDEX(x)                                 (((x) & 0x1F) << 0)
 #define   G_028C74_TILE_MODE_INDEX(x)                                 (((x) >> 0) & 0x1F)
@@ -8576,7 +9509,9 @@
 #define   S_028C74_FMASK_TILE_MODE_INDEX(x)                           (((x) & 0x1F) << 5)
 #define   G_028C74_FMASK_TILE_MODE_INDEX(x)                           (((x) >> 5) & 0x1F)
 #define   C_028C74_FMASK_TILE_MODE_INDEX                              0xFFFFFC1F
-#define   S_028C74_FMASK_BANK_HEIGHT(x)				      (((x) & 0x3) << 10) /* SI errata */
+#define   S_028C74_FMASK_BANK_HEIGHT(x)                               (((x) & 0x03) << 10)
+#define   G_028C74_FMASK_BANK_HEIGHT(x)                               (((x) >> 10) & 0x03)
+#define   C_028C74_FMASK_BANK_HEIGHT                                  0xFFFFF3FF
 #define   S_028C74_NUM_SAMPLES(x)                                     (((x) & 0x07) << 12)
 #define   G_028C74_NUM_SAMPLES(x)                                     (((x) >> 12) & 0x07)
 #define   C_028C74_NUM_SAMPLES                                        0xFFFF8FFF
@@ -8586,6 +9521,36 @@
 #define   S_028C74_FORCE_DST_ALPHA_1(x)                               (((x) & 0x1) << 17)
 #define   G_028C74_FORCE_DST_ALPHA_1(x)                               (((x) >> 17) & 0x1)
 #define   C_028C74_FORCE_DST_ALPHA_1                                  0xFFFDFFFF
+/* VI */
+#define R_028C78_CB_COLOR0_DCC_CONTROL                                  0x028C78
+#define   S_028C78_OVERWRITE_COMBINER_DISABLE(x)                      (((x) & 0x1) << 0)
+#define   G_028C78_OVERWRITE_COMBINER_DISABLE(x)                      (((x) >> 0) & 0x1)
+#define   C_028C78_OVERWRITE_COMBINER_DISABLE                         0xFFFFFFFE
+#define   S_028C78_KEY_CLEAR_ENABLE(x)                                (((x) & 0x1) << 1)
+#define   G_028C78_KEY_CLEAR_ENABLE(x)                                (((x) >> 1) & 0x1)
+#define   C_028C78_KEY_CLEAR_ENABLE                                   0xFFFFFFFD
+#define   S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(x)                     (((x) & 0x03) << 2)
+#define   G_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(x)                     (((x) >> 2) & 0x03)
+#define   C_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE                        0xFFFFFFF3
+#define   S_028C78_MIN_COMPRESSED_BLOCK_SIZE(x)                       (((x) & 0x1) << 4)
+#define   G_028C78_MIN_COMPRESSED_BLOCK_SIZE(x)                       (((x) >> 4) & 0x1)
+#define   C_028C78_MIN_COMPRESSED_BLOCK_SIZE                          0xFFFFFFEF
+#define   S_028C78_MAX_COMPRESSED_BLOCK_SIZE(x)                       (((x) & 0x03) << 5)
+#define   G_028C78_MAX_COMPRESSED_BLOCK_SIZE(x)                       (((x) >> 5) & 0x03)
+#define   C_028C78_MAX_COMPRESSED_BLOCK_SIZE                          0xFFFFFF9F
+#define   S_028C78_COLOR_TRANSFORM(x)                                 (((x) & 0x03) << 7)
+#define   G_028C78_COLOR_TRANSFORM(x)                                 (((x) >> 7) & 0x03)
+#define   C_028C78_COLOR_TRANSFORM                                    0xFFFFFE7F
+#define   S_028C78_INDEPENDENT_64B_BLOCKS(x)                          (((x) & 0x1) << 9)
+#define   G_028C78_INDEPENDENT_64B_BLOCKS(x)                          (((x) >> 9) & 0x1)
+#define   C_028C78_INDEPENDENT_64B_BLOCKS                             0xFFFFFDFF
+#define   S_028C78_LOSSY_RGB_PRECISION(x)                             (((x) & 0x0F) << 10)
+#define   G_028C78_LOSSY_RGB_PRECISION(x)                             (((x) >> 10) & 0x0F)
+#define   C_028C78_LOSSY_RGB_PRECISION                                0xFFFFC3FF
+#define   S_028C78_LOSSY_ALPHA_PRECISION(x)                           (((x) & 0x0F) << 14)
+#define   G_028C78_LOSSY_ALPHA_PRECISION(x)                           (((x) >> 14) & 0x0F)
+#define   C_028C78_LOSSY_ALPHA_PRECISION                              0xFFFC3FFF
+/*    */
 #define R_028C7C_CB_COLOR0_CMASK                                        0x028C7C
 #define R_028C80_CB_COLOR0_CMASK_SLICE                                  0x028C80
 #define   S_028C80_TILE_MAX(x)                                        (((x) & 0x3FFF) << 0)
@@ -8598,90 +9563,105 @@
 #define   C_028C88_TILE_MAX                                           0xFFC00000
 #define R_028C8C_CB_COLOR0_CLEAR_WORD0                                  0x028C8C
 #define R_028C90_CB_COLOR0_CLEAR_WORD1                                  0x028C90
+#define R_028C94_CB_COLOR0_DCC_BASE                                     0x028C94 /* VI */
 #define R_028C9C_CB_COLOR1_BASE                                         0x028C9C
 #define R_028CA0_CB_COLOR1_PITCH                                        0x028CA0
 #define R_028CA4_CB_COLOR1_SLICE                                        0x028CA4
 #define R_028CA8_CB_COLOR1_VIEW                                         0x028CA8
 #define R_028CAC_CB_COLOR1_INFO                                         0x028CAC
 #define R_028CB0_CB_COLOR1_ATTRIB                                       0x028CB0
-#define R_028CD4_CB_COLOR1_CMASK                                        0x028CB8
+#define R_028CB4_CB_COLOR1_DCC_CONTROL                                  0x028CB4 /* VI */
+#define R_028CB8_CB_COLOR1_CMASK                                        0x028CB8
 #define R_028CBC_CB_COLOR1_CMASK_SLICE                                  0x028CBC
 #define R_028CC0_CB_COLOR1_FMASK                                        0x028CC0
 #define R_028CC4_CB_COLOR1_FMASK_SLICE                                  0x028CC4
 #define R_028CC8_CB_COLOR1_CLEAR_WORD0                                  0x028CC8
 #define R_028CCC_CB_COLOR1_CLEAR_WORD1                                  0x028CCC
+#define R_028CD0_CB_COLOR1_DCC_BASE                                     0x028CD0 /* VI */
 #define R_028CD8_CB_COLOR2_BASE                                         0x028CD8
 #define R_028CDC_CB_COLOR2_PITCH                                        0x028CDC
 #define R_028CE0_CB_COLOR2_SLICE                                        0x028CE0
 #define R_028CE4_CB_COLOR2_VIEW                                         0x028CE4
 #define R_028CE8_CB_COLOR2_INFO                                         0x028CE8
 #define R_028CEC_CB_COLOR2_ATTRIB                                       0x028CEC
+#define R_028CF0_CB_COLOR2_DCC_CONTROL                                  0x028CF0 /* VI */
 #define R_028CF4_CB_COLOR2_CMASK                                        0x028CF4
 #define R_028CF8_CB_COLOR2_CMASK_SLICE                                  0x028CF8
 #define R_028CFC_CB_COLOR2_FMASK                                        0x028CFC
 #define R_028D00_CB_COLOR2_FMASK_SLICE                                  0x028D00
 #define R_028D04_CB_COLOR2_CLEAR_WORD0                                  0x028D04
 #define R_028D08_CB_COLOR2_CLEAR_WORD1                                  0x028D08
+#define R_028D0C_CB_COLOR2_DCC_BASE                                     0x028D0C /* VI */
 #define R_028D14_CB_COLOR3_BASE                                         0x028D14
 #define R_028D18_CB_COLOR3_PITCH                                        0x028D18
 #define R_028D1C_CB_COLOR3_SLICE                                        0x028D1C
 #define R_028D20_CB_COLOR3_VIEW                                         0x028D20
 #define R_028D24_CB_COLOR3_INFO                                         0x028D24
 #define R_028D28_CB_COLOR3_ATTRIB                                       0x028D28
+#define R_028D2C_CB_COLOR3_DCC_CONTROL                                  0x028D2C /* VI */
 #define R_028D30_CB_COLOR3_CMASK                                        0x028D30
 #define R_028D34_CB_COLOR3_CMASK_SLICE                                  0x028D34
 #define R_028D38_CB_COLOR3_FMASK                                        0x028D38
 #define R_028D3C_CB_COLOR3_FMASK_SLICE                                  0x028D3C
 #define R_028D40_CB_COLOR3_CLEAR_WORD0                                  0x028D40
 #define R_028D44_CB_COLOR3_CLEAR_WORD1                                  0x028D44
+#define R_028D48_CB_COLOR3_DCC_BASE                                     0x028D48 /* VI */
 #define R_028D50_CB_COLOR4_BASE                                         0x028D50
 #define R_028D54_CB_COLOR4_PITCH                                        0x028D54
 #define R_028D58_CB_COLOR4_SLICE                                        0x028D58
 #define R_028D5C_CB_COLOR4_VIEW                                         0x028D5C
 #define R_028D60_CB_COLOR4_INFO                                         0x028D60
 #define R_028D64_CB_COLOR4_ATTRIB                                       0x028D64
+#define R_028D68_CB_COLOR4_DCC_CONTROL                                  0x028D68 /* VI */
 #define R_028D6C_CB_COLOR4_CMASK                                        0x028D6C
 #define R_028D70_CB_COLOR4_CMASK_SLICE                                  0x028D70
 #define R_028D74_CB_COLOR4_FMASK                                        0x028D74
 #define R_028D78_CB_COLOR4_FMASK_SLICE                                  0x028D78
 #define R_028D7C_CB_COLOR4_CLEAR_WORD0                                  0x028D7C
 #define R_028D80_CB_COLOR4_CLEAR_WORD1                                  0x028D80
+#define R_028D84_CB_COLOR4_DCC_BASE                                     0x028D84 /* VI */
 #define R_028D8C_CB_COLOR5_BASE                                         0x028D8C
 #define R_028D90_CB_COLOR5_PITCH                                        0x028D90
 #define R_028D94_CB_COLOR5_SLICE                                        0x028D94
 #define R_028D98_CB_COLOR5_VIEW                                         0x028D98
 #define R_028D9C_CB_COLOR5_INFO                                         0x028D9C
 #define R_028DA0_CB_COLOR5_ATTRIB                                       0x028DA0
+#define R_028DA4_CB_COLOR5_DCC_CONTROL                                  0x028DA4 /* VI */
 #define R_028DA8_CB_COLOR5_CMASK                                        0x028DA8
 #define R_028DAC_CB_COLOR5_CMASK_SLICE                                  0x028DAC
 #define R_028DB0_CB_COLOR5_FMASK                                        0x028DB0
 #define R_028DB4_CB_COLOR5_FMASK_SLICE                                  0x028DB4
 #define R_028DB8_CB_COLOR5_CLEAR_WORD0                                  0x028DB8
 #define R_028DBC_CB_COLOR5_CLEAR_WORD1                                  0x028DBC
+#define R_028DC0_CB_COLOR5_DCC_BASE                                     0x028DC0 /* VI */
 #define R_028DC8_CB_COLOR6_BASE                                         0x028DC8
 #define R_028DCC_CB_COLOR6_PITCH                                        0x028DCC
 #define R_028DD0_CB_COLOR6_SLICE                                        0x028DD0
 #define R_028DD4_CB_COLOR6_VIEW                                         0x028DD4
 #define R_028DD8_CB_COLOR6_INFO                                         0x028DD8
 #define R_028DDC_CB_COLOR6_ATTRIB                                       0x028DDC
+#define R_028DE0_CB_COLOR6_DCC_CONTROL                                  0x028DE0 /* VI */
 #define R_028DE4_CB_COLOR6_CMASK                                        0x028DE4
 #define R_028DE8_CB_COLOR6_CMASK_SLICE                                  0x028DE8
 #define R_028DEC_CB_COLOR6_FMASK                                        0x028DEC
 #define R_028DF0_CB_COLOR6_FMASK_SLICE                                  0x028DF0
 #define R_028DF4_CB_COLOR6_CLEAR_WORD0                                  0x028DF4
 #define R_028DF8_CB_COLOR6_CLEAR_WORD1                                  0x028DF8
+#define R_028DFC_CB_COLOR6_DCC_BASE                                     0x028DFC /* VI */
 #define R_028E04_CB_COLOR7_BASE                                         0x028E04
 #define R_028E08_CB_COLOR7_PITCH                                        0x028E08
 #define R_028E0C_CB_COLOR7_SLICE                                        0x028E0C
 #define R_028E10_CB_COLOR7_VIEW                                         0x028E10
 #define R_028E14_CB_COLOR7_INFO                                         0x028E14
 #define R_028E18_CB_COLOR7_ATTRIB                                       0x028E18
+#define R_028E1C_CB_COLOR7_DCC_CONTROL                                  0x028E1C /* VI */
 #define R_028E20_CB_COLOR7_CMASK                                        0x028E20
 #define R_028E24_CB_COLOR7_CMASK_SLICE                                  0x028E24
 #define R_028E28_CB_COLOR7_FMASK                                        0x028E28
 #define R_028E2C_CB_COLOR7_FMASK_SLICE                                  0x028E2C
 #define R_028E30_CB_COLOR7_CLEAR_WORD0                                  0x028E30
 #define R_028E34_CB_COLOR7_CLEAR_WORD1                                  0x028E34
+#define R_028E38_CB_COLOR7_DCC_BASE                                     0x028E38 /* VI */
 
 /* SI async DMA packets */
 #define SI_DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) |    \

From 943a4b5e963a3bbeb3a0a39d0123e359fdf3ec07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 11 Jul 2015 13:22:22 +0200
Subject: [PATCH 1170/1208] radeonsi: add definitions for VI status registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Useful for debugging hangs with the read-register interface.
I checked that this adds the same register fields as the kernel driver.

Acked-by: Michel Dänzer <michel.daenzer@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/radeonsi/sid.h | 1080 +++++++++++++++++++++++++++-
 1 file changed, 1079 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 921d5335554..66fdf35c8af 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -207,6 +207,324 @@
  */
 
 
+#define R_000E4C_SRBM_STATUS2                                           0x000E4C
+#define   S_000E4C_SDMA_RQ_PENDING(x)                                 (((x) & 0x1) << 0)
+#define   G_000E4C_SDMA_RQ_PENDING(x)                                 (((x) >> 0) & 0x1)
+#define   C_000E4C_SDMA_RQ_PENDING                                    0xFFFFFFFE
+#define   S_000E4C_TST_RQ_PENDING(x)                                  (((x) & 0x1) << 1)
+#define   G_000E4C_TST_RQ_PENDING(x)                                  (((x) >> 1) & 0x1)
+#define   C_000E4C_TST_RQ_PENDING                                     0xFFFFFFFD
+#define   S_000E4C_SDMA1_RQ_PENDING(x)                                (((x) & 0x1) << 2)
+#define   G_000E4C_SDMA1_RQ_PENDING(x)                                (((x) >> 2) & 0x1)
+#define   C_000E4C_SDMA1_RQ_PENDING                                   0xFFFFFFFB
+#define   S_000E4C_VCE0_RQ_PENDING(x)                                 (((x) & 0x1) << 3)
+#define   G_000E4C_VCE0_RQ_PENDING(x)                                 (((x) >> 3) & 0x1)
+#define   C_000E4C_VCE0_RQ_PENDING                                    0xFFFFFFF7
+#define   S_000E4C_VP8_BUSY(x)                                        (((x) & 0x1) << 4)
+#define   G_000E4C_VP8_BUSY(x)                                        (((x) >> 4) & 0x1)
+#define   C_000E4C_VP8_BUSY                                           0xFFFFFFEF
+#define   S_000E4C_SDMA_BUSY(x)                                       (((x) & 0x1) << 5)
+#define   G_000E4C_SDMA_BUSY(x)                                       (((x) >> 5) & 0x1)
+#define   C_000E4C_SDMA_BUSY                                          0xFFFFFFDF
+#define   S_000E4C_SDMA1_BUSY(x)                                      (((x) & 0x1) << 6)
+#define   G_000E4C_SDMA1_BUSY(x)                                      (((x) >> 6) & 0x1)
+#define   C_000E4C_SDMA1_BUSY                                         0xFFFFFFBF
+#define   S_000E4C_VCE0_BUSY(x)                                       (((x) & 0x1) << 7)
+#define   G_000E4C_VCE0_BUSY(x)                                       (((x) >> 7) & 0x1)
+#define   C_000E4C_VCE0_BUSY                                          0xFFFFFF7F
+#define   S_000E4C_XDMA_BUSY(x)                                       (((x) & 0x1) << 8)
+#define   G_000E4C_XDMA_BUSY(x)                                       (((x) >> 8) & 0x1)
+#define   C_000E4C_XDMA_BUSY                                          0xFFFFFEFF
+#define   S_000E4C_CHUB_BUSY(x)                                       (((x) & 0x1) << 9)
+#define   G_000E4C_CHUB_BUSY(x)                                       (((x) >> 9) & 0x1)
+#define   C_000E4C_CHUB_BUSY                                          0xFFFFFDFF
+#define   S_000E4C_SDMA2_BUSY(x)                                      (((x) & 0x1) << 10)
+#define   G_000E4C_SDMA2_BUSY(x)                                      (((x) >> 10) & 0x1)
+#define   C_000E4C_SDMA2_BUSY                                         0xFFFFFBFF
+#define   S_000E4C_SDMA3_BUSY(x)                                      (((x) & 0x1) << 11)
+#define   G_000E4C_SDMA3_BUSY(x)                                      (((x) >> 11) & 0x1)
+#define   C_000E4C_SDMA3_BUSY                                         0xFFFFF7FF
+#define   S_000E4C_SAMSCP_BUSY(x)                                     (((x) & 0x1) << 12)
+#define   G_000E4C_SAMSCP_BUSY(x)                                     (((x) >> 12) & 0x1)
+#define   C_000E4C_SAMSCP_BUSY                                        0xFFFFEFFF
+#define   S_000E4C_ISP_BUSY(x)                                        (((x) & 0x1) << 13)
+#define   G_000E4C_ISP_BUSY(x)                                        (((x) >> 13) & 0x1)
+#define   C_000E4C_ISP_BUSY                                           0xFFFFDFFF
+#define   S_000E4C_VCE1_BUSY(x)                                       (((x) & 0x1) << 14)
+#define   G_000E4C_VCE1_BUSY(x)                                       (((x) >> 14) & 0x1)
+#define   C_000E4C_VCE1_BUSY                                          0xFFFFBFFF
+#define   S_000E4C_ODE_BUSY(x)                                        (((x) & 0x1) << 15)
+#define   G_000E4C_ODE_BUSY(x)                                        (((x) >> 15) & 0x1)
+#define   C_000E4C_ODE_BUSY                                           0xFFFF7FFF
+#define   S_000E4C_SDMA2_RQ_PENDING(x)                                (((x) & 0x1) << 16)
+#define   G_000E4C_SDMA2_RQ_PENDING(x)                                (((x) >> 16) & 0x1)
+#define   C_000E4C_SDMA2_RQ_PENDING                                   0xFFFEFFFF
+#define   S_000E4C_SDMA3_RQ_PENDING(x)                                (((x) & 0x1) << 17)
+#define   G_000E4C_SDMA3_RQ_PENDING(x)                                (((x) >> 17) & 0x1)
+#define   C_000E4C_SDMA3_RQ_PENDING                                   0xFFFDFFFF
+#define   S_000E4C_SAMSCP_RQ_PENDING(x)                               (((x) & 0x1) << 18)
+#define   G_000E4C_SAMSCP_RQ_PENDING(x)                               (((x) >> 18) & 0x1)
+#define   C_000E4C_SAMSCP_RQ_PENDING                                  0xFFFBFFFF
+#define   S_000E4C_ISP_RQ_PENDING(x)                                  (((x) & 0x1) << 19)
+#define   G_000E4C_ISP_RQ_PENDING(x)                                  (((x) >> 19) & 0x1)
+#define   C_000E4C_ISP_RQ_PENDING                                     0xFFF7FFFF
+#define   S_000E4C_VCE1_RQ_PENDING(x)                                 (((x) & 0x1) << 20)
+#define   G_000E4C_VCE1_RQ_PENDING(x)                                 (((x) >> 20) & 0x1)
+#define   C_000E4C_VCE1_RQ_PENDING                                    0xFFEFFFFF
+#define R_000E50_SRBM_STATUS                                            0x000E50
+#define   S_000E50_UVD_RQ_PENDING(x)                                  (((x) & 0x1) << 1)
+#define   G_000E50_UVD_RQ_PENDING(x)                                  (((x) >> 1) & 0x1)
+#define   C_000E50_UVD_RQ_PENDING                                     0xFFFFFFFD
+#define   S_000E50_SAMMSP_RQ_PENDING(x)                               (((x) & 0x1) << 2)
+#define   G_000E50_SAMMSP_RQ_PENDING(x)                               (((x) >> 2) & 0x1)
+#define   C_000E50_SAMMSP_RQ_PENDING                                  0xFFFFFFFB
+#define   S_000E50_ACP_RQ_PENDING(x)                                  (((x) & 0x1) << 3)
+#define   G_000E50_ACP_RQ_PENDING(x)                                  (((x) >> 3) & 0x1)
+#define   C_000E50_ACP_RQ_PENDING                                     0xFFFFFFF7
+#define   S_000E50_SMU_RQ_PENDING(x)                                  (((x) & 0x1) << 4)
+#define   G_000E50_SMU_RQ_PENDING(x)                                  (((x) >> 4) & 0x1)
+#define   C_000E50_SMU_RQ_PENDING                                     0xFFFFFFEF
+#define   S_000E50_GRBM_RQ_PENDING(x)                                 (((x) & 0x1) << 5)
+#define   G_000E50_GRBM_RQ_PENDING(x)                                 (((x) >> 5) & 0x1)
+#define   C_000E50_GRBM_RQ_PENDING                                    0xFFFFFFDF
+#define   S_000E50_HI_RQ_PENDING(x)                                   (((x) & 0x1) << 6)
+#define   G_000E50_HI_RQ_PENDING(x)                                   (((x) >> 6) & 0x1)
+#define   C_000E50_HI_RQ_PENDING                                      0xFFFFFFBF
+#define   S_000E50_VMC_BUSY(x)                                        (((x) & 0x1) << 8)
+#define   G_000E50_VMC_BUSY(x)                                        (((x) >> 8) & 0x1)
+#define   C_000E50_VMC_BUSY                                           0xFFFFFEFF
+#define   S_000E50_MCB_BUSY(x)                                        (((x) & 0x1) << 9)
+#define   G_000E50_MCB_BUSY(x)                                        (((x) >> 9) & 0x1)
+#define   C_000E50_MCB_BUSY                                           0xFFFFFDFF
+#define   S_000E50_MCB_NON_DISPLAY_BUSY(x)                            (((x) & 0x1) << 10)
+#define   G_000E50_MCB_NON_DISPLAY_BUSY(x)                            (((x) >> 10) & 0x1)
+#define   C_000E50_MCB_NON_DISPLAY_BUSY                               0xFFFFFBFF
+#define   S_000E50_MCC_BUSY(x)                                        (((x) & 0x1) << 11)
+#define   G_000E50_MCC_BUSY(x)                                        (((x) >> 11) & 0x1)
+#define   C_000E50_MCC_BUSY                                           0xFFFFF7FF
+#define   S_000E50_MCD_BUSY(x)                                        (((x) & 0x1) << 12)
+#define   G_000E50_MCD_BUSY(x)                                        (((x) >> 12) & 0x1)
+#define   C_000E50_MCD_BUSY                                           0xFFFFEFFF
+#define   S_000E50_VMC1_BUSY(x)                                       (((x) & 0x1) << 13)
+#define   G_000E50_VMC1_BUSY(x)                                       (((x) >> 13) & 0x1)
+#define   C_000E50_VMC1_BUSY                                          0xFFFFDFFF
+#define   S_000E50_SEM_BUSY(x)                                        (((x) & 0x1) << 14)
+#define   G_000E50_SEM_BUSY(x)                                        (((x) >> 14) & 0x1)
+#define   C_000E50_SEM_BUSY                                           0xFFFFBFFF
+#define   S_000E50_ACP_BUSY(x)                                        (((x) & 0x1) << 16)
+#define   G_000E50_ACP_BUSY(x)                                        (((x) >> 16) & 0x1)
+#define   C_000E50_ACP_BUSY                                           0xFFFEFFFF
+#define   S_000E50_IH_BUSY(x)                                         (((x) & 0x1) << 17)
+#define   G_000E50_IH_BUSY(x)                                         (((x) >> 17) & 0x1)
+#define   C_000E50_IH_BUSY                                            0xFFFDFFFF
+#define   S_000E50_UVD_BUSY(x)                                        (((x) & 0x1) << 19)
+#define   G_000E50_UVD_BUSY(x)                                        (((x) >> 19) & 0x1)
+#define   C_000E50_UVD_BUSY                                           0xFFF7FFFF
+#define   S_000E50_SAMMSP_BUSY(x)                                     (((x) & 0x1) << 20)
+#define   G_000E50_SAMMSP_BUSY(x)                                     (((x) >> 20) & 0x1)
+#define   C_000E50_SAMMSP_BUSY                                        0xFFEFFFFF
+#define   S_000E50_GCATCL2_BUSY(x)                                    (((x) & 0x1) << 21)
+#define   G_000E50_GCATCL2_BUSY(x)                                    (((x) >> 21) & 0x1)
+#define   C_000E50_GCATCL2_BUSY                                       0xFFDFFFFF
+#define   S_000E50_OSATCL2_BUSY(x)                                    (((x) & 0x1) << 22)
+#define   G_000E50_OSATCL2_BUSY(x)                                    (((x) >> 22) & 0x1)
+#define   C_000E50_OSATCL2_BUSY                                       0xFFBFFFFF
+#define   S_000E50_BIF_BUSY(x)                                        (((x) & 0x1) << 29)
+#define   G_000E50_BIF_BUSY(x)                                        (((x) >> 29) & 0x1)
+#define   C_000E50_BIF_BUSY                                           0xDFFFFFFF
+#define R_000E54_SRBM_STATUS3                                           0x000E54
+#define   S_000E54_MCC0_BUSY(x)                                       (((x) & 0x1) << 0)
+#define   G_000E54_MCC0_BUSY(x)                                       (((x) >> 0) & 0x1)
+#define   C_000E54_MCC0_BUSY                                          0xFFFFFFFE
+#define   S_000E54_MCC1_BUSY(x)                                       (((x) & 0x1) << 1)
+#define   G_000E54_MCC1_BUSY(x)                                       (((x) >> 1) & 0x1)
+#define   C_000E54_MCC1_BUSY                                          0xFFFFFFFD
+#define   S_000E54_MCC2_BUSY(x)                                       (((x) & 0x1) << 2)
+#define   G_000E54_MCC2_BUSY(x)                                       (((x) >> 2) & 0x1)
+#define   C_000E54_MCC2_BUSY                                          0xFFFFFFFB
+#define   S_000E54_MCC3_BUSY(x)                                       (((x) & 0x1) << 3)
+#define   G_000E54_MCC3_BUSY(x)                                       (((x) >> 3) & 0x1)
+#define   C_000E54_MCC3_BUSY                                          0xFFFFFFF7
+#define   S_000E54_MCC4_BUSY(x)                                       (((x) & 0x1) << 4)
+#define   G_000E54_MCC4_BUSY(x)                                       (((x) >> 4) & 0x1)
+#define   C_000E54_MCC4_BUSY                                          0xFFFFFFEF
+#define   S_000E54_MCC5_BUSY(x)                                       (((x) & 0x1) << 5)
+#define   G_000E54_MCC5_BUSY(x)                                       (((x) >> 5) & 0x1)
+#define   C_000E54_MCC5_BUSY                                          0xFFFFFFDF
+#define   S_000E54_MCC6_BUSY(x)                                       (((x) & 0x1) << 6)
+#define   G_000E54_MCC6_BUSY(x)                                       (((x) >> 6) & 0x1)
+#define   C_000E54_MCC6_BUSY                                          0xFFFFFFBF
+#define   S_000E54_MCC7_BUSY(x)                                       (((x) & 0x1) << 7)
+#define   G_000E54_MCC7_BUSY(x)                                       (((x) >> 7) & 0x1)
+#define   C_000E54_MCC7_BUSY                                          0xFFFFFF7F
+#define   S_000E54_MCD0_BUSY(x)                                       (((x) & 0x1) << 8)
+#define   G_000E54_MCD0_BUSY(x)                                       (((x) >> 8) & 0x1)
+#define   C_000E54_MCD0_BUSY                                          0xFFFFFEFF
+#define   S_000E54_MCD1_BUSY(x)                                       (((x) & 0x1) << 9)
+#define   G_000E54_MCD1_BUSY(x)                                       (((x) >> 9) & 0x1)
+#define   C_000E54_MCD1_BUSY                                          0xFFFFFDFF
+#define   S_000E54_MCD2_BUSY(x)                                       (((x) & 0x1) << 10)
+#define   G_000E54_MCD2_BUSY(x)                                       (((x) >> 10) & 0x1)
+#define   C_000E54_MCD2_BUSY                                          0xFFFFFBFF
+#define   S_000E54_MCD3_BUSY(x)                                       (((x) & 0x1) << 11)
+#define   G_000E54_MCD3_BUSY(x)                                       (((x) >> 11) & 0x1)
+#define   C_000E54_MCD3_BUSY                                          0xFFFFF7FF
+#define   S_000E54_MCD4_BUSY(x)                                       (((x) & 0x1) << 12)
+#define   G_000E54_MCD4_BUSY(x)                                       (((x) >> 12) & 0x1)
+#define   C_000E54_MCD4_BUSY                                          0xFFFFEFFF
+#define   S_000E54_MCD5_BUSY(x)                                       (((x) & 0x1) << 13)
+#define   G_000E54_MCD5_BUSY(x)                                       (((x) >> 13) & 0x1)
+#define   C_000E54_MCD5_BUSY                                          0xFFFFDFFF
+#define   S_000E54_MCD6_BUSY(x)                                       (((x) & 0x1) << 14)
+#define   G_000E54_MCD6_BUSY(x)                                       (((x) >> 14) & 0x1)
+#define   C_000E54_MCD6_BUSY                                          0xFFFFBFFF
+#define   S_000E54_MCD7_BUSY(x)                                       (((x) & 0x1) << 15)
+#define   G_000E54_MCD7_BUSY(x)                                       (((x) >> 15) & 0x1)
+#define   C_000E54_MCD7_BUSY                                          0xFFFF7FFF
+#define R_00D034_SDMA0_STATUS_REG                                       0x00D034
+#define   S_00D034_IDLE(x)                                            (((x) & 0x1) << 0)
+#define   G_00D034_IDLE(x)                                            (((x) >> 0) & 0x1)
+#define   C_00D034_IDLE                                               0xFFFFFFFE
+#define   S_00D034_REG_IDLE(x)                                        (((x) & 0x1) << 1)
+#define   G_00D034_REG_IDLE(x)                                        (((x) >> 1) & 0x1)
+#define   C_00D034_REG_IDLE                                           0xFFFFFFFD
+#define   S_00D034_RB_EMPTY(x)                                        (((x) & 0x1) << 2)
+#define   G_00D034_RB_EMPTY(x)                                        (((x) >> 2) & 0x1)
+#define   C_00D034_RB_EMPTY                                           0xFFFFFFFB
+#define   S_00D034_RB_FULL(x)                                         (((x) & 0x1) << 3)
+#define   G_00D034_RB_FULL(x)                                         (((x) >> 3) & 0x1)
+#define   C_00D034_RB_FULL                                            0xFFFFFFF7
+#define   S_00D034_RB_CMD_IDLE(x)                                     (((x) & 0x1) << 4)
+#define   G_00D034_RB_CMD_IDLE(x)                                     (((x) >> 4) & 0x1)
+#define   C_00D034_RB_CMD_IDLE                                        0xFFFFFFEF
+#define   S_00D034_RB_CMD_FULL(x)                                     (((x) & 0x1) << 5)
+#define   G_00D034_RB_CMD_FULL(x)                                     (((x) >> 5) & 0x1)
+#define   C_00D034_RB_CMD_FULL                                        0xFFFFFFDF
+#define   S_00D034_IB_CMD_IDLE(x)                                     (((x) & 0x1) << 6)
+#define   G_00D034_IB_CMD_IDLE(x)                                     (((x) >> 6) & 0x1)
+#define   C_00D034_IB_CMD_IDLE                                        0xFFFFFFBF
+#define   S_00D034_IB_CMD_FULL(x)                                     (((x) & 0x1) << 7)
+#define   G_00D034_IB_CMD_FULL(x)                                     (((x) >> 7) & 0x1)
+#define   C_00D034_IB_CMD_FULL                                        0xFFFFFF7F
+#define   S_00D034_BLOCK_IDLE(x)                                      (((x) & 0x1) << 8)
+#define   G_00D034_BLOCK_IDLE(x)                                      (((x) >> 8) & 0x1)
+#define   C_00D034_BLOCK_IDLE                                         0xFFFFFEFF
+#define   S_00D034_INSIDE_IB(x)                                       (((x) & 0x1) << 9)
+#define   G_00D034_INSIDE_IB(x)                                       (((x) >> 9) & 0x1)
+#define   C_00D034_INSIDE_IB                                          0xFFFFFDFF
+#define   S_00D034_EX_IDLE(x)                                         (((x) & 0x1) << 10)
+#define   G_00D034_EX_IDLE(x)                                         (((x) >> 10) & 0x1)
+#define   C_00D034_EX_IDLE                                            0xFFFFFBFF
+#define   S_00D034_EX_IDLE_POLL_TIMER_EXPIRE(x)                       (((x) & 0x1) << 11)
+#define   G_00D034_EX_IDLE_POLL_TIMER_EXPIRE(x)                       (((x) >> 11) & 0x1)
+#define   C_00D034_EX_IDLE_POLL_TIMER_EXPIRE                          0xFFFFF7FF
+#define   S_00D034_PACKET_READY(x)                                    (((x) & 0x1) << 12)
+#define   G_00D034_PACKET_READY(x)                                    (((x) >> 12) & 0x1)
+#define   C_00D034_PACKET_READY                                       0xFFFFEFFF
+#define   S_00D034_MC_WR_IDLE(x)                                      (((x) & 0x1) << 13)
+#define   G_00D034_MC_WR_IDLE(x)                                      (((x) >> 13) & 0x1)
+#define   C_00D034_MC_WR_IDLE                                         0xFFFFDFFF
+#define   S_00D034_SRBM_IDLE(x)                                       (((x) & 0x1) << 14)
+#define   G_00D034_SRBM_IDLE(x)                                       (((x) >> 14) & 0x1)
+#define   C_00D034_SRBM_IDLE                                          0xFFFFBFFF
+#define   S_00D034_CONTEXT_EMPTY(x)                                   (((x) & 0x1) << 15)
+#define   G_00D034_CONTEXT_EMPTY(x)                                   (((x) >> 15) & 0x1)
+#define   C_00D034_CONTEXT_EMPTY                                      0xFFFF7FFF
+#define   S_00D034_DELTA_RPTR_FULL(x)                                 (((x) & 0x1) << 16)
+#define   G_00D034_DELTA_RPTR_FULL(x)                                 (((x) >> 16) & 0x1)
+#define   C_00D034_DELTA_RPTR_FULL                                    0xFFFEFFFF
+#define   S_00D034_RB_MC_RREQ_IDLE(x)                                 (((x) & 0x1) << 17)
+#define   G_00D034_RB_MC_RREQ_IDLE(x)                                 (((x) >> 17) & 0x1)
+#define   C_00D034_RB_MC_RREQ_IDLE                                    0xFFFDFFFF
+#define   S_00D034_IB_MC_RREQ_IDLE(x)                                 (((x) & 0x1) << 18)
+#define   G_00D034_IB_MC_RREQ_IDLE(x)                                 (((x) >> 18) & 0x1)
+#define   C_00D034_IB_MC_RREQ_IDLE                                    0xFFFBFFFF
+#define   S_00D034_MC_RD_IDLE(x)                                      (((x) & 0x1) << 19)
+#define   G_00D034_MC_RD_IDLE(x)                                      (((x) >> 19) & 0x1)
+#define   C_00D034_MC_RD_IDLE                                         0xFFF7FFFF
+#define   S_00D034_DELTA_RPTR_EMPTY(x)                                (((x) & 0x1) << 20)
+#define   G_00D034_DELTA_RPTR_EMPTY(x)                                (((x) >> 20) & 0x1)
+#define   C_00D034_DELTA_RPTR_EMPTY                                   0xFFEFFFFF
+#define   S_00D034_MC_RD_RET_STALL(x)                                 (((x) & 0x1) << 21)
+#define   G_00D034_MC_RD_RET_STALL(x)                                 (((x) >> 21) & 0x1)
+#define   C_00D034_MC_RD_RET_STALL                                    0xFFDFFFFF
+#define   S_00D034_MC_RD_NO_POLL_IDLE(x)                              (((x) & 0x1) << 22)
+#define   G_00D034_MC_RD_NO_POLL_IDLE(x)                              (((x) >> 22) & 0x1)
+#define   C_00D034_MC_RD_NO_POLL_IDLE                                 0xFFBFFFFF
+#define   S_00D034_PREV_CMD_IDLE(x)                                   (((x) & 0x1) << 25)
+#define   G_00D034_PREV_CMD_IDLE(x)                                   (((x) >> 25) & 0x1)
+#define   C_00D034_PREV_CMD_IDLE                                      0xFDFFFFFF
+#define   S_00D034_SEM_IDLE(x)                                        (((x) & 0x1) << 26)
+#define   G_00D034_SEM_IDLE(x)                                        (((x) >> 26) & 0x1)
+#define   C_00D034_SEM_IDLE                                           0xFBFFFFFF
+#define   S_00D034_SEM_REQ_STALL(x)                                   (((x) & 0x1) << 27)
+#define   G_00D034_SEM_REQ_STALL(x)                                   (((x) >> 27) & 0x1)
+#define   C_00D034_SEM_REQ_STALL                                      0xF7FFFFFF
+#define   S_00D034_SEM_RESP_STATE(x)                                  (((x) & 0x03) << 28)
+#define   G_00D034_SEM_RESP_STATE(x)                                  (((x) >> 28) & 0x03)
+#define   C_00D034_SEM_RESP_STATE                                     0xCFFFFFFF
+#define   S_00D034_INT_IDLE(x)                                        (((x) & 0x1) << 30)
+#define   G_00D034_INT_IDLE(x)                                        (((x) >> 30) & 0x1)
+#define   C_00D034_INT_IDLE                                           0xBFFFFFFF
+#define   S_00D034_INT_REQ_STALL(x)                                   (((x) & 0x1) << 31)
+#define   G_00D034_INT_REQ_STALL(x)                                   (((x) >> 31) & 0x1)
+#define   C_00D034_INT_REQ_STALL                                      0x7FFFFFFF
+#define R_00D834_SDMA1_STATUS_REG                                       0x00D834
+#define R_008008_GRBM_STATUS2                                           0x008008
+#define   S_008008_ME0PIPE1_CMDFIFO_AVAIL(x)                          (((x) & 0x0F) << 0)
+#define   G_008008_ME0PIPE1_CMDFIFO_AVAIL(x)                          (((x) >> 0) & 0x0F)
+#define   C_008008_ME0PIPE1_CMDFIFO_AVAIL                             0xFFFFFFF0
+#define   S_008008_ME0PIPE1_CF_RQ_PENDING(x)                          (((x) & 0x1) << 4)
+#define   G_008008_ME0PIPE1_CF_RQ_PENDING(x)                          (((x) >> 4) & 0x1)
+#define   C_008008_ME0PIPE1_CF_RQ_PENDING                             0xFFFFFFEF
+#define   S_008008_ME0PIPE1_PF_RQ_PENDING(x)                          (((x) & 0x1) << 5)
+#define   G_008008_ME0PIPE1_PF_RQ_PENDING(x)                          (((x) >> 5) & 0x1)
+#define   C_008008_ME0PIPE1_PF_RQ_PENDING                             0xFFFFFFDF
+#define   S_008008_ME1PIPE0_RQ_PENDING(x)                             (((x) & 0x1) << 6)
+#define   G_008008_ME1PIPE0_RQ_PENDING(x)                             (((x) >> 6) & 0x1)
+#define   C_008008_ME1PIPE0_RQ_PENDING                                0xFFFFFFBF
+#define   S_008008_ME1PIPE1_RQ_PENDING(x)                             (((x) & 0x1) << 7)
+#define   G_008008_ME1PIPE1_RQ_PENDING(x)                             (((x) >> 7) & 0x1)
+#define   C_008008_ME1PIPE1_RQ_PENDING                                0xFFFFFF7F
+#define   S_008008_ME1PIPE2_RQ_PENDING(x)                             (((x) & 0x1) << 8)
+#define   G_008008_ME1PIPE2_RQ_PENDING(x)                             (((x) >> 8) & 0x1)
+#define   C_008008_ME1PIPE2_RQ_PENDING                                0xFFFFFEFF
+#define   S_008008_ME1PIPE3_RQ_PENDING(x)                             (((x) & 0x1) << 9)
+#define   G_008008_ME1PIPE3_RQ_PENDING(x)                             (((x) >> 9) & 0x1)
+#define   C_008008_ME1PIPE3_RQ_PENDING                                0xFFFFFDFF
+#define   S_008008_ME2PIPE0_RQ_PENDING(x)                             (((x) & 0x1) << 10)
+#define   G_008008_ME2PIPE0_RQ_PENDING(x)                             (((x) >> 10) & 0x1)
+#define   C_008008_ME2PIPE0_RQ_PENDING                                0xFFFFFBFF
+#define   S_008008_ME2PIPE1_RQ_PENDING(x)                             (((x) & 0x1) << 11)
+#define   G_008008_ME2PIPE1_RQ_PENDING(x)                             (((x) >> 11) & 0x1)
+#define   C_008008_ME2PIPE1_RQ_PENDING                                0xFFFFF7FF
+#define   S_008008_ME2PIPE2_RQ_PENDING(x)                             (((x) & 0x1) << 12)
+#define   G_008008_ME2PIPE2_RQ_PENDING(x)                             (((x) >> 12) & 0x1)
+#define   C_008008_ME2PIPE2_RQ_PENDING                                0xFFFFEFFF
+#define   S_008008_ME2PIPE3_RQ_PENDING(x)                             (((x) & 0x1) << 13)
+#define   G_008008_ME2PIPE3_RQ_PENDING(x)                             (((x) >> 13) & 0x1)
+#define   C_008008_ME2PIPE3_RQ_PENDING                                0xFFFFDFFF
+#define   S_008008_RLC_RQ_PENDING(x)                                  (((x) & 0x1) << 14)
+#define   G_008008_RLC_RQ_PENDING(x)                                  (((x) >> 14) & 0x1)
+#define   C_008008_RLC_RQ_PENDING                                     0xFFFFBFFF
+#define   S_008008_RLC_BUSY(x)                                        (((x) & 0x1) << 24)
+#define   G_008008_RLC_BUSY(x)                                        (((x) >> 24) & 0x1)
+#define   C_008008_RLC_BUSY                                           0xFEFFFFFF
+#define   S_008008_TC_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008008_TC_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008008_TC_BUSY                                            0xFDFFFFFF
+#define   S_008008_TCC_CC_RESIDENT(x)                                 (((x) & 0x1) << 26)
+#define   G_008008_TCC_CC_RESIDENT(x)                                 (((x) >> 26) & 0x1)
+#define   C_008008_TCC_CC_RESIDENT                                    0xFBFFFFFF
+#define   S_008008_CPF_BUSY(x)                                        (((x) & 0x1) << 28)
+#define   G_008008_CPF_BUSY(x)                                        (((x) >> 28) & 0x1)
+#define   C_008008_CPF_BUSY                                           0xEFFFFFFF
+#define   S_008008_CPC_BUSY(x)                                        (((x) & 0x1) << 29)
+#define   G_008008_CPC_BUSY(x)                                        (((x) >> 29) & 0x1)
+#define   C_008008_CPC_BUSY                                           0xDFFFFFFF
+#define   S_008008_CPG_BUSY(x)                                        (((x) & 0x1) << 30)
+#define   G_008008_CPG_BUSY(x)                                        (((x) >> 30) & 0x1)
+#define   C_008008_CPG_BUSY                                           0xBFFFFFFF
 #define R_008010_GRBM_STATUS                                            0x008010
 #define   S_008010_ME0PIPE0_CMDFIFO_AVAIL(x)                          (((x) & 0x0F) << 0)
 #define   G_008010_ME0PIPE0_CMDFIFO_AVAIL(x)                          (((x) >> 0) & 0x0F)
@@ -350,7 +668,142 @@
 #define   C_0085F0_SH_ICACHE_ACTION_ENA                               0xDFFFFFFF
 #define R_0085F4_CP_COHER_SIZE                                          0x0085F4
 #define R_0085F8_CP_COHER_BASE                                          0x0085F8
-
+#define R_008014_GRBM_STATUS_SE0                                        0x008014
+#define   S_008014_DB_CLEAN(x)                                        (((x) & 0x1) << 1)
+#define   G_008014_DB_CLEAN(x)                                        (((x) >> 1) & 0x1)
+#define   C_008014_DB_CLEAN                                           0xFFFFFFFD
+#define   S_008014_CB_CLEAN(x)                                        (((x) & 0x1) << 2)
+#define   G_008014_CB_CLEAN(x)                                        (((x) >> 2) & 0x1)
+#define   C_008014_CB_CLEAN                                           0xFFFFFFFB
+#define   S_008014_BCI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008014_BCI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008014_BCI_BUSY                                           0xFFBFFFFF
+#define   S_008014_VGT_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_008014_VGT_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_008014_VGT_BUSY                                           0xFF7FFFFF
+#define   S_008014_PA_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_008014_PA_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_008014_PA_BUSY                                            0xFEFFFFFF
+#define   S_008014_TA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008014_TA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008014_TA_BUSY                                            0xFDFFFFFF
+#define   S_008014_SX_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008014_SX_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008014_SX_BUSY                                            0xFBFFFFFF
+#define   S_008014_SPI_BUSY(x)                                        (((x) & 0x1) << 27)
+#define   G_008014_SPI_BUSY(x)                                        (((x) >> 27) & 0x1)
+#define   C_008014_SPI_BUSY                                           0xF7FFFFFF
+#define   S_008014_SC_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_008014_SC_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_008014_SC_BUSY                                            0xDFFFFFFF
+#define   S_008014_DB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_008014_DB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_008014_DB_BUSY                                            0xBFFFFFFF
+#define   S_008014_CB_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_008014_CB_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_008014_CB_BUSY                                            0x7FFFFFFF
+#define R_008018_GRBM_STATUS_SE1                                        0x008018
+#define   S_008018_DB_CLEAN(x)                                        (((x) & 0x1) << 1)
+#define   G_008018_DB_CLEAN(x)                                        (((x) >> 1) & 0x1)
+#define   C_008018_DB_CLEAN                                           0xFFFFFFFD
+#define   S_008018_CB_CLEAN(x)                                        (((x) & 0x1) << 2)
+#define   G_008018_CB_CLEAN(x)                                        (((x) >> 2) & 0x1)
+#define   C_008018_CB_CLEAN                                           0xFFFFFFFB
+#define   S_008018_BCI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008018_BCI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008018_BCI_BUSY                                           0xFFBFFFFF
+#define   S_008018_VGT_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_008018_VGT_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_008018_VGT_BUSY                                           0xFF7FFFFF
+#define   S_008018_PA_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_008018_PA_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_008018_PA_BUSY                                            0xFEFFFFFF
+#define   S_008018_TA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008018_TA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008018_TA_BUSY                                            0xFDFFFFFF
+#define   S_008018_SX_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008018_SX_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008018_SX_BUSY                                            0xFBFFFFFF
+#define   S_008018_SPI_BUSY(x)                                        (((x) & 0x1) << 27)
+#define   G_008018_SPI_BUSY(x)                                        (((x) >> 27) & 0x1)
+#define   C_008018_SPI_BUSY                                           0xF7FFFFFF
+#define   S_008018_SC_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_008018_SC_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_008018_SC_BUSY                                            0xDFFFFFFF
+#define   S_008018_DB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_008018_DB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_008018_DB_BUSY                                            0xBFFFFFFF
+#define   S_008018_CB_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_008018_CB_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_008018_CB_BUSY                                            0x7FFFFFFF
+#define R_008038_GRBM_STATUS_SE2                                        0x008038
+#define   S_008038_DB_CLEAN(x)                                        (((x) & 0x1) << 1)
+#define   G_008038_DB_CLEAN(x)                                        (((x) >> 1) & 0x1)
+#define   C_008038_DB_CLEAN                                           0xFFFFFFFD
+#define   S_008038_CB_CLEAN(x)                                        (((x) & 0x1) << 2)
+#define   G_008038_CB_CLEAN(x)                                        (((x) >> 2) & 0x1)
+#define   C_008038_CB_CLEAN                                           0xFFFFFFFB
+#define   S_008038_BCI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008038_BCI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008038_BCI_BUSY                                           0xFFBFFFFF
+#define   S_008038_VGT_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_008038_VGT_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_008038_VGT_BUSY                                           0xFF7FFFFF
+#define   S_008038_PA_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_008038_PA_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_008038_PA_BUSY                                            0xFEFFFFFF
+#define   S_008038_TA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008038_TA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008038_TA_BUSY                                            0xFDFFFFFF
+#define   S_008038_SX_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008038_SX_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008038_SX_BUSY                                            0xFBFFFFFF
+#define   S_008038_SPI_BUSY(x)                                        (((x) & 0x1) << 27)
+#define   G_008038_SPI_BUSY(x)                                        (((x) >> 27) & 0x1)
+#define   C_008038_SPI_BUSY                                           0xF7FFFFFF
+#define   S_008038_SC_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_008038_SC_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_008038_SC_BUSY                                            0xDFFFFFFF
+#define   S_008038_DB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_008038_DB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_008038_DB_BUSY                                            0xBFFFFFFF
+#define   S_008038_CB_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_008038_CB_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_008038_CB_BUSY                                            0x7FFFFFFF
+#define R_00803C_GRBM_STATUS_SE3                                        0x00803C
+#define   S_00803C_DB_CLEAN(x)                                        (((x) & 0x1) << 1)
+#define   G_00803C_DB_CLEAN(x)                                        (((x) >> 1) & 0x1)
+#define   C_00803C_DB_CLEAN                                           0xFFFFFFFD
+#define   S_00803C_CB_CLEAN(x)                                        (((x) & 0x1) << 2)
+#define   G_00803C_CB_CLEAN(x)                                        (((x) >> 2) & 0x1)
+#define   C_00803C_CB_CLEAN                                           0xFFFFFFFB
+#define   S_00803C_BCI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_00803C_BCI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_00803C_BCI_BUSY                                           0xFFBFFFFF
+#define   S_00803C_VGT_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_00803C_VGT_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_00803C_VGT_BUSY                                           0xFF7FFFFF
+#define   S_00803C_PA_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_00803C_PA_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_00803C_PA_BUSY                                            0xFEFFFFFF
+#define   S_00803C_TA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_00803C_TA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_00803C_TA_BUSY                                            0xFDFFFFFF
+#define   S_00803C_SX_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_00803C_SX_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_00803C_SX_BUSY                                            0xFBFFFFFF
+#define   S_00803C_SPI_BUSY(x)                                        (((x) & 0x1) << 27)
+#define   G_00803C_SPI_BUSY(x)                                        (((x) >> 27) & 0x1)
+#define   C_00803C_SPI_BUSY                                           0xF7FFFFFF
+#define   S_00803C_SC_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_00803C_SC_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_00803C_SC_BUSY                                            0xDFFFFFFF
+#define   S_00803C_DB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_00803C_DB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_00803C_DB_BUSY                                            0xBFFFFFFF
+#define   S_00803C_CB_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_00803C_CB_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_00803C_CB_BUSY                                            0x7FFFFFFF
 /* CIK */
 #define R_0300FC_CP_STRMOUT_CNTL                                        0x0300FC
 #define   S_0300FC_OFFSET_UPDATE_DONE(x)                              (((x) & 0x1) << 0)
@@ -465,6 +918,366 @@
 #define   S_0301FC_STATUS(x)                                          (((x) & 0x1) << 31)
 #define   G_0301FC_STATUS(x)                                          (((x) >> 31) & 0x1)
 #define   C_0301FC_STATUS                                             0x7FFFFFFF
+#define R_008210_CP_CPC_STATUS                                          0x008210
+#define   S_008210_MEC1_BUSY(x)                                       (((x) & 0x1) << 0)
+#define   G_008210_MEC1_BUSY(x)                                       (((x) >> 0) & 0x1)
+#define   C_008210_MEC1_BUSY                                          0xFFFFFFFE
+#define   S_008210_MEC2_BUSY(x)                                       (((x) & 0x1) << 1)
+#define   G_008210_MEC2_BUSY(x)                                       (((x) >> 1) & 0x1)
+#define   C_008210_MEC2_BUSY                                          0xFFFFFFFD
+#define   S_008210_DC0_BUSY(x)                                        (((x) & 0x1) << 2)
+#define   G_008210_DC0_BUSY(x)                                        (((x) >> 2) & 0x1)
+#define   C_008210_DC0_BUSY                                           0xFFFFFFFB
+#define   S_008210_DC1_BUSY(x)                                        (((x) & 0x1) << 3)
+#define   G_008210_DC1_BUSY(x)                                        (((x) >> 3) & 0x1)
+#define   C_008210_DC1_BUSY                                           0xFFFFFFF7
+#define   S_008210_RCIU1_BUSY(x)                                      (((x) & 0x1) << 4)
+#define   G_008210_RCIU1_BUSY(x)                                      (((x) >> 4) & 0x1)
+#define   C_008210_RCIU1_BUSY                                         0xFFFFFFEF
+#define   S_008210_RCIU2_BUSY(x)                                      (((x) & 0x1) << 5)
+#define   G_008210_RCIU2_BUSY(x)                                      (((x) >> 5) & 0x1)
+#define   C_008210_RCIU2_BUSY                                         0xFFFFFFDF
+#define   S_008210_ROQ1_BUSY(x)                                       (((x) & 0x1) << 6)
+#define   G_008210_ROQ1_BUSY(x)                                       (((x) >> 6) & 0x1)
+#define   C_008210_ROQ1_BUSY                                          0xFFFFFFBF
+#define   S_008210_ROQ2_BUSY(x)                                       (((x) & 0x1) << 7)
+#define   G_008210_ROQ2_BUSY(x)                                       (((x) >> 7) & 0x1)
+#define   C_008210_ROQ2_BUSY                                          0xFFFFFF7F
+#define   S_008210_TCIU_BUSY(x)                                       (((x) & 0x1) << 10)
+#define   G_008210_TCIU_BUSY(x)                                       (((x) >> 10) & 0x1)
+#define   C_008210_TCIU_BUSY                                          0xFFFFFBFF
+#define   S_008210_SCRATCH_RAM_BUSY(x)                                (((x) & 0x1) << 11)
+#define   G_008210_SCRATCH_RAM_BUSY(x)                                (((x) >> 11) & 0x1)
+#define   C_008210_SCRATCH_RAM_BUSY                                   0xFFFFF7FF
+#define   S_008210_QU_BUSY(x)                                         (((x) & 0x1) << 12)
+#define   G_008210_QU_BUSY(x)                                         (((x) >> 12) & 0x1)
+#define   C_008210_QU_BUSY                                            0xFFFFEFFF
+#define   S_008210_ATCL2IU_BUSY(x)                                    (((x) & 0x1) << 13)
+#define   G_008210_ATCL2IU_BUSY(x)                                    (((x) >> 13) & 0x1)
+#define   C_008210_ATCL2IU_BUSY                                       0xFFFFDFFF
+#define   S_008210_CPG_CPC_BUSY(x)                                    (((x) & 0x1) << 29)
+#define   G_008210_CPG_CPC_BUSY(x)                                    (((x) >> 29) & 0x1)
+#define   C_008210_CPG_CPC_BUSY                                       0xDFFFFFFF
+#define   S_008210_CPF_CPC_BUSY(x)                                    (((x) & 0x1) << 30)
+#define   G_008210_CPF_CPC_BUSY(x)                                    (((x) >> 30) & 0x1)
+#define   C_008210_CPF_CPC_BUSY                                       0xBFFFFFFF
+#define   S_008210_CPC_BUSY(x)                                        (((x) & 0x1) << 31)
+#define   G_008210_CPC_BUSY(x)                                        (((x) >> 31) & 0x1)
+#define   C_008210_CPC_BUSY                                           0x7FFFFFFF
+#define R_008214_CP_CPC_BUSY_STAT                                       0x008214
+#define   S_008214_MEC1_LOAD_BUSY(x)                                  (((x) & 0x1) << 0)
+#define   G_008214_MEC1_LOAD_BUSY(x)                                  (((x) >> 0) & 0x1)
+#define   C_008214_MEC1_LOAD_BUSY                                     0xFFFFFFFE
+#define   S_008214_MEC1_SEMAPOHRE_BUSY(x)                             (((x) & 0x1) << 1)
+#define   G_008214_MEC1_SEMAPOHRE_BUSY(x)                             (((x) >> 1) & 0x1)
+#define   C_008214_MEC1_SEMAPOHRE_BUSY                                0xFFFFFFFD
+#define   S_008214_MEC1_MUTEX_BUSY(x)                                 (((x) & 0x1) << 2)
+#define   G_008214_MEC1_MUTEX_BUSY(x)                                 (((x) >> 2) & 0x1)
+#define   C_008214_MEC1_MUTEX_BUSY                                    0xFFFFFFFB
+#define   S_008214_MEC1_MESSAGE_BUSY(x)                               (((x) & 0x1) << 3)
+#define   G_008214_MEC1_MESSAGE_BUSY(x)                               (((x) >> 3) & 0x1)
+#define   C_008214_MEC1_MESSAGE_BUSY                                  0xFFFFFFF7
+#define   S_008214_MEC1_EOP_QUEUE_BUSY(x)                             (((x) & 0x1) << 4)
+#define   G_008214_MEC1_EOP_QUEUE_BUSY(x)                             (((x) >> 4) & 0x1)
+#define   C_008214_MEC1_EOP_QUEUE_BUSY                                0xFFFFFFEF
+#define   S_008214_MEC1_IQ_QUEUE_BUSY(x)                              (((x) & 0x1) << 5)
+#define   G_008214_MEC1_IQ_QUEUE_BUSY(x)                              (((x) >> 5) & 0x1)
+#define   C_008214_MEC1_IQ_QUEUE_BUSY                                 0xFFFFFFDF
+#define   S_008214_MEC1_IB_QUEUE_BUSY(x)                              (((x) & 0x1) << 6)
+#define   G_008214_MEC1_IB_QUEUE_BUSY(x)                              (((x) >> 6) & 0x1)
+#define   C_008214_MEC1_IB_QUEUE_BUSY                                 0xFFFFFFBF
+#define   S_008214_MEC1_TC_BUSY(x)                                    (((x) & 0x1) << 7)
+#define   G_008214_MEC1_TC_BUSY(x)                                    (((x) >> 7) & 0x1)
+#define   C_008214_MEC1_TC_BUSY                                       0xFFFFFF7F
+#define   S_008214_MEC1_DMA_BUSY(x)                                   (((x) & 0x1) << 8)
+#define   G_008214_MEC1_DMA_BUSY(x)                                   (((x) >> 8) & 0x1)
+#define   C_008214_MEC1_DMA_BUSY                                      0xFFFFFEFF
+#define   S_008214_MEC1_PARTIAL_FLUSH_BUSY(x)                         (((x) & 0x1) << 9)
+#define   G_008214_MEC1_PARTIAL_FLUSH_BUSY(x)                         (((x) >> 9) & 0x1)
+#define   C_008214_MEC1_PARTIAL_FLUSH_BUSY                            0xFFFFFDFF
+#define   S_008214_MEC1_PIPE0_BUSY(x)                                 (((x) & 0x1) << 10)
+#define   G_008214_MEC1_PIPE0_BUSY(x)                                 (((x) >> 10) & 0x1)
+#define   C_008214_MEC1_PIPE0_BUSY                                    0xFFFFFBFF
+#define   S_008214_MEC1_PIPE1_BUSY(x)                                 (((x) & 0x1) << 11)
+#define   G_008214_MEC1_PIPE1_BUSY(x)                                 (((x) >> 11) & 0x1)
+#define   C_008214_MEC1_PIPE1_BUSY                                    0xFFFFF7FF
+#define   S_008214_MEC1_PIPE2_BUSY(x)                                 (((x) & 0x1) << 12)
+#define   G_008214_MEC1_PIPE2_BUSY(x)                                 (((x) >> 12) & 0x1)
+#define   C_008214_MEC1_PIPE2_BUSY                                    0xFFFFEFFF
+#define   S_008214_MEC1_PIPE3_BUSY(x)                                 (((x) & 0x1) << 13)
+#define   G_008214_MEC1_PIPE3_BUSY(x)                                 (((x) >> 13) & 0x1)
+#define   C_008214_MEC1_PIPE3_BUSY                                    0xFFFFDFFF
+#define   S_008214_MEC2_LOAD_BUSY(x)                                  (((x) & 0x1) << 16)
+#define   G_008214_MEC2_LOAD_BUSY(x)                                  (((x) >> 16) & 0x1)
+#define   C_008214_MEC2_LOAD_BUSY                                     0xFFFEFFFF
+#define   S_008214_MEC2_SEMAPOHRE_BUSY(x)                             (((x) & 0x1) << 17)
+#define   G_008214_MEC2_SEMAPOHRE_BUSY(x)                             (((x) >> 17) & 0x1)
+#define   C_008214_MEC2_SEMAPOHRE_BUSY                                0xFFFDFFFF
+#define   S_008214_MEC2_MUTEX_BUSY(x)                                 (((x) & 0x1) << 18)
+#define   G_008214_MEC2_MUTEX_BUSY(x)                                 (((x) >> 18) & 0x1)
+#define   C_008214_MEC2_MUTEX_BUSY                                    0xFFFBFFFF
+#define   S_008214_MEC2_MESSAGE_BUSY(x)                               (((x) & 0x1) << 19)
+#define   G_008214_MEC2_MESSAGE_BUSY(x)                               (((x) >> 19) & 0x1)
+#define   C_008214_MEC2_MESSAGE_BUSY                                  0xFFF7FFFF
+#define   S_008214_MEC2_EOP_QUEUE_BUSY(x)                             (((x) & 0x1) << 20)
+#define   G_008214_MEC2_EOP_QUEUE_BUSY(x)                             (((x) >> 20) & 0x1)
+#define   C_008214_MEC2_EOP_QUEUE_BUSY                                0xFFEFFFFF
+#define   S_008214_MEC2_IQ_QUEUE_BUSY(x)                              (((x) & 0x1) << 21)
+#define   G_008214_MEC2_IQ_QUEUE_BUSY(x)                              (((x) >> 21) & 0x1)
+#define   C_008214_MEC2_IQ_QUEUE_BUSY                                 0xFFDFFFFF
+#define   S_008214_MEC2_IB_QUEUE_BUSY(x)                              (((x) & 0x1) << 22)
+#define   G_008214_MEC2_IB_QUEUE_BUSY(x)                              (((x) >> 22) & 0x1)
+#define   C_008214_MEC2_IB_QUEUE_BUSY                                 0xFFBFFFFF
+#define   S_008214_MEC2_TC_BUSY(x)                                    (((x) & 0x1) << 23)
+#define   G_008214_MEC2_TC_BUSY(x)                                    (((x) >> 23) & 0x1)
+#define   C_008214_MEC2_TC_BUSY                                       0xFF7FFFFF
+#define   S_008214_MEC2_DMA_BUSY(x)                                   (((x) & 0x1) << 24)
+#define   G_008214_MEC2_DMA_BUSY(x)                                   (((x) >> 24) & 0x1)
+#define   C_008214_MEC2_DMA_BUSY                                      0xFEFFFFFF
+#define   S_008214_MEC2_PARTIAL_FLUSH_BUSY(x)                         (((x) & 0x1) << 25)
+#define   G_008214_MEC2_PARTIAL_FLUSH_BUSY(x)                         (((x) >> 25) & 0x1)
+#define   C_008214_MEC2_PARTIAL_FLUSH_BUSY                            0xFDFFFFFF
+#define   S_008214_MEC2_PIPE0_BUSY(x)                                 (((x) & 0x1) << 26)
+#define   G_008214_MEC2_PIPE0_BUSY(x)                                 (((x) >> 26) & 0x1)
+#define   C_008214_MEC2_PIPE0_BUSY                                    0xFBFFFFFF
+#define   S_008214_MEC2_PIPE1_BUSY(x)                                 (((x) & 0x1) << 27)
+#define   G_008214_MEC2_PIPE1_BUSY(x)                                 (((x) >> 27) & 0x1)
+#define   C_008214_MEC2_PIPE1_BUSY                                    0xF7FFFFFF
+#define   S_008214_MEC2_PIPE2_BUSY(x)                                 (((x) & 0x1) << 28)
+#define   G_008214_MEC2_PIPE2_BUSY(x)                                 (((x) >> 28) & 0x1)
+#define   C_008214_MEC2_PIPE2_BUSY                                    0xEFFFFFFF
+#define   S_008214_MEC2_PIPE3_BUSY(x)                                 (((x) & 0x1) << 29)
+#define   G_008214_MEC2_PIPE3_BUSY(x)                                 (((x) >> 29) & 0x1)
+#define   C_008214_MEC2_PIPE3_BUSY                                    0xDFFFFFFF
+#define R_008218_CP_CPC_STALLED_STAT1                                   0x008218
+#define   S_008218_RCIU_TX_FREE_STALL(x)                              (((x) & 0x1) << 3)
+#define   G_008218_RCIU_TX_FREE_STALL(x)                              (((x) >> 3) & 0x1)
+#define   C_008218_RCIU_TX_FREE_STALL                                 0xFFFFFFF7
+#define   S_008218_RCIU_PRIV_VIOLATION(x)                             (((x) & 0x1) << 4)
+#define   G_008218_RCIU_PRIV_VIOLATION(x)                             (((x) >> 4) & 0x1)
+#define   C_008218_RCIU_PRIV_VIOLATION                                0xFFFFFFEF
+#define   S_008218_TCIU_TX_FREE_STALL(x)                              (((x) & 0x1) << 6)
+#define   G_008218_TCIU_TX_FREE_STALL(x)                              (((x) >> 6) & 0x1)
+#define   C_008218_TCIU_TX_FREE_STALL                                 0xFFFFFFBF
+#define   S_008218_MEC1_DECODING_PACKET(x)                            (((x) & 0x1) << 8)
+#define   G_008218_MEC1_DECODING_PACKET(x)                            (((x) >> 8) & 0x1)
+#define   C_008218_MEC1_DECODING_PACKET                               0xFFFFFEFF
+#define   S_008218_MEC1_WAIT_ON_RCIU(x)                               (((x) & 0x1) << 9)
+#define   G_008218_MEC1_WAIT_ON_RCIU(x)                               (((x) >> 9) & 0x1)
+#define   C_008218_MEC1_WAIT_ON_RCIU                                  0xFFFFFDFF
+#define   S_008218_MEC1_WAIT_ON_RCIU_READ(x)                          (((x) & 0x1) << 10)
+#define   G_008218_MEC1_WAIT_ON_RCIU_READ(x)                          (((x) >> 10) & 0x1)
+#define   C_008218_MEC1_WAIT_ON_RCIU_READ                             0xFFFFFBFF
+#define   S_008218_MEC1_WAIT_ON_ROQ_DATA(x)                           (((x) & 0x1) << 13)
+#define   G_008218_MEC1_WAIT_ON_ROQ_DATA(x)                           (((x) >> 13) & 0x1)
+#define   C_008218_MEC1_WAIT_ON_ROQ_DATA                              0xFFFFDFFF
+#define   S_008218_MEC2_DECODING_PACKET(x)                            (((x) & 0x1) << 16)
+#define   G_008218_MEC2_DECODING_PACKET(x)                            (((x) >> 16) & 0x1)
+#define   C_008218_MEC2_DECODING_PACKET                               0xFFFEFFFF
+#define   S_008218_MEC2_WAIT_ON_RCIU(x)                               (((x) & 0x1) << 17)
+#define   G_008218_MEC2_WAIT_ON_RCIU(x)                               (((x) >> 17) & 0x1)
+#define   C_008218_MEC2_WAIT_ON_RCIU                                  0xFFFDFFFF
+#define   S_008218_MEC2_WAIT_ON_RCIU_READ(x)                          (((x) & 0x1) << 18)
+#define   G_008218_MEC2_WAIT_ON_RCIU_READ(x)                          (((x) >> 18) & 0x1)
+#define   C_008218_MEC2_WAIT_ON_RCIU_READ                             0xFFFBFFFF
+#define   S_008218_MEC2_WAIT_ON_ROQ_DATA(x)                           (((x) & 0x1) << 21)
+#define   G_008218_MEC2_WAIT_ON_ROQ_DATA(x)                           (((x) >> 21) & 0x1)
+#define   C_008218_MEC2_WAIT_ON_ROQ_DATA                              0xFFDFFFFF
+#define   S_008218_ATCL2IU_WAITING_ON_FREE(x)                         (((x) & 0x1) << 22)
+#define   G_008218_ATCL2IU_WAITING_ON_FREE(x)                         (((x) >> 22) & 0x1)
+#define   C_008218_ATCL2IU_WAITING_ON_FREE                            0xFFBFFFFF
+#define   S_008218_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) & 0x1) << 23)
+#define   G_008218_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) >> 23) & 0x1)
+#define   C_008218_ATCL2IU_WAITING_ON_TAGS                            0xFF7FFFFF
+#define   S_008218_ATCL1_WAITING_ON_TRANS(x)                          (((x) & 0x1) << 24)
+#define   G_008218_ATCL1_WAITING_ON_TRANS(x)                          (((x) >> 24) & 0x1)
+#define   C_008218_ATCL1_WAITING_ON_TRANS                             0xFEFFFFFF
+#define R_00821C_CP_CPF_STATUS                                          0x00821C
+#define   S_00821C_POST_WPTR_GFX_BUSY(x)                              (((x) & 0x1) << 0)
+#define   G_00821C_POST_WPTR_GFX_BUSY(x)                              (((x) >> 0) & 0x1)
+#define   C_00821C_POST_WPTR_GFX_BUSY                                 0xFFFFFFFE
+#define   S_00821C_CSF_BUSY(x)                                        (((x) & 0x1) << 1)
+#define   G_00821C_CSF_BUSY(x)                                        (((x) >> 1) & 0x1)
+#define   C_00821C_CSF_BUSY                                           0xFFFFFFFD
+#define   S_00821C_ROQ_ALIGN_BUSY(x)                                  (((x) & 0x1) << 4)
+#define   G_00821C_ROQ_ALIGN_BUSY(x)                                  (((x) >> 4) & 0x1)
+#define   C_00821C_ROQ_ALIGN_BUSY                                     0xFFFFFFEF
+#define   S_00821C_ROQ_RING_BUSY(x)                                   (((x) & 0x1) << 5)
+#define   G_00821C_ROQ_RING_BUSY(x)                                   (((x) >> 5) & 0x1)
+#define   C_00821C_ROQ_RING_BUSY                                      0xFFFFFFDF
+#define   S_00821C_ROQ_INDIRECT1_BUSY(x)                              (((x) & 0x1) << 6)
+#define   G_00821C_ROQ_INDIRECT1_BUSY(x)                              (((x) >> 6) & 0x1)
+#define   C_00821C_ROQ_INDIRECT1_BUSY                                 0xFFFFFFBF
+#define   S_00821C_ROQ_INDIRECT2_BUSY(x)                              (((x) & 0x1) << 7)
+#define   G_00821C_ROQ_INDIRECT2_BUSY(x)                              (((x) >> 7) & 0x1)
+#define   C_00821C_ROQ_INDIRECT2_BUSY                                 0xFFFFFF7F
+#define   S_00821C_ROQ_STATE_BUSY(x)                                  (((x) & 0x1) << 8)
+#define   G_00821C_ROQ_STATE_BUSY(x)                                  (((x) >> 8) & 0x1)
+#define   C_00821C_ROQ_STATE_BUSY                                     0xFFFFFEFF
+#define   S_00821C_ROQ_CE_RING_BUSY(x)                                (((x) & 0x1) << 9)
+#define   G_00821C_ROQ_CE_RING_BUSY(x)                                (((x) >> 9) & 0x1)
+#define   C_00821C_ROQ_CE_RING_BUSY                                   0xFFFFFDFF
+#define   S_00821C_ROQ_CE_INDIRECT1_BUSY(x)                           (((x) & 0x1) << 10)
+#define   G_00821C_ROQ_CE_INDIRECT1_BUSY(x)                           (((x) >> 10) & 0x1)
+#define   C_00821C_ROQ_CE_INDIRECT1_BUSY                              0xFFFFFBFF
+#define   S_00821C_ROQ_CE_INDIRECT2_BUSY(x)                           (((x) & 0x1) << 11)
+#define   G_00821C_ROQ_CE_INDIRECT2_BUSY(x)                           (((x) >> 11) & 0x1)
+#define   C_00821C_ROQ_CE_INDIRECT2_BUSY                              0xFFFFF7FF
+#define   S_00821C_SEMAPHORE_BUSY(x)                                  (((x) & 0x1) << 12)
+#define   G_00821C_SEMAPHORE_BUSY(x)                                  (((x) >> 12) & 0x1)
+#define   C_00821C_SEMAPHORE_BUSY                                     0xFFFFEFFF
+#define   S_00821C_INTERRUPT_BUSY(x)                                  (((x) & 0x1) << 13)
+#define   G_00821C_INTERRUPT_BUSY(x)                                  (((x) >> 13) & 0x1)
+#define   C_00821C_INTERRUPT_BUSY                                     0xFFFFDFFF
+#define   S_00821C_TCIU_BUSY(x)                                       (((x) & 0x1) << 14)
+#define   G_00821C_TCIU_BUSY(x)                                       (((x) >> 14) & 0x1)
+#define   C_00821C_TCIU_BUSY                                          0xFFFFBFFF
+#define   S_00821C_HQD_BUSY(x)                                        (((x) & 0x1) << 15)
+#define   G_00821C_HQD_BUSY(x)                                        (((x) >> 15) & 0x1)
+#define   C_00821C_HQD_BUSY                                           0xFFFF7FFF
+#define   S_00821C_PRT_BUSY(x)                                        (((x) & 0x1) << 16)
+#define   G_00821C_PRT_BUSY(x)                                        (((x) >> 16) & 0x1)
+#define   C_00821C_PRT_BUSY                                           0xFFFEFFFF
+#define   S_00821C_ATCL2IU_BUSY(x)                                    (((x) & 0x1) << 17)
+#define   G_00821C_ATCL2IU_BUSY(x)                                    (((x) >> 17) & 0x1)
+#define   C_00821C_ATCL2IU_BUSY                                       0xFFFDFFFF
+#define   S_00821C_CPF_GFX_BUSY(x)                                    (((x) & 0x1) << 26)
+#define   G_00821C_CPF_GFX_BUSY(x)                                    (((x) >> 26) & 0x1)
+#define   C_00821C_CPF_GFX_BUSY                                       0xFBFFFFFF
+#define   S_00821C_CPF_CMP_BUSY(x)                                    (((x) & 0x1) << 27)
+#define   G_00821C_CPF_CMP_BUSY(x)                                    (((x) >> 27) & 0x1)
+#define   C_00821C_CPF_CMP_BUSY                                       0xF7FFFFFF
+#define   S_00821C_GRBM_CPF_STAT_BUSY(x)                              (((x) & 0x03) << 28)
+#define   G_00821C_GRBM_CPF_STAT_BUSY(x)                              (((x) >> 28) & 0x03)
+#define   C_00821C_GRBM_CPF_STAT_BUSY                                 0xCFFFFFFF
+#define   S_00821C_CPC_CPF_BUSY(x)                                    (((x) & 0x1) << 30)
+#define   G_00821C_CPC_CPF_BUSY(x)                                    (((x) >> 30) & 0x1)
+#define   C_00821C_CPC_CPF_BUSY                                       0xBFFFFFFF
+#define   S_00821C_CPF_BUSY(x)                                        (((x) & 0x1) << 31)
+#define   G_00821C_CPF_BUSY(x)                                        (((x) >> 31) & 0x1)
+#define   C_00821C_CPF_BUSY                                           0x7FFFFFFF
+#define R_008220_CP_CPF_BUSY_STAT                                       0x008220
+#define   S_008220_REG_BUS_FIFO_BUSY(x)                               (((x) & 0x1) << 0)
+#define   G_008220_REG_BUS_FIFO_BUSY(x)                               (((x) >> 0) & 0x1)
+#define   C_008220_REG_BUS_FIFO_BUSY                                  0xFFFFFFFE
+#define   S_008220_CSF_RING_BUSY(x)                                   (((x) & 0x1) << 1)
+#define   G_008220_CSF_RING_BUSY(x)                                   (((x) >> 1) & 0x1)
+#define   C_008220_CSF_RING_BUSY                                      0xFFFFFFFD
+#define   S_008220_CSF_INDIRECT1_BUSY(x)                              (((x) & 0x1) << 2)
+#define   G_008220_CSF_INDIRECT1_BUSY(x)                              (((x) >> 2) & 0x1)
+#define   C_008220_CSF_INDIRECT1_BUSY                                 0xFFFFFFFB
+#define   S_008220_CSF_INDIRECT2_BUSY(x)                              (((x) & 0x1) << 3)
+#define   G_008220_CSF_INDIRECT2_BUSY(x)                              (((x) >> 3) & 0x1)
+#define   C_008220_CSF_INDIRECT2_BUSY                                 0xFFFFFFF7
+#define   S_008220_CSF_STATE_BUSY(x)                                  (((x) & 0x1) << 4)
+#define   G_008220_CSF_STATE_BUSY(x)                                  (((x) >> 4) & 0x1)
+#define   C_008220_CSF_STATE_BUSY                                     0xFFFFFFEF
+#define   S_008220_CSF_CE_INDR1_BUSY(x)                               (((x) & 0x1) << 5)
+#define   G_008220_CSF_CE_INDR1_BUSY(x)                               (((x) >> 5) & 0x1)
+#define   C_008220_CSF_CE_INDR1_BUSY                                  0xFFFFFFDF
+#define   S_008220_CSF_CE_INDR2_BUSY(x)                               (((x) & 0x1) << 6)
+#define   G_008220_CSF_CE_INDR2_BUSY(x)                               (((x) >> 6) & 0x1)
+#define   C_008220_CSF_CE_INDR2_BUSY                                  0xFFFFFFBF
+#define   S_008220_CSF_ARBITER_BUSY(x)                                (((x) & 0x1) << 7)
+#define   G_008220_CSF_ARBITER_BUSY(x)                                (((x) >> 7) & 0x1)
+#define   C_008220_CSF_ARBITER_BUSY                                   0xFFFFFF7F
+#define   S_008220_CSF_INPUT_BUSY(x)                                  (((x) & 0x1) << 8)
+#define   G_008220_CSF_INPUT_BUSY(x)                                  (((x) >> 8) & 0x1)
+#define   C_008220_CSF_INPUT_BUSY                                     0xFFFFFEFF
+#define   S_008220_OUTSTANDING_READ_TAGS(x)                           (((x) & 0x1) << 9)
+#define   G_008220_OUTSTANDING_READ_TAGS(x)                           (((x) >> 9) & 0x1)
+#define   C_008220_OUTSTANDING_READ_TAGS                              0xFFFFFDFF
+#define   S_008220_HPD_PROCESSING_EOP_BUSY(x)                         (((x) & 0x1) << 11)
+#define   G_008220_HPD_PROCESSING_EOP_BUSY(x)                         (((x) >> 11) & 0x1)
+#define   C_008220_HPD_PROCESSING_EOP_BUSY                            0xFFFFF7FF
+#define   S_008220_HQD_DISPATCH_BUSY(x)                               (((x) & 0x1) << 12)
+#define   G_008220_HQD_DISPATCH_BUSY(x)                               (((x) >> 12) & 0x1)
+#define   C_008220_HQD_DISPATCH_BUSY                                  0xFFFFEFFF
+#define   S_008220_HQD_IQ_TIMER_BUSY(x)                               (((x) & 0x1) << 13)
+#define   G_008220_HQD_IQ_TIMER_BUSY(x)                               (((x) >> 13) & 0x1)
+#define   C_008220_HQD_IQ_TIMER_BUSY                                  0xFFFFDFFF
+#define   S_008220_HQD_DMA_OFFLOAD_BUSY(x)                            (((x) & 0x1) << 14)
+#define   G_008220_HQD_DMA_OFFLOAD_BUSY(x)                            (((x) >> 14) & 0x1)
+#define   C_008220_HQD_DMA_OFFLOAD_BUSY                               0xFFFFBFFF
+#define   S_008220_HQD_WAIT_SEMAPHORE_BUSY(x)                         (((x) & 0x1) << 15)
+#define   G_008220_HQD_WAIT_SEMAPHORE_BUSY(x)                         (((x) >> 15) & 0x1)
+#define   C_008220_HQD_WAIT_SEMAPHORE_BUSY                            0xFFFF7FFF
+#define   S_008220_HQD_SIGNAL_SEMAPHORE_BUSY(x)                       (((x) & 0x1) << 16)
+#define   G_008220_HQD_SIGNAL_SEMAPHORE_BUSY(x)                       (((x) >> 16) & 0x1)
+#define   C_008220_HQD_SIGNAL_SEMAPHORE_BUSY                          0xFFFEFFFF
+#define   S_008220_HQD_MESSAGE_BUSY(x)                                (((x) & 0x1) << 17)
+#define   G_008220_HQD_MESSAGE_BUSY(x)                                (((x) >> 17) & 0x1)
+#define   C_008220_HQD_MESSAGE_BUSY                                   0xFFFDFFFF
+#define   S_008220_HQD_PQ_FETCHER_BUSY(x)                             (((x) & 0x1) << 18)
+#define   G_008220_HQD_PQ_FETCHER_BUSY(x)                             (((x) >> 18) & 0x1)
+#define   C_008220_HQD_PQ_FETCHER_BUSY                                0xFFFBFFFF
+#define   S_008220_HQD_IB_FETCHER_BUSY(x)                             (((x) & 0x1) << 19)
+#define   G_008220_HQD_IB_FETCHER_BUSY(x)                             (((x) >> 19) & 0x1)
+#define   C_008220_HQD_IB_FETCHER_BUSY                                0xFFF7FFFF
+#define   S_008220_HQD_IQ_FETCHER_BUSY(x)                             (((x) & 0x1) << 20)
+#define   G_008220_HQD_IQ_FETCHER_BUSY(x)                             (((x) >> 20) & 0x1)
+#define   C_008220_HQD_IQ_FETCHER_BUSY                                0xFFEFFFFF
+#define   S_008220_HQD_EOP_FETCHER_BUSY(x)                            (((x) & 0x1) << 21)
+#define   G_008220_HQD_EOP_FETCHER_BUSY(x)                            (((x) >> 21) & 0x1)
+#define   C_008220_HQD_EOP_FETCHER_BUSY                               0xFFDFFFFF
+#define   S_008220_HQD_CONSUMED_RPTR_BUSY(x)                          (((x) & 0x1) << 22)
+#define   G_008220_HQD_CONSUMED_RPTR_BUSY(x)                          (((x) >> 22) & 0x1)
+#define   C_008220_HQD_CONSUMED_RPTR_BUSY                             0xFFBFFFFF
+#define   S_008220_HQD_FETCHER_ARB_BUSY(x)                            (((x) & 0x1) << 23)
+#define   G_008220_HQD_FETCHER_ARB_BUSY(x)                            (((x) >> 23) & 0x1)
+#define   C_008220_HQD_FETCHER_ARB_BUSY                               0xFF7FFFFF
+#define   S_008220_HQD_ROQ_ALIGN_BUSY(x)                              (((x) & 0x1) << 24)
+#define   G_008220_HQD_ROQ_ALIGN_BUSY(x)                              (((x) >> 24) & 0x1)
+#define   C_008220_HQD_ROQ_ALIGN_BUSY                                 0xFEFFFFFF
+#define   S_008220_HQD_ROQ_EOP_BUSY(x)                                (((x) & 0x1) << 25)
+#define   G_008220_HQD_ROQ_EOP_BUSY(x)                                (((x) >> 25) & 0x1)
+#define   C_008220_HQD_ROQ_EOP_BUSY                                   0xFDFFFFFF
+#define   S_008220_HQD_ROQ_IQ_BUSY(x)                                 (((x) & 0x1) << 26)
+#define   G_008220_HQD_ROQ_IQ_BUSY(x)                                 (((x) >> 26) & 0x1)
+#define   C_008220_HQD_ROQ_IQ_BUSY                                    0xFBFFFFFF
+#define   S_008220_HQD_ROQ_PQ_BUSY(x)                                 (((x) & 0x1) << 27)
+#define   G_008220_HQD_ROQ_PQ_BUSY(x)                                 (((x) >> 27) & 0x1)
+#define   C_008220_HQD_ROQ_PQ_BUSY                                    0xF7FFFFFF
+#define   S_008220_HQD_ROQ_IB_BUSY(x)                                 (((x) & 0x1) << 28)
+#define   G_008220_HQD_ROQ_IB_BUSY(x)                                 (((x) >> 28) & 0x1)
+#define   C_008220_HQD_ROQ_IB_BUSY                                    0xEFFFFFFF
+#define   S_008220_HQD_WPTR_POLL_BUSY(x)                              (((x) & 0x1) << 29)
+#define   G_008220_HQD_WPTR_POLL_BUSY(x)                              (((x) >> 29) & 0x1)
+#define   C_008220_HQD_WPTR_POLL_BUSY                                 0xDFFFFFFF
+#define   S_008220_HQD_PQ_BUSY(x)                                     (((x) & 0x1) << 30)
+#define   G_008220_HQD_PQ_BUSY(x)                                     (((x) >> 30) & 0x1)
+#define   C_008220_HQD_PQ_BUSY                                        0xBFFFFFFF
+#define   S_008220_HQD_IB_BUSY(x)                                     (((x) & 0x1) << 31)
+#define   G_008220_HQD_IB_BUSY(x)                                     (((x) >> 31) & 0x1)
+#define   C_008220_HQD_IB_BUSY                                        0x7FFFFFFF
+#define R_008224_CP_CPF_STALLED_STAT1                                   0x008224
+#define   S_008224_RING_FETCHING_DATA(x)                              (((x) & 0x1) << 0)
+#define   G_008224_RING_FETCHING_DATA(x)                              (((x) >> 0) & 0x1)
+#define   C_008224_RING_FETCHING_DATA                                 0xFFFFFFFE
+#define   S_008224_INDR1_FETCHING_DATA(x)                             (((x) & 0x1) << 1)
+#define   G_008224_INDR1_FETCHING_DATA(x)                             (((x) >> 1) & 0x1)
+#define   C_008224_INDR1_FETCHING_DATA                                0xFFFFFFFD
+#define   S_008224_INDR2_FETCHING_DATA(x)                             (((x) & 0x1) << 2)
+#define   G_008224_INDR2_FETCHING_DATA(x)                             (((x) >> 2) & 0x1)
+#define   C_008224_INDR2_FETCHING_DATA                                0xFFFFFFFB
+#define   S_008224_STATE_FETCHING_DATA(x)                             (((x) & 0x1) << 3)
+#define   G_008224_STATE_FETCHING_DATA(x)                             (((x) >> 3) & 0x1)
+#define   C_008224_STATE_FETCHING_DATA                                0xFFFFFFF7
+#define   S_008224_TCIU_WAITING_ON_FREE(x)                            (((x) & 0x1) << 5)
+#define   G_008224_TCIU_WAITING_ON_FREE(x)                            (((x) >> 5) & 0x1)
+#define   C_008224_TCIU_WAITING_ON_FREE                               0xFFFFFFDF
+#define   S_008224_TCIU_WAITING_ON_TAGS(x)                            (((x) & 0x1) << 6)
+#define   G_008224_TCIU_WAITING_ON_TAGS(x)                            (((x) >> 6) & 0x1)
+#define   C_008224_TCIU_WAITING_ON_TAGS                               0xFFFFFFBF
+#define   S_008224_ATCL2IU_WAITING_ON_FREE(x)                         (((x) & 0x1) << 7)
+#define   G_008224_ATCL2IU_WAITING_ON_FREE(x)                         (((x) >> 7) & 0x1)
+#define   C_008224_ATCL2IU_WAITING_ON_FREE                            0xFFFFFF7F
+#define   S_008224_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) & 0x1) << 8)
+#define   G_008224_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) >> 8) & 0x1)
+#define   C_008224_ATCL2IU_WAITING_ON_TAGS                            0xFFFFFEFF
+#define   S_008224_ATCL1_WAITING_ON_TRANS(x)                          (((x) & 0x1) << 9)
+#define   G_008224_ATCL1_WAITING_ON_TRANS(x)                          (((x) >> 9) & 0x1)
+#define   C_008224_ATCL1_WAITING_ON_TRANS                             0xFFFFFDFF
 #define R_030230_CP_COHER_SIZE_HI                                       0x030230
 #define   S_030230_COHER_SIZE_HI_256B(x)                              (((x) & 0xFF) << 0)
 #define   G_030230_COHER_SIZE_HI_256B(x)                              (((x) >> 0) & 0xFF)
@@ -568,6 +1381,271 @@
 #define   S_008B10_CURRENT_COUNT(x)                                   (((x) & 0xFF) << 8)
 #define   G_008B10_CURRENT_COUNT(x)                                   (((x) >> 8) & 0xFF)
 #define   C_008B10_CURRENT_COUNT                                      0xFFFF00FF
+#define R_008670_CP_STALLED_STAT3                                       0x008670
+#define   S_008670_CE_TO_CSF_NOT_RDY_TO_RCV(x)                        (((x) & 0x1) << 0)
+#define   G_008670_CE_TO_CSF_NOT_RDY_TO_RCV(x)                        (((x) >> 0) & 0x1)
+#define   C_008670_CE_TO_CSF_NOT_RDY_TO_RCV                           0xFFFFFFFE
+#define   S_008670_CE_TO_RAM_INIT_FETCHER_NOT_RDY_TO_RCV(x)           (((x) & 0x1) << 1)
+#define   G_008670_CE_TO_RAM_INIT_FETCHER_NOT_RDY_TO_RCV(x)           (((x) >> 1) & 0x1)
+#define   C_008670_CE_TO_RAM_INIT_FETCHER_NOT_RDY_TO_RCV              0xFFFFFFFD
+#define   S_008670_CE_WAITING_ON_DATA_FROM_RAM_INIT_FETCHER(x)        (((x) & 0x1) << 2)
+#define   G_008670_CE_WAITING_ON_DATA_FROM_RAM_INIT_FETCHER(x)        (((x) >> 2) & 0x1)
+#define   C_008670_CE_WAITING_ON_DATA_FROM_RAM_INIT_FETCHER           0xFFFFFFFB
+#define   S_008670_CE_TO_RAM_INIT_NOT_RDY(x)                          (((x) & 0x1) << 3)
+#define   G_008670_CE_TO_RAM_INIT_NOT_RDY(x)                          (((x) >> 3) & 0x1)
+#define   C_008670_CE_TO_RAM_INIT_NOT_RDY                             0xFFFFFFF7
+#define   S_008670_CE_TO_RAM_DUMP_NOT_RDY(x)                          (((x) & 0x1) << 4)
+#define   G_008670_CE_TO_RAM_DUMP_NOT_RDY(x)                          (((x) >> 4) & 0x1)
+#define   C_008670_CE_TO_RAM_DUMP_NOT_RDY                             0xFFFFFFEF
+#define   S_008670_CE_TO_RAM_WRITE_NOT_RDY(x)                         (((x) & 0x1) << 5)
+#define   G_008670_CE_TO_RAM_WRITE_NOT_RDY(x)                         (((x) >> 5) & 0x1)
+#define   C_008670_CE_TO_RAM_WRITE_NOT_RDY                            0xFFFFFFDF
+#define   S_008670_CE_TO_INC_FIFO_NOT_RDY_TO_RCV(x)                   (((x) & 0x1) << 6)
+#define   G_008670_CE_TO_INC_FIFO_NOT_RDY_TO_RCV(x)                   (((x) >> 6) & 0x1)
+#define   C_008670_CE_TO_INC_FIFO_NOT_RDY_TO_RCV                      0xFFFFFFBF
+#define   S_008670_CE_TO_WR_FIFO_NOT_RDY_TO_RCV(x)                    (((x) & 0x1) << 7)
+#define   G_008670_CE_TO_WR_FIFO_NOT_RDY_TO_RCV(x)                    (((x) >> 7) & 0x1)
+#define   C_008670_CE_TO_WR_FIFO_NOT_RDY_TO_RCV                       0xFFFFFF7F
+#define   S_008670_CE_WAITING_ON_BUFFER_DATA(x)                       (((x) & 0x1) << 10)
+#define   G_008670_CE_WAITING_ON_BUFFER_DATA(x)                       (((x) >> 10) & 0x1)
+#define   C_008670_CE_WAITING_ON_BUFFER_DATA                          0xFFFFFBFF
+#define   S_008670_CE_WAITING_ON_CE_BUFFER_FLAG(x)                    (((x) & 0x1) << 11)
+#define   G_008670_CE_WAITING_ON_CE_BUFFER_FLAG(x)                    (((x) >> 11) & 0x1)
+#define   C_008670_CE_WAITING_ON_CE_BUFFER_FLAG                       0xFFFFF7FF
+#define   S_008670_CE_WAITING_ON_DE_COUNTER(x)                        (((x) & 0x1) << 12)
+#define   G_008670_CE_WAITING_ON_DE_COUNTER(x)                        (((x) >> 12) & 0x1)
+#define   C_008670_CE_WAITING_ON_DE_COUNTER                           0xFFFFEFFF
+#define   S_008670_CE_WAITING_ON_DE_COUNTER_UNDERFLOW(x)              (((x) & 0x1) << 13)
+#define   G_008670_CE_WAITING_ON_DE_COUNTER_UNDERFLOW(x)              (((x) >> 13) & 0x1)
+#define   C_008670_CE_WAITING_ON_DE_COUNTER_UNDERFLOW                 0xFFFFDFFF
+#define   S_008670_TCIU_WAITING_ON_FREE(x)                            (((x) & 0x1) << 14)
+#define   G_008670_TCIU_WAITING_ON_FREE(x)                            (((x) >> 14) & 0x1)
+#define   C_008670_TCIU_WAITING_ON_FREE                               0xFFFFBFFF
+#define   S_008670_TCIU_WAITING_ON_TAGS(x)                            (((x) & 0x1) << 15)
+#define   G_008670_TCIU_WAITING_ON_TAGS(x)                            (((x) >> 15) & 0x1)
+#define   C_008670_TCIU_WAITING_ON_TAGS                               0xFFFF7FFF
+#define   S_008670_CE_STALLED_ON_TC_WR_CONFIRM(x)                     (((x) & 0x1) << 16)
+#define   G_008670_CE_STALLED_ON_TC_WR_CONFIRM(x)                     (((x) >> 16) & 0x1)
+#define   C_008670_CE_STALLED_ON_TC_WR_CONFIRM                        0xFFFEFFFF
+#define   S_008670_CE_STALLED_ON_ATOMIC_RTN_DATA(x)                   (((x) & 0x1) << 17)
+#define   G_008670_CE_STALLED_ON_ATOMIC_RTN_DATA(x)                   (((x) >> 17) & 0x1)
+#define   C_008670_CE_STALLED_ON_ATOMIC_RTN_DATA                      0xFFFDFFFF
+#define   S_008670_ATCL2IU_WAITING_ON_FREE(x)                         (((x) & 0x1) << 18)
+#define   G_008670_ATCL2IU_WAITING_ON_FREE(x)                         (((x) >> 18) & 0x1)
+#define   C_008670_ATCL2IU_WAITING_ON_FREE                            0xFFFBFFFF
+#define   S_008670_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) & 0x1) << 19)
+#define   G_008670_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) >> 19) & 0x1)
+#define   C_008670_ATCL2IU_WAITING_ON_TAGS                            0xFFF7FFFF
+#define   S_008670_ATCL1_WAITING_ON_TRANS(x)                          (((x) & 0x1) << 20)
+#define   G_008670_ATCL1_WAITING_ON_TRANS(x)                          (((x) >> 20) & 0x1)
+#define   C_008670_ATCL1_WAITING_ON_TRANS                             0xFFEFFFFF
+#define R_008674_CP_STALLED_STAT1                                       0x008674
+#define   S_008674_RBIU_TO_DMA_NOT_RDY_TO_RCV(x)                      (((x) & 0x1) << 0)
+#define   G_008674_RBIU_TO_DMA_NOT_RDY_TO_RCV(x)                      (((x) >> 0) & 0x1)
+#define   C_008674_RBIU_TO_DMA_NOT_RDY_TO_RCV                         0xFFFFFFFE
+#define   S_008674_RBIU_TO_SEM_NOT_RDY_TO_RCV(x)                      (((x) & 0x1) << 2)
+#define   G_008674_RBIU_TO_SEM_NOT_RDY_TO_RCV(x)                      (((x) >> 2) & 0x1)
+#define   C_008674_RBIU_TO_SEM_NOT_RDY_TO_RCV                         0xFFFFFFFB
+#define   S_008674_RBIU_TO_MEMWR_NOT_RDY_TO_RCV(x)                    (((x) & 0x1) << 4)
+#define   G_008674_RBIU_TO_MEMWR_NOT_RDY_TO_RCV(x)                    (((x) >> 4) & 0x1)
+#define   C_008674_RBIU_TO_MEMWR_NOT_RDY_TO_RCV                       0xFFFFFFEF
+#define   S_008674_ME_HAS_ACTIVE_CE_BUFFER_FLAG(x)                    (((x) & 0x1) << 10)
+#define   G_008674_ME_HAS_ACTIVE_CE_BUFFER_FLAG(x)                    (((x) >> 10) & 0x1)
+#define   C_008674_ME_HAS_ACTIVE_CE_BUFFER_FLAG                       0xFFFFFBFF
+#define   S_008674_ME_HAS_ACTIVE_DE_BUFFER_FLAG(x)                    (((x) & 0x1) << 11)
+#define   G_008674_ME_HAS_ACTIVE_DE_BUFFER_FLAG(x)                    (((x) >> 11) & 0x1)
+#define   C_008674_ME_HAS_ACTIVE_DE_BUFFER_FLAG                       0xFFFFF7FF
+#define   S_008674_ME_STALLED_ON_TC_WR_CONFIRM(x)                     (((x) & 0x1) << 12)
+#define   G_008674_ME_STALLED_ON_TC_WR_CONFIRM(x)                     (((x) >> 12) & 0x1)
+#define   C_008674_ME_STALLED_ON_TC_WR_CONFIRM                        0xFFFFEFFF
+#define   S_008674_ME_STALLED_ON_ATOMIC_RTN_DATA(x)                   (((x) & 0x1) << 13)
+#define   G_008674_ME_STALLED_ON_ATOMIC_RTN_DATA(x)                   (((x) >> 13) & 0x1)
+#define   C_008674_ME_STALLED_ON_ATOMIC_RTN_DATA                      0xFFFFDFFF
+#define   S_008674_ME_WAITING_ON_TC_READ_DATA(x)                      (((x) & 0x1) << 14)
+#define   G_008674_ME_WAITING_ON_TC_READ_DATA(x)                      (((x) >> 14) & 0x1)
+#define   C_008674_ME_WAITING_ON_TC_READ_DATA                         0xFFFFBFFF
+#define   S_008674_ME_WAITING_ON_REG_READ_DATA(x)                     (((x) & 0x1) << 15)
+#define   G_008674_ME_WAITING_ON_REG_READ_DATA(x)                     (((x) >> 15) & 0x1)
+#define   C_008674_ME_WAITING_ON_REG_READ_DATA                        0xFFFF7FFF
+#define   S_008674_RCIU_WAITING_ON_GDS_FREE(x)                        (((x) & 0x1) << 23)
+#define   G_008674_RCIU_WAITING_ON_GDS_FREE(x)                        (((x) >> 23) & 0x1)
+#define   C_008674_RCIU_WAITING_ON_GDS_FREE                           0xFF7FFFFF
+#define   S_008674_RCIU_WAITING_ON_GRBM_FREE(x)                       (((x) & 0x1) << 24)
+#define   G_008674_RCIU_WAITING_ON_GRBM_FREE(x)                       (((x) >> 24) & 0x1)
+#define   C_008674_RCIU_WAITING_ON_GRBM_FREE                          0xFEFFFFFF
+#define   S_008674_RCIU_WAITING_ON_VGT_FREE(x)                        (((x) & 0x1) << 25)
+#define   G_008674_RCIU_WAITING_ON_VGT_FREE(x)                        (((x) >> 25) & 0x1)
+#define   C_008674_RCIU_WAITING_ON_VGT_FREE                           0xFDFFFFFF
+#define   S_008674_RCIU_STALLED_ON_ME_READ(x)                         (((x) & 0x1) << 26)
+#define   G_008674_RCIU_STALLED_ON_ME_READ(x)                         (((x) >> 26) & 0x1)
+#define   C_008674_RCIU_STALLED_ON_ME_READ                            0xFBFFFFFF
+#define   S_008674_RCIU_STALLED_ON_DMA_READ(x)                        (((x) & 0x1) << 27)
+#define   G_008674_RCIU_STALLED_ON_DMA_READ(x)                        (((x) >> 27) & 0x1)
+#define   C_008674_RCIU_STALLED_ON_DMA_READ                           0xF7FFFFFF
+#define   S_008674_RCIU_STALLED_ON_APPEND_READ(x)                     (((x) & 0x1) << 28)
+#define   G_008674_RCIU_STALLED_ON_APPEND_READ(x)                     (((x) >> 28) & 0x1)
+#define   C_008674_RCIU_STALLED_ON_APPEND_READ                        0xEFFFFFFF
+#define   S_008674_RCIU_HALTED_BY_REG_VIOLATION(x)                    (((x) & 0x1) << 29)
+#define   G_008674_RCIU_HALTED_BY_REG_VIOLATION(x)                    (((x) >> 29) & 0x1)
+#define   C_008674_RCIU_HALTED_BY_REG_VIOLATION                       0xDFFFFFFF
+#define R_008678_CP_STALLED_STAT2                                       0x008678
+#define   S_008678_PFP_TO_CSF_NOT_RDY_TO_RCV(x)                       (((x) & 0x1) << 0)
+#define   G_008678_PFP_TO_CSF_NOT_RDY_TO_RCV(x)                       (((x) >> 0) & 0x1)
+#define   C_008678_PFP_TO_CSF_NOT_RDY_TO_RCV                          0xFFFFFFFE
+#define   S_008678_PFP_TO_MEQ_NOT_RDY_TO_RCV(x)                       (((x) & 0x1) << 1)
+#define   G_008678_PFP_TO_MEQ_NOT_RDY_TO_RCV(x)                       (((x) >> 1) & 0x1)
+#define   C_008678_PFP_TO_MEQ_NOT_RDY_TO_RCV                          0xFFFFFFFD
+#define   S_008678_PFP_TO_RCIU_NOT_RDY_TO_RCV(x)                      (((x) & 0x1) << 2)
+#define   G_008678_PFP_TO_RCIU_NOT_RDY_TO_RCV(x)                      (((x) >> 2) & 0x1)
+#define   C_008678_PFP_TO_RCIU_NOT_RDY_TO_RCV                         0xFFFFFFFB
+#define   S_008678_PFP_TO_VGT_WRITES_PENDING(x)                       (((x) & 0x1) << 4)
+#define   G_008678_PFP_TO_VGT_WRITES_PENDING(x)                       (((x) >> 4) & 0x1)
+#define   C_008678_PFP_TO_VGT_WRITES_PENDING                          0xFFFFFFEF
+#define   S_008678_PFP_RCIU_READ_PENDING(x)                           (((x) & 0x1) << 5)
+#define   G_008678_PFP_RCIU_READ_PENDING(x)                           (((x) >> 5) & 0x1)
+#define   C_008678_PFP_RCIU_READ_PENDING                              0xFFFFFFDF
+#define   S_008678_PFP_WAITING_ON_BUFFER_DATA(x)                      (((x) & 0x1) << 8)
+#define   G_008678_PFP_WAITING_ON_BUFFER_DATA(x)                      (((x) >> 8) & 0x1)
+#define   C_008678_PFP_WAITING_ON_BUFFER_DATA                         0xFFFFFEFF
+#define   S_008678_ME_WAIT_ON_CE_COUNTER(x)                           (((x) & 0x1) << 9)
+#define   G_008678_ME_WAIT_ON_CE_COUNTER(x)                           (((x) >> 9) & 0x1)
+#define   C_008678_ME_WAIT_ON_CE_COUNTER                              0xFFFFFDFF
+#define   S_008678_ME_WAIT_ON_AVAIL_BUFFER(x)                         (((x) & 0x1) << 10)
+#define   G_008678_ME_WAIT_ON_AVAIL_BUFFER(x)                         (((x) >> 10) & 0x1)
+#define   C_008678_ME_WAIT_ON_AVAIL_BUFFER                            0xFFFFFBFF
+#define   S_008678_GFX_CNTX_NOT_AVAIL_TO_ME(x)                        (((x) & 0x1) << 11)
+#define   G_008678_GFX_CNTX_NOT_AVAIL_TO_ME(x)                        (((x) >> 11) & 0x1)
+#define   C_008678_GFX_CNTX_NOT_AVAIL_TO_ME                           0xFFFFF7FF
+#define   S_008678_ME_RCIU_NOT_RDY_TO_RCV(x)                          (((x) & 0x1) << 12)
+#define   G_008678_ME_RCIU_NOT_RDY_TO_RCV(x)                          (((x) >> 12) & 0x1)
+#define   C_008678_ME_RCIU_NOT_RDY_TO_RCV                             0xFFFFEFFF
+#define   S_008678_ME_TO_CONST_NOT_RDY_TO_RCV(x)                      (((x) & 0x1) << 13)
+#define   G_008678_ME_TO_CONST_NOT_RDY_TO_RCV(x)                      (((x) >> 13) & 0x1)
+#define   C_008678_ME_TO_CONST_NOT_RDY_TO_RCV                         0xFFFFDFFF
+#define   S_008678_ME_WAITING_DATA_FROM_PFP(x)                        (((x) & 0x1) << 14)
+#define   G_008678_ME_WAITING_DATA_FROM_PFP(x)                        (((x) >> 14) & 0x1)
+#define   C_008678_ME_WAITING_DATA_FROM_PFP                           0xFFFFBFFF
+#define   S_008678_ME_WAITING_ON_PARTIAL_FLUSH(x)                     (((x) & 0x1) << 15)
+#define   G_008678_ME_WAITING_ON_PARTIAL_FLUSH(x)                     (((x) >> 15) & 0x1)
+#define   C_008678_ME_WAITING_ON_PARTIAL_FLUSH                        0xFFFF7FFF
+#define   S_008678_MEQ_TO_ME_NOT_RDY_TO_RCV(x)                        (((x) & 0x1) << 16)
+#define   G_008678_MEQ_TO_ME_NOT_RDY_TO_RCV(x)                        (((x) >> 16) & 0x1)
+#define   C_008678_MEQ_TO_ME_NOT_RDY_TO_RCV                           0xFFFEFFFF
+#define   S_008678_STQ_TO_ME_NOT_RDY_TO_RCV(x)                        (((x) & 0x1) << 17)
+#define   G_008678_STQ_TO_ME_NOT_RDY_TO_RCV(x)                        (((x) >> 17) & 0x1)
+#define   C_008678_STQ_TO_ME_NOT_RDY_TO_RCV                           0xFFFDFFFF
+#define   S_008678_ME_WAITING_DATA_FROM_STQ(x)                        (((x) & 0x1) << 18)
+#define   G_008678_ME_WAITING_DATA_FROM_STQ(x)                        (((x) >> 18) & 0x1)
+#define   C_008678_ME_WAITING_DATA_FROM_STQ                           0xFFFBFFFF
+#define   S_008678_PFP_STALLED_ON_TC_WR_CONFIRM(x)                    (((x) & 0x1) << 19)
+#define   G_008678_PFP_STALLED_ON_TC_WR_CONFIRM(x)                    (((x) >> 19) & 0x1)
+#define   C_008678_PFP_STALLED_ON_TC_WR_CONFIRM                       0xFFF7FFFF
+#define   S_008678_PFP_STALLED_ON_ATOMIC_RTN_DATA(x)                  (((x) & 0x1) << 20)
+#define   G_008678_PFP_STALLED_ON_ATOMIC_RTN_DATA(x)                  (((x) >> 20) & 0x1)
+#define   C_008678_PFP_STALLED_ON_ATOMIC_RTN_DATA                     0xFFEFFFFF
+#define   S_008678_EOPD_FIFO_NEEDS_SC_EOP_DONE(x)                     (((x) & 0x1) << 21)
+#define   G_008678_EOPD_FIFO_NEEDS_SC_EOP_DONE(x)                     (((x) >> 21) & 0x1)
+#define   C_008678_EOPD_FIFO_NEEDS_SC_EOP_DONE                        0xFFDFFFFF
+#define   S_008678_EOPD_FIFO_NEEDS_WR_CONFIRM(x)                      (((x) & 0x1) << 22)
+#define   G_008678_EOPD_FIFO_NEEDS_WR_CONFIRM(x)                      (((x) >> 22) & 0x1)
+#define   C_008678_EOPD_FIFO_NEEDS_WR_CONFIRM                         0xFFBFFFFF
+#define   S_008678_STRMO_WR_OF_PRIM_DATA_PENDING(x)                   (((x) & 0x1) << 23)
+#define   G_008678_STRMO_WR_OF_PRIM_DATA_PENDING(x)                   (((x) >> 23) & 0x1)
+#define   C_008678_STRMO_WR_OF_PRIM_DATA_PENDING                      0xFF7FFFFF
+#define   S_008678_PIPE_STATS_WR_DATA_PENDING(x)                      (((x) & 0x1) << 24)
+#define   G_008678_PIPE_STATS_WR_DATA_PENDING(x)                      (((x) >> 24) & 0x1)
+#define   C_008678_PIPE_STATS_WR_DATA_PENDING                         0xFEFFFFFF
+#define   S_008678_APPEND_RDY_WAIT_ON_CS_DONE(x)                      (((x) & 0x1) << 25)
+#define   G_008678_APPEND_RDY_WAIT_ON_CS_DONE(x)                      (((x) >> 25) & 0x1)
+#define   C_008678_APPEND_RDY_WAIT_ON_CS_DONE                         0xFDFFFFFF
+#define   S_008678_APPEND_RDY_WAIT_ON_PS_DONE(x)                      (((x) & 0x1) << 26)
+#define   G_008678_APPEND_RDY_WAIT_ON_PS_DONE(x)                      (((x) >> 26) & 0x1)
+#define   C_008678_APPEND_RDY_WAIT_ON_PS_DONE                         0xFBFFFFFF
+#define   S_008678_APPEND_WAIT_ON_WR_CONFIRM(x)                       (((x) & 0x1) << 27)
+#define   G_008678_APPEND_WAIT_ON_WR_CONFIRM(x)                       (((x) >> 27) & 0x1)
+#define   C_008678_APPEND_WAIT_ON_WR_CONFIRM                          0xF7FFFFFF
+#define   S_008678_APPEND_ACTIVE_PARTITION(x)                         (((x) & 0x1) << 28)
+#define   G_008678_APPEND_ACTIVE_PARTITION(x)                         (((x) >> 28) & 0x1)
+#define   C_008678_APPEND_ACTIVE_PARTITION                            0xEFFFFFFF
+#define   S_008678_APPEND_WAITING_TO_SEND_MEMWRITE(x)                 (((x) & 0x1) << 29)
+#define   G_008678_APPEND_WAITING_TO_SEND_MEMWRITE(x)                 (((x) >> 29) & 0x1)
+#define   C_008678_APPEND_WAITING_TO_SEND_MEMWRITE                    0xDFFFFFFF
+#define   S_008678_SURF_SYNC_NEEDS_IDLE_CNTXS(x)                      (((x) & 0x1) << 30)
+#define   G_008678_SURF_SYNC_NEEDS_IDLE_CNTXS(x)                      (((x) >> 30) & 0x1)
+#define   C_008678_SURF_SYNC_NEEDS_IDLE_CNTXS                         0xBFFFFFFF
+#define   S_008678_SURF_SYNC_NEEDS_ALL_CLEAN(x)                       (((x) & 0x1) << 31)
+#define   G_008678_SURF_SYNC_NEEDS_ALL_CLEAN(x)                       (((x) >> 31) & 0x1)
+#define   C_008678_SURF_SYNC_NEEDS_ALL_CLEAN                          0x7FFFFFFF
+#define R_008680_CP_STAT                                                0x008680
+#define   S_008680_ROQ_RING_BUSY(x)                                   (((x) & 0x1) << 9)
+#define   G_008680_ROQ_RING_BUSY(x)                                   (((x) >> 9) & 0x1)
+#define   C_008680_ROQ_RING_BUSY                                      0xFFFFFDFF
+#define   S_008680_ROQ_INDIRECT1_BUSY(x)                              (((x) & 0x1) << 10)
+#define   G_008680_ROQ_INDIRECT1_BUSY(x)                              (((x) >> 10) & 0x1)
+#define   C_008680_ROQ_INDIRECT1_BUSY                                 0xFFFFFBFF
+#define   S_008680_ROQ_INDIRECT2_BUSY(x)                              (((x) & 0x1) << 11)
+#define   G_008680_ROQ_INDIRECT2_BUSY(x)                              (((x) >> 11) & 0x1)
+#define   C_008680_ROQ_INDIRECT2_BUSY                                 0xFFFFF7FF
+#define   S_008680_ROQ_STATE_BUSY(x)                                  (((x) & 0x1) << 12)
+#define   G_008680_ROQ_STATE_BUSY(x)                                  (((x) >> 12) & 0x1)
+#define   C_008680_ROQ_STATE_BUSY                                     0xFFFFEFFF
+#define   S_008680_DC_BUSY(x)                                         (((x) & 0x1) << 13)
+#define   G_008680_DC_BUSY(x)                                         (((x) >> 13) & 0x1)
+#define   C_008680_DC_BUSY                                            0xFFFFDFFF
+#define   S_008680_ATCL2IU_BUSY(x)                                    (((x) & 0x1) << 14)
+#define   G_008680_ATCL2IU_BUSY(x)                                    (((x) >> 14) & 0x1)
+#define   C_008680_ATCL2IU_BUSY                                       0xFFFFBFFF
+#define   S_008680_PFP_BUSY(x)                                        (((x) & 0x1) << 15)
+#define   G_008680_PFP_BUSY(x)                                        (((x) >> 15) & 0x1)
+#define   C_008680_PFP_BUSY                                           0xFFFF7FFF
+#define   S_008680_MEQ_BUSY(x)                                        (((x) & 0x1) << 16)
+#define   G_008680_MEQ_BUSY(x)                                        (((x) >> 16) & 0x1)
+#define   C_008680_MEQ_BUSY                                           0xFFFEFFFF
+#define   S_008680_ME_BUSY(x)                                         (((x) & 0x1) << 17)
+#define   G_008680_ME_BUSY(x)                                         (((x) >> 17) & 0x1)
+#define   C_008680_ME_BUSY                                            0xFFFDFFFF
+#define   S_008680_QUERY_BUSY(x)                                      (((x) & 0x1) << 18)
+#define   G_008680_QUERY_BUSY(x)                                      (((x) >> 18) & 0x1)
+#define   C_008680_QUERY_BUSY                                         0xFFFBFFFF
+#define   S_008680_SEMAPHORE_BUSY(x)                                  (((x) & 0x1) << 19)
+#define   G_008680_SEMAPHORE_BUSY(x)                                  (((x) >> 19) & 0x1)
+#define   C_008680_SEMAPHORE_BUSY                                     0xFFF7FFFF
+#define   S_008680_INTERRUPT_BUSY(x)                                  (((x) & 0x1) << 20)
+#define   G_008680_INTERRUPT_BUSY(x)                                  (((x) >> 20) & 0x1)
+#define   C_008680_INTERRUPT_BUSY                                     0xFFEFFFFF
+#define   S_008680_SURFACE_SYNC_BUSY(x)                               (((x) & 0x1) << 21)
+#define   G_008680_SURFACE_SYNC_BUSY(x)                               (((x) >> 21) & 0x1)
+#define   C_008680_SURFACE_SYNC_BUSY                                  0xFFDFFFFF
+#define   S_008680_DMA_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008680_DMA_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008680_DMA_BUSY                                           0xFFBFFFFF
+#define   S_008680_RCIU_BUSY(x)                                       (((x) & 0x1) << 23)
+#define   G_008680_RCIU_BUSY(x)                                       (((x) >> 23) & 0x1)
+#define   C_008680_RCIU_BUSY                                          0xFF7FFFFF
+#define   S_008680_SCRATCH_RAM_BUSY(x)                                (((x) & 0x1) << 24)
+#define   G_008680_SCRATCH_RAM_BUSY(x)                                (((x) >> 24) & 0x1)
+#define   C_008680_SCRATCH_RAM_BUSY                                   0xFEFFFFFF
+#define   S_008680_CPC_CPG_BUSY(x)                                    (((x) & 0x1) << 25)
+#define   G_008680_CPC_CPG_BUSY(x)                                    (((x) >> 25) & 0x1)
+#define   C_008680_CPC_CPG_BUSY                                       0xFDFFFFFF
+#define   S_008680_CE_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008680_CE_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008680_CE_BUSY                                            0xFBFFFFFF
+#define   S_008680_TCIU_BUSY(x)                                       (((x) & 0x1) << 27)
+#define   G_008680_TCIU_BUSY(x)                                       (((x) >> 27) & 0x1)
+#define   C_008680_TCIU_BUSY                                          0xF7FFFFFF
+#define   S_008680_ROQ_CE_RING_BUSY(x)                                (((x) & 0x1) << 28)
+#define   G_008680_ROQ_CE_RING_BUSY(x)                                (((x) >> 28) & 0x1)
+#define   C_008680_ROQ_CE_RING_BUSY                                   0xEFFFFFFF
+#define   S_008680_ROQ_CE_INDIRECT1_BUSY(x)                           (((x) & 0x1) << 29)
+#define   G_008680_ROQ_CE_INDIRECT1_BUSY(x)                           (((x) >> 29) & 0x1)
+#define   C_008680_ROQ_CE_INDIRECT1_BUSY                              0xDFFFFFFF
+#define   S_008680_ROQ_CE_INDIRECT2_BUSY(x)                           (((x) & 0x1) << 30)
+#define   G_008680_ROQ_CE_INDIRECT2_BUSY(x)                           (((x) >> 30) & 0x1)
+#define   C_008680_ROQ_CE_INDIRECT2_BUSY                              0xBFFFFFFF
+#define   S_008680_CP_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_008680_CP_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_008680_CP_BUSY                                            0x7FFFFFFF
 /* CIK */
 #define R_030800_GRBM_GFX_INDEX                                         0x030800
 #define   S_030800_INSTANCE_INDEX(x)                                  (((x) & 0xFF) << 0)

From 2d1952e2a5abd273983374b420371d263388bb20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Apr 2015 20:44:54 +0200
Subject: [PATCH 1171/1208] radeonsi: add VI hardware support

---
 src/gallium/drivers/radeon/r600_pipe_common.c |  6 +++
 src/gallium/drivers/radeon/r600_pipe_common.h |  1 +
 src/gallium/drivers/radeonsi/si_descriptors.c | 16 +++++--
 src/gallium/drivers/radeonsi/si_pipe.c        |  4 +-
 src/gallium/drivers/radeonsi/si_shader.c      | 25 +++++++++--
 src/gallium/drivers/radeonsi/si_state.c       | 45 ++++++++++++++++---
 src/gallium/drivers/radeonsi/si_state_draw.c  | 41 ++++++++++++-----
 .../drivers/radeonsi/si_state_shaders.c       |  8 +++-
 8 files changed, 121 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 51c72cc882f..c982a4d9bad 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -408,6 +408,9 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
 	case CHIP_KABINI: return "AMD KABINI";
 	case CHIP_HAWAII: return "AMD HAWAII";
 	case CHIP_MULLINS: return "AMD MULLINS";
+	case CHIP_TONGA: return "AMD TONGA";
+	case CHIP_ICELAND: return "AMD ICELAND";
+	case CHIP_CARRIZO: return "AMD CARRIZO";
 	default: return "AMD unknown";
 	}
 }
@@ -532,6 +535,9 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 #else
 		return "kabini";
 #endif
+	case CHIP_TONGA: return "tonga";
+	case CHIP_ICELAND: return "iceland";
+	case CHIP_CARRIZO: return "carrizo";
 	default: return "";
 	}
 }
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 768fe282981..29db1cc4e07 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -242,6 +242,7 @@ struct r600_surface {
 	unsigned cb_color_pitch;	/* EG and later */
 	unsigned cb_color_slice;	/* EG and later */
 	unsigned cb_color_attrib;	/* EG and later */
+	unsigned cb_dcc_control;	/* VI and later */
 	unsigned cb_color_fmask;	/* CB_COLORn_FMASK (EG and later) or CB_COLORn_FRAG (r600) */
 	unsigned cb_color_fmask_slice;	/* EG and later */
 	unsigned cb_color_cmask;	/* CB_COLORn_TILE (r600 only) */
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 8d9f8f71dc5..890be071596 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -429,7 +429,8 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 		desc[0] = va & 0xFFFFFFFF;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(vb->stride);
-		if (vb->stride)
+
+		if (sctx->b.chip_class <= CIK && vb->stride)
 			/* Round up by rounding down and adding 1 */
 			desc[2] = (vb->buffer->width0 - offset -
 				   sctx->vertex_elements->format_size[i]) /
@@ -593,6 +594,9 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			break;
 		}
 
+		if (sctx->b.chip_class >= VI && stride)
+			num_records *= stride;
+
 		/* Set the descriptor. */
 		uint32_t *desc = buffers->desc.list + slot*4;
 		desc[0] = va;
@@ -682,7 +686,12 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			struct pipe_resource *buffer = targets[i]->buffer;
 			uint64_t va = r600_resource(buffer)->gpu_address;
 
-			/* Set the descriptor. */
+			/* Set the descriptor.
+			 *
+			 * On VI, the format must be non-INVALID, otherwise
+			 * the buffer will be considered not bound and store
+			 * instructions will be no-ops.
+			 */
 			uint32_t *desc = buffers->desc.list + bufidx*4;
 			desc[0] = va;
 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
@@ -690,7 +699,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
 				  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
 				  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+				  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 
 			/* Set the resource. */
 			pipe_resource_reference(&buffers->buffers[bufidx],
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 9b5cdd8dc12..97cf24bf715 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -184,7 +184,9 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 	r600_target = radeon_llvm_get_r600_target(triple);
 	sctx->tm = LLVMCreateTargetMachine(r600_target, triple,
 					   r600_get_llvm_processor_name(sscreen->b.family),
-					   "+DumpCode,+vgpr-spilling",
+					   sctx->b.chip_class >= VI ?
+						   "+DumpCode" :
+						   "+DumpCode,+vgpr-spilling",
 					   LLVMCodeGenLevelDefault,
 					   LLVMRelocDefault,
 					   LLVMCodeModelDefault);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index ac1c1be7a79..4288e9b2ab1 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2787,6 +2787,7 @@ static void txq_fetch_args(
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	const struct tgsi_full_instruction *inst = emit_data->inst;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
 	unsigned target = inst->Texture.Texture;
 	LLVMValueRef res_ptr;
 
@@ -2807,10 +2808,26 @@ static void txq_fetch_args(
 		LLVMTypeRef v8i32 = LLVMVectorType(i32, 8);
 
 		/* Read the size from the buffer descriptor directly. */
-		LLVMValueRef size = res_ptr;
-		size = LLVMBuildBitCast(gallivm->builder, size, v8i32, "");
-		size = LLVMBuildExtractElement(gallivm->builder, size,
-					      lp_build_const_int32(gallivm, 6), "");
+		LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, v8i32, "");
+		LLVMValueRef size = LLVMBuildExtractElement(builder, res,
+						lp_build_const_int32(gallivm, 6), "");
+
+		if (si_shader_ctx->screen->b.chip_class >= VI) {
+			/* On VI, the descriptor contains the size in bytes,
+			 * but TXQ must return the size in elements.
+			 * The stride is always non-zero for resources using TXQ.
+			 */
+			LLVMValueRef stride =
+				LLVMBuildExtractElement(builder, res,
+							lp_build_const_int32(gallivm, 5), "");
+			stride = LLVMBuildLShr(builder, stride,
+					       lp_build_const_int32(gallivm, 16), "");
+			stride = LLVMBuildAnd(builder, stride,
+					      lp_build_const_int32(gallivm, 0x3FFF), "");
+
+			size = LLVMBuildUDiv(builder, size, stride, "");
+		}
+
 		emit_data->args[0] = size;
 		return;
 	}
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 51ade5248a4..6a8d786282c 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -61,7 +61,7 @@ unsigned si_array_mode(unsigned mode)
 
 uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex)
 {
-	if (sscreen->b.chip_class == CIK &&
+	if (sscreen->b.chip_class >= CIK &&
 	    sscreen->b.info.cik_macrotile_mode_array_valid) {
 		unsigned index, tileb;
 
@@ -1846,6 +1846,9 @@ static void si_initialize_color_surface(struct si_context *sctx,
 	surf->cb_color_info = color_info;
 	surf->cb_color_attrib = color_attrib;
 
+	if (sctx->b.chip_class >= VI)
+		surf->cb_dcc_control = S_028C78_OVERWRITE_COMBINER_DISABLE(1);
+
 	if (rtex->fmask.size) {
 		surf->cb_color_fmask = (offset + rtex->fmask.offset) >> 8;
 		surf->cb_color_fmask_slice = S_028C88_TILE_MAX(rtex->fmask.slice_tile_max);
@@ -1991,6 +1994,10 @@ static void si_init_depth_surface(struct si_context *sctx,
 		db_htile_surface = 0;
 	}
 
+	/* Bug workaround. */
+	if (sctx->b.chip_class >= VI)
+		s_info |= S_028044_TILE_STENCIL_DISABLE(1);
+
 	assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
 
 	surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
@@ -2084,7 +2091,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 	si_update_fb_rs_state(sctx);
 	si_update_fb_blend_state(sctx);
 
-	sctx->framebuffer.atom.num_dw = state->nr_cbufs*15 + (8 - state->nr_cbufs)*3;
+	sctx->framebuffer.atom.num_dw = state->nr_cbufs*16 + (8 - state->nr_cbufs)*3;
 	sctx->framebuffer.atom.num_dw += state->zsbuf ? 26 : 4;
 	sctx->framebuffer.atom.num_dw += 3; /* WINDOW_SCISSOR_BR */
 	sctx->framebuffer.atom.num_dw += 18; /* MSAA sample locations */
@@ -2163,20 +2170,24 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 				RADEON_PRIO_COLOR_META);
 		}
 
-		r600_write_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 13);
+		r600_write_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
+					   sctx->b.chip_class >= VI ? 14 : 13);
 		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
 		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
 		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
 		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
 		radeon_emit(cs, cb->cb_color_info | tex->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
 		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
-		radeon_emit(cs, 0);			/* R_028C78 unused */
+		radeon_emit(cs, cb->cb_dcc_control);	/* R_028C78_CB_COLOR0_DCC_CONTROL */
 		radeon_emit(cs, tex->cmask.base_address_reg);	/* R_028C7C_CB_COLOR0_CMASK */
 		radeon_emit(cs, tex->cmask.slice_tile_max);	/* R_028C80_CB_COLOR0_CMASK_SLICE */
 		radeon_emit(cs, cb->cb_color_fmask);		/* R_028C84_CB_COLOR0_FMASK */
 		radeon_emit(cs, cb->cb_color_fmask_slice);	/* R_028C88_CB_COLOR0_FMASK_SLICE */
 		radeon_emit(cs, tex->color_clear_value[0]);	/* R_028C8C_CB_COLOR0_CLEAR_WORD0 */
 		radeon_emit(cs, tex->color_clear_value[1]);	/* R_028C90_CB_COLOR0_CLEAR_WORD1 */
+
+		if (sctx->b.chip_class >= VI)
+			radeon_emit(cs, 0);	/* R_028C94_CB_COLOR0_DCC_BASE */
 	}
 	/* set CB_COLOR1_INFO for possible dual-src blending */
 	if (i == 1 && state->cbufs[0]) {
@@ -2332,7 +2343,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 
 	/* Buffer resource. */
 	if (texture->target == PIPE_BUFFER) {
-		unsigned stride;
+		unsigned stride, num_records;
 
 		desc = util_format_description(state->format);
 		first_non_void = util_format_get_first_non_void_channel(state->format);
@@ -2341,10 +2352,16 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 		format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
 		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
 
+		num_records = state->u.buf.last_element + 1 - state->u.buf.first_element;
+		num_records = MIN2(num_records, texture->width0 / stride);
+
+		if (sctx->b.chip_class >= VI)
+			num_records *= stride;
+
 		view->state[4] = va;
 		view->state[5] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 				 S_008F04_STRIDE(stride);
-		view->state[6] = state->u.buf.last_element + 1 - state->u.buf.first_element;
+		view->state[6] = num_records;
 		view->state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
 				 S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
 				 S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
@@ -3167,6 +3184,15 @@ static void si_init_config(struct si_context *sctx)
 			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x3a00161a);
 			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x0000002e);
 			break;
+		case CHIP_TONGA:
+			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x16000012);
+			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x0000002a);
+			break;
+		case CHIP_ICELAND:
+		case CHIP_CARRIZO:
+			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x00000002);
+			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x00000000);
+			break;
 		case CHIP_KAVERI:
 			/* XXX todo */
 		case CHIP_KABINI:
@@ -3261,5 +3287,12 @@ static void si_init_config(struct si_context *sctx)
 		si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff));
 	}
 
+	if (sctx->b.chip_class >= VI) {
+		si_pm4_set_reg(pm4, R_028424_CB_DCC_CONTROL,
+			       S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1));
+		si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 30);
+		si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32);
+	}
+
 	sctx->init_config = pm4;
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index f136a1c94d8..4c21655596c 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -318,7 +318,8 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
 		S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
 		S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1) |
-		S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0);
+		S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0) |
+		S_028AA8_MAX_PRIMGRP_IN_WAVE(sctx->b.chip_class >= VI ? 2 : 0);
 }
 
 static unsigned si_get_ls_hs_config(struct si_context *sctx,
@@ -473,12 +474,24 @@ static void si_emit_draw_packets(struct si_context *sctx,
 	if (info->indexed) {
 		radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
 
-		if (ib->index_size == 4) {
-			radeon_emit(cs, V_028A7C_VGT_INDEX_32 | (SI_BIG_ENDIAN ?
-					V_028A7C_VGT_DMA_SWAP_32_BIT : 0));
-		} else {
-			radeon_emit(cs, V_028A7C_VGT_INDEX_16 | (SI_BIG_ENDIAN ?
-					V_028A7C_VGT_DMA_SWAP_16_BIT : 0));
+		/* index type */
+		switch (ib->index_size) {
+		case 1:
+			radeon_emit(cs, V_028A7C_VGT_INDEX_8);
+			break;
+		case 2:
+			radeon_emit(cs, V_028A7C_VGT_INDEX_16 |
+				    (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ?
+					     V_028A7C_VGT_DMA_SWAP_16_BIT : 0));
+			break;
+		case 4:
+			radeon_emit(cs, V_028A7C_VGT_INDEX_32 |
+				    (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ?
+					     V_028A7C_VGT_DMA_SWAP_32_BIT : 0));
+			break;
+		default:
+			assert(!"unreachable");
+			return;
 		}
 	}
 
@@ -604,9 +617,14 @@ void si_emit_cache_flush(struct r600_common_context *sctx, struct r600_atom *ato
 
 	if (sctx->flags & SI_CONTEXT_INV_TC_L1)
 		cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
-	if (sctx->flags & SI_CONTEXT_INV_TC_L2)
+	if (sctx->flags & SI_CONTEXT_INV_TC_L2) {
 		cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
 
+		/* TODO: this might not be needed. */
+		if (sctx->chip_class >= VI)
+			cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1);
+	}
+
 	if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
 		cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
 				 S_0085F0_CB0_DEST_BASE_ENA(1) |
@@ -754,7 +772,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		ib.offset = sctx->index_buffer.offset;
 
 		/* Translate or upload, if needed. */
-		if (ib.index_size == 1) {
+		/* 8-bit indices are supported on VI. */
+		if (sctx->b.chip_class <= CIK && ib.index_size == 1) {
 			struct pipe_resource *out_buffer = NULL;
 			unsigned out_offset, start, count, start_offset;
 			void *ptr;
@@ -789,6 +808,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		}
 	}
 
+	/* TODO: VI should read index buffers through TC, so this shouldn't be
+	 * needed on VI. */
 	if (info->indexed && r600_resource(ib.buffer)->TC_L2_dirty) {
 		sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
 		r600_resource(ib.buffer)->TC_L2_dirty = false;
@@ -822,7 +843,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 	/* Workaround for a VGT hang when streamout is enabled.
 	 * It must be done after drawing. */
-	if (sctx->b.family == CHIP_HAWAII &&
+	if ((sctx->b.family == CHIP_HAWAII || sctx->b.family == CHIP_TONGA) &&
 	    (sctx->b.streamout.streamout_enabled ||
 	     sctx->b.streamout.prims_gen_query_enabled)) {
 		sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 044acd8fcea..0347014948d 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1016,7 +1016,7 @@ bcolor:
 static void si_init_gs_rings(struct si_context *sctx)
 {
 	unsigned esgs_ring_size = 128 * 1024;
-	unsigned gsvs_ring_size = 64 * 1024 * 1024;
+	unsigned gsvs_ring_size = 60 * 1024 * 1024;
 
 	assert(!sctx->gs_rings);
 	sctx->gs_rings = CALLOC_STRUCT(si_pm4_state);
@@ -1028,6 +1028,12 @@ static void si_init_gs_rings(struct si_context *sctx)
 					     PIPE_USAGE_DEFAULT, gsvs_ring_size);
 
 	if (sctx->b.chip_class >= CIK) {
+		if (sctx->b.chip_class >= VI) {
+			/* The maximum sizes are 63.999 MB on VI, because
+			 * the register fields only have 18 bits. */
+			assert(esgs_ring_size / 256 < (1 << 18));
+			assert(gsvs_ring_size / 256 < (1 << 18));
+		}
 		si_pm4_set_reg(sctx->gs_rings, R_030900_VGT_ESGS_RING_SIZE,
 			       esgs_ring_size / 256);
 		si_pm4_set_reg(sctx->gs_rings, R_030904_VGT_GSVS_RING_SIZE,

From bf2c3422d7c12bdead944c3de8b37b809f4cbcbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 30 Apr 2015 17:02:38 +0200
Subject: [PATCH 1172/1208] radeonsi: add amdgpu support for querying the GPU
 reset state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 97cf24bf715..473a2e9ad12 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -84,6 +84,14 @@ static void si_destroy_context(struct pipe_context *context)
 	FREE(sctx);
 }
 
+static enum pipe_reset_status
+si_amdgpu_get_reset_status(struct pipe_context *ctx)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+
+	return sctx->b.ws->ctx_query_reset_status(sctx->b.ctx);
+}
+
 static struct pipe_context *si_create_context(struct pipe_screen *screen, void *priv)
 {
 	struct si_context *sctx = CALLOC_STRUCT(si_context);
@@ -107,6 +115,9 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 	if (!r600_common_context_init(&sctx->b, &sscreen->b))
 		goto fail;
 
+	if (sscreen->b.info.drm_major == 3)
+		sctx->b.b.get_device_reset_status = si_amdgpu_get_reset_status;
+
 	si_init_blit_functions(sctx);
 	si_init_compute_functions(sctx);
 	si_init_cp_dma_functions(sctx);
@@ -268,7 +279,9 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return !SI_BIG_ENDIAN && sscreen->b.info.has_userptr;
 
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-		return sscreen->b.info.drm_major == 2 && sscreen->b.info.drm_minor >= 43;
+		return (sscreen->b.info.drm_major == 2 &&
+			sscreen->b.info.drm_minor >= 43) ||
+		       sscreen->b.info.drm_major == 3;
 
 	case PIPE_CAP_TEXTURE_MULTISAMPLE:
 		/* 2D tiling on CIK is supported since DRM 2.35.0 */

From 649975e7162cc4ee0586ee76d24321cd7250581f Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Wed, 10 Jun 2015 11:39:30 -0400
Subject: [PATCH 1173/1208] radeonsi: properly set the raster_config for KV
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This enables the second RB on asics that support it which
should boost performance.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: mesa-stable@lists.freedesktop.org
---
 src/gallium/drivers/radeonsi/si_state.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 6a8d786282c..c95fbf84047 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3143,6 +3143,7 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 
 static void si_init_config(struct si_context *sctx)
 {
+	unsigned num_rb = sctx->screen->b.info.r600_num_backends;
 	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
 
 	if (pm4 == NULL)
@@ -3194,14 +3195,17 @@ static void si_init_config(struct si_context *sctx)
 			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x00000000);
 			break;
 		case CHIP_KAVERI:
-			/* XXX todo */
+			if (num_rb > 1)
+				si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x00000002);
+			else
+				si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x00000000);
+			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x00000000);
+			break;
 		case CHIP_KABINI:
-			/* XXX todo */
 		case CHIP_MULLINS:
-			/* XXX todo */
 		default:
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0);
+			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x00000000);
+			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x00000000);
 			break;
 		}
 	} else {

From f0e24a7beae57f24501fa9d3b6b947fc20ca23bb Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Wed, 10 Jun 2015 11:43:24 -0400
Subject: [PATCH 1174/1208] radeonsi: properly handler raster_config setup on
 CZ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Need to take into account the number of RBs.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index c95fbf84047..83b7db2b1dd 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3190,10 +3190,10 @@ static void si_init_config(struct si_context *sctx)
 			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x0000002a);
 			break;
 		case CHIP_ICELAND:
-		case CHIP_CARRIZO:
 			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x00000002);
 			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x00000000);
 			break;
+		case CHIP_CARRIZO:
 		case CHIP_KAVERI:
 			if (num_rb > 1)
 				si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x00000002);

From 933d24b1768d769f1847a023ea3c70b6c9723e33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 27 Jun 2015 13:57:25 +0200
Subject: [PATCH 1175/1208] gallium/radeon: enable the GPU load query for
 amdgpu

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index c982a4d9bad..a08e841f11f 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -719,17 +719,19 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
 		{"VRAM-usage", R600_QUERY_VRAM_USAGE, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"GTT-usage", R600_QUERY_GTT_USAGE, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
+		{"GPU-load", R600_QUERY_GPU_LOAD, {100}},
 		{"temperature", R600_QUERY_GPU_TEMPERATURE, {100}},
 		{"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
 		{"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
-		{"GPU-load", R600_QUERY_GPU_LOAD, {100}}
 	};
 	unsigned num_queries;
 
 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
 		num_queries = Elements(list);
+	else if (rscreen->info.drm_major == 3)
+		num_queries = Elements(list) - 3;
 	else
-		num_queries = 9;
+		num_queries = Elements(list) - 4;
 
 	if (!info)
 		return num_queries;

From d69686f1d375c3a65a4398f69da843e833987b0e Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Wed, 8 Jul 2015 22:19:55 -0400
Subject: [PATCH 1176/1208] radeonsi: add harvest support for CI/VI parts (v3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Properly calculate the PA_SC_RASTER_CONFIG[_1] settings
for harvest chips.

v2: - fix default raster config settings for CZ and KV
    - Suggestions from Michel
v3: - handle multiple packers properly for CI+
    - GRBM_GFX_INDEX is privileged on VI+

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com> (v2)
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 208 +++++++++++++-----------
 1 file changed, 116 insertions(+), 92 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 83b7db2b1dd..34ef9e49d5c 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3040,19 +3040,24 @@ void si_init_state_functions(struct si_context *sctx)
 static void
 si_write_harvested_raster_configs(struct si_context *sctx,
 				  struct si_pm4_state *pm4,
-				  unsigned raster_config)
+				  unsigned raster_config,
+				  unsigned raster_config_1)
 {
 	unsigned sh_per_se = MAX2(sctx->screen->b.info.max_sh_per_se, 1);
 	unsigned num_se = MAX2(sctx->screen->b.info.max_se, 1);
 	unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask;
-	unsigned num_rb = sctx->screen->b.info.r600_num_backends;
-	unsigned rb_per_pkr = num_rb / num_se / sh_per_se;
+	unsigned num_rb = MIN2(sctx->screen->b.info.r600_num_backends, 16);
+	unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
 	unsigned rb_per_se = num_rb / num_se;
-	unsigned se0_mask = (1 << rb_per_se) - 1;
-	unsigned se1_mask = se0_mask << rb_per_se;
+	unsigned se_mask[4];
 	unsigned se;
 
-	assert(num_se == 1 || num_se == 2);
+	se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
+	se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
+	se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
+	se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
+
+	assert(num_se == 1 || num_se == 2 || num_se == 4);
 	assert(sh_per_se == 1 || sh_per_se == 2);
 	assert(rb_per_pkr == 1 || rb_per_pkr == 2);
 
@@ -3060,17 +3065,16 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 	 * fields are for, so I'm leaving them as their default
 	 * values. */
 
-	se0_mask &= rb_mask;
-	se1_mask &= rb_mask;
-	if (num_se == 2 && (!se0_mask || !se1_mask)) {
-		raster_config &= C_028350_SE_MAP;
+	if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) ||
+			     (!se_mask[2] && !se_mask[3]))) {
+		raster_config_1 &= C_028354_SE_PAIR_MAP;
 
-		if (!se0_mask) {
-			raster_config |=
-				S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
+		if (!se_mask[0] && !se_mask[1]) {
+			raster_config_1 |=
+				S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
 		} else {
-			raster_config |=
-				S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
+			raster_config_1 |=
+				S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
 		}
 	}
 
@@ -3078,10 +3082,23 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 		unsigned raster_config_se = raster_config;
 		unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
 		unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
+		int idx = (se / 2) * 2;
+
+		if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
+			raster_config_se &= C_028350_SE_MAP;
+
+			if (!se_mask[idx]) {
+				raster_config_se |=
+					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
+			} else {
+				raster_config_se |=
+					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
+			}
+		}
 
 		pkr0_mask &= rb_mask;
 		pkr1_mask &= rb_mask;
-		if (sh_per_se == 2 && (!pkr0_mask || !pkr1_mask)) {
+		if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
 			raster_config_se &= C_028350_PKR_MAP;
 
 			if (!pkr0_mask) {
@@ -3093,7 +3110,7 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 			}
 		}
 
-		if (rb_per_pkr == 2) {
+		if (rb_per_se >= 2) {
 			unsigned rb0_mask = 1 << (se * rb_per_se);
 			unsigned rb1_mask = rb0_mask << 1;
 
@@ -3111,7 +3128,7 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 				}
 			}
 
-			if (sh_per_se == 2) {
+			if (rb_per_se > 2) {
 				rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
 				rb1_mask = rb0_mask << 1;
 				rb0_mask &= rb_mask;
@@ -3130,20 +3147,28 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 			}
 		}
 
-		si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
-			       SE_INDEX(se) | SH_BROADCAST_WRITES |
-			       INSTANCE_BROADCAST_WRITES);
+		/* GRBM_GFX_INDEX is privileged on VI */
+		if (sctx->b.chip_class <= CIK)
+			si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
+				       SE_INDEX(se) | SH_BROADCAST_WRITES |
+				       INSTANCE_BROADCAST_WRITES);
 		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se);
+		if (sctx->b.chip_class >= CIK)
+			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
 	}
 
-	si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
-		       SE_BROADCAST_WRITES | SH_BROADCAST_WRITES |
-		       INSTANCE_BROADCAST_WRITES);
+	/* GRBM_GFX_INDEX is privileged on VI */
+	if (sctx->b.chip_class <= CIK)
+		si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
+			       SE_BROADCAST_WRITES | SH_BROADCAST_WRITES |
+			       INSTANCE_BROADCAST_WRITES);
 }
 
 static void si_init_config(struct si_context *sctx)
 {
-	unsigned num_rb = sctx->screen->b.info.r600_num_backends;
+	unsigned num_rb = MIN2(sctx->screen->b.info.r600_num_backends, 16);
+	unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask;
+	unsigned raster_config, raster_config_1;
 	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
 
 	if (pm4 == NULL)
@@ -3175,74 +3200,73 @@ static void si_init_config(struct si_context *sctx)
 
 	si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0);
 
-	if (sctx->b.chip_class >= CIK) {
-		switch (sctx->screen->b.family) {
-		case CHIP_BONAIRE:
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x16000012);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0);
-			break;
-		case CHIP_HAWAII:
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x3a00161a);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x0000002e);
-			break;
-		case CHIP_TONGA:
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x16000012);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x0000002a);
-			break;
-		case CHIP_ICELAND:
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x00000002);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x00000000);
-			break;
-		case CHIP_CARRIZO:
-		case CHIP_KAVERI:
-			if (num_rb > 1)
-				si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x00000002);
-			else
-				si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x00000000);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x00000000);
-			break;
-		case CHIP_KABINI:
-		case CHIP_MULLINS:
-		default:
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x00000000);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x00000000);
-			break;
-		}
+	switch (sctx->screen->b.family) {
+	case CHIP_TAHITI:
+	case CHIP_PITCAIRN:
+		raster_config = 0x2a00126a;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_VERDE:
+		raster_config = 0x0000124a;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_OLAND:
+		raster_config = 0x00000082;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_HAINAN:
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_BONAIRE:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_HAWAII:
+		raster_config = 0x3a00161a;
+		raster_config_1 = 0x0000002e;
+		break;
+	case CHIP_TONGA:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x0000002a;
+		break;
+	case CHIP_ICELAND:
+		raster_config = 0x00000002;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_CARRIZO:
+		raster_config = 0x00000002;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_KAVERI:
+		/* KV should be 0x00000002, but that causes problems with radeon */
+		raster_config = 0x00000000; /* 0x00000002 */
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_KABINI:
+	case CHIP_MULLINS:
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	default:
+		fprintf(stderr,
+			"radeonsi: Unknown GPU, using 0 for raster_config\n");
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	}
+
+	/* Always use the default config when all backends are enabled
+	 * (or when we failed to determine the enabled backends).
+	 */
+	if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
+		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG,
+			       raster_config);
+		if (sctx->b.chip_class >= CIK)
+			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1,
+				       raster_config_1);
 	} else {
-		unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask;
-		unsigned num_rb = sctx->screen->b.info.r600_num_backends;
-		unsigned raster_config;
-
-		switch (sctx->screen->b.family) {
-		case CHIP_TAHITI:
-		case CHIP_PITCAIRN:
-			raster_config = 0x2a00126a;
-			break;
-		case CHIP_VERDE:
-			raster_config = 0x0000124a;
-			break;
-		case CHIP_OLAND:
-			raster_config = 0x00000082;
-			break;
-		case CHIP_HAINAN:
-			raster_config = 0;
-			break;
-		default:
-			fprintf(stderr,
-				"radeonsi: Unknown GPU, using 0 for raster_config\n");
-			raster_config = 0;
-			break;
-		}
-
-		/* Always use the default config when all backends are enabled
-		 * (or when we failed to determine the enabled backends).
-		 */
-		if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG,
-				       raster_config);
-		} else {
-			si_write_harvested_raster_configs(sctx, pm4, raster_config);
-		}
+		si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1);
 	}
 
 	si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));

From 528a6ff5992e6710921d6e4157a8a51884bc277f Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Tue, 7 Jul 2015 22:18:13 -0400
Subject: [PATCH 1177/1208] winsys/amdgpu: add addrlib support for Fiji (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: fix tonga chip check

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: David Zhang <david1.zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp | 5 +++++
 src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h   | 1 +
 src/gallium/winsys/amdgpu/drm/amdgpu_id.h                | 6 +++++-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
index 264e2efb8b2..7393953c120 100644
--- a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
@@ -350,6 +350,7 @@ AddrChipFamily CIAddrLib::HwlConvertChipFamily(
             m_settings.isVolcanicIslands = 1;
             m_settings.isIceland         = ASICREV_IS_ICELAND_M(uChipRevision);
             m_settings.isTonga           = ASICREV_IS_TONGA_P(uChipRevision);
+            m_settings.isFiji            = ASICREV_IS_FIJI_P(uChipRevision);
             break;
         case FAMILY_CZ:
             m_settings.isCarrizo         = 1;
@@ -410,6 +411,10 @@ BOOL_32 CIAddrLib::HwlInitGlobalParams(
     {
         m_pipes = 2;
     }
+    else if (m_settings.isFiji)
+    {
+        m_pipes = 16;
+    }
 
     if (valid)
     {
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
index 0220736c7bc..451508619f9 100644
--- a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
@@ -59,6 +59,7 @@ struct CIChipSettings
         UINT_32 isVolcanicIslands : 1;
         UINT_32 isIceland         : 1;
         UINT_32 isTonga           : 1;
+        UINT_32 isFiji            : 1;
         // VI fusion (Carrizo)
         UINT_32 isCarrizo         : 1;
     };
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
index 08a1591496e..8882c418e12 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
@@ -136,6 +136,8 @@ enum {
 	VI_TONGA_P_A0     = 20,
 	VI_TONGA_P_A1     = 21,
 
+	VI_FIJI_P_A0      = 60,
+
 	VI_UNKNOWN        = 0xFF
 };
 
@@ -143,7 +145,9 @@ enum {
 #define ASICREV_IS_ICELAND_M(eChipRev)	\
 	(eChipRev < VI_TONGA_P_A0)
 #define ASICREV_IS_TONGA_P(eChipRev)	\
-	(eChipRev >= VI_TONGA_P_A0)
+	((eChipRev >= VI_TONGA_P_A0) && (eChipRev < VI_FIJI_P_A0))
+#define ASICREV_IS_FIJI_P(eChipRev)	\
+	(eChipRev >= VI_FIJI_P_A0)
 
 /* CZ specific rev IDs */
 enum {

From 767ad50a10d01274b1d1a877add12b5552ba6984 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Wed, 29 Jul 2015 15:40:46 -0400
Subject: [PATCH 1178/1208] radeonsi: add support for FIJI (v4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: incorporate comments from Marek
v3: add missing fiji case in winsys init
    use tonga raster config (double check this)
v4: rebase on harvest patch

Reviewed-by: Marek Olšák <marek.olsak@amd.com> (v3)
Reviewed-by: Christian König <christian.koenig@amd.com> (v3)
Reviewed-by: David Zhang <david1.zhang@amd.com> (v3)
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.c | 2 ++
 src/gallium/drivers/radeon/radeon_winsys.h    | 1 +
 src/gallium/drivers/radeonsi/si_state.c       | 5 +++++
 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 4 ++++
 4 files changed, 12 insertions(+)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index a08e841f11f..ed5d1dabdc3 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -411,6 +411,7 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
 	case CHIP_TONGA: return "AMD TONGA";
 	case CHIP_ICELAND: return "AMD ICELAND";
 	case CHIP_CARRIZO: return "AMD CARRIZO";
+	case CHIP_FIJI: return "AMD FIJI";
 	default: return "AMD unknown";
 	}
 }
@@ -538,6 +539,7 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 	case CHIP_TONGA: return "tonga";
 	case CHIP_ICELAND: return "iceland";
 	case CHIP_CARRIZO: return "carrizo";
+	case CHIP_FIJI: return "fiji";
 	default: return "";
 	}
 }
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 616816ebcea..d7424eef27e 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -136,6 +136,7 @@ enum radeon_family {
     CHIP_TONGA,
     CHIP_ICELAND,
     CHIP_CARRIZO,
+    CHIP_FIJI,
     CHIP_LAST,
 };
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 34ef9e49d5c..21689e71b8b 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3226,6 +3226,11 @@ static void si_init_config(struct si_context *sctx)
 		raster_config = 0x3a00161a;
 		raster_config_1 = 0x0000002e;
 		break;
+	case CHIP_FIJI:
+		/* Fiji should be same as Hawaii, but that causes corruption in some cases */
+		raster_config = 0x16000012; /* 0x3a00161a */
+		raster_config_1 = 0x0000002a; /* 0x0000002e */
+		break;
 	case CHIP_TONGA:
 		raster_config = 0x16000012;
 		raster_config_1 = 0x0000002a;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 9020b1cefc7..f57f45b618f 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -227,6 +227,10 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws)
       ws->family = FAMILY_CZ;
       ws->rev_id = CZ_CARRIZO_A0;
       break;
+   case CHIP_FIJI:
+      ws->family = FAMILY_VI;
+      ws->rev_id = VI_FIJI_P_A0;
+      break;
    default:
       fprintf(stderr, "amdgpu: Unknown family.\n");
       goto fail;

From 0248c13a8b1e10e2c8c8d614473c701239627a71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Wed, 9 Apr 2014 19:41:06 +0200
Subject: [PATCH 1179/1208] gallium/radeon: use VM for UVD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: (leo) add checking for driver backend
v3: (leo) change variable name from use_amdgpu to use_vm
v4: rebase by Marek

Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeon/radeon_uvd.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 79fc0c726c4..b203e64f815 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -82,6 +82,7 @@ struct ruvd_decoder {
 	unsigned			bs_size;
 
 	struct rvid_buffer		dpb;
+	bool				use_legacy;
 };
 
 /* flush IB to the hardware */
@@ -107,8 +108,16 @@ static void send_cmd(struct ruvd_decoder *dec, unsigned cmd,
 
 	reloc_idx = dec->ws->cs_add_reloc(dec->cs, cs_buf, usage, domain,
 					  RADEON_PRIO_MIN);
-	set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off);
-	set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4);
+	if (!dec->use_legacy) {
+		uint64_t addr;
+		addr = dec->ws->buffer_get_virtual_address(cs_buf);
+		addr = addr + off;
+		set_reg(dec, RUVD_GPCOM_VCPU_DATA0, addr);
+		set_reg(dec, RUVD_GPCOM_VCPU_DATA1, addr >> 32);
+	} else {
+		set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off);
+		set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4);
+	}
 	set_reg(dec, RUVD_GPCOM_VCPU_CMD, cmd << 1);
 }
 
@@ -792,6 +801,9 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	if (!dec)
 		return NULL;
 
+	if (info.drm_major < 3)
+		dec->use_legacy = TRUE;
+
 	dec->base = *templ;
 	dec->base.context = context;
 	dec->base.width = width;

From 67586c4b40881940535658c3c89b5b1a42f94027 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Thu, 10 Apr 2014 17:18:32 +0200
Subject: [PATCH 1180/1208] gallium/radeon: use VM for VCE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: (leo) add checking for driver backend
v3: (leo) change variable name from use_amdgpu to use_vm
v4: rebase by Marek

Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeon/radeon_vce.c       | 24 +++++++++++++++++++
 src/gallium/drivers/radeon/radeon_vce.h       | 15 +++++++-----
 .../drivers/radeon/radeon_vce_40_2_2.c        | 17 ++++++-------
 src/gallium/drivers/radeon/radeon_vce_50.c    |  8 +++----
 4 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 4557ce55556..c7b8952403a 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -396,6 +396,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	if (!enc)
 		return NULL;
 
+	if (rscreen->info.drm_major == 3)
+		enc->use_vm = true;
 	if ((rscreen->info.drm_major > 2) || (rscreen->info.drm_minor >= 42))
 		enc->use_vui = true;
 
@@ -485,3 +487,25 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 		rscreen->info.vce_fw_version == FW_50_0_1 ||
 		rscreen->info.vce_fw_version == FW_50_1_2;
 }
+
+/**
+ * Add the buffer as relocation to the current command submission
+ */
+void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf,
+                     enum radeon_bo_usage usage, enum radeon_bo_domain domain,
+                     uint32_t offset)
+{
+	int reloc_idx;
+
+	reloc_idx = enc->ws->cs_add_reloc(enc->cs, buf, usage, domain, RADEON_PRIO_MIN);
+	if (enc->use_vm) {
+		uint64_t addr;
+		addr = enc->ws->buffer_get_virtual_address(buf);
+		addr = addr + offset;
+		RVCE_CS(addr >> 32);
+		RVCE_CS(addr);
+	} else {
+		RVCE_CS(reloc_idx * 4);
+		RVCE_CS(offset);
+	}
+}
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index 8319ef48cd5..66c547855b1 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -36,13 +36,11 @@
 
 #include "util/list.h"
 
-#define RVCE_RELOC(buf, usage, domain) (enc->ws->cs_add_reloc(enc->cs, (buf), (usage), domain, RADEON_PRIO_MIN))
-
 #define RVCE_CS(value) (enc->cs->buf[enc->cs->cdw++] = (value))
 #define RVCE_BEGIN(cmd) { uint32_t *begin = &enc->cs->buf[enc->cs->cdw++]; RVCE_CS(cmd)
-#define RVCE_READ(buf, domain) RVCE_CS(RVCE_RELOC(buf, RADEON_USAGE_READ, domain) * 4)
-#define RVCE_WRITE(buf, domain) RVCE_CS(RVCE_RELOC(buf, RADEON_USAGE_WRITE, domain) * 4)
-#define RVCE_READWRITE(buf, domain) RVCE_CS(RVCE_RELOC(buf, RADEON_USAGE_READWRITE, domain) * 4)
+#define RVCE_READ(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READ, (domain), (off))
+#define RVCE_WRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_WRITE, (domain), (off))
+#define RVCE_READWRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READWRITE, (domain), (off))
 #define RVCE_END() *begin = (&enc->cs->buf[enc->cs->cdw] - begin) * 4; }
 
 struct r600_common_screen;
@@ -101,7 +99,8 @@ struct rvce_encoder {
 	struct rvid_buffer		*fb;
 	struct rvid_buffer		cpb;
 	struct pipe_h264_enc_picture_desc pic;
-	bool use_vui;
+	bool				use_vm;
+	bool				use_vui;
 };
 
 /* CPB handling functions */
@@ -118,6 +117,10 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen);
 
+void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf,
+		     enum radeon_bo_usage usage, enum radeon_bo_domain domain,
+		     uint32_t offset);
+
 /* init vce fw 40.2.2 specific callbacks */
 void radeon_vce_40_2_2_init(struct rvce_encoder *enc);
 
diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index 51b17b5f6a8..71b5e89fd59 100644
--- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -68,8 +68,7 @@ static void task_info(struct rvce_encoder *enc, uint32_t taskOperation)
 static void feedback(struct rvce_encoder *enc)
 {
 	RVCE_BEGIN(0x05000005); // feedback buffer
-	RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains); // feedbackRingAddressHi
-	RVCE_CS(0x00000000); // feedbackRingAddressLo
+	RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo
 	RVCE_CS(0x00000001); // feedbackRingSize
 	RVCE_END();
 }
@@ -280,13 +279,11 @@ static void encode(struct rvce_encoder *enc)
 	task_info(enc, 0x00000003);
 
 	RVCE_BEGIN(0x05000001); // context buffer
-	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains); // encodeContextAddressHi
-	RVCE_CS(0x00000000); // encodeContextAddressLo
+	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo
 	RVCE_END();
 
 	RVCE_BEGIN(0x05000004); // video bitstream buffer
-	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT); // videoBitstreamRingAddressHi
-	RVCE_CS(0x00000000); // videoBitstreamRingAddressLo
+	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, 0x0); // videoBitstreamRingAddressHi/Lo
 	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
 	RVCE_END();
 
@@ -298,10 +295,10 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(0x00000000); // insertAUD
 	RVCE_CS(0x00000000); // endOfSequence
 	RVCE_CS(0x00000000); // endOfStream
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureLumaAddressHi
-	RVCE_CS(enc->luma->level[0].offset); // inputPictureLumaAddressLo
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureChromaAddressHi
-	RVCE_CS(enc->chroma->level[0].offset); // inputPictureChromaAddressLo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		  enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		  enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo
 	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
 	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
 	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c
index 84a2bfb117e..2cdddf2e5b7 100644
--- a/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -96,12 +96,12 @@ static void encode(struct rvce_encoder *enc)
 	task_info(enc, 0x00000003);
 
 	RVCE_BEGIN(0x05000001); // context buffer
-	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains); // encodeContextAddressHi
+	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi
 	RVCE_CS(0x00000000); // encodeContextAddressLo
 	RVCE_END();
 
 	RVCE_BEGIN(0x05000004); // video bitstream buffer
-	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT); // videoBitstreamRingAddressHi
+	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, 0); // videoBitstreamRingAddressHi
 	RVCE_CS(0x00000000); // videoBitstreamRingAddressLo
 	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
 	RVCE_END();
@@ -114,9 +114,9 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(0x00000000); // insertAUD
 	RVCE_CS(0x00000000); // endOfSequence
 	RVCE_CS(0x00000000); // endOfStream
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureLumaAddressHi
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, 0); // inputPictureLumaAddressHi
 	RVCE_CS(enc->luma->level[0].offset); // inputPictureLumaAddressLo
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureChromaAddressHi
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, 0); // inputPictureChromaAddressHi
 	RVCE_CS(enc->chroma->level[0].offset); // inputPictureChromaAddressLo
 	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
 	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch

From 261ed775475db8d328a772fc4ff151d63969c84a Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Mon, 15 Dec 2014 12:51:50 -0500
Subject: [PATCH 1181/1208] gallium/radeon: add h264 performance HW decoder
 support

v2: -make tonga use new h264 performance HW decoder;
    -integrate it scaling buffer to msg_fb buffer

Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeon/radeon_uvd.c | 63 ++++++++++++++++---------
 src/gallium/drivers/radeon/radeon_uvd.h |  2 +
 2 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index b203e64f815..d86086b936f 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -57,6 +57,7 @@
 
 #define FB_BUFFER_OFFSET 0x1000
 #define FB_BUFFER_SIZE 2048
+#define IT_SCALING_TABLE_SIZE 224
 
 /* UVD decoder representation */
 struct ruvd_decoder {
@@ -65,6 +66,7 @@ struct ruvd_decoder {
 	ruvd_set_dtb			set_dtb;
 
 	unsigned			stream_handle;
+	unsigned			stream_type;
 	unsigned			frame_number;
 
 	struct pipe_screen		*screen;
@@ -73,9 +75,10 @@ struct ruvd_decoder {
 
 	unsigned			cur_buffer;
 
-	struct rvid_buffer		msg_fb_buffers[NUM_BUFFERS];
+	struct rvid_buffer		msg_fb_it_buffers[NUM_BUFFERS];
 	struct ruvd_msg			*msg;
 	uint32_t			*fb;
+	uint8_t				*it;
 
 	struct rvid_buffer		bs_buffers[NUM_BUFFERS];
 	void*				bs_ptr;
@@ -121,14 +124,14 @@ static void send_cmd(struct ruvd_decoder *dec, unsigned cmd,
 	set_reg(dec, RUVD_GPCOM_VCPU_CMD, cmd << 1);
 }
 
-/* map the next available message/feedback buffer */
-static void map_msg_fb_buf(struct ruvd_decoder *dec)
+/* map the next available message/feedback/itscaling buffer */
+static void map_msg_fb_it_buf(struct ruvd_decoder *dec)
 {
 	struct rvid_buffer* buf;
 	uint8_t *ptr;
 
 	/* grab the current message/feedback buffer */
-	buf = &dec->msg_fb_buffers[dec->cur_buffer];
+	buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 
 	/* and map it for CPU access */
 	ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs, PIPE_TRANSFER_WRITE);
@@ -136,6 +139,8 @@ static void map_msg_fb_buf(struct ruvd_decoder *dec)
 	/* calc buffer offsets */
 	dec->msg = (struct ruvd_msg *)ptr;
 	dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);
+	if (dec->stream_type == RUVD_CODEC_H264_PERF)
+		dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + FB_BUFFER_SIZE);
 }
 
 /* unmap and send a message command to the VCPU */
@@ -148,12 +153,14 @@ static void send_msg_buf(struct ruvd_decoder *dec)
 		return;
 
 	/* grab the current message buffer */
-	buf = &dec->msg_fb_buffers[dec->cur_buffer];
+	buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 
 	/* unmap the buffer */
 	dec->ws->buffer_unmap(buf->res->cs_buf);
 	dec->msg = NULL;
 	dec->fb = NULL;
+	if (dec->stream_type == RUVD_CODEC_H264_PERF)
+		dec->it = NULL;
 
 	/* and send it to the hardware */
 	send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->cs_buf, 0,
@@ -168,11 +175,12 @@ static void next_buffer(struct ruvd_decoder *dec)
 }
 
 /* convert the profile into something UVD understands */
-static uint32_t profile2stream_type(enum pipe_video_profile profile)
+static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family)
 {
-	switch (u_reduce_video_profile(profile)) {
+	switch (u_reduce_video_profile(dec->base.profile)) {
 	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-		return RUVD_CODEC_H264;
+		return (family >= CHIP_TONGA) ?
+			RUVD_CODEC_H264_PERF : RUVD_CODEC_H264;
 
 	case PIPE_VIDEO_FORMAT_VC1:
 		return RUVD_CODEC_VC1;
@@ -565,7 +573,7 @@ static void ruvd_destroy(struct pipe_video_codec *decoder)
 
 	assert(decoder);
 
-	map_msg_fb_buf(dec);
+	map_msg_fb_it_buf(dec);
 	memset(dec->msg, 0, sizeof(*dec->msg));
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_DESTROY;
@@ -577,7 +585,7 @@ static void ruvd_destroy(struct pipe_video_codec *decoder)
 	dec->ws->cs_destroy(dec->cs);
 
 	for (i = 0; i < NUM_BUFFERS; ++i) {
-		rvid_destroy_buffer(&dec->msg_fb_buffers[i]);
+		rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]);
 		rvid_destroy_buffer(&dec->bs_buffers[i]);
 	}
 
@@ -679,7 +687,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 {
 	struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder;
 	struct radeon_winsys_cs_handle *dt;
-	struct rvid_buffer *msg_fb_buf, *bs_buf;
+	struct rvid_buffer *msg_fb_it_buf, *bs_buf;
 	unsigned bs_size;
 
 	assert(decoder);
@@ -687,32 +695,37 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	if (!dec->bs_ptr)
 		return;
 
-	msg_fb_buf = &dec->msg_fb_buffers[dec->cur_buffer];
+	msg_fb_it_buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 	bs_buf = &dec->bs_buffers[dec->cur_buffer];
 
 	bs_size = align(dec->bs_size, 128);
 	memset(dec->bs_ptr, 0, bs_size - dec->bs_size);
 	dec->ws->buffer_unmap(bs_buf->res->cs_buf);
 
-	map_msg_fb_buf(dec);
+	map_msg_fb_it_buf(dec);
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_DECODE;
 	dec->msg->stream_handle = dec->stream_handle;
 	dec->msg->status_report_feedback_number = dec->frame_number;
 
-	dec->msg->body.decode.stream_type = profile2stream_type(dec->base.profile);
+	dec->msg->body.decode.stream_type = dec->stream_type;
 	dec->msg->body.decode.decode_flags = 0x1;
 	dec->msg->body.decode.width_in_samples = dec->base.width;
 	dec->msg->body.decode.height_in_samples = dec->base.height;
 
 	dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size;
 	dec->msg->body.decode.bsd_size = bs_size;
+	dec->msg->body.decode.db_pitch = dec->base.width;
 
 	dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target);
 
 	switch (u_reduce_video_profile(picture->profile)) {
 	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
 		dec->msg->body.decode.codec.h264 = get_h264_msg(dec, (struct pipe_h264_picture_desc*)picture);
+		if (dec->stream_type == RUVD_CODEC_H264_PERF) {
+			memcpy(dec->it, dec->msg->body.decode.codec.h264.scaling_list_4x4, 6*16);
+			memcpy((dec->it + 96), dec->msg->body.decode.codec.h264.scaling_list_8x8, 2*64);
+		}
 		break;
 
 	case PIPE_VIDEO_FORMAT_VC1:
@@ -746,8 +759,11 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 		 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 	send_cmd(dec, RUVD_CMD_DECODING_TARGET_BUFFER, dt, 0,
 		 RADEON_USAGE_WRITE, RADEON_DOMAIN_VRAM);
-	send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_buf->res->cs_buf,
+	send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->cs_buf,
 		 FB_BUFFER_OFFSET, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
+	if (dec->stream_type == RUVD_CODEC_H264_PERF)
+		send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->cs_buf,
+			 FB_BUFFER_OFFSET + FB_BUFFER_SIZE, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 	set_reg(dec, RUVD_ENGINE_CNTL, 1);
 
 	flush(dec);
@@ -816,6 +832,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	dec->base.end_frame = ruvd_end_frame;
 	dec->base.flush = ruvd_flush;
 
+	dec->stream_type = profile2stream_type(dec, info.family);
 	dec->set_dtb = set_dtb;
 	dec->stream_handle = rvid_alloc_stream_handle();
 	dec->screen = context->screen;
@@ -828,10 +845,12 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 
 	bs_buf_size = width * height * 512 / (16 * 16);
 	for (i = 0; i < NUM_BUFFERS; ++i) {
-		unsigned msg_fb_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE;
+		unsigned msg_fb_it_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE;
 		STATIC_ASSERT(sizeof(struct ruvd_msg) <= FB_BUFFER_OFFSET);
-		if (!rvid_create_buffer(dec->screen, &dec->msg_fb_buffers[i],
-					msg_fb_size, PIPE_USAGE_STAGING)) {
+		if (dec->stream_type == RUVD_CODEC_H264_PERF)
+			msg_fb_it_size += IT_SCALING_TABLE_SIZE;
+		if (!rvid_create_buffer(dec->screen, &dec->msg_fb_it_buffers[i],
+					msg_fb_it_size, PIPE_USAGE_STAGING)) {
 			RVID_ERR("Can't allocated message buffers.\n");
 			goto error;
 		}
@@ -842,7 +861,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 			goto error;
 		}
 
-		rvid_clear_buffer(context, &dec->msg_fb_buffers[i]);
+		rvid_clear_buffer(context, &dec->msg_fb_it_buffers[i]);
 		rvid_clear_buffer(context, &dec->bs_buffers[i]);
 	}
 
@@ -853,11 +872,11 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 
 	rvid_clear_buffer(context, &dec->dpb);
 
-	map_msg_fb_buf(dec);
+	map_msg_fb_it_buf(dec);
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_CREATE;
 	dec->msg->stream_handle = dec->stream_handle;
-	dec->msg->body.create.stream_type = profile2stream_type(dec->base.profile);
+	dec->msg->body.create.stream_type = dec->stream_type;
 	dec->msg->body.create.width_in_samples = dec->base.width;
 	dec->msg->body.create.height_in_samples = dec->base.height;
 	dec->msg->body.create.dpb_size = dec->dpb.res->buf->size;
@@ -871,7 +890,7 @@ error:
 	if (dec->cs) dec->ws->cs_destroy(dec->cs);
 
 	for (i = 0; i < NUM_BUFFERS; ++i) {
-		rvid_destroy_buffer(&dec->msg_fb_buffers[i]);
+		rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]);
 		rvid_destroy_buffer(&dec->bs_buffers[i]);
 	}
 
diff --git a/src/gallium/drivers/radeon/radeon_uvd.h b/src/gallium/drivers/radeon/radeon_uvd.h
index 7442865c9ec..5b6c65c81b0 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/src/gallium/drivers/radeon/radeon_uvd.h
@@ -62,6 +62,7 @@
 #define RUVD_CMD_DECODING_TARGET_BUFFER	0x00000002
 #define RUVD_CMD_FEEDBACK_BUFFER	0x00000003
 #define RUVD_CMD_BITSTREAM_BUFFER	0x00000100
+#define RUVD_CMD_ITSCALING_TABLE_BUFFER	0x00000204
 
 /* UVD message types */
 #define RUVD_MSG_CREATE		0
@@ -73,6 +74,7 @@
 #define RUVD_CODEC_VC1		0x00000001
 #define RUVD_CODEC_MPEG2	0x00000003
 #define RUVD_CODEC_MPEG4	0x00000004
+#define RUVD_CODEC_H264_PERF	0x00000007
 
 /* UVD decode target buffer tiling mode */
 #define RUVD_TILE_LINEAR	0x00000000

From c29f0d4722832a9d284aba899875955e60a41c03 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Mon, 9 Mar 2015 16:24:48 -0400
Subject: [PATCH 1182/1208] radeon/video: add 4K support for decode/encode
 parameters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeon/radeon_video.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index 826e0763c08..65949fba3b9 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -214,9 +214,9 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	        case PIPE_VIDEO_CAP_NPOT_TEXTURES:
         	        return 1;
 	        case PIPE_VIDEO_CAP_MAX_WIDTH:
-        	        return 2048;
+			return (rscreen->family < CHIP_TONGA) ? 2048 : 4096;
 	        case PIPE_VIDEO_CAP_MAX_HEIGHT:
-        	        return 1152;
+			return (rscreen->family < CHIP_TONGA) ? 1152 : 2304;
 	        case PIPE_VIDEO_CAP_PREFERED_FORMAT:
         	        return PIPE_FORMAT_NV12;
 	        case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
@@ -268,9 +268,9 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	case PIPE_VIDEO_CAP_NPOT_TEXTURES:
 		return 1;
 	case PIPE_VIDEO_CAP_MAX_WIDTH:
-		return 2048;
+		return (rscreen->family < CHIP_TONGA) ? 2048 : 4096;
 	case PIPE_VIDEO_CAP_MAX_HEIGHT:
-		return 1152;
+		return (rscreen->family < CHIP_TONGA) ? 1152 : 2304;
 	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
 		return PIPE_FORMAT_NV12;
 	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:

From baecc518c9adcd073e725268421a049dd610d22f Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Thu, 12 Mar 2015 16:13:44 -0400
Subject: [PATCH 1183/1208] radeon/uvd: recalculate dbp buffer size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/radeon/radeon_uvd.c | 79 ++++++++++++++++++-------
 1 file changed, 58 insertions(+), 21 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index d86086b936f..7b0eb1110d3 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -198,16 +198,16 @@ static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family)
 }
 
 /* calculate size of reference picture buffer */
-static unsigned calc_dpb_size(const struct pipe_video_codec *templ)
+static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 {
 	unsigned width_in_mb, height_in_mb, image_size, dpb_size;
 
 	// always align them to MB size for dpb calculation
-	unsigned width = align(templ->width, VL_MACROBLOCK_WIDTH);
-	unsigned height = align(templ->height, VL_MACROBLOCK_HEIGHT);
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
 
 	// always one more for currently decoded picture
-	unsigned max_references = templ->max_references + 1;
+	unsigned max_references = dec->base.max_references + 1;
 
 	// aligned size of a single frame
 	image_size = width * height;
@@ -218,20 +218,57 @@ static unsigned calc_dpb_size(const struct pipe_video_codec *templ)
 	width_in_mb = width / VL_MACROBLOCK_WIDTH;
 	height_in_mb = align(height / VL_MACROBLOCK_HEIGHT, 2);
 
-	switch (u_reduce_video_profile(templ->profile)) {
-	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-		// the firmware seems to allways assume a minimum of ref frames
-		max_references = MAX2(NUM_H264_REFS, max_references);
+	switch (u_reduce_video_profile(dec->base.profile)) {
+	case PIPE_VIDEO_FORMAT_MPEG4_AVC: {
+		if (!dec->use_legacy) {
+			unsigned fs_in_mb = width_in_mb * height_in_mb;
+			unsigned alignment = 64, num_dpb_buffer;
 
-		// reference picture buffer
-		dpb_size = image_size * max_references;
-
-		// macroblock context buffer
-		dpb_size += width_in_mb * height_in_mb * max_references * 192;
-
-		// IT surface buffer
-		dpb_size += width_in_mb * height_in_mb * 32;
+			if (dec->stream_type == RUVD_CODEC_H264_PERF)
+				alignment = 256;
+			switch(dec->base.level) {
+			case 30:
+				num_dpb_buffer = 8100 / fs_in_mb;
+				break;
+			case 31:
+				num_dpb_buffer = 18000 / fs_in_mb;
+				break;
+			case 32:
+				num_dpb_buffer = 20480 / fs_in_mb;
+				break;
+			case 41:
+				num_dpb_buffer = 32768 / fs_in_mb;
+				break;
+			case 42:
+				num_dpb_buffer = 34816 / fs_in_mb;
+				break;
+			case 50:
+				num_dpb_buffer = 110400 / fs_in_mb;
+				break;
+			case 51:
+				num_dpb_buffer = 184320 / fs_in_mb;
+				break;
+			default:
+				num_dpb_buffer = 184320 / fs_in_mb;
+				break;
+			}
+			num_dpb_buffer++;
+			max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references);
+			dpb_size = image_size * max_references;
+			dpb_size += max_references * align(width_in_mb * height_in_mb  * 192, alignment);
+			dpb_size += align(width_in_mb * height_in_mb * 32, alignment);
+		} else {
+			// the firmware seems to allways assume a minimum of ref frames
+			max_references = MAX2(NUM_H264_REFS, max_references);
+			// reference picture buffer
+			dpb_size = image_size * max_references;
+			// macroblock context buffer
+			dpb_size += width_in_mb * height_in_mb * max_references * 192;
+			// IT surface buffer
+			dpb_size += width_in_mb * height_in_mb * 32;
+		}
 		break;
+	}
 
 	case PIPE_VIDEO_FORMAT_VC1:
 		// the firmware seems to allways assume a minimum of ref frames
@@ -303,10 +340,8 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_
 		assert(0);
 		break;
 	}
-	if (((dec->base.width * dec->base.height) >> 8) <= 1620)
-		result.level = 30;
-	else
-		result.level = 41;
+
+	result.level = dec->base.level;
 
 	result.sps_info_flags = 0;
 	result.sps_info_flags |= pic->pps->sps->direct_8x8_inference_flag << 0;
@@ -785,8 +820,8 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 					     ruvd_set_dtb set_dtb)
 {
 	struct radeon_winsys* ws = ((struct r600_common_context *)context)->ws;
-	unsigned dpb_size = calc_dpb_size(templ);
 	struct r600_common_context *rctx = (struct r600_common_context*)context;
+	unsigned dpb_size;
 	unsigned width = templ->width, height = templ->height;
 	unsigned bs_buf_size;
 	struct radeon_info info;
@@ -865,6 +900,8 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 		rvid_clear_buffer(context, &dec->bs_buffers[i]);
 	}
 
+	dpb_size = calc_dpb_size(dec);
+
 	if (!rvid_create_buffer(dec->screen, &dec->dpb, dpb_size, PIPE_USAGE_DEFAULT)) {
 		RVID_ERR("Can't allocated dpb.\n");
 		goto error;

From 22f71dbf7976d1803940bc2a0429c3d302dae9fa Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Thu, 12 Mar 2015 16:24:57 -0400
Subject: [PATCH 1184/1208] radeon/uvd: make 30M as minimum for MPEG4 dpb
 buffer size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/radeon/radeon_uvd.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 7b0eb1110d3..375b5c06e12 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -304,6 +304,8 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 
 		// IT surface buffer
 		dpb_size += align(width_in_mb * height_in_mb * 32, 64);
+
+		dpb_size = MAX2(dpb_size, 30 * 1024 * 1024);
 		break;
 
 	default:

From 1550790b3fab901c697e9d8e5b01ea67d8843e99 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Wed, 15 Apr 2015 12:36:32 -0400
Subject: [PATCH 1185/1208] radeon/vce: implement VCE two pipe support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: rebase by Marek

Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/radeon/radeon_vce.c        |  5 +++++
 src/gallium/drivers/radeon/radeon_vce.h        |  4 ++++
 src/gallium/drivers/radeon/radeon_vce_40_2_2.c | 17 +++++++++++++++++
 3 files changed, 26 insertions(+)

diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index c7b8952403a..02991b2afd7 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -400,6 +400,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 		enc->use_vm = true;
 	if ((rscreen->info.drm_major > 2) || (rscreen->info.drm_minor >= 42))
 		enc->use_vui = true;
+	if (rscreen->info.family >= CHIP_TONGA)
+		enc->use_2p = true;
 
 	enc->base = *templ;
 	enc->base.context = context;
@@ -439,6 +441,9 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	cpb_size = cpb_size * align(tmp_surf->npix_y, 16);
 	cpb_size = cpb_size * 3 / 2;
 	cpb_size = cpb_size * enc->cpb_num;
+	if (enc->use_2p)
+		cpb_size +=  RVCE_MAX_AUX_BUFFER_NUM *
+			RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE;
 	tmp_buf->destroy(tmp_buf);
 	if (!rvid_create_buffer(enc->screen, &enc->cpb, cpb_size, PIPE_USAGE_DEFAULT)) {
 		RVID_ERR("Can't create CPB buffer.\n");
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index 66c547855b1..a8448108a55 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -43,6 +43,9 @@
 #define RVCE_READWRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READWRITE, (domain), (off))
 #define RVCE_END() *begin = (&enc->cs->buf[enc->cs->cdw] - begin) * 4; }
 
+#define RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE (4096 * 16 * 2.5)
+#define RVCE_MAX_AUX_BUFFER_NUM 4
+
 struct r600_common_screen;
 
 /* driver dependent callback */
@@ -101,6 +104,7 @@ struct rvce_encoder {
 	struct pipe_h264_enc_picture_desc pic;
 	bool				use_vm;
 	bool				use_vui;
+	bool				use_2p;
 };
 
 /* CPB handling functions */
diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index 71b5e89fd59..904781525b5 100644
--- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -287,6 +287,23 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
 	RVCE_END();
 
+	if (enc->use_2p) {
+		unsigned aux_offset = enc->cpb.res->buf->size -
+			RVCE_MAX_AUX_BUFFER_NUM * RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE;
+		RVCE_BEGIN(0x05000002); // auxiliary buffer
+		for (i = 0; i < 4; ++i) {
+			RVCE_CS(aux_offset);
+			aux_offset += RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE;
+		}
+		for (i = 0; i < 4; ++i)
+			RVCE_CS(0x00000000);
+		for (i = 0; i < 4; ++i)
+			RVCE_CS(RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE);
+		for (i = 0; i < 4; ++i)
+			RVCE_CS(0x00000000);
+		RVCE_END();
+	}
+
 	RVCE_BEGIN(0x03000001); // encode
 	RVCE_CS(0x00000000); // insertHeaders
 	RVCE_CS(0x00000000); // pictureStructure

From 468fcdcb4fafeba466bb1006ece1f16cc38805c7 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Fri, 29 May 2015 13:43:00 -0400
Subject: [PATCH 1186/1208] radeon/vce: add new firmware support for VI and CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/radeon/radeon_vce.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 02991b2afd7..4af044303d2 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -47,6 +47,8 @@
 #define FW_40_2_2 ((40 << 24) | (2 << 16) | (2 << 8))
 #define FW_50_0_1 ((50 << 24) | (0 << 16) | (1 << 8))
 #define FW_50_1_2 ((50 << 24) | (1 << 16) | (2 << 8))
+#define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8))
+#define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8))
 
 /**
  * flush commands to the hardware
@@ -463,6 +465,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	case FW_50_0_1:
 	case FW_50_1_2:
+	case FW_50_10_2:
+	case FW_50_17_3:
 		radeon_vce_50_init(enc);
 		break;
 
@@ -490,7 +494,9 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 {
 	return rscreen->info.vce_fw_version == FW_40_2_2 ||
 		rscreen->info.vce_fw_version == FW_50_0_1 ||
-		rscreen->info.vce_fw_version == FW_50_1_2;
+		rscreen->info.vce_fw_version == FW_50_1_2 ||
+		rscreen->info.vce_fw_version == FW_50_10_2 ||
+		rscreen->info.vce_fw_version == FW_50_17_3;
 }
 
 /**

From fa80c1fe20f1fc33864f04fd9cf49f8bddfa4448 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Mon, 15 Jun 2015 14:11:57 -0400
Subject: [PATCH 1187/1208] radeon/vce: add dual pipe support for VI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/radeon/radeon_vce.c        |  6 +++---
 src/gallium/drivers/radeon/radeon_vce.h        |  2 +-
 src/gallium/drivers/radeon/radeon_vce_40_2_2.c | 17 -----------------
 src/gallium/drivers/radeon/radeon_vce_50.c     | 18 +++++++++++++++++-
 4 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 4af044303d2..644958a05e8 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -403,7 +403,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	if ((rscreen->info.drm_major > 2) || (rscreen->info.drm_minor >= 42))
 		enc->use_vui = true;
 	if (rscreen->info.family >= CHIP_TONGA)
-		enc->use_2p = true;
+		enc->dual_pipe = true;
 
 	enc->base = *templ;
 	enc->base.context = context;
@@ -443,9 +443,9 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	cpb_size = cpb_size * align(tmp_surf->npix_y, 16);
 	cpb_size = cpb_size * 3 / 2;
 	cpb_size = cpb_size * enc->cpb_num;
-	if (enc->use_2p)
+	if (enc->dual_pipe)
 		cpb_size +=  RVCE_MAX_AUX_BUFFER_NUM *
-			RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE;
+			RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE * 2;
 	tmp_buf->destroy(tmp_buf);
 	if (!rvid_create_buffer(enc->screen, &enc->cpb, cpb_size, PIPE_USAGE_DEFAULT)) {
 		RVID_ERR("Can't create CPB buffer.\n");
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index a8448108a55..ae7b2b431be 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -104,7 +104,7 @@ struct rvce_encoder {
 	struct pipe_h264_enc_picture_desc pic;
 	bool				use_vm;
 	bool				use_vui;
-	bool				use_2p;
+	bool				dual_pipe;
 };
 
 /* CPB handling functions */
diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index 904781525b5..71b5e89fd59 100644
--- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -287,23 +287,6 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
 	RVCE_END();
 
-	if (enc->use_2p) {
-		unsigned aux_offset = enc->cpb.res->buf->size -
-			RVCE_MAX_AUX_BUFFER_NUM * RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE;
-		RVCE_BEGIN(0x05000002); // auxiliary buffer
-		for (i = 0; i < 4; ++i) {
-			RVCE_CS(aux_offset);
-			aux_offset += RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE;
-		}
-		for (i = 0; i < 4; ++i)
-			RVCE_CS(0x00000000);
-		for (i = 0; i < 4; ++i)
-			RVCE_CS(RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE);
-		for (i = 0; i < 4; ++i)
-			RVCE_CS(0x00000000);
-		RVCE_END();
-	}
-
 	RVCE_BEGIN(0x03000001); // encode
 	RVCE_CS(0x00000000); // insertHeaders
 	RVCE_CS(0x00000000); // pictureStructure
diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c
index 2cdddf2e5b7..80a69b5442b 100644
--- a/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -106,6 +106,19 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
 	RVCE_END();
 
+	if (enc->dual_pipe) {
+		unsigned aux_offset = enc->cpb.res->buf->size -
+			RVCE_MAX_AUX_BUFFER_NUM * RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE * 2;
+		RVCE_BEGIN(0x05000002); // auxiliary buffer
+		for (i = 0; i < 8; ++i) {
+			RVCE_CS(aux_offset);
+			aux_offset += RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE;
+		}
+		for (i = 0; i < 8; ++i)
+			RVCE_CS(RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE);
+		RVCE_END();
+	}
+
 	RVCE_BEGIN(0x03000001); // encode
 	RVCE_CS(enc->pic.frame_num ? 0x0 : 0x11); // insertHeaders
 	RVCE_CS(0x00000000); // pictureStructure
@@ -121,7 +134,10 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
 	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
 	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
-	RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
+	if (enc->dual_pipe)
+		RVCE_CS(0x00000000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
+	else
+		RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
 	RVCE_CS(0x00000000); // encInputPicTileConfig
 	RVCE_CS(enc->pic.picture_type); // encPicType
 	RVCE_CS(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag

From e91a67abfa5112acd481ee4a3f07c03f6ff2708c Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Mon, 15 Jun 2015 15:20:20 -0400
Subject: [PATCH 1188/1208] radeon/vce: fix VCE fail after rebase
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/radeon/radeon_vce_50.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c
index 80a69b5442b..a11cc6a2860 100644
--- a/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -96,13 +96,11 @@ static void encode(struct rvce_encoder *enc)
 	task_info(enc, 0x00000003);
 
 	RVCE_BEGIN(0x05000001); // context buffer
-	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi
-	RVCE_CS(0x00000000); // encodeContextAddressLo
+	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
 	RVCE_END();
 
 	RVCE_BEGIN(0x05000004); // video bitstream buffer
-	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, 0); // videoBitstreamRingAddressHi
-	RVCE_CS(0x00000000); // videoBitstreamRingAddressLo
+	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, 0); // videoBitstreamRingAddressHi/Lo
 	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
 	RVCE_END();
 
@@ -127,10 +125,10 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(0x00000000); // insertAUD
 	RVCE_CS(0x00000000); // endOfSequence
 	RVCE_CS(0x00000000); // endOfStream
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, 0); // inputPictureLumaAddressHi
-	RVCE_CS(enc->luma->level[0].offset); // inputPictureLumaAddressLo
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, 0); // inputPictureChromaAddressHi
-	RVCE_CS(enc->chroma->level[0].offset); // inputPictureChromaAddressLo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo
 	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
 	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
 	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch

From 57fabe9f3a21a2a370284575833637d37e987cb5 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Mon, 1 Jun 2015 13:48:24 -0400
Subject: [PATCH 1189/1208] radeon/vce: add config task and put task info into
 encoder v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The config task has own task ID, extract the configuration functions
into config task.

v2 (chk): calculate offset automatically

Signed-off-by: Leo Liu <leo.liu@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeon/radeon_vce.c       | 21 +++++-----
 src/gallium/drivers/radeon/radeon_vce.h       |  6 +++
 .../drivers/radeon/radeon_vce_40_2_2.c        | 39 +++++++++++++++----
 src/gallium/drivers/radeon/radeon_vce_50.c    | 14 +------
 4 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 644958a05e8..e53ef3f8486 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -56,6 +56,7 @@
 static void flush(struct rvce_encoder *enc)
 {
 	enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL, 0);
+	enc->task_info_idx = 0;
 }
 
 #if 0
@@ -280,24 +281,19 @@ static void rvce_begin_frame(struct pipe_video_codec *encoder,
 		enc->fb = &fb;
 		enc->session(enc);
 		enc->create(enc);
-		enc->rate_control(enc);
-		need_rate_control = false;
-		enc->config_extension(enc);
-		enc->motion_estimation(enc);
-		enc->rdo(enc);
-		if (enc->use_vui)
-			enc->vui(enc);
-		enc->pic_control(enc);
+		enc->config(enc);
 		enc->feedback(enc);
 		flush(enc);
 		//dump_feedback(enc, &fb);
 		rvid_destroy_buffer(&fb);
+		need_rate_control = false;
 	}
 
-	enc->session(enc);
-
-	if (need_rate_control)
-		enc->rate_control(enc);
+	if (need_rate_control) {
+		enc->session(enc);
+		enc->config(enc);
+		flush(enc);
+	}
 }
 
 static void rvce_encode_bitstream(struct pipe_video_codec *encoder,
@@ -314,6 +310,7 @@ static void rvce_encode_bitstream(struct pipe_video_codec *encoder,
 		RVID_ERR("Can't create feedback buffer.\n");
 		return;
 	}
+	enc->session(enc);
 	enc->encode(enc);
 	enc->feedback(enc);
 }
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index ae7b2b431be..28629043bc3 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -77,8 +77,12 @@ struct rvce_encoder {
 	void (*motion_estimation)(struct rvce_encoder *enc);
 	void (*rdo)(struct rvce_encoder *enc);
 	void (*vui)(struct rvce_encoder *enc);
+	void (*config)(struct rvce_encoder *enc);
 	void (*encode)(struct rvce_encoder *enc);
 	void (*destroy)(struct rvce_encoder *enc);
+	void (*task_info)(struct rvce_encoder *enc, uint32_t op,
+			  uint32_t dep, uint32_t fb_idx,
+			  uint32_t ring_idx);
 
 	unsigned			stream_handle;
 
@@ -102,6 +106,8 @@ struct rvce_encoder {
 	struct rvid_buffer		*fb;
 	struct rvid_buffer		cpb;
 	struct pipe_h264_enc_picture_desc pic;
+	unsigned			task_info_idx;
+
 	bool				use_vm;
 	bool				use_vui;
 	bool				dual_pipe;
diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index 71b5e89fd59..ede540d7ac8 100644
--- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -53,15 +53,24 @@ static void session(struct rvce_encoder *enc)
 	RVCE_END();
 }
 
-static void task_info(struct rvce_encoder *enc, uint32_t taskOperation)
+static void task_info(struct rvce_encoder *enc, uint32_t op,
+		      uint32_t dep, uint32_t fb_idx, uint32_t ring_idx)
 {
 	RVCE_BEGIN(0x00000002); // task info
+	if (op == 0x3) {
+		if (enc->task_info_idx) {
+			uint32_t offs = enc->cs->cdw - enc->task_info_idx + 3;
+			// Update offsetOfNextTaskInfo
+			enc->cs->buf[enc->task_info_idx] = offs;
+		}
+		enc->task_info_idx = enc->cs->cdw;
+	}
 	RVCE_CS(0xffffffff); // offsetOfNextTaskInfo
-	RVCE_CS(taskOperation); // taskOperation
-	RVCE_CS(0x00000000); // referencePictureDependency
+	RVCE_CS(op); // taskOperation
+	RVCE_CS(dep); // referencePictureDependency
 	RVCE_CS(0x00000000); // collocateFlagDependency
-	RVCE_CS(0x00000000); // feedbackIndex
-	RVCE_CS(0x00000000); // videoBitstreamRingIndex
+	RVCE_CS(fb_idx); // feedbackIndex
+	RVCE_CS(ring_idx); // videoBitstreamRingIndex
 	RVCE_END();
 }
 
@@ -75,7 +84,7 @@ static void feedback(struct rvce_encoder *enc)
 
 static void create(struct rvce_encoder *enc)
 {
-	task_info(enc, 0x00000000);
+	enc->task_info(enc, 0x00000000, 0, 0, 0);
 
 	RVCE_BEGIN(0x01000001); // create cmd
 	RVCE_CS(0x00000000); // encUseCircularBuffer
@@ -271,12 +280,24 @@ static void vui(struct rvce_encoder *enc)
 	RVCE_END();
 }
 
+static void config(struct rvce_encoder *enc)
+{
+	enc->task_info(enc, 0x00000002, 0, 0xffffffff, 0);
+	enc->rate_control(enc);
+	enc->config_extension(enc);
+	enc->motion_estimation(enc);
+	enc->rdo(enc);
+	if (enc->use_vui)
+		enc->vui(enc);
+	enc->pic_control(enc);
+}
+
 static void encode(struct rvce_encoder *enc)
 {
 	int i;
 	unsigned luma_offset, chroma_offset;
 
-	task_info(enc, 0x00000003);
+	enc->task_info(enc, 0x00000003, 0, 0, 0);
 
 	RVCE_BEGIN(0x05000001); // context buffer
 	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo
@@ -401,7 +422,7 @@ static void encode(struct rvce_encoder *enc)
 
 static void destroy(struct rvce_encoder *enc)
 {
-	task_info(enc, 0x00000001);
+	enc->task_info(enc, 0x00000001, 0, 0, 0);
 
 	RVCE_BEGIN(0x02000001); // destroy
 	RVCE_END();
@@ -410,6 +431,7 @@ static void destroy(struct rvce_encoder *enc)
 void radeon_vce_40_2_2_init(struct rvce_encoder *enc)
 {
 	enc->session = session;
+	enc->task_info = task_info;
 	enc->create = create;
 	enc->feedback = feedback;
 	enc->rate_control = rate_control;
@@ -418,6 +440,7 @@ void radeon_vce_40_2_2_init(struct rvce_encoder *enc)
 	enc->motion_estimation = motion_estimation;
 	enc->rdo = rdo;
 	enc->vui = vui;
+	enc->config = config;
 	enc->encode = encode;
 	enc->destroy = destroy;
 }
diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c
index a11cc6a2860..377e1543186 100644
--- a/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -44,18 +44,6 @@
 #include "radeon_video.h"
 #include "radeon_vce.h"
 
-static void task_info(struct rvce_encoder *enc, uint32_t taskOperation)
-{
-	RVCE_BEGIN(0x00000002); // task info
-	RVCE_CS(0xffffffff); // offsetOfNextTaskInfo
-	RVCE_CS(taskOperation); // taskOperation
-	RVCE_CS(0x00000000); // referencePictureDependency
-	RVCE_CS(0x00000000); // collocateFlagDependency
-	RVCE_CS(0x00000000); // feedbackIndex
-	RVCE_CS(0x00000000); // videoBitstreamRingIndex
-	RVCE_END();
-}
-
 static void rate_control(struct rvce_encoder *enc)
 {
 	RVCE_BEGIN(0x04000005); // rate control
@@ -93,7 +81,7 @@ static void encode(struct rvce_encoder *enc)
 	int i;
 	unsigned luma_offset, chroma_offset;
 
-	task_info(enc, 0x00000003);
+	enc->task_info(enc, 0x00000003, 0, 0, 0);
 
 	RVCE_BEGIN(0x05000001); // context buffer
 	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo

From 42bc4e6be434b398d9edaff0ed10dfb5bf89b6a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Mon, 15 Jun 2015 20:19:48 +0200
Subject: [PATCH 1190/1208] radeon/vce: make reloc offset signed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need a negative offset for FW 50.

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeon/radeon_vce.c        | 4 ++--
 src/gallium/drivers/radeon/radeon_vce.h        | 4 ++--
 src/gallium/drivers/radeon/radeon_vce_40_2_2.c | 2 +-
 src/gallium/drivers/radeon/radeon_vce_50.c     | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index e53ef3f8486..0b667fd489b 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -217,7 +217,7 @@ struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc)
  * Calculate the offsets into the CPB
  */
 void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
-		       unsigned *luma_offset, unsigned *chroma_offset)
+		       signed *luma_offset, signed *chroma_offset)
 {
 	unsigned pitch = align(enc->luma->level[0].pitch_bytes, 128);
 	unsigned vpitch = align(enc->luma->npix_y, 16);
@@ -501,7 +501,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
  */
 void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf,
                      enum radeon_bo_usage usage, enum radeon_bo_domain domain,
-                     uint32_t offset)
+                     signed offset)
 {
 	int reloc_idx;
 
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index 28629043bc3..06e9868ca96 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -118,7 +118,7 @@ struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc);
 struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc);
 struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc);
 void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
-		       unsigned *luma_offset, unsigned *chroma_offset);
+		       signed *luma_offset, signed *chroma_offset);
 
 struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 					     const struct pipe_video_codec *templat,
@@ -129,7 +129,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen);
 
 void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf,
 		     enum radeon_bo_usage usage, enum radeon_bo_domain domain,
-		     uint32_t offset);
+		     signed offset);
 
 /* init vce fw 40.2.2 specific callbacks */
 void radeon_vce_40_2_2_init(struct rvce_encoder *enc);
diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index ede540d7ac8..e64fbc7afb0 100644
--- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -294,8 +294,8 @@ static void config(struct rvce_encoder *enc)
 
 static void encode(struct rvce_encoder *enc)
 {
+	signed luma_offset, chroma_offset;
 	int i;
-	unsigned luma_offset, chroma_offset;
 
 	enc->task_info(enc, 0x00000003, 0, 0, 0);
 
diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c
index 377e1543186..5bf22194418 100644
--- a/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -78,8 +78,8 @@ static void rate_control(struct rvce_encoder *enc)
 
 static void encode(struct rvce_encoder *enc)
 {
+	signed luma_offset, chroma_offset;
 	int i;
-	unsigned luma_offset, chroma_offset;
 
 	enc->task_info(enc, 0x00000003, 0, 0, 0);
 

From 4dfcf6e3a91be97fcf9d3f44e76a7a389f8f40b2 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Thu, 25 Jun 2015 12:12:12 -0400
Subject: [PATCH 1191/1208] radeon/video: config encode stacked frame number
 based on HW
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

since VCE 3.0 with dual instances, we need stack frames for them.

Signed-off-by: Leo Liu <leo.liu@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/radeon/radeon_video.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index 65949fba3b9..5a8d18762d9 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -225,6 +225,8 @@ int rvid_get_video_param(struct pipe_screen *screen,
         	        return false;
 	        case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
         	        return true;
+	        case PIPE_VIDEO_CAP_STACKED_FRAMES:
+			return (rscreen->family < CHIP_TONGA) ? 1 : 2;
 	        default:
         	        return 0;
 		}

From 09def7e1e06827ab1eae091f0e765d91c6715cf9 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Thu, 25 Jun 2015 10:14:14 -0400
Subject: [PATCH 1192/1208] radeon/vce: implement VCE dual instance support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VCE dual instances are encoding in parallel, it needs two frames for
encoding with their own parameters in one IB. Master instance will check
the task info to find another frame, assign it to the slave instance

Signed-off-by: Leo Liu <leo.liu@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeon/radeon_vce.c    | 13 +++++++++++--
 src/gallium/drivers/radeon/radeon_vce.h    |  3 +++
 src/gallium/drivers/radeon/radeon_vce_50.c | 19 ++++++++++++++++---
 3 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 0b667fd489b..bc8f36db355 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -57,6 +57,7 @@ static void flush(struct rvce_encoder *enc)
 {
 	enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL, 0);
 	enc->task_info_idx = 0;
+	enc->bs_idx = 0;
 }
 
 #if 0
@@ -310,7 +311,8 @@ static void rvce_encode_bitstream(struct pipe_video_codec *encoder,
 		RVID_ERR("Can't create feedback buffer.\n");
 		return;
 	}
-	enc->session(enc);
+	if (!enc->cs->cdw)
+		enc->session(enc);
 	enc->encode(enc);
 	enc->feedback(enc);
 }
@@ -323,7 +325,8 @@ static void rvce_end_frame(struct pipe_video_codec *encoder,
 	struct rvce_cpb_slot *slot = LIST_ENTRY(
 		struct rvce_cpb_slot, enc->cpb_slots.prev, list);
 
-	flush(enc);
+	if (!enc->dual_inst || enc->bs_idx > 1)
+		flush(enc);
 
 	/* update the CPB backtrack with the just encoded frame */
 	slot->picture_type = enc->pic.picture_type;
@@ -362,6 +365,9 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder,
  */
 static void rvce_flush(struct pipe_video_codec *encoder)
 {
+	struct rvce_encoder *enc = (struct rvce_encoder*)encoder;
+
+	flush(enc);
 }
 
 static void rvce_cs_flush(void *ctx, unsigned flags,
@@ -401,6 +407,9 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 		enc->use_vui = true;
 	if (rscreen->info.family >= CHIP_TONGA)
 		enc->dual_pipe = true;
+	/* TODO enable B frame with dual instance */
+	if ((rscreen->info.family >= CHIP_TONGA) && (templ->max_references == 1))
+		enc->dual_inst = true;
 
 	enc->base = *templ;
 	enc->base.context = context;
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index 06e9868ca96..624bda479f8 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -106,11 +106,14 @@ struct rvce_encoder {
 	struct rvid_buffer		*fb;
 	struct rvid_buffer		cpb;
 	struct pipe_h264_enc_picture_desc pic;
+
 	unsigned			task_info_idx;
+	unsigned			bs_idx;
 
 	bool				use_vm;
 	bool				use_vui;
 	bool				dual_pipe;
+	bool				dual_inst;
 };
 
 /* CPB handling functions */
diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c
index 5bf22194418..afdab18c0d3 100644
--- a/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -78,17 +78,30 @@ static void rate_control(struct rvce_encoder *enc)
 
 static void encode(struct rvce_encoder *enc)
 {
-	signed luma_offset, chroma_offset;
+	signed luma_offset, chroma_offset, bs_offset;
+	unsigned dep, bs_idx = enc->bs_idx++;
 	int i;
 
-	enc->task_info(enc, 0x00000003, 0, 0, 0);
+	if (enc->dual_inst) {
+		if (bs_idx == 0)
+			dep = 1;
+		else if (enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR)
+			dep = 0;
+		else
+			dep = 2;
+	} else
+		dep = 0;
+
+	enc->task_info(enc, 0x00000003, dep, 0, bs_idx);
 
 	RVCE_BEGIN(0x05000001); // context buffer
 	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
 	RVCE_END();
 
+	bs_offset = -(signed)(bs_idx * enc->bs_size);
+
 	RVCE_BEGIN(0x05000004); // video bitstream buffer
-	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, 0); // videoBitstreamRingAddressHi/Lo
+	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, bs_offset); // videoBitstreamRingAddressHi/Lo
 	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
 	RVCE_END();
 

From 0654a9ca17c17fe140f70d126c878a0ce4736b76 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Mon, 13 Jul 2015 13:36:27 -0400
Subject: [PATCH 1193/1208] radeon/vce: disable VCE dual instance for harvest
 part
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/radeon/radeon_vce.c       | 4 +++-
 src/gallium/drivers/radeon/radeon_winsys.h    | 1 +
 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index bc8f36db355..7eab974a3df 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -408,7 +408,9 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	if (rscreen->info.family >= CHIP_TONGA)
 		enc->dual_pipe = true;
 	/* TODO enable B frame with dual instance */
-	if ((rscreen->info.family >= CHIP_TONGA) && (templ->max_references == 1))
+	if ((rscreen->info.family >= CHIP_TONGA) &&
+		(templ->max_references == 1) &&
+		(rscreen->info.vce_harvest_config == 0))
 		enc->dual_inst = true;
 
 	enc->base = *templ;
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index d7424eef27e..7ab6e56e099 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -244,6 +244,7 @@ struct radeon_info {
 
     boolean                     cik_macrotile_mode_array_valid;
     uint32_t                    cik_macrotile_mode_array[16];
+    uint32_t                    vce_harvest_config;
 };
 
 enum radeon_feature_id {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index f57f45b618f..012c9003b69 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -164,6 +164,7 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws)
 
    /* Set chip identification. */
    ws->info.pci_id = ws->amdinfo.asic_id; /* TODO: is this correct? */
+   ws->info.vce_harvest_config = ws->amdinfo.vce_harvest_config;
 
    switch (ws->info.pci_id) {
 #define CHIPSET(pci_id, name, cfamily) case pci_id: ws->info.family = CHIP_##cfamily; break;

From 839bf82606ae9c7b1c7d8d5055ab5e3cadae9bf9 Mon Sep 17 00:00:00 2001
From: Boyuan Zhang <boyuan.zhang@amd.com>
Date: Wed, 8 Jul 2015 16:54:48 -0400
Subject: [PATCH 1194/1208] radeon/uvd: implement HEVC support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

add context buffer to fix H265 uvd decode issue.
fix H265 corruption issue caused by incorrect assigned ref_pic_list.

v2: disable interlace for HEVC
    add CZ sps flag workaround
    fix coding style

Signed-off-by: Christian KÃ¶nig <christian.koenig@amd.com>
Signed-off-by: Boyuan Zhang <boyuan.zhang@amd.com>
Reviewed-by: Leo Liu <leo.liu@amd.com>
---
 src/gallium/drivers/radeon/radeon_uvd.c   | 242 ++++++++++++++++++++--
 src/gallium/drivers/radeon/radeon_uvd.h   |  63 ++++++
 src/gallium/drivers/radeon/radeon_video.c |  10 +
 3 files changed, 298 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 375b5c06e12..16ee5410273 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -57,7 +57,7 @@
 
 #define FB_BUFFER_OFFSET 0x1000
 #define FB_BUFFER_SIZE 2048
-#define IT_SCALING_TABLE_SIZE 224
+#define IT_SCALING_TABLE_SIZE 992
 
 /* UVD decoder representation */
 struct ruvd_decoder {
@@ -86,6 +86,7 @@ struct ruvd_decoder {
 
 	struct rvid_buffer		dpb;
 	bool				use_legacy;
+	struct rvid_buffer		ctx;
 };
 
 /* flush IB to the hardware */
@@ -124,6 +125,13 @@ static void send_cmd(struct ruvd_decoder *dec, unsigned cmd,
 	set_reg(dec, RUVD_GPCOM_VCPU_CMD, cmd << 1);
 }
 
+/* do the codec needs an IT buffer ?*/
+static bool have_it(struct ruvd_decoder *dec)
+{
+	return dec->stream_type == RUVD_CODEC_H264_PERF ||
+		dec->stream_type == RUVD_CODEC_H265;
+}
+
 /* map the next available message/feedback/itscaling buffer */
 static void map_msg_fb_it_buf(struct ruvd_decoder *dec)
 {
@@ -139,7 +147,7 @@ static void map_msg_fb_it_buf(struct ruvd_decoder *dec)
 	/* calc buffer offsets */
 	dec->msg = (struct ruvd_msg *)ptr;
 	dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);
-	if (dec->stream_type == RUVD_CODEC_H264_PERF)
+	if (have_it(dec))
 		dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + FB_BUFFER_SIZE);
 }
 
@@ -159,8 +167,7 @@ static void send_msg_buf(struct ruvd_decoder *dec)
 	dec->ws->buffer_unmap(buf->res->cs_buf);
 	dec->msg = NULL;
 	dec->fb = NULL;
-	if (dec->stream_type == RUVD_CODEC_H264_PERF)
-		dec->it = NULL;
+	dec->it = NULL;
 
 	/* and send it to the hardware */
 	send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->cs_buf, 0,
@@ -191,12 +198,35 @@ static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family)
 	case PIPE_VIDEO_FORMAT_MPEG4:
 		return RUVD_CODEC_MPEG4;
 
+	case PIPE_VIDEO_FORMAT_HEVC:
+		return RUVD_CODEC_H265;
+
 	default:
 		assert(0);
 		return 0;
 	}
 }
 
+static unsigned calc_ctx_size(struct ruvd_decoder *dec)
+{
+	unsigned width_in_mb, height_in_mb, ctx_size;
+
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
+
+	unsigned max_references = dec->base.max_references + 1;
+
+	if (dec->base.width * dec->base.height >= 4096*2000)
+		max_references = MAX2(max_references, 8);
+	else
+		max_references = MAX2(max_references, 17);
+
+	width = align (width, 16);
+	height = align (height, 16);
+	ctx_size = ((width + 255) / 16)*((height + 255) / 16) * 16 * max_references + 52 * 1024;
+	return ctx_size;
+}
+
 /* calculate size of reference picture buffer */
 static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 {
@@ -270,6 +300,17 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 		break;
 	}
 
+	case PIPE_VIDEO_FORMAT_HEVC:
+		if (dec->base.width * dec->base.height >= 4096*2000)
+			max_references = MAX2(max_references, 8);
+		else
+			max_references = MAX2(max_references, 17);
+
+		width = align (width, 16);
+		height = align (height, 16);
+		dpb_size = align((width * height * 3) / 2, 256) * max_references;
+		break;
+
 	case PIPE_VIDEO_FORMAT_VC1:
 		// the firmware seems to allways assume a minimum of ref frames
 		max_references = MAX2(NUM_VC1_REFS, max_references);
@@ -319,6 +360,12 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 	return dpb_size;
 }
 
+/* free associated data in the video buffer callback */
+static void ruvd_destroy_associated_data(void *data)
+{
+	/* NOOP, since we only use an intptr */
+}
+
 /* get h264 specific message bits */
 static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_picture_desc *pic)
 {
@@ -392,6 +439,11 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_
 	memcpy(result.scaling_list_4x4, pic->pps->ScalingList4x4, 6*16);
 	memcpy(result.scaling_list_8x8, pic->pps->ScalingList8x8, 2*64);
 
+	if (dec->stream_type == RUVD_CODEC_H264_PERF) {
+		memcpy(dec->it, result.scaling_list_4x4, 6*16);
+		memcpy((dec->it + 96), result.scaling_list_8x8, 2*64);
+	}
+
 	result.num_ref_frames = pic->num_ref_frames;
 
 	result.num_ref_idx_l0_active_minus1 = pic->num_ref_idx_l0_active_minus1;
@@ -408,6 +460,151 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_
 	return result;
 }
 
+/* get h265 specific message bits */
+static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video_buffer *target,
+				     struct pipe_h265_picture_desc *pic)
+{
+	struct ruvd_h265 result;
+	unsigned i;
+
+	memset(&result, 0, sizeof(result));
+
+	result.sps_info_flags = 0;
+	result.sps_info_flags |= pic->pps->sps->scaling_list_enabled_flag << 0;
+	result.sps_info_flags |= pic->pps->sps->amp_enabled_flag << 1;
+	result.sps_info_flags |= pic->pps->sps->sample_adaptive_offset_enabled_flag << 2;
+	result.sps_info_flags |= pic->pps->sps->pcm_enabled_flag << 3;
+	result.sps_info_flags |= pic->pps->sps->pcm_loop_filter_disabled_flag << 4;
+	result.sps_info_flags |= pic->pps->sps->long_term_ref_pics_present_flag << 5;
+	result.sps_info_flags |= pic->pps->sps->sps_temporal_mvp_enabled_flag << 6;
+	result.sps_info_flags |= pic->pps->sps->strong_intra_smoothing_enabled_flag << 7;
+	result.sps_info_flags |= pic->pps->sps->separate_colour_plane_flag << 8;
+	if (((struct r600_common_screen*)dec->screen)->family == CHIP_CARRIZO)
+		result.sps_info_flags |= 1 << 9;
+
+	result.chroma_format = pic->pps->sps->chroma_format_idc;
+	result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8;
+	result.bit_depth_chroma_minus8 = pic->pps->sps->bit_depth_chroma_minus8;
+	result.log2_max_pic_order_cnt_lsb_minus4 = pic->pps->sps->log2_max_pic_order_cnt_lsb_minus4;
+	result.sps_max_dec_pic_buffering_minus1 = pic->pps->sps->sps_max_dec_pic_buffering_minus1;
+	result.log2_min_luma_coding_block_size_minus3 = pic->pps->sps->log2_min_luma_coding_block_size_minus3;
+	result.log2_diff_max_min_luma_coding_block_size = pic->pps->sps->log2_diff_max_min_luma_coding_block_size;
+	result.log2_min_transform_block_size_minus2 = pic->pps->sps->log2_min_transform_block_size_minus2;
+	result.log2_diff_max_min_transform_block_size = pic->pps->sps->log2_diff_max_min_transform_block_size;
+	result.max_transform_hierarchy_depth_inter = pic->pps->sps->max_transform_hierarchy_depth_inter;
+	result.max_transform_hierarchy_depth_intra = pic->pps->sps->max_transform_hierarchy_depth_intra;
+	result.pcm_sample_bit_depth_luma_minus1 = pic->pps->sps->pcm_sample_bit_depth_luma_minus1;
+	result.pcm_sample_bit_depth_chroma_minus1 = pic->pps->sps->pcm_sample_bit_depth_chroma_minus1;
+	result.log2_min_pcm_luma_coding_block_size_minus3 = pic->pps->sps->log2_min_pcm_luma_coding_block_size_minus3;
+	result.log2_diff_max_min_pcm_luma_coding_block_size = pic->pps->sps->log2_diff_max_min_pcm_luma_coding_block_size;
+	result.num_short_term_ref_pic_sets = pic->pps->sps->num_short_term_ref_pic_sets;
+
+	result.pps_info_flags = 0;
+	result.pps_info_flags |= pic->pps->dependent_slice_segments_enabled_flag << 0;
+	result.pps_info_flags |= pic->pps->output_flag_present_flag << 1;
+	result.pps_info_flags |= pic->pps->sign_data_hiding_enabled_flag << 2;
+	result.pps_info_flags |= pic->pps->cabac_init_present_flag << 3;
+	result.pps_info_flags |= pic->pps->constrained_intra_pred_flag << 4;
+	result.pps_info_flags |= pic->pps->transform_skip_enabled_flag << 5;
+	result.pps_info_flags |= pic->pps->cu_qp_delta_enabled_flag << 6;
+	result.pps_info_flags |= pic->pps->pps_slice_chroma_qp_offsets_present_flag << 7;
+	result.pps_info_flags |= pic->pps->weighted_pred_flag << 8;
+	result.pps_info_flags |= pic->pps->weighted_bipred_flag << 9;
+	result.pps_info_flags |= pic->pps->transquant_bypass_enabled_flag << 10;
+	result.pps_info_flags |= pic->pps->tiles_enabled_flag << 11;
+	result.pps_info_flags |= pic->pps->entropy_coding_sync_enabled_flag << 12;
+	result.pps_info_flags |= pic->pps->uniform_spacing_flag << 13;
+	result.pps_info_flags |= pic->pps->loop_filter_across_tiles_enabled_flag << 14;
+	result.pps_info_flags |= pic->pps->pps_loop_filter_across_slices_enabled_flag << 15;
+	result.pps_info_flags |= pic->pps->deblocking_filter_override_enabled_flag << 16;
+	result.pps_info_flags |= pic->pps->pps_deblocking_filter_disabled_flag << 17;
+	result.pps_info_flags |= pic->pps->lists_modification_present_flag << 18;
+	result.pps_info_flags |= pic->pps->slice_segment_header_extension_present_flag << 19;
+	//result.pps_info_flags |= pic->pps->deblocking_filter_control_present_flag; ???
+
+	result.num_extra_slice_header_bits = pic->pps->num_extra_slice_header_bits;
+	result.num_long_term_ref_pic_sps = pic->pps->sps->num_long_term_ref_pics_sps;
+	result.num_ref_idx_l0_default_active_minus1 = pic->pps->num_ref_idx_l0_default_active_minus1;
+	result.num_ref_idx_l1_default_active_minus1 = pic->pps->num_ref_idx_l1_default_active_minus1;
+	result.pps_cb_qp_offset = pic->pps->pps_cb_qp_offset;
+	result.pps_cr_qp_offset = pic->pps->pps_cr_qp_offset;
+	result.pps_beta_offset_div2 = pic->pps->pps_beta_offset_div2;
+	result.pps_tc_offset_div2 = pic->pps->pps_tc_offset_div2;
+	result.diff_cu_qp_delta_depth = pic->pps->diff_cu_qp_delta_depth;
+	result.num_tile_columns_minus1 = pic->pps->num_tile_columns_minus1;
+	result.num_tile_rows_minus1 = pic->pps->num_tile_rows_minus1;
+	result.log2_parallel_merge_level_minus2 = pic->pps->log2_parallel_merge_level_minus2;
+	result.init_qp_minus26 = pic->pps->init_qp_minus26;
+
+	for (i = 0; i < 19; ++i)
+		result.column_width_minus1[i] = pic->pps->column_width_minus1[i];
+
+	for (i = 0; i < 21; ++i)
+		result.row_height_minus1[i] = pic->pps->row_height_minus1[i];
+
+	result.num_delta_pocs_ref_rps_idx = pic->NumDeltaPocsOfRefRpsIdx;
+	result.curr_idx = pic->CurrPicOrderCntVal;
+	result.curr_poc = pic->CurrPicOrderCntVal;
+
+	vl_video_buffer_set_associated_data(target, &dec->base,
+					    (void *)(uintptr_t)pic->CurrPicOrderCntVal,
+					    &ruvd_destroy_associated_data);
+
+	for (i = 0; i < 16; ++i) {
+		struct pipe_video_buffer *ref = pic->ref[i];
+		uintptr_t ref_pic = 0;
+
+		result.poc_list[i] = pic->PicOrderCntVal[i];
+
+		if (ref)
+			ref_pic = (uintptr_t)vl_video_buffer_get_associated_data(ref, &dec->base);
+		else
+			ref_pic = 0x7F;
+		result.ref_pic_list[i] = ref_pic;
+	}
+
+	for (i = 0; i < 8; ++i) {
+		result.ref_pic_set_st_curr_before[i] = 0xFF;
+		result.ref_pic_set_st_curr_after[i] = 0xFF;
+		result.ref_pic_set_lt_curr[i] = 0xFF;
+	}
+
+	for (i = 0; i < pic->NumPocStCurrBefore; ++i)
+		result.ref_pic_set_st_curr_before[i] = pic->RefPicSetStCurrBefore[i];
+
+	for (i = 0; i < pic->NumPocStCurrAfter; ++i)
+		result.ref_pic_set_st_curr_after[i] = pic->RefPicSetStCurrAfter[i];
+
+	for (i = 0; i < pic->NumPocLtCurr; ++i)
+		result.ref_pic_set_lt_curr[i] = pic->RefPicSetLtCurr[i];
+
+	for (i = 0; i < 6; ++i)
+		result.ucScalingListDCCoefSizeID2[i] = pic->pps->sps->ScalingListDCCoeff16x16[i];
+
+	for (i = 0; i < 2; ++i)
+		result.ucScalingListDCCoefSizeID3[i] = pic->pps->sps->ScalingListDCCoeff32x32[i];
+
+	memcpy(dec->it, pic->pps->sps->ScalingList4x4, 6 * 16);
+	memcpy(dec->it + 96, pic->pps->sps->ScalingList8x8, 6 * 64);
+	memcpy(dec->it + 480, pic->pps->sps->ScalingList16x16, 6 * 64);
+	memcpy(dec->it + 864, pic->pps->sps->ScalingList32x32, 2 * 64);
+
+	/* TODO
+	result.highestTid;
+	result.isNonRef;
+
+	IDRPicFlag;
+	RAPPicFlag;
+	NumPocTotalCurr;
+	NumShortTermPictureSliceHeaderBits;
+	NumLongTermPictureSliceHeaderBits;
+
+	IsLongTerm[16];
+	*/
+
+	return result;
+}
+
 /* get vc1 specific message bits */
 static struct ruvd_vc1 get_vc1_msg(struct pipe_vc1_picture_desc *pic)
 {
@@ -627,16 +824,12 @@ static void ruvd_destroy(struct pipe_video_codec *decoder)
 	}
 
 	rvid_destroy_buffer(&dec->dpb);
+	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC)
+		rvid_destroy_buffer(&dec->ctx);
 
 	FREE(dec);
 }
 
-/* free associated data in the video buffer callback */
-static void ruvd_destroy_associated_data(void *data)
-{
-	/* NOOP, since we only use an intptr */
-}
-
 /**
  * start decoding of a new frame
  */
@@ -759,10 +952,10 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	switch (u_reduce_video_profile(picture->profile)) {
 	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
 		dec->msg->body.decode.codec.h264 = get_h264_msg(dec, (struct pipe_h264_picture_desc*)picture);
-		if (dec->stream_type == RUVD_CODEC_H264_PERF) {
-			memcpy(dec->it, dec->msg->body.decode.codec.h264.scaling_list_4x4, 6*16);
-			memcpy((dec->it + 96), dec->msg->body.decode.codec.h264.scaling_list_8x8, 2*64);
-		}
+		break;
+
+	case PIPE_VIDEO_FORMAT_HEVC:
+		dec->msg->body.decode.codec.h265 = get_h265_msg(dec, target, (struct pipe_h265_picture_desc*)picture);
 		break;
 
 	case PIPE_VIDEO_FORMAT_VC1:
@@ -792,13 +985,17 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 
 	send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->cs_buf, 0,
 		 RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
+	if (u_reduce_video_profile(picture->profile) == PIPE_VIDEO_FORMAT_HEVC) {
+		send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->cs_buf, 0,
+			RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
+	}
 	send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->cs_buf,
 		 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 	send_cmd(dec, RUVD_CMD_DECODING_TARGET_BUFFER, dt, 0,
 		 RADEON_USAGE_WRITE, RADEON_DOMAIN_VRAM);
 	send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->cs_buf,
 		 FB_BUFFER_OFFSET, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
-	if (dec->stream_type == RUVD_CODEC_H264_PERF)
+	if (have_it(dec))
 		send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->cs_buf,
 			 FB_BUFFER_OFFSET + FB_BUFFER_SIZE, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 	set_reg(dec, RUVD_ENGINE_CNTL, 1);
@@ -884,7 +1081,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	for (i = 0; i < NUM_BUFFERS; ++i) {
 		unsigned msg_fb_it_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE;
 		STATIC_ASSERT(sizeof(struct ruvd_msg) <= FB_BUFFER_OFFSET);
-		if (dec->stream_type == RUVD_CODEC_H264_PERF)
+		if (have_it(dec))
 			msg_fb_it_size += IT_SCALING_TABLE_SIZE;
 		if (!rvid_create_buffer(dec->screen, &dec->msg_fb_it_buffers[i],
 					msg_fb_it_size, PIPE_USAGE_STAGING)) {
@@ -911,6 +1108,15 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 
 	rvid_clear_buffer(context, &dec->dpb);
 
+	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) {
+		unsigned ctx_size = calc_ctx_size(dec);
+		if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) {
+			RVID_ERR("Can't allocated context buffer.\n");
+			goto error;
+		}
+		rvid_clear_buffer(context, &dec->ctx);
+	}
+
 	map_msg_fb_it_buf(dec);
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_CREATE;
@@ -918,7 +1124,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	dec->msg->body.create.stream_type = dec->stream_type;
 	dec->msg->body.create.width_in_samples = dec->base.width;
 	dec->msg->body.create.height_in_samples = dec->base.height;
-	dec->msg->body.create.dpb_size = dec->dpb.res->buf->size;
+	dec->msg->body.create.dpb_size = dpb_size;
 	send_msg_buf(dec);
 	flush(dec);
 	next_buffer(dec);
@@ -934,6 +1140,8 @@ error:
 	}
 
 	rvid_destroy_buffer(&dec->dpb);
+	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC)
+		rvid_destroy_buffer(&dec->ctx);
 
 	FREE(dec);
 
diff --git a/src/gallium/drivers/radeon/radeon_uvd.h b/src/gallium/drivers/radeon/radeon_uvd.h
index 5b6c65c81b0..452fbd60880 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/src/gallium/drivers/radeon/radeon_uvd.h
@@ -63,6 +63,7 @@
 #define RUVD_CMD_FEEDBACK_BUFFER	0x00000003
 #define RUVD_CMD_BITSTREAM_BUFFER	0x00000100
 #define RUVD_CMD_ITSCALING_TABLE_BUFFER	0x00000204
+#define RUVD_CMD_CONTEXT_BUFFER		0x00000206
 
 /* UVD message types */
 #define RUVD_MSG_CREATE		0
@@ -75,6 +76,7 @@
 #define RUVD_CODEC_MPEG2	0x00000003
 #define RUVD_CODEC_MPEG4	0x00000004
 #define RUVD_CODEC_H264_PERF	0x00000007
+#define RUVD_CODEC_H265		0x00000010
 
 /* UVD decode target buffer tiling mode */
 #define RUVD_TILE_LINEAR	0x00000000
@@ -173,6 +175,66 @@ struct ruvd_h264 {
 	} mvc;
 };
 
+struct ruvd_h265 {
+	uint32_t	sps_info_flags;
+	uint32_t	pps_info_flags;
+
+	uint8_t		chroma_format;
+	uint8_t		bit_depth_luma_minus8;
+	uint8_t		bit_depth_chroma_minus8;
+	uint8_t		log2_max_pic_order_cnt_lsb_minus4;
+
+	uint8_t		sps_max_dec_pic_buffering_minus1;
+	uint8_t		log2_min_luma_coding_block_size_minus3;
+	uint8_t		log2_diff_max_min_luma_coding_block_size;
+	uint8_t		log2_min_transform_block_size_minus2;
+
+	uint8_t		log2_diff_max_min_transform_block_size;
+	uint8_t		max_transform_hierarchy_depth_inter;
+	uint8_t		max_transform_hierarchy_depth_intra;
+	uint8_t		pcm_sample_bit_depth_luma_minus1;
+
+	uint8_t		pcm_sample_bit_depth_chroma_minus1;
+	uint8_t		log2_min_pcm_luma_coding_block_size_minus3;
+	uint8_t		log2_diff_max_min_pcm_luma_coding_block_size;
+	uint8_t		num_extra_slice_header_bits;
+
+	uint8_t		num_short_term_ref_pic_sets;
+	uint8_t		num_long_term_ref_pic_sps;
+	uint8_t		num_ref_idx_l0_default_active_minus1;
+	uint8_t		num_ref_idx_l1_default_active_minus1;
+
+	int8_t		pps_cb_qp_offset;
+	int8_t		pps_cr_qp_offset;
+	int8_t		pps_beta_offset_div2;
+	int8_t		pps_tc_offset_div2;
+
+	uint8_t		diff_cu_qp_delta_depth;
+	uint8_t		num_tile_columns_minus1;
+	uint8_t		num_tile_rows_minus1;
+	uint8_t		log2_parallel_merge_level_minus2;
+
+	uint16_t	column_width_minus1[19];
+	uint16_t	row_height_minus1[21];
+
+	int8_t		init_qp_minus26;
+	uint8_t		num_delta_pocs_ref_rps_idx;
+	uint8_t		curr_idx;
+	uint8_t		reserved1;
+	int32_t		curr_poc;
+	uint8_t		ref_pic_list[16];
+	int32_t		poc_list[16];
+	uint8_t		ref_pic_set_st_curr_before[8];
+	uint8_t		ref_pic_set_st_curr_after[8];
+	uint8_t		ref_pic_set_lt_curr[8];
+
+	uint8_t		ucScalingListDCCoefSizeID2[6];
+	uint8_t		ucScalingListDCCoefSizeID3[2];
+
+	uint8_t		highestTid;
+	uint8_t		isNonRef;
+};
+
 struct ruvd_vc1 {
 	uint32_t	profile;
 	uint32_t	level;
@@ -329,6 +391,7 @@ struct ruvd_msg {
 
 			union {
 				struct ruvd_h264	h264;
+				struct ruvd_h265	h265;
 				struct ruvd_vc1		vc1;
 				struct ruvd_mpeg2	mpeg2;
 				struct ruvd_mpeg4	mpeg4;
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index 5a8d18762d9..3a1834b948f 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -264,6 +264,10 @@ int rvid_get_video_param(struct pipe_screen *screen,
 			/* FIXME: VC-1 simple/main profile is broken */
 			return profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED &&
 			       entrypoint != PIPE_VIDEO_ENTRYPOINT_ENCODE;
+		case PIPE_VIDEO_FORMAT_HEVC:
+			/* Carrizo only supports HEVC Main */
+			return rscreen->family >= CHIP_CARRIZO &&
+				   profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
 		default:
 			return false;
 		}
@@ -276,8 +280,12 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
 		return PIPE_FORMAT_NV12;
 	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+		if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC)
+			return false; //The hardware doesn't support interlaced HEVC.
 		return true;
 	case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+		if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC)
+			return false; //The hardware doesn't support interlaced HEVC.
 		return true;
 	case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
 		return true;
@@ -302,6 +310,8 @@ int rvid_get_video_param(struct pipe_screen *screen,
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
 			return 41;
+		case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+			return 186;
 		default:
 			return 0;
 		}

From f47c59322e614d6304091207fc81cfa5beba6ea9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 10 Aug 2015 16:23:53 +0200
Subject: [PATCH 1195/1208] radeonsi: revert a wrong DB bug workaround for VI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bug was misunderstood. Besides that, the bug affects a DB feature we
don't use yet.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 21689e71b8b..c923ea7e154 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -1994,10 +1994,6 @@ static void si_init_depth_surface(struct si_context *sctx,
 		db_htile_surface = 0;
 	}
 
-	/* Bug workaround. */
-	if (sctx->b.chip_class >= VI)
-		s_info |= S_028044_TILE_STENCIL_DISABLE(1);
-
 	assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
 
 	surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |

From 7bfb9ee5ee0551ef2c2056e7fe2e63e35c629e3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 16 Apr 2015 22:59:41 +0200
Subject: [PATCH 1196/1208] radeonsi: add all new VI PCI IDs including Fiji

---
 include/pci_ids/radeonsi_pci_ids.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/include/pci_ids/radeonsi_pci_ids.h b/include/pci_ids/radeonsi_pci_ids.h
index f451b7d761a..52eada1d3d5 100644
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -157,3 +157,27 @@ CHIPSET(0x67B8, HAWAII_67B8, HAWAII)
 CHIPSET(0x67B9, HAWAII_67B9, HAWAII)
 CHIPSET(0x67BA, HAWAII_67BA, HAWAII)
 CHIPSET(0x67BE, HAWAII_67BE, HAWAII)
+
+CHIPSET(0x6900, ICELAND_, ICELAND)
+CHIPSET(0x6901, ICELAND_, ICELAND)
+CHIPSET(0x6902, ICELAND_, ICELAND)
+CHIPSET(0x6903, ICELAND_, ICELAND)
+CHIPSET(0x6907, ICELAND_, ICELAND)
+
+CHIPSET(0x6920, TONGA_, TONGA)
+CHIPSET(0x6921, TONGA_, TONGA)
+CHIPSET(0x6928, TONGA_, TONGA)
+CHIPSET(0x6929, TONGA_, TONGA)
+CHIPSET(0x692B, TONGA_, TONGA)
+CHIPSET(0x692F, TONGA_, TONGA)
+CHIPSET(0x6930, TONGA_, TONGA)
+CHIPSET(0x6938, TONGA_, TONGA)
+CHIPSET(0x6939, TONGA_, TONGA)
+
+CHIPSET(0x9870, CARRIZO_, CARRIZO)
+CHIPSET(0x9874, CARRIZO_, CARRIZO)
+CHIPSET(0x9875, CARRIZO_, CARRIZO)
+CHIPSET(0x9876, CARRIZO_, CARRIZO)
+CHIPSET(0x9877, CARRIZO_, CARRIZO)
+
+CHIPSET(0x7300, FIJI_, FIJI)

From a90aa54fde37cbdf162bf909a9e895b764eb41ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 13 Aug 2015 23:46:13 +0200
Subject: [PATCH 1197/1208] docs/relnotes: document amdgpu, GL 4.1 and other
 new features

---
 docs/relnotes/11.0.0.html | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/relnotes/11.0.0.html b/docs/relnotes/11.0.0.html
index ae3a2be2d36..75967ac7eec 100644
--- a/docs/relnotes/11.0.0.html
+++ b/docs/relnotes/11.0.0.html
@@ -44,6 +44,8 @@ Note: some of the new features are only available with certain drivers.
 </p>
 
 <ul>
+<li>New hardware support for AMD GCN 1.2 GPUs: Tonga, Iceland, Carrizo, Fiji</li>
+<li>OpenGL 4.1 on radeonsi, nvc0</li>
 <li>GL_AMD_vertex_shader_viewport_index on radeonsi</li>
 <li>GL_ARB_conditional_render_inverted on r600, radeonsi</li>
 <li>GL_ARB_derivative_control on radeonsi</li>
@@ -53,12 +55,16 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_gpu_shader5 on radeonsi</li>
 <li>GL_ARB_gpu_shader_fp64 on llvmpipe, radeonsi</li>
 <li>GL_ARB_shader_image_load_store on i965</li>
+<li>GL_ARB_shader_precision on radeonsi, nvc0</li>
 <li>GL_ARB_shader_stencil_export on llvmpipe</li>
 <li>GL_ARB_shader_subroutine on core profile all drivers</li>
 <li>GL_ARB_tessellation_shader on nvc0, radeonsi</li>
 <li>GL_ARB_vertex_attrib_64bit on llvmpipe, radeonsi</li>
 <li>GL_ARB_viewport_array on radeonsi</li>
 <li>GL_EXT_depth_bounds_test on radeonsi, nv30, nv50, nvc0</li>
+<li>GL_NV_read_depth (GLES) on all drivers</li>
+<li>GL_NV_read_depth_stencil (GLES) on all drivers</li>
+<li>GL_NV_read_stencil (GLES) on all drivers</li>
 <li>GL_OES_texture_float on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
 <li>GL_OES_texture_half_float on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
 <li>GL_OES_texture_float_linear on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>

From d4087265f656c1998e20cbe2c9b6beaff6762b76 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 11 Aug 2015 11:46:22 -0400
Subject: [PATCH 1198/1208] nvc0: add depth bounds test support

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 2 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_state.c    | 7 +++++++
 src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 54b469bc03e..c211e99c60a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -177,6 +177,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -199,7 +200,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 693b14fa15f..2a33857d9df 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -351,6 +351,13 @@ nvc0_zsa_state_create(struct pipe_context *pipe,
       SB_DATA    (so, nvgl_comparison_op(cso->depth.func));
    }
 
+   SB_IMMED_3D(so, DEPTH_BOUNDS_EN, cso->depth.bounds_test);
+   if (cso->depth.bounds_test) {
+      SB_BEGIN_3D(so, DEPTH_BOUNDS(0), 2);
+      SB_DATA    (so, fui(cso->depth.bounds_min));
+      SB_DATA    (so, fui(cso->depth.bounds_max));
+   }
+
    if (cso->stencil[0].enabled) {
       SB_BEGIN_3D(so, STENCIL_ENABLE, 5);
       SB_DATA    (so, 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
index 9b69bf091d0..18fcc12dea3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
@@ -29,7 +29,7 @@ struct nvc0_rasterizer_stateobj {
 struct nvc0_zsa_stateobj {
    struct pipe_depth_stencil_alpha_state pipe;
    int size;
-   uint32_t state[26];
+   uint32_t state[30];
 };
 
 struct nvc0_constbuf {

From a6bf20d153f06639e1ae7d52d37ace9df440354d Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 11 Aug 2015 11:59:56 -0400
Subject: [PATCH 1199/1208] nv50: add depth bounds test support

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   |  2 +-
 src/gallium/drivers/nouveau/nv50/nv50_state.c    | 10 ++++++++++
 src/gallium/drivers/nouveau/nv50/nv50_stateobj.h |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 2479cbd664e..30e6e042fbf 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -178,6 +178,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
    case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -213,7 +214,6 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 186d126305a..9505a0b4085 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -373,6 +373,16 @@ nv50_zsa_state_create(struct pipe_context *pipe,
       SB_DATA    (so, 0);
    }
 
+   SB_BEGIN_3D(so, DEPTH_BOUNDS_EN, 1);
+   if (cso->depth.bounds_test) {
+      SB_DATA    (so, 1);
+      SB_BEGIN_3D(so, DEPTH_BOUNDS(0), 2);
+      SB_DATA    (so, fui(cso->depth.bounds_min));
+      SB_DATA    (so, fui(cso->depth.bounds_max));
+   } else {
+      SB_DATA    (so, 0);
+   }
+
    if (cso->stencil[0].enabled) {
       SB_BEGIN_3D(so, STENCIL_ENABLE, 5);
       SB_DATA    (so, 1);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
index 853df5497ec..cf75d1eb11b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
@@ -31,7 +31,7 @@ struct nv50_rasterizer_stateobj {
 struct nv50_zsa_stateobj {
    struct pipe_depth_stencil_alpha_state pipe;
    int size;
-   uint32_t state[29];
+   uint32_t state[34];
 };
 
 struct nv50_constbuf {

From 7ff7d5d799a54f2b08a3019df7fd531501174182 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 11 Aug 2015 12:19:54 -0400
Subject: [PATCH 1200/1208] nv30: add depth bounds test support for hw that has
 it

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv30/nv30_screen.c | 4 +++-
 src/gallium/drivers/nouveau/nv30/nv30_state.c  | 8 ++++++++
 src/gallium/drivers/nouveau/nv30/nv30_state.h  | 4 +++-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index efa766d4a69..7aad26ba18b 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -98,6 +98,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
       return 1;
+   /* nv35 capabilities */
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
+      return eng3d->oclass == NV35_3D_CLASS || eng3d->oclass >= NV40_3D_CLASS;
    /* nv4x capabilities */
    case PIPE_CAP_BLEND_EQUATION_SEPARATE:
    case PIPE_CAP_NPOT_TEXTURES:
@@ -166,7 +169,6 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
    case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
-   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.c b/src/gallium/drivers/nouveau/nv30/nv30_state.c
index 708ba34c1e5..fd604c2266d 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state.c
@@ -211,6 +211,7 @@ static void *
 nv30_zsa_state_create(struct pipe_context *pipe,
                       const struct pipe_depth_stencil_alpha_state *cso)
 {
+   struct nouveau_object *eng3d = nv30_context(pipe)->screen->eng3d;
    struct nv30_zsa_stateobj *so;
 
    so = CALLOC_STRUCT(nv30_zsa_stateobj);
@@ -223,6 +224,13 @@ nv30_zsa_state_create(struct pipe_context *pipe,
    SB_DATA  (so, cso->depth.writemask);
    SB_DATA  (so, cso->depth.enabled);
 
+   if (eng3d->oclass == NV35_3D_CLASS || eng3d->oclass >= NV40_3D_CLASS) {
+      SB_MTHD35(so, DEPTH_BOUNDS_TEST_ENABLE, 3);
+      SB_DATA  (so, cso->depth.bounds_test);
+      SB_DATA  (so, fui(cso->depth.bounds_min));
+      SB_DATA  (so, fui(cso->depth.bounds_max));
+   }
+
    if (cso->stencil[0].enabled) {
       SB_MTHD30(so, STENCIL_ENABLE(0), 3);
       SB_DATA  (so, 1);
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.h b/src/gallium/drivers/nouveau/nv30/nv30_state.h
index 1f13d24f349..ed3b8103a00 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state.h
@@ -13,6 +13,8 @@
 #define SB_DATA(so, u)        (so)->data[(so)->size++] = (u)
 #define SB_MTHD30(so, mthd, size)                                          \
    SB_DATA((so), ((size) << 18) | (7 << 13) | NV30_3D_##mthd)
+#define SB_MTHD35(so, mthd, size)                                          \
+   SB_DATA((so), ((size) << 18) | (7 << 13) | NV35_3D_##mthd)
 #define SB_MTHD40(so, mthd, size)                                          \
    SB_DATA((so), ((size) << 18) | (7 << 13) | NV40_3D_##mthd)
 
@@ -30,7 +32,7 @@ struct nv30_rasterizer_stateobj {
 
 struct nv30_zsa_stateobj {
    struct pipe_depth_stencil_alpha_state pipe;
-   unsigned data[32];
+   unsigned data[36];
    unsigned size;
 };
 

From b346a84e270a50f0a8f1a6e474a51da04dd72f0e Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 14 Aug 2015 14:10:36 -0400
Subject: [PATCH 1201/1208] gm107/ir: indirect handle goes first on maxwell
 also

Fixes fs-simple-texture-size.shader_test

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "10.6" <mesa-stable@lists.freedesktop.org>
---
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp        | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index c632e30afae..c3c302da5c8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -993,14 +993,10 @@ NVC0LoweringPass::handleTXQ(TexInstruction *txq)
       txq->tex.r = 0xff;
       txq->tex.s = 0x1f;
 
-      if (chipset < NVISA_GM107_CHIPSET) {
-         txq->setIndirectR(NULL);
-         txq->moveSources(0, 1);
-         txq->setSrc(0, hnd);
-         txq->tex.rIndirectSrc = 0;
-      } else {
-         txq->setIndirectR(hnd);
-      }
+      txq->setIndirectR(NULL);
+      txq->moveSources(0, 1);
+      txq->setSrc(0, hnd);
+      txq->tex.rIndirectSrc = 0;
    }
 
    return true;

From 9e6dc5b64d5e931c7ebc272096eccab102b75d76 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 4 Aug 2015 16:25:24 -0700
Subject: [PATCH 1202/1208] nir: Add a nir_opt_undef() to handle csels with
 undef.

We may find a cause to do more undef optimization in the future, but for
now this fixes up things after if flattening.  vc4 was handling this
internally most of the time, but a GLB2.7 shader that did a conditional
discard and assign gl_FragColor in the else was still emitting some extra
code.

total instructions in shared programs: 100809 -> 100795 (-0.01%)
instructions in affected programs:     37 -> 23 (-37.84%)

v2: Use nir_instr_rewrite_src() to update def/use on src[0] (by Thomas
    Helland).
v3: Make sure to flag metadata dirties, and copy the swizzle and abs/neg
    over to src[0], too (by anholt).

Reviewed-by: Thomas Helland <thomashelland90@gmail.com> (v2)
Tested-by: Thomas Helland <thomashelland90@gmail.com> (v2)
---
 src/gallium/drivers/vc4/vc4_program.c |   1 +
 src/glsl/Makefile.sources             |   1 +
 src/glsl/nir/nir.h                    |   2 +
 src/glsl/nir/nir_opt_undef.c          | 104 ++++++++++++++++++++++++++
 4 files changed, 108 insertions(+)
 create mode 100644 src/glsl/nir/nir_opt_undef.c

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index e9120b7a1ff..4a3a277a9fb 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1627,6 +1627,7 @@ vc4_optimize_nir(struct nir_shader *s)
                 progress = nir_opt_peephole_select(s) || progress;
                 progress = nir_opt_algebraic(s) || progress;
                 progress = nir_opt_constant_folding(s) || progress;
+                progress = nir_opt_undef(s) || progress;
         } while (progress);
 }
 
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index a0e85ed9e4a..0b77244ac03 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -56,6 +56,7 @@ NIR_FILES = \
 	nir/nir_opt_peephole_ffma.c \
 	nir/nir_opt_peephole_select.c \
 	nir/nir_opt_remove_phis.c \
+	nir/nir_opt_undef.c \
 	nir/nir_print.c \
 	nir/nir_remove_dead_variables.c \
 	nir/nir_search.c \
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 9aae6d70354..222a219d0e6 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1704,6 +1704,8 @@ bool nir_opt_peephole_ffma(nir_shader *shader);
 
 bool nir_opt_remove_phis(nir_shader *shader);
 
+bool nir_opt_undef(nir_shader *shader);
+
 void nir_sweep(nir_shader *shader);
 
 #ifdef __cplusplus
diff --git a/src/glsl/nir/nir_opt_undef.c b/src/glsl/nir/nir_opt_undef.c
new file mode 100644
index 00000000000..4ab27a8c9d5
--- /dev/null
+++ b/src/glsl/nir/nir_opt_undef.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+
+/** @file nir_opt_undef.c
+ *
+ * Handles optimization of operations involving ssa_undef.  For now, we just
+ * make sure that csels between undef and some other value just give the other
+ * value (on the assumption that the condition's going to be choosing the
+ * defined value).  This reduces work after if flattening when each side of
+ * the if is defining a variable.
+ *
+ * Some day, we may find some use for making other operations consuming an
+ * undef arg output undef, but I don't know of any cases currently.
+ */
+
+static bool
+opt_undef_alu(nir_alu_instr *instr)
+{
+   if (instr->op != nir_op_bcsel && instr->op != nir_op_fcsel)
+      return false;
+
+   assert(instr->dest.dest.is_ssa);
+
+   for (int i = 1; i <= 2; i++) {
+      if (!instr->src[i].src.is_ssa)
+         continue;
+
+      nir_instr *parent = instr->src[i].src.ssa->parent_instr;
+      if (parent->type != nir_instr_type_ssa_undef)
+         continue;
+
+      /* We can't just use nir_alu_src_copy, because we need the def/use
+       * updated.
+       */
+      nir_instr_rewrite_src(&instr->instr, &instr->src[0].src,
+                            instr->src[i == 1 ? 2 : 1].src);
+      nir_alu_src_copy(&instr->src[0], &instr->src[i == 1 ? 2 : 1],
+                       ralloc_parent(instr));
+
+      nir_src empty_src;
+      memset(&empty_src, 0, sizeof(empty_src));
+      nir_instr_rewrite_src(&instr->instr, &instr->src[1].src, empty_src);
+      nir_instr_rewrite_src(&instr->instr, &instr->src[2].src, empty_src);
+      instr->op = nir_op_imov;
+
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+opt_undef_block(nir_block *block, void *data)
+{
+   bool *progress = data;
+
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type == nir_instr_type_alu)
+         if (opt_undef_alu(nir_instr_as_alu(instr)))
+             (*progress) = true;
+   }
+
+   return true;
+}
+
+bool
+nir_opt_undef(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_overload(shader, overload) {
+      if (overload->impl) {
+         nir_foreach_block(overload->impl, opt_undef_block, &progress);
+         if (progress)
+            nir_metadata_preserve(overload->impl,
+                                  nir_metadata_block_index |
+                                  nir_metadata_dominance);
+      }
+   }
+
+   return progress;
+}

From d50c182671a6aa315c83b5e203655e902996c0e7 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 31 Jul 2015 15:35:22 -0700
Subject: [PATCH 1203/1208] nir: Don't try to scalarize unpack ops.

Avoids regressions in vc4 when trying to do our blending in NIR.

v2: Add the other unpack ops I meant to when writing the original commit
    message.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/nir/nir_lower_alu_to_scalar.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/glsl/nir/nir_lower_alu_to_scalar.c b/src/glsl/nir/nir_lower_alu_to_scalar.c
index 5d15fb2bc32..efbe9e7175f 100644
--- a/src/glsl/nir/nir_lower_alu_to_scalar.c
+++ b/src/glsl/nir/nir_lower_alu_to_scalar.c
@@ -100,6 +100,21 @@ lower_alu_instr_scalar(nir_alu_instr *instr, void *mem_ctx)
        */
       return;
 
+   case nir_op_unpack_unorm_4x8:
+   case nir_op_unpack_snorm_4x8:
+   case nir_op_unpack_unorm_2x16:
+   case nir_op_unpack_snorm_2x16:
+      /* There is no scalar version of these ops, unless we were to break it
+       * down to bitshifts and math (which is definitely not intended).
+       */
+      return;
+
+   case nir_op_unpack_half_2x16:
+      /* We could split this into unpack_half_2x16_split_[xy], but should
+       * we?
+       */
+      return;
+
       LOWER_REDUCTION(nir_op_fdot, nir_op_fmul, nir_op_fadd);
       LOWER_REDUCTION(nir_op_ball_fequal, nir_op_feq, nir_op_iand);
       LOWER_REDUCTION(nir_op_ball_iequal, nir_op_ieq, nir_op_iand);

From 38c6c0f5b499e2bcff2cc9607f67c0f1836f305b Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 31 Jul 2015 09:02:01 -0700
Subject: [PATCH 1204/1208] vc4: Add a helper for making driver-specific NIR
 load_uniform for GL state

In order to move more of our lowering into NIR, we need the ability to
reference various pipeline state (like texture rectangle scaling factors
or blend colors), so we just set those up as a load_uniform with a big
offset to indicate that it's not within the shader's uniform storage and
is one of our state values.
---
 src/gallium/drivers/vc4/vc4_program.c | 25 +++++++++++++++++++++++--
 src/gallium/drivers/vc4/vc4_qir.h     |  7 +++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 4a3a277a9fb..fb1726c0d1e 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -33,8 +33,9 @@
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_lowering.h"
 #include "tgsi/tgsi_parse.h"
+#include "glsl/nir/nir.h"
+#include "glsl/nir/nir_builder.h"
 #include "nir/tgsi_to_nir.h"
-
 #include "vc4_context.h"
 #include "vc4_qpu.h"
 #include "vc4_qir.h"
@@ -109,6 +110,19 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
         return qir_TEX_RESULT(c);
 }
 
+nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
+                                       enum quniform_contents contents)
+{
+        nir_intrinsic_instr *intr =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_load_uniform);
+        intr->const_index[0] = VC4_NIR_STATE_UNIFORM_OFFSET + contents;
+        intr->num_components = 1;
+        nir_ssa_dest_init(&intr->instr, &intr->dest, 1, NULL);
+        nir_builder_instr_insert(b, &intr->instr);
+        return &intr->dest.ssa;
+}
+
 static struct qreg *
 ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
 {
@@ -1808,7 +1822,14 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
         switch (instr->intrinsic) {
         case nir_intrinsic_load_uniform:
                 assert(instr->num_components == 1);
-                *dest = qir_uniform(c, QUNIFORM_UNIFORM, instr->const_index[0]);
+                if (instr->const_index[0] < VC4_NIR_STATE_UNIFORM_OFFSET) {
+                        *dest = qir_uniform(c, QUNIFORM_UNIFORM,
+                                            instr->const_index[0]);
+                } else {
+                        *dest = qir_uniform(c, instr->const_index[0] -
+                                            VC4_NIR_STATE_UNIFORM_OFFSET,
+                                            0);
+                }
                 break;
 
         case nir_intrinsic_load_uniform_indirect:
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 7a74018d9af..57e25de1b94 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -414,6 +414,11 @@ struct vc4_compile {
         uint32_t variant_id;
 };
 
+/* Special offset for nir_load_uniform values to get a QUNIFORM_*
+ * state-dependent value.
+ */
+#define VC4_NIR_STATE_UNIFORM_OFFSET		2000000000
+
 struct vc4_compile *qir_compile_init(void);
 void qir_compile_destroy(struct vc4_compile *c);
 struct qinst *qir_inst(enum qop op, struct qreg dst,
@@ -454,6 +459,8 @@ bool qir_opt_dead_code(struct vc4_compile *c);
 bool qir_opt_small_immediates(struct vc4_compile *c);
 bool qir_opt_vpm_writes(struct vc4_compile *c);
 void vc4_nir_lower_io(struct vc4_compile *c);
+nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
+                                       enum quniform_contents contents);
 void qir_lower_uniforms(struct vc4_compile *c);
 
 void qpu_schedule_instructions(struct vc4_compile *c);

From bf3c50fba221f216e38d3f60f89161ced4c684c0 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 13 Apr 2015 21:36:24 -0700
Subject: [PATCH 1205/1208] vc4: Move all of our fixed function fragment color
 handling to NIR.

This massively reduces our dependency on VC4-specific optimization passes.

shader-db:
total uniforms in shared programs: 32077 -> 32067 (-0.03%)
uniforms in affected programs:     149 -> 139 (-6.71%)
total instructions in shared programs: 98208 -> 98182 (-0.03%)
instructions in affected programs:     2154 -> 2128 (-1.21%)
---
 src/gallium/drivers/vc4/Makefile.sources      |   1 +
 src/gallium/drivers/vc4/vc4_nir_lower_blend.c | 431 +++++++++++++++++
 src/gallium/drivers/vc4/vc4_nir_lower_io.c    |  24 +-
 src/gallium/drivers/vc4/vc4_program.c         | 440 +++---------------
 src/gallium/drivers/vc4/vc4_qir.h             |  16 +-
 src/gallium/drivers/vc4/vc4_uniforms.c        |  14 +-
 6 files changed, 538 insertions(+), 388 deletions(-)
 create mode 100644 src/gallium/drivers/vc4/vc4_nir_lower_blend.c

diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index b09ffa60c39..6fb40c20562 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -19,6 +19,7 @@ C_SOURCES := \
 	vc4_fence.c \
 	vc4_formats.c \
 	vc4_job.c \
+	vc4_nir_lower_blend.c \
 	vc4_nir_lower_io.c \
 	vc4_opt_algebraic.c \
 	vc4_opt_constant_folding.c \
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
new file mode 100644
index 00000000000..a372a6c0cdc
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * Implements most of the fixed function fragment pipeline in shader code.
+ *
+ * VC4 doesn't have any hardware support for blending, alpha test, logic ops,
+ * or color mask.  Instead, you read the current contents of the destination
+ * from the tile buffer after having waited for the scoreboard (which is
+ * handled by vc4_qpu_emit.c), then do math using your output color and that
+ * destination value, and update the output color appropriately.
+ */
+
+/**
+ * Lowers fixed-function blending to a load of the destination color and a
+ * series of ALU operations before the store of the output.
+ */
+#include "util/u_format.h"
+#include "vc4_qir.h"
+#include "glsl/nir/nir_builder.h"
+#include "vc4_context.h"
+
+/** Emits a load of the previous fragment color from the tile buffer. */
+static nir_ssa_def *
+vc4_nir_get_dst_color(nir_builder *b)
+{
+        nir_intrinsic_instr *load =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_load_input);
+        load->num_components = 1;
+        load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT;
+        nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
+        nir_builder_instr_insert(b, &load->instr);
+        return &load->dest.ssa;
+}
+
+static  nir_ssa_def *
+vc4_nir_srgb_decode(nir_builder *b, nir_ssa_def *srgb)
+{
+        nir_ssa_def *is_low = nir_flt(b, srgb, nir_imm_float(b, 0.04045));
+        nir_ssa_def *low = nir_fmul(b, srgb, nir_imm_float(b, 1.0 / 12.92));
+        nir_ssa_def *high = nir_fpow(b,
+                                     nir_fmul(b,
+                                              nir_fadd(b, srgb,
+                                                       nir_imm_float(b, 0.055)),
+                                              nir_imm_float(b, 1.0 / 1.055)),
+                                     nir_imm_float(b, 2.4));
+
+        return nir_bcsel(b, is_low, low, high);
+}
+
+static  nir_ssa_def *
+vc4_nir_srgb_encode(nir_builder *b, nir_ssa_def *linear)
+{
+        nir_ssa_def *is_low = nir_flt(b, linear, nir_imm_float(b, 0.0031308));
+        nir_ssa_def *low = nir_fmul(b, linear, nir_imm_float(b, 12.92));
+        nir_ssa_def *high = nir_fsub(b,
+                                     nir_fmul(b,
+                                              nir_imm_float(b, 1.055),
+                                              nir_fpow(b,
+                                                       linear,
+                                                       nir_imm_float(b, 0.41666))),
+                                     nir_imm_float(b, 0.055));
+
+        return nir_bcsel(b, is_low, low, high);
+}
+
+static nir_ssa_def *
+vc4_blend_channel(nir_builder *b,
+                  nir_ssa_def **src,
+                  nir_ssa_def **dst,
+                  unsigned factor,
+                  int channel)
+{
+        switch(factor) {
+        case PIPE_BLENDFACTOR_ONE:
+                return nir_imm_float(b, 1.0);
+        case PIPE_BLENDFACTOR_SRC_COLOR:
+                return src[channel];
+        case PIPE_BLENDFACTOR_SRC_ALPHA:
+                return src[3];
+        case PIPE_BLENDFACTOR_DST_ALPHA:
+                return dst[3];
+        case PIPE_BLENDFACTOR_DST_COLOR:
+                return dst[channel];
+        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+                if (channel != 3) {
+                        return nir_fmin(b,
+                                        src[3],
+                                        nir_fsub(b,
+                                                 nir_imm_float(b, 1.0),
+                                                 dst[3]));
+                } else {
+                        return nir_imm_float(b, 1.0);
+                }
+        case PIPE_BLENDFACTOR_CONST_COLOR:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel);
+        case PIPE_BLENDFACTOR_CONST_ALPHA:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W);
+        case PIPE_BLENDFACTOR_ZERO:
+                return nir_imm_float(b, 0.0);
+        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0), src[channel]);
+        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0), src[3]);
+        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0), dst[3]);
+        case PIPE_BLENDFACTOR_INV_DST_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0), dst[channel]);
+        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0),
+                                vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel));
+        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0),
+                                vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W));
+
+        default:
+        case PIPE_BLENDFACTOR_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_SRC1_ALPHA:
+        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend factor %d\n", factor);
+                return nir_imm_float(b, 1.0);
+        }
+}
+
+static nir_ssa_def *
+vc4_blend_func(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
+               unsigned func)
+{
+        switch (func) {
+        case PIPE_BLEND_ADD:
+                return nir_fadd(b, src, dst);
+        case PIPE_BLEND_SUBTRACT:
+                return nir_fsub(b, src, dst);
+        case PIPE_BLEND_REVERSE_SUBTRACT:
+                return nir_fsub(b, dst, src);
+        case PIPE_BLEND_MIN:
+                return nir_fmin(b, src, dst);
+        case PIPE_BLEND_MAX:
+                return nir_fmax(b, src, dst);
+
+        default:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend func %d\n", func);
+                return src;
+
+        }
+}
+
+static void
+vc4_do_blending(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result,
+                nir_ssa_def **src_color, nir_ssa_def **dst_color)
+{
+        struct pipe_rt_blend_state *blend = &c->fs_key->blend;
+
+        if (!blend->blend_enable) {
+                for (int i = 0; i < 4; i++)
+                        result[i] = src_color[i];
+                return;
+        }
+
+        /* Clamp the src color to [0, 1].  Dest is already clamped. */
+        for (int i = 0; i < 4; i++)
+                src_color[i] = nir_fsat(b, src_color[i]);
+
+        nir_ssa_def *src_blend[4], *dst_blend[4];
+        for (int i = 0; i < 4; i++) {
+                int src_factor = ((i != 3) ? blend->rgb_src_factor :
+                                  blend->alpha_src_factor);
+                int dst_factor = ((i != 3) ? blend->rgb_dst_factor :
+                                  blend->alpha_dst_factor);
+                src_blend[i] = nir_fmul(b, src_color[i],
+                                        vc4_blend_channel(b,
+                                                          src_color, dst_color,
+                                                          src_factor, i));
+                dst_blend[i] = nir_fmul(b, dst_color[i],
+                                        vc4_blend_channel(b,
+                                                          src_color, dst_color,
+                                                          dst_factor, i));
+        }
+
+        for (int i = 0; i < 4; i++) {
+                result[i] = vc4_blend_func(b, src_blend[i], dst_blend[i],
+                                           ((i != 3) ? blend->rgb_func :
+                                            blend->alpha_func));
+        }
+}
+
+static nir_ssa_def *
+vc4_logicop(nir_builder *b, int logicop_func,
+            nir_ssa_def *src, nir_ssa_def *dst)
+{
+        switch (logicop_func) {
+        case PIPE_LOGICOP_CLEAR:
+                return nir_imm_int(b, 0);
+        case PIPE_LOGICOP_NOR:
+                return nir_inot(b, nir_ior(b, src, dst));
+        case PIPE_LOGICOP_AND_INVERTED:
+                return nir_iand(b, nir_inot(b, src), dst);
+        case PIPE_LOGICOP_COPY_INVERTED:
+                return nir_inot(b, src);
+        case PIPE_LOGICOP_AND_REVERSE:
+                return nir_iand(b, src, nir_inot(b, dst));
+        case PIPE_LOGICOP_INVERT:
+                return nir_inot(b, dst);
+        case PIPE_LOGICOP_XOR:
+                return nir_ixor(b, src, dst);
+        case PIPE_LOGICOP_NAND:
+                return nir_inot(b, nir_iand(b, src, dst));
+        case PIPE_LOGICOP_AND:
+                return nir_iand(b, src, dst);
+        case PIPE_LOGICOP_EQUIV:
+                return nir_inot(b, nir_ixor(b, src, dst));
+        case PIPE_LOGICOP_NOOP:
+                return dst;
+        case PIPE_LOGICOP_OR_INVERTED:
+                return nir_ior(b, nir_inot(b, src), dst);
+        case PIPE_LOGICOP_OR_REVERSE:
+                return nir_ior(b, src, nir_inot(b, dst));
+        case PIPE_LOGICOP_OR:
+                return nir_ior(b, src, dst);
+        case PIPE_LOGICOP_SET:
+                return nir_imm_int(b, ~0);
+        default:
+                fprintf(stderr, "Unknown logic op %d\n", logicop_func);
+                /* FALLTHROUGH */
+        case PIPE_LOGICOP_COPY:
+                return src;
+        }
+}
+
+static nir_ssa_def *
+vc4_nir_pipe_compare_func(nir_builder *b, int func,
+                          nir_ssa_def *src0, nir_ssa_def *src1)
+{
+        switch (func) {
+        default:
+                fprintf(stderr, "Unknown compare func %d\n", func);
+                /* FALLTHROUGH */
+        case PIPE_FUNC_NEVER:
+                return nir_imm_int(b, 0);
+        case PIPE_FUNC_ALWAYS:
+                return nir_imm_int(b, ~0);
+        case PIPE_FUNC_EQUAL:
+                return nir_feq(b, src0, src1);
+        case PIPE_FUNC_NOTEQUAL:
+                return nir_fne(b, src0, src1);
+        case PIPE_FUNC_GREATER:
+                return nir_flt(b, src1, src0);
+        case PIPE_FUNC_GEQUAL:
+                return nir_fge(b, src0, src1);
+        case PIPE_FUNC_LESS:
+                return nir_flt(b, src0, src1);
+        case PIPE_FUNC_LEQUAL:
+                return nir_fge(b, src1, src0);
+        }
+}
+
+static void
+vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b,
+                                nir_ssa_def *alpha)
+{
+        if (!c->fs_key->alpha_test)
+                return;
+
+        nir_ssa_def *alpha_ref =
+                vc4_nir_get_state_uniform(b, QUNIFORM_ALPHA_REF);
+        nir_ssa_def *condition =
+                vc4_nir_pipe_compare_func(b, c->fs_key->alpha_test_func,
+                                          alpha, alpha_ref);
+
+        nir_intrinsic_instr *discard =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_discard_if);
+        discard->num_components = 1;
+        discard->src[0] = nir_src_for_ssa(nir_inot(b, condition));
+        nir_builder_instr_insert(b, &discard->instr);
+}
+
+static void
+vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
+                          nir_intrinsic_instr *intr)
+{
+        enum pipe_format color_format = c->fs_key->color_format;
+        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
+
+        /* Pull out the float src/dst color components. */
+        nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b);
+        nir_ssa_def *dst_vec4 = nir_unpack_unorm_4x8(b, packed_dst_color);
+        nir_ssa_def *src_color[4], *unpacked_dst_color[4];
+        for (unsigned i = 0; i < 4; i++) {
+                src_color[i] = nir_swizzle(b, intr->src[0].ssa, &i, 1, false);
+                unpacked_dst_color[i] = nir_swizzle(b, dst_vec4, &i, 1, false);
+        }
+
+        /* Unswizzle the destination color. */
+        nir_ssa_def *dst_color[4];
+        for (unsigned i = 0; i < 4; i++) {
+                dst_color[i] = vc4_nir_get_swizzled_channel(b,
+                                                            unpacked_dst_color,
+                                                            format_swiz[i]);
+        }
+
+        vc4_nir_emit_alpha_test_discard(c, b, src_color[3]);
+
+        /* Turn dst color to linear. */
+        if (util_format_is_srgb(color_format)) {
+                for (int i = 0; i < 3; i++)
+                        dst_color[i] = vc4_nir_srgb_decode(b, dst_color[i]);
+        }
+
+        nir_ssa_def *blend_color[4];
+        vc4_do_blending(c, b, blend_color, src_color, dst_color);
+
+        /* sRGB encode the output color */
+        if (util_format_is_srgb(color_format)) {
+                for (int i = 0; i < 3; i++)
+                        blend_color[i] = vc4_nir_srgb_encode(b, blend_color[i]);
+        }
+
+        nir_ssa_def *swizzled_outputs[4];
+        for (int i = 0; i < 4; i++) {
+                swizzled_outputs[i] =
+                        vc4_nir_get_swizzled_channel(b, blend_color,
+                                                     format_swiz[i]);
+        }
+
+        nir_ssa_def *packed_color =
+                nir_pack_unorm_4x8(b,
+                                   nir_vec4(b,
+                                            swizzled_outputs[0],
+                                            swizzled_outputs[1],
+                                            swizzled_outputs[2],
+                                            swizzled_outputs[3]));
+
+        packed_color = vc4_logicop(b, c->fs_key->logicop_func,
+                                   packed_color, packed_dst_color);
+
+        /* If the bit isn't set in the color mask, then just return the
+         * original dst color, instead.
+         */
+        uint32_t colormask = 0xffffffff;
+        for (int i = 0; i < 4; i++) {
+                if (format_swiz[i] < 4 &&
+                    !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) {
+                        colormask &= ~(0xff << (i * 8));
+                }
+        }
+        packed_color = nir_ior(b,
+                               nir_iand(b, packed_color,
+                                        nir_imm_int(b, colormask)),
+                               nir_iand(b, packed_dst_color,
+                                        nir_imm_int(b, ~colormask)));
+
+        /* Turn the old vec4 output into a store of the packed color. */
+        nir_instr_rewrite_src(&intr->instr, &intr->src[0],
+                              nir_src_for_ssa(packed_color));
+        intr->num_components = 1;
+}
+
+static bool
+vc4_nir_lower_blend_block(nir_block *block, void *state)
+{
+        struct vc4_compile *c = state;
+
+        nir_foreach_instr(block, instr) {
+                if (instr->type != nir_instr_type_intrinsic)
+                        continue;
+                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+                if (intr->intrinsic != nir_intrinsic_store_output)
+                        continue;
+
+                nir_variable *output_var = NULL;
+                foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+                        if (var->data.driver_location == intr->const_index[0]) {
+                                output_var = var;
+                                break;
+                        }
+                }
+                assert(output_var);
+                unsigned semantic_name = output_var->data.location;
+
+                if (semantic_name != TGSI_SEMANTIC_COLOR)
+                        continue;
+
+                nir_function_impl *impl =
+                        nir_cf_node_get_function(&block->cf_node);
+                nir_builder b;
+                nir_builder_init(&b, impl);
+                nir_builder_insert_before_instr(&b, &intr->instr);
+                vc4_nir_lower_blend_instr(c, &b, intr);
+        }
+        return true;
+}
+
+void
+vc4_nir_lower_blend(struct vc4_compile *c)
+{
+        nir_foreach_overload(c->s, overload) {
+                if (overload->impl) {
+                        nir_foreach_block(overload->impl,
+                                          vc4_nir_lower_blend_block, c);
+
+                        nir_metadata_preserve(overload->impl,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance);
+                }
+        }
+}
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index ffc120e8865..229d41147d8 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -56,11 +56,14 @@ static void
 vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
                     nir_intrinsic_instr *intr)
 {
-        /* All TGSI-to-NIR inputs are vec4. */
-        assert(intr->num_components == 4);
-
         nir_builder_insert_before_instr(b, &intr->instr);
 
+        if (c->stage == QSTAGE_FRAG && intr->const_index[0] ==
+            VC4_NIR_TLB_COLOR_READ_INPUT) {
+                /* This doesn't need any lowering. */
+                return;
+        }
+
         nir_variable *input_var = NULL;
         foreach_list_typed(nir_variable, var, node, &c->s->inputs) {
                 if (var->data.driver_location == intr->const_index[0]) {
@@ -72,6 +75,9 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
         int semantic_name = input_var->data.location;
         int semantic_index = input_var->data.index;
 
+        /* All TGSI-to-NIR inputs are vec4. */
+        assert(intr->num_components == 4);
+
         /* Generate scalar loads equivalent to the original VEC4. */
         nir_ssa_def *dests[4];
         for (unsigned i = 0; i < intr->num_components; i++) {
@@ -145,6 +151,12 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
                 return;
         }
 
+        /* Color output is lowered by vc4_nir_lower_blend(). */
+        if (c->stage == QSTAGE_FRAG && semantic_name == TGSI_SEMANTIC_COLOR) {
+                intr->const_index[0] *= 4;
+                return;
+        }
+
         /* All TGSI-to-NIR outputs are VEC4. */
         assert(intr->num_components == 4);
 
@@ -170,7 +182,11 @@ static void
 vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b,
                       nir_intrinsic_instr *intr)
 {
-        /* All TGSI-to-NIR uniform loads are vec4. */
+        /* All TGSI-to-NIR uniform loads are vec4, but we may create dword
+         * loads in our lowering passes.
+         */
+        if (intr->num_components == 1)
+                return;
         assert(intr->num_components == 4);
 
         nir_builder_insert_before_instr(b, &intr->instr);
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index fb1726c0d1e..13c472152d8 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -123,6 +123,26 @@ nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
         return &intr->dest.ssa;
 }
 
+nir_ssa_def *
+vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
+{
+        switch (swiz) {
+        default:
+        case UTIL_FORMAT_SWIZZLE_NONE:
+                fprintf(stderr, "warning: unknown swizzle\n");
+                /* FALLTHROUGH */
+        case UTIL_FORMAT_SWIZZLE_0:
+                return nir_imm_float(b, 0.0);
+        case UTIL_FORMAT_SWIZZLE_1:
+                return nir_imm_float(b, 1.0);
+        case UTIL_FORMAT_SWIZZLE_X:
+        case UTIL_FORMAT_SWIZZLE_Y:
+        case UTIL_FORMAT_SWIZZLE_Z:
+        case UTIL_FORMAT_SWIZZLE_W:
+                return srcs[swiz];
+        }
+}
+
 static struct qreg *
 ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
 {
@@ -258,22 +278,6 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb)
         return qir_SEL_X_Y_NS(c, low, high);
 }
 
-static struct qreg
-qir_srgb_encode(struct vc4_compile *c, struct qreg linear)
-{
-        struct qreg low = qir_FMUL(c, linear, qir_uniform_f(c, 12.92));
-        struct qreg high = qir_FSUB(c,
-                                    qir_FMUL(c,
-                                             qir_uniform_f(c, 1.055),
-                                             qir_POW(c,
-                                                     linear,
-                                                     qir_uniform_f(c, 0.41666))),
-                                    qir_uniform_f(c, 0.055));
-
-        qir_SF(c, qir_FSUB(c, linear, qir_uniform_f(c, 0.0031308)));
-        return qir_SEL_X_Y_NS(c, low, high);
-}
-
 static struct qreg
 ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
 {
@@ -834,6 +838,32 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 return;
         }
 
+        if (instr->op == nir_op_pack_unorm_4x8) {
+                struct qreg result;
+                for (int i = 0; i < 4; i++) {
+                        struct qreg src = ntq_get_src(c, instr->src[0].src,
+                                                      instr->src[0].swizzle[i]);
+                        if (i == 0)
+                                result = qir_PACK_8888_F(c, src);
+                        else
+                                result = qir_PACK_8_F(c, result, src, i);
+                }
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+                *dest = result;
+                return;
+        }
+
+        if (instr->op == nir_op_unpack_unorm_4x8) {
+                struct qreg src = ntq_get_src(c, instr->src[0].src,
+                                              instr->src[0].swizzle[0]);
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+                for (int i = 0; i < 4; i++) {
+                        if (instr->dest.write_mask & (1 << i))
+                                dest[i] = qir_UNPACK_8_F(c, src, i);
+                }
+                return;
+        }
+
         /* General case: We can just grab the one used channel per src. */
         struct qreg src[nir_op_infos[instr->op].num_inputs];
         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
@@ -1036,161 +1066,6 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         }
 }
 
-static struct qreg
-vc4_blend_channel(struct vc4_compile *c,
-                  struct qreg *dst,
-                  struct qreg *src,
-                  struct qreg val,
-                  unsigned factor,
-                  int channel)
-{
-        switch(factor) {
-        case PIPE_BLENDFACTOR_ONE:
-                return val;
-        case PIPE_BLENDFACTOR_SRC_COLOR:
-                return qir_FMUL(c, val, src[channel]);
-        case PIPE_BLENDFACTOR_SRC_ALPHA:
-                return qir_FMUL(c, val, src[3]);
-        case PIPE_BLENDFACTOR_DST_ALPHA:
-                return qir_FMUL(c, val, dst[3]);
-        case PIPE_BLENDFACTOR_DST_COLOR:
-                return qir_FMUL(c, val, dst[channel]);
-        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-                if (channel != 3) {
-                        return qir_FMUL(c,
-                                        val,
-                                        qir_FMIN(c,
-                                                 src[3],
-                                                 qir_FSUB(c,
-                                                          qir_uniform_f(c, 1.0),
-                                                          dst[3])));
-                } else {
-                        return val;
-                }
-        case PIPE_BLENDFACTOR_CONST_COLOR:
-                return qir_FMUL(c, val,
-                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR,
-                                            channel));
-        case PIPE_BLENDFACTOR_CONST_ALPHA:
-                return qir_FMUL(c, val,
-                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, 3));
-        case PIPE_BLENDFACTOR_ZERO:
-                return qir_uniform_f(c, 0.0);
-        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 src[channel]));
-        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 src[3]));
-        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 dst[3]));
-        case PIPE_BLENDFACTOR_INV_DST_COLOR:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 dst[channel]));
-        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-                return qir_FMUL(c, val,
-                                qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                         qir_uniform(c,
-                                                     QUNIFORM_BLEND_CONST_COLOR,
-                                                     channel)));
-        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-                return qir_FMUL(c, val,
-                                qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                         qir_uniform(c,
-                                                     QUNIFORM_BLEND_CONST_COLOR,
-                                                     3)));
-
-        default:
-        case PIPE_BLENDFACTOR_SRC1_COLOR:
-        case PIPE_BLENDFACTOR_SRC1_ALPHA:
-        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-                /* Unsupported. */
-                fprintf(stderr, "Unknown blend factor %d\n", factor);
-                return val;
-        }
-}
-
-static struct qreg
-vc4_blend_func(struct vc4_compile *c,
-               struct qreg src, struct qreg dst,
-               unsigned func)
-{
-        switch (func) {
-        case PIPE_BLEND_ADD:
-                return qir_FADD(c, src, dst);
-        case PIPE_BLEND_SUBTRACT:
-                return qir_FSUB(c, src, dst);
-        case PIPE_BLEND_REVERSE_SUBTRACT:
-                return qir_FSUB(c, dst, src);
-        case PIPE_BLEND_MIN:
-                return qir_FMIN(c, src, dst);
-        case PIPE_BLEND_MAX:
-                return qir_FMAX(c, src, dst);
-
-        default:
-                /* Unsupported. */
-                fprintf(stderr, "Unknown blend func %d\n", func);
-                return src;
-
-        }
-}
-
-/**
- * Implements fixed function blending in shader code.
- *
- * VC4 doesn't have any hardware support for blending.  Instead, you read the
- * current contents of the destination from the tile buffer after having
- * waited for the scoreboard (which is handled by vc4_qpu_emit.c), then do
- * math using your output color and that destination value, and update the
- * output color appropriately.
- */
-static void
-vc4_blend(struct vc4_compile *c, struct qreg *result,
-          struct qreg *dst_color, struct qreg *src_color)
-{
-        struct pipe_rt_blend_state *blend = &c->fs_key->blend;
-
-        if (!blend->blend_enable) {
-                for (int i = 0; i < 4; i++)
-                        result[i] = src_color[i];
-                return;
-        }
-
-        for (int i = 0; i < 4; i++)
-                src_color[i] = qir_SAT(c, src_color[i]);
-
-        struct qreg src_blend[4], dst_blend[4];
-        for (int i = 0; i < 3; i++) {
-                src_blend[i] = vc4_blend_channel(c,
-                                                 dst_color, src_color,
-                                                 src_color[i],
-                                                 blend->rgb_src_factor, i);
-                dst_blend[i] = vc4_blend_channel(c,
-                                                 dst_color, src_color,
-                                                 dst_color[i],
-                                                 blend->rgb_dst_factor, i);
-        }
-        src_blend[3] = vc4_blend_channel(c,
-                                         dst_color, src_color,
-                                         src_color[3],
-                                         blend->alpha_src_factor, 3);
-        dst_blend[3] = vc4_blend_channel(c,
-                                         dst_color, src_color,
-                                         dst_color[3],
-                                         blend->alpha_dst_factor, 3);
-
-        for (int i = 0; i < 3; i++) {
-                result[i] = vc4_blend_func(c,
-                                           src_blend[i], dst_blend[i],
-                                           blend->rgb_func);
-        }
-        result[3] = vc4_blend_func(c,
-                                   src_blend[3], dst_blend[3],
-                                   blend->alpha_func);
-}
-
 static void
 clip_distance_discard(struct vc4_compile *c)
 {
@@ -1213,217 +1088,17 @@ clip_distance_discard(struct vc4_compile *c)
         }
 }
 
-static void
-alpha_test_discard(struct vc4_compile *c)
-{
-        struct qreg src_alpha;
-        struct qreg alpha_ref = qir_uniform(c, QUNIFORM_ALPHA_REF, 0);
-
-        if (!c->fs_key->alpha_test)
-                return;
-
-        if (c->output_color_index != -1)
-                src_alpha = c->outputs[c->output_color_index + 3];
-        else
-                src_alpha = qir_uniform_f(c, 1.0);
-
-        if (c->discard.file == QFILE_NULL)
-                c->discard = qir_uniform_ui(c, 0);
-
-        switch (c->fs_key->alpha_test_func) {
-        case PIPE_FUNC_NEVER:
-                c->discard = qir_uniform_ui(c, ~0);
-                break;
-        case PIPE_FUNC_ALWAYS:
-                break;
-        case PIPE_FUNC_EQUAL:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_ZS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_NOTEQUAL:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_ZC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_GREATER:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_NC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_GEQUAL:
-                qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha));
-                c->discard = qir_SEL_X_Y_NS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_LESS:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_NS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_LEQUAL:
-                qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha));
-                c->discard = qir_SEL_X_Y_NC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        }
-}
-
-static struct qreg
-vc4_logicop(struct vc4_compile *c, struct qreg src, struct qreg dst)
-{
-        switch (c->fs_key->logicop_func) {
-        case PIPE_LOGICOP_CLEAR:
-                return qir_uniform_f(c, 0.0);
-        case PIPE_LOGICOP_NOR:
-                return qir_NOT(c, qir_OR(c, src, dst));
-        case PIPE_LOGICOP_AND_INVERTED:
-                return qir_AND(c, qir_NOT(c, src), dst);
-        case PIPE_LOGICOP_COPY_INVERTED:
-                return qir_NOT(c, src);
-        case PIPE_LOGICOP_AND_REVERSE:
-                return qir_AND(c, src, qir_NOT(c, dst));
-        case PIPE_LOGICOP_INVERT:
-                return qir_NOT(c, dst);
-        case PIPE_LOGICOP_XOR:
-                return qir_XOR(c, src, dst);
-        case PIPE_LOGICOP_NAND:
-                return qir_NOT(c, qir_AND(c, src, dst));
-        case PIPE_LOGICOP_AND:
-                return qir_AND(c, src, dst);
-        case PIPE_LOGICOP_EQUIV:
-                return qir_NOT(c, qir_XOR(c, src, dst));
-        case PIPE_LOGICOP_NOOP:
-                return dst;
-        case PIPE_LOGICOP_OR_INVERTED:
-                return qir_OR(c, qir_NOT(c, src), dst);
-        case PIPE_LOGICOP_OR_REVERSE:
-                return qir_OR(c, src, qir_NOT(c, dst));
-        case PIPE_LOGICOP_OR:
-                return qir_OR(c, src, dst);
-        case PIPE_LOGICOP_SET:
-                return qir_uniform_ui(c, ~0);
-        case PIPE_LOGICOP_COPY:
-        default:
-                return src;
-        }
-}
-
-/**
- * Applies the GL blending pipeline and returns the packed (8888) output
- * color.
- */
-static struct qreg
-blend_pipeline(struct vc4_compile *c)
-{
-        enum pipe_format color_format = c->fs_key->color_format;
-        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
-        struct qreg tlb_read_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg dst_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg linear_dst_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg packed_dst_color = c->undef;
-
-        if (c->fs_key->blend.blend_enable ||
-            c->fs_key->blend.colormask != 0xf ||
-            c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
-                packed_dst_color = qir_TLB_COLOR_READ(c);
-                for (int i = 0; i < 4; i++)
-                        tlb_read_color[i] = qir_UNPACK_8_F(c,
-                                                           packed_dst_color, i);
-                for (int i = 0; i < 4; i++) {
-                        dst_color[i] = get_swizzled_channel(c,
-                                                            tlb_read_color,
-                                                            format_swiz[i]);
-                        if (util_format_is_srgb(color_format) && i != 3) {
-                                linear_dst_color[i] =
-                                        qir_srgb_decode(c, dst_color[i]);
-                        } else {
-                                linear_dst_color[i] = dst_color[i];
-                        }
-                }
-        }
-
-        struct qreg undef_array[4] = { c->undef, c->undef, c->undef, c->undef };
-        const struct qreg *output_colors = (c->output_color_index != -1 ?
-                                            c->outputs + c->output_color_index :
-                                            undef_array);
-        struct qreg blend_src_color[4];
-        for (int i = 0; i < 4; i++)
-                blend_src_color[i] = output_colors[i];
-
-        struct qreg blend_color[4];
-        vc4_blend(c, blend_color, linear_dst_color, blend_src_color);
-
-        if (util_format_is_srgb(color_format)) {
-                for (int i = 0; i < 3; i++)
-                        blend_color[i] = qir_srgb_encode(c, blend_color[i]);
-        }
-
-        /* Debug: Sometimes you're getting a black output and just want to see
-         * if the FS is getting executed at all.  Spam magenta into the color
-         * output.
-         */
-        if (0) {
-                blend_color[0] = qir_uniform_f(c, 1.0);
-                blend_color[1] = qir_uniform_f(c, 0.0);
-                blend_color[2] = qir_uniform_f(c, 1.0);
-                blend_color[3] = qir_uniform_f(c, 0.5);
-        }
-
-        struct qreg swizzled_outputs[4];
-        for (int i = 0; i < 4; i++) {
-                swizzled_outputs[i] = get_swizzled_channel(c, blend_color,
-                                                           format_swiz[i]);
-        }
-
-        struct qreg packed_color = c->undef;
-        for (int i = 0; i < 4; i++) {
-                if (swizzled_outputs[i].file == QFILE_NULL)
-                        continue;
-                if (packed_color.file == QFILE_NULL) {
-                        packed_color = qir_PACK_8888_F(c, swizzled_outputs[i]);
-                } else {
-                        packed_color = qir_PACK_8_F(c,
-                                                    packed_color,
-                                                    swizzled_outputs[i],
-                                                    i);
-                }
-        }
-
-        if (packed_color.file == QFILE_NULL)
-                packed_color = qir_uniform_ui(c, 0);
-
-        if (c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
-                packed_color = vc4_logicop(c, packed_color, packed_dst_color);
-        }
-
-        /* If the bit isn't set in the color mask, then just return the
-         * original dst color, instead.
-         */
-        uint32_t colormask = 0xffffffff;
-        for (int i = 0; i < 4; i++) {
-                if (format_swiz[i] < 4 &&
-                    !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) {
-                        colormask &= ~(0xff << (i * 8));
-                }
-        }
-        if (colormask != 0xffffffff) {
-                packed_color = qir_OR(c,
-                                      qir_AND(c, packed_color,
-                                              qir_uniform_ui(c, colormask)),
-                                      qir_AND(c, packed_dst_color,
-                                              qir_uniform_ui(c, ~colormask)));
-        }
-
-        return packed_color;
-}
-
 static void
 emit_frag_end(struct vc4_compile *c)
 {
         clip_distance_discard(c);
-        alpha_test_discard(c);
-        struct qreg color = blend_pipeline(c);
+
+        struct qreg color;
+        if (c->output_color_index != -1) {
+                color = c->outputs[c->output_color_index];
+        } else {
+                color = qir_uniform_ui(c, 0);
+        }
 
         if (c->discard.file != QFILE_NULL)
                 qir_TLB_DISCARD_SETUP(c, c->discard);
@@ -1839,8 +1514,11 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
 
         case nir_intrinsic_load_input:
                 assert(instr->num_components == 1);
-                *dest = c->inputs[instr->const_index[0]];
-
+                if (instr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
+                        *dest = qir_TLB_COLOR_READ(c);
+                } else {
+                        *dest = c->inputs[instr->const_index[0]];
+                }
                 break;
 
         case nir_intrinsic_store_output:
@@ -2052,6 +1730,8 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         c->s = tgsi_to_nir(tokens, &nir_options);
         nir_opt_global_to_local(c->s);
         nir_convert_to_ssa(c->s);
+        if (stage == QSTAGE_FRAG)
+                vc4_nir_lower_blend(c);
         vc4_nir_lower_io(c);
         nir_lower_idiv(c->s);
         nir_lower_load_const_to_scalar(c->s);
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 57e25de1b94..cade795c12a 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -39,6 +39,8 @@
 #include "vc4_screen.h"
 #include "pipe/p_state.h"
 
+struct nir_builder;
+
 enum qfile {
         QFILE_NULL,
         QFILE_TEMP,
@@ -242,7 +244,11 @@ enum quniform_contents {
 
         QUNIFORM_TEXTURE_BORDER_COLOR,
 
-        QUNIFORM_BLEND_CONST_COLOR,
+        QUNIFORM_BLEND_CONST_COLOR_X,
+        QUNIFORM_BLEND_CONST_COLOR_Y,
+        QUNIFORM_BLEND_CONST_COLOR_Z,
+        QUNIFORM_BLEND_CONST_COLOR_W,
+
         QUNIFORM_STENCIL,
 
         QUNIFORM_ALPHA_REF,
@@ -414,6 +420,11 @@ struct vc4_compile {
         uint32_t variant_id;
 };
 
+/* Special nir_load_input intrinsic index for loading the current TLB
+ * destination color.
+ */
+#define VC4_NIR_TLB_COLOR_READ_INPUT		2000000000
+
 /* Special offset for nir_load_uniform values to get a QUNIFORM_*
  * state-dependent value.
  */
@@ -458,9 +469,12 @@ bool qir_opt_cse(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
 bool qir_opt_small_immediates(struct vc4_compile *c);
 bool qir_opt_vpm_writes(struct vc4_compile *c);
+void vc4_nir_lower_blend(struct vc4_compile *c);
 void vc4_nir_lower_io(struct vc4_compile *c);
 nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
                                        enum quniform_contents contents);
+nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b,
+                                          nir_ssa_def **srcs, int swiz);
 void qir_lower_uniforms(struct vc4_compile *c);
 
 void qpu_schedule_instructions(struct vc4_compile *c);
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
index 3bf6672a88a..85d6998205e 100644
--- a/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -257,9 +257,14 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                                                          uinfo->data[i]));
                         break;
 
-                case QUNIFORM_BLEND_CONST_COLOR:
+                case QUNIFORM_BLEND_CONST_COLOR_X:
+                case QUNIFORM_BLEND_CONST_COLOR_Y:
+                case QUNIFORM_BLEND_CONST_COLOR_Z:
+                case QUNIFORM_BLEND_CONST_COLOR_W:
                         cl_aligned_f(&uniforms,
-                                     CLAMP(vc4->blend_color.color[uinfo->data[i]], 0, 1));
+                                     CLAMP(vc4->blend_color.color[uinfo->contents[i] -
+                                                                  QUNIFORM_BLEND_CONST_COLOR_X],
+                                           0, 1));
                         break;
 
                 case QUNIFORM_STENCIL:
@@ -321,7 +326,10 @@ vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
                         dirty |= VC4_DIRTY_TEXSTATE;
                         break;
 
-                case QUNIFORM_BLEND_CONST_COLOR:
+                case QUNIFORM_BLEND_CONST_COLOR_X:
+                case QUNIFORM_BLEND_CONST_COLOR_Y:
+                case QUNIFORM_BLEND_CONST_COLOR_Z:
+                case QUNIFORM_BLEND_CONST_COLOR_W:
                         dirty |= VC4_DIRTY_BLEND_COLOR;
                         break;
 

From fb2425a641dd7f891964e6f51b10cce63dff7d2c Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 11 Aug 2015 17:10:35 -0700
Subject: [PATCH 1206/1208] nir: Zero out texture instructions when creating
 them.

There are so many flags in textures, that the CSE pass would have a hard
time referencing the correct set when figuring out if two texture ops are
the same.  By zeroing, we can avoid that fragility.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/glsl/nir/nir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c
index 78ff886218d..2f7cbae42be 100644
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -450,7 +450,7 @@ nir_call_instr_create(nir_shader *shader, nir_function_overload *callee)
 nir_tex_instr *
 nir_tex_instr_create(nir_shader *shader, unsigned num_srcs)
 {
-   nir_tex_instr *instr = ralloc(shader, nir_tex_instr);
+   nir_tex_instr *instr = rzalloc(shader, nir_tex_instr);
    instr_init(&instr->instr, nir_instr_type_tex);
 
    dest_init(&instr->dest);

From a6e75e3cd74fd60200cc8dddc672a2d88495eb06 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 6 Feb 2015 16:24:36 -0800
Subject: [PATCH 1207/1208] nir: Add support for CSE on textures.

NIR instruction count results on i965:
total instructions in shared programs: 1261954 -> 1261937 (-0.00%)
instructions in affected programs:     455 -> 438 (-3.74%)

One in yofrankie, two in tropics.  Apparently i965 had also optimized all
of these out anyway.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/glsl/nir/nir_opt_cse.c | 43 ++++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/src/glsl/nir/nir_opt_cse.c b/src/glsl/nir/nir_opt_cse.c
index 553906e1291..864795ce5ed 100644
--- a/src/glsl/nir/nir_opt_cse.c
+++ b/src/glsl/nir/nir_opt_cse.c
@@ -86,8 +86,41 @@ nir_instrs_equal(nir_instr *instr1, nir_instr *instr2)
       }
       return true;
    }
-   case nir_instr_type_tex:
-      return false;
+   case nir_instr_type_tex: {
+      nir_tex_instr *tex1 = nir_instr_as_tex(instr1);
+      nir_tex_instr *tex2 = nir_instr_as_tex(instr2);
+
+      if (tex1->op != tex2->op)
+         return false;
+
+      if (tex1->num_srcs != tex2->num_srcs)
+         return false;
+      for (unsigned i = 0; i < tex1->num_srcs; i++) {
+         if (tex1->src[i].src_type != tex2->src[i].src_type ||
+             !nir_srcs_equal(tex1->src[i].src, tex2->src[i].src)) {
+            return false;
+         }
+      }
+
+      if (tex1->coord_components != tex2->coord_components ||
+          tex1->sampler_dim != tex2->sampler_dim ||
+          tex1->is_array != tex2->is_array ||
+          tex1->is_shadow != tex2->is_shadow ||
+          tex1->is_new_style_shadow != tex2->is_new_style_shadow ||
+          memcmp(tex1->const_offset, tex2->const_offset,
+                 sizeof(tex1->const_offset)) != 0 ||
+          tex1->component != tex2->component ||
+         tex1->sampler_index != tex2->sampler_index ||
+         tex1->sampler_array_size != tex2->sampler_array_size) {
+         return false;
+      }
+
+      /* Don't support un-lowered sampler derefs currently. */
+      if (tex1->sampler || tex2->sampler)
+         return false;
+
+      return true;
+   }
    case nir_instr_type_load_const: {
       nir_load_const_instr *load1 = nir_instr_as_load_const(instr1);
       nir_load_const_instr *load2 = nir_instr_as_load_const(instr2);
@@ -181,11 +214,10 @@ nir_instr_can_cse(nir_instr *instr)
 
    switch (instr->type) {
    case nir_instr_type_alu:
+   case nir_instr_type_tex:
    case nir_instr_type_load_const:
    case nir_instr_type_phi:
       return true;
-   case nir_instr_type_tex:
-      return false; /* TODO */
    case nir_instr_type_intrinsic: {
       const nir_intrinsic_info *info =
          &nir_intrinsic_infos[nir_instr_as_intrinsic(instr)->intrinsic];
@@ -212,6 +244,9 @@ nir_instr_get_dest_ssa_def(nir_instr *instr)
    case nir_instr_type_alu:
       assert(nir_instr_as_alu(instr)->dest.dest.is_ssa);
       return &nir_instr_as_alu(instr)->dest.dest.ssa;
+   case nir_instr_type_tex:
+      assert(nir_instr_as_tex(instr)->dest.is_ssa);
+      return &nir_instr_as_tex(instr)->dest.ssa;
    case nir_instr_type_load_const:
       return &nir_instr_as_load_const(instr)->def;
    case nir_instr_type_phi:

From d3e23f1ff915c01541f8df375b50b93b3da565a8 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 14 Aug 2015 15:58:28 -0400
Subject: [PATCH 1208/1208] nvc0: disable tessellation on maxwell

The address calculations are all different (e.g. see GP), there appear
to be sync's in programs, and probably a bunch of other differences.
Just disable it for now.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index c211e99c60a..ab19b26f156 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -232,11 +232,14 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 
    switch (shader) {
    case PIPE_SHADER_VERTEX:
-   case PIPE_SHADER_TESS_CTRL:
-   case PIPE_SHADER_TESS_EVAL:
    case PIPE_SHADER_GEOMETRY:
    case PIPE_SHADER_FRAGMENT:
       break;
+   case PIPE_SHADER_TESS_CTRL:
+   case PIPE_SHADER_TESS_EVAL:
+      if (class_3d >= GM107_3D_CLASS)
+         return 0;
+      break;
    case PIPE_SHADER_COMPUTE:
       if (class_3d != NVE4_3D_CLASS)
          return 0;